diff --git a/.gitignore b/.gitignore index 5152f1aaf5..531f1a39c1 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,6 @@ buildNumber.properties cache/ libudpipe_java.dylib .DS_Store +.checkstyle +learning.log +.factorypath diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt index f4b68d80b2..14b8aea4f5 100644 --- a/CONTRIBUTORS.txt +++ b/CONTRIBUTORS.txt @@ -43,6 +43,7 @@ Mateusz Parzonka Michael Unterkalmsteiner [mikmakmuk] Milen Kouylekov [kouylekov, kouylekov-usit] Nicolai Erbs [nicolaierbs] +Nicolas Paris [parisni] Niklas Jakob Nils Reimers [nreimers] Oliver Ferschke [ferschke] diff --git a/README.md b/README.md index 937647f383..f3b8dd6bf6 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # DKPro Core [![Build Status](https://zoidberg.ukp.informatik.tu-darmstadt.de:443/jenkins/job/DKPro%20Core%20(GitHub)/badge/icon)](https://zoidberg.ukp.informatik.tu-darmstadt.de:443/jenkins/job/DKPro%20Core%20(GitHub)/) -[![Maven Central](https://maven-badges.herokuapp.com/maven-central/de.tudarmstadt.ukp.dkpro.core/de.tudarmstadt.ukp.dkpro.core/badge.svg?style=plastic)](https://maven-badges.herokuapp.com/maven-central/de.tudarmstadt.ukp.dkpro.core/de.tudarmstadt.ukp.dkpro.core) +[![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.dkpro.core/dkpro-core/badge.svg?style=plastic)](https://maven-badges.herokuapp.com/maven-central/org.dkpro.core/dkpro-core) [![Join the chat at https://gitter.im/dkpro/dkpro-core](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dkpro/dkpro-core?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) DKPro Core is a collection of software components for natural language processing (NLP) based on the @@ -9,4 +9,4 @@ Apache UIMA framework. For more information, visit the [DKPro Core website](https://dkpro.github.io/dkpro-core). -For usage examples, see the [DKPro Core Examples project](https://github.com/dkpro/dkpro-core-examples) \ No newline at end of file +For usage examples, see the [DKPro Core Examples project](https://github.com/dkpro/dkpro-core-examples) diff --git a/dkpro-core-api-anomaly-asl/pom.xml b/dkpro-core-api-anomaly-asl/pom.xml index 98dd1aa7ea..80c1a30af7 100644 --- a/dkpro-core-api-anomaly-asl/pom.xml +++ b/dkpro-core-api-anomaly-asl/pom.xml @@ -1,53 +1,54 @@ - 4.0.0 - - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT - ../dkpro-core-asl - - de.tudarmstadt.ukp.dkpro.core.api.anomaly-asl - jar - DKPro Core ASL - Anomaly API - - - org.apache.uima - uimaj-core - - - - - - false - src/main/resources - - desc/type/**/* - - - - true - src/main/resources - - desc/type/**/* - - - - + 4.0.0 + + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT + ../dkpro-core-asl + + dkpro-core-api-anomaly-asl + jar + DKPro Core ASL - Anomaly API + https://dkpro.github.io/dkpro-core/ + + + org.apache.uima + uimaj-core + + + + + + false + src/main/resources + + desc/type/**/* + + + + true + src/main/resources + + desc/type/**/* + + + + diff --git a/dkpro-core-api-anomaly-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/anomaly/package-info.java b/dkpro-core-api-anomaly-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/anomaly/package-info.java deleted file mode 100644 index d81af25abc..0000000000 --- a/dkpro-core-api-anomaly-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/anomaly/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Types for annotating spelling or grammar problems. - */ -package de.tudarmstadt.ukp.dkpro.core.api.anomaly; \ No newline at end of file diff --git a/dkpro-core-api-anomaly-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/anomaly/AnomalyApiConstants.java b/dkpro-core-api-anomaly-asl/src/main/java/org/dkpro/core/api/anomaly/AnomalyApiConstants.java similarity index 93% rename from dkpro-core-api-anomaly-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/anomaly/AnomalyApiConstants.java rename to dkpro-core-api-anomaly-asl/src/main/java/org/dkpro/core/api/anomaly/AnomalyApiConstants.java index 6266bb643b..0571f8228a 100644 --- a/dkpro-core-api-anomaly-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/anomaly/AnomalyApiConstants.java +++ b/dkpro-core-api-anomaly-asl/src/main/java/org/dkpro/core/api/anomaly/AnomalyApiConstants.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.anomaly; +package org.dkpro.core.api.anomaly; /** * Actually just an excuse to get a javadoc artifact. diff --git a/dkpro-core-api-anomaly-asl/src/main/java/org/dkpro/core/api/anomaly/package-info.java b/dkpro-core-api-anomaly-asl/src/main/java/org/dkpro/core/api/anomaly/package-info.java new file mode 100644 index 0000000000..2c2e24f03c --- /dev/null +++ b/dkpro-core-api-anomaly-asl/src/main/java/org/dkpro/core/api/anomaly/package-info.java @@ -0,0 +1,22 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Types for annotating spelling or grammar problems. + */ +package org.dkpro.core.api.anomaly; diff --git a/dkpro-core-api-anomaly-asl/suppressions.xml b/dkpro-core-api-anomaly-asl/suppressions.xml new file mode 100644 index 0000000000..05381817ea --- /dev/null +++ b/dkpro-core-api-anomaly-asl/suppressions.xml @@ -0,0 +1,9 @@ + + + + + + + diff --git a/dkpro-core-api-coref-asl/pom.xml b/dkpro-core-api-coref-asl/pom.xml index d36cf994e9..5dcc65c272 100644 --- a/dkpro-core-api-coref-asl/pom.xml +++ b/dkpro-core-api-coref-asl/pom.xml @@ -1,53 +1,54 @@ - 4.0.0 - - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT - ../dkpro-core-asl - - de.tudarmstadt.ukp.dkpro.core.api.coref-asl - jar - DKPro Core ASL - Coreference API - - - org.apache.uima - uimaj-core - - - - - - false - src/main/resources - - desc/type/**/* - - - - true - src/main/resources - - desc/type/**/* - - - - - \ No newline at end of file + 4.0.0 + + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT + ../dkpro-core-asl + + dkpro-core-api-coref-asl + jar + DKPro Core ASL - Coreference API + https://dkpro.github.io/dkpro-core/ + + + org.apache.uima + uimaj-core + + + + + + false + src/main/resources + + desc/type/**/* + + + + true + src/main/resources + + desc/type/**/* + + + + + diff --git a/dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/package-info.java b/dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/package-info.java deleted file mode 100644 index e8531d1303..0000000000 --- a/dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Types for coreference annotations. - */ -package de.tudarmstadt.ukp.dkpro.core.api.coref; \ No newline at end of file diff --git a/dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/type/CoreferenceChain.java b/dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/type/CoreferenceChain.java index 04c5b26170..2a22af38cd 100644 --- a/dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/type/CoreferenceChain.java +++ b/dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/type/CoreferenceChain.java @@ -1,61 +1,75 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* First created by JCasGen Sun Nov 20 19:36:17 CET 2011 */ + + + +/* Apache UIMA v3 - First created by JCasGen Sun Jan 28 11:36:00 CET 2018 */ + package de.tudarmstadt.ukp.dkpro.core.api.coref.type; +import java.lang.invoke.CallSite; +import java.lang.invoke.MethodHandle; import java.util.ArrayList; import java.util.List; -import org.apache.uima.jcas.JCas; +import org.apache.uima.cas.impl.CASImpl; +import org.apache.uima.cas.impl.TypeImpl; +import org.apache.uima.cas.impl.TypeSystemImpl; +import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.JCasRegistry; -import org.apache.uima.jcas.cas.TOP_Type; - import org.apache.uima.jcas.cas.AnnotationBase; /** Marks the beginning of a chain. - * Updated by JCasGen Fri Oct 31 23:38:08 CET 2014 - * XML source: /Users/bluefire/UKP/Workspaces/dkpro-juno/de.tudarmstadt.ukp.dkpro.core-asl/de.tudarmstadt.ukp.dkpro.core.api.coref-asl/src/main/resources/desc/type/coref.xml + * Updated by JCasGen Sun Jan 28 11:36:00 CET 2018 + * XML source: /Users/bluefire/git/dkpro-core/dkpro-core-api-coref-asl/src/main/resources/desc/type/coref.xml * @generated */ public class CoreferenceChain extends AnnotationBase { + + /** @generated + * @ordered + */ + @SuppressWarnings ("hiding") + public final static String _TypeName = "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain"; + /** @generated * @ordered */ + @SuppressWarnings ("hiding") public final static int typeIndexID = JCasRegistry.register(CoreferenceChain.class); /** @generated * @ordered */ + @SuppressWarnings ("hiding") public final static int type = typeIndexID; - /** @generated */ + /** @generated + * @return index of the type + */ @Override -public int getTypeIndexID() {return typeIndexID;} + public int getTypeIndexID() {return typeIndexID;} + + /* ******************* + * Feature Offsets * + * *******************/ + + public final static String _FeatName_first = "first"; + + + /* Feature Adjusted Offsets */ + private final static CallSite _FC_first = TypeSystemImpl.createCallSite(CoreferenceChain.class, "first"); + private final static MethodHandle _FH_first = _FC_first.dynamicInvoker(); + + /** Never called. Disable default constructor * @generated */ protected CoreferenceChain() {/* intentionally empty block */} /** Internal - constructor used by generator * @generated - * @param addr low level Feature Structure reference + * @param casImpl the CAS this Feature Structure belongs to * @param type the type of this Feature Structure */ - public CoreferenceChain(int addr, TOP_Type type) { - super(addr, type); + public CoreferenceChain(TypeImpl type, CASImpl casImpl) { + super(type, casImpl); readObject(); } @@ -67,14 +81,15 @@ public CoreferenceChain(JCas jcas) { readObject(); } + /** * - * Write your own initialization here - * - * + * Write your own initialization here + * + * * @generated modifiable */ - private void readObject() {} + private void readObject() {/*default - does nothing empty block */} @@ -85,27 +100,24 @@ private void readObject() {} * @generated * @return value of the feature */ - public CoreferenceLink getFirst() { - if (CoreferenceChain_Type.featOkTst && ((CoreferenceChain_Type)jcasType).casFeat_first == null) - jcasType.jcas.throwFeatMissing("first", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain"); - return (CoreferenceLink)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((CoreferenceChain_Type)jcasType).casFeatCode_first)));} + public CoreferenceLink getFirst() { return (CoreferenceLink)(_getFeatureValueNc(wrapGetIntCatchException(_FH_first)));} /** setter for first - sets This is the first corefernce link in coreference chain * @generated * @param v value to set into the feature */ public void setFirst(CoreferenceLink v) { - if (CoreferenceChain_Type.featOkTst && ((CoreferenceChain_Type)jcasType).casFeat_first == null) - jcasType.jcas.throwFeatMissing("first", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain"); - jcasType.ll_cas.ll_setRefValue(addr, ((CoreferenceChain_Type)jcasType).casFeatCode_first, jcasType.ll_cas.ll_getFSRef(v));} - public List links() { - List links = new ArrayList(); - CoreferenceLink l = getFirst(); - while (l != null) { + _setFeatureValueNcWj(wrapGetIntCatchException(_FH_first), v); + } + + public List links() { + List links = new ArrayList(); + CoreferenceLink l = getFirst(); + while (l != null) { links.add(l); l = l.getNext(); - } - return links; + } + return links; } } diff --git a/dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/type/CoreferenceChain_Type.java b/dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/type/CoreferenceChain_Type.java deleted file mode 100644 index ad306b33e2..0000000000 --- a/dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/type/CoreferenceChain_Type.java +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* First created by JCasGen Sun Nov 20 19:36:17 CET 2011 */ -package de.tudarmstadt.ukp.dkpro.core.api.coref.type; - -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.JCasRegistry; -import org.apache.uima.cas.impl.CASImpl; -import org.apache.uima.cas.impl.FSGenerator; -import org.apache.uima.cas.FeatureStructure; -import org.apache.uima.cas.impl.TypeImpl; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.impl.FeatureImpl; -import org.apache.uima.cas.Feature; -import org.apache.uima.jcas.cas.AnnotationBase_Type; - -/** Marks the beginning of a chain. - * Updated by JCasGen Fri Oct 31 23:38:08 CET 2014 - * @generated */ -public class CoreferenceChain_Type extends AnnotationBase_Type { - /** @generated */ - @Override -protected FSGenerator getFSGenerator() {return fsGenerator;} - /** @generated */ - private final FSGenerator fsGenerator = - new FSGenerator() { - @Override - public FeatureStructure createFS(int addr, CASImpl cas) { - if (CoreferenceChain_Type.this.useExistingInstance) { - // Return eq fs instance if already created - FeatureStructure fs = CoreferenceChain_Type.this.jcas.getJfsFromCaddr(addr); - if (null == fs) { - fs = new CoreferenceChain(addr, CoreferenceChain_Type.this); - CoreferenceChain_Type.this.jcas.putJfsFromCaddr(addr, fs); - return fs; - } - return fs; - } else return new CoreferenceChain(addr, CoreferenceChain_Type.this); - } - }; - /** @generated */ - public final static int typeIndexID = CoreferenceChain.typeIndexID; - /** @generated - @modifiable */ - public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain"); - - /** @generated */ - final Feature casFeat_first; - /** @generated */ - final int casFeatCode_first; - /** @generated - * @param addr low level Feature Structure reference - * @return the feature value - */ - public int getFirst(int addr) { - if (featOkTst && casFeat_first == null) - jcas.throwFeatMissing("first", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain"); - return ll_cas.ll_getRefValue(addr, casFeatCode_first); - } - /** @generated - * @param addr low level Feature Structure reference - * @param v value to set - */ - public void setFirst(int addr, int v) { - if (featOkTst && casFeat_first == null) - jcas.throwFeatMissing("first", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain"); - ll_cas.ll_setRefValue(addr, casFeatCode_first, v);} - - - - - - /** initialize variables to correspond with Cas Type and Features - * @generated - * @param jcas JCas - * @param casType Type - */ - public CoreferenceChain_Type(JCas jcas, Type casType) { - super(jcas, casType); - casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator()); - - - casFeat_first = jcas.getRequiredFeatureDE(casType, "first", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink", featOkTst); - casFeatCode_first = (null == casFeat_first) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_first).getCode(); - - } -} - - - - \ No newline at end of file diff --git a/dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/type/CoreferenceLink.java b/dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/type/CoreferenceLink.java index 771b5f8e9e..5fb930ab7b 100644 --- a/dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/type/CoreferenceLink.java +++ b/dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/type/CoreferenceLink.java @@ -1,58 +1,81 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* First created by JCasGen Sun Nov 20 19:27:55 CET 2011 */ + + + +/* Apache UIMA v3 - First created by JCasGen Sun Jan 28 11:36:00 CET 2018 */ + package de.tudarmstadt.ukp.dkpro.core.api.coref.type; +import java.lang.invoke.CallSite; +import java.lang.invoke.MethodHandle; + +import org.apache.uima.cas.impl.CASImpl; +import org.apache.uima.cas.impl.TypeImpl; +import org.apache.uima.cas.impl.TypeSystemImpl; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.JCasRegistry; -import org.apache.uima.jcas.cas.TOP_Type; + import org.apache.uima.jcas.tcas.Annotation; /** A link in the coreference chain. - * Updated by JCasGen Fri Oct 31 23:38:08 CET 2014 - * XML source: /Users/bluefire/UKP/Workspaces/dkpro-juno/de.tudarmstadt.ukp.dkpro.core-asl/de.tudarmstadt.ukp.dkpro.core.api.coref-asl/src/main/resources/desc/type/coref.xml + * Updated by JCasGen Sun Jan 28 11:36:00 CET 2018 + * XML source: /Users/bluefire/git/dkpro-core/dkpro-core-api-coref-asl/src/main/resources/desc/type/coref.xml * @generated */ public class CoreferenceLink extends Annotation { + + /** @generated + * @ordered + */ + @SuppressWarnings ("hiding") + public final static String _TypeName = "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink"; + /** @generated * @ordered */ + @SuppressWarnings ("hiding") public final static int typeIndexID = JCasRegistry.register(CoreferenceLink.class); /** @generated * @ordered */ + @SuppressWarnings ("hiding") public final static int type = typeIndexID; - /** @generated */ + /** @generated + * @return index of the type + */ @Override -public int getTypeIndexID() {return typeIndexID;} + public int getTypeIndexID() {return typeIndexID;} + + /* ******************* + * Feature Offsets * + * *******************/ + + public final static String _FeatName_next = "next"; + public final static String _FeatName_referenceType = "referenceType"; + public final static String _FeatName_referenceRelation = "referenceRelation"; + + + /* Feature Adjusted Offsets */ + private final static CallSite _FC_next = TypeSystemImpl.createCallSite(CoreferenceLink.class, "next"); + private final static MethodHandle _FH_next = _FC_next.dynamicInvoker(); + private final static CallSite _FC_referenceType = TypeSystemImpl.createCallSite(CoreferenceLink.class, "referenceType"); + private final static MethodHandle _FH_referenceType = _FC_referenceType.dynamicInvoker(); + private final static CallSite _FC_referenceRelation = TypeSystemImpl.createCallSite(CoreferenceLink.class, "referenceRelation"); + private final static MethodHandle _FH_referenceRelation = _FC_referenceRelation.dynamicInvoker(); + + /** Never called. Disable default constructor * @generated */ protected CoreferenceLink() {/* intentionally empty block */} /** Internal - constructor used by generator * @generated - * @param addr low level Feature Structure reference + * @param casImpl the CAS this Feature Structure belongs to * @param type the type of this Feature Structure */ - public CoreferenceLink(int addr, TOP_Type type) { - super(addr, type); + public CoreferenceLink(TypeImpl type, CASImpl casImpl) { + super(type, casImpl); readObject(); } @@ -64,6 +87,7 @@ public CoreferenceLink(JCas jcas) { readObject(); } + /** @generated * @param jcas JCas to which this Feature Structure belongs * @param begin offset to the begin spot in the SofA @@ -78,12 +102,12 @@ public CoreferenceLink(JCas jcas, int begin, int end) { /** * - * Write your own initialization here - * - * + * Write your own initialization here + * + * * @generated modifiable */ - private void readObject() {} + private void readObject() {/*default - does nothing empty block */} @@ -94,19 +118,16 @@ private void readObject() {} * @generated * @return value of the feature */ - public CoreferenceLink getNext() { - if (CoreferenceLink_Type.featOkTst && ((CoreferenceLink_Type)jcasType).casFeat_next == null) - jcasType.jcas.throwFeatMissing("next", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink"); - return (CoreferenceLink)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((CoreferenceLink_Type)jcasType).casFeatCode_next)));} + public CoreferenceLink getNext() { return (CoreferenceLink)(_getFeatureValueNc(wrapGetIntCatchException(_FH_next)));} /** setter for next - sets If there is one, it is the next coreference link to the current coreference link * @generated * @param v value to set into the feature */ public void setNext(CoreferenceLink v) { - if (CoreferenceLink_Type.featOkTst && ((CoreferenceLink_Type)jcasType).casFeat_next == null) - jcasType.jcas.throwFeatMissing("next", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink"); - jcasType.ll_cas.ll_setRefValue(addr, ((CoreferenceLink_Type)jcasType).casFeatCode_next, jcasType.ll_cas.ll_getFSRef(v));} + _setFeatureValueNcWj(wrapGetIntCatchException(_FH_next), v); + } + //*--------------* @@ -116,19 +137,16 @@ public void setNext(CoreferenceLink v) { * @generated * @return value of the feature */ - public String getReferenceType() { - if (CoreferenceLink_Type.featOkTst && ((CoreferenceLink_Type)jcasType).casFeat_referenceType == null) - jcasType.jcas.throwFeatMissing("referenceType", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink"); - return jcasType.ll_cas.ll_getStringValue(addr, ((CoreferenceLink_Type)jcasType).casFeatCode_referenceType);} + public String getReferenceType() { return _getStringValueNc(wrapGetIntCatchException(_FH_referenceType));} /** setter for referenceType - sets The role or type which the covered text has in the coreference chain. * @generated * @param v value to set into the feature */ public void setReferenceType(String v) { - if (CoreferenceLink_Type.featOkTst && ((CoreferenceLink_Type)jcasType).casFeat_referenceType == null) - jcasType.jcas.throwFeatMissing("referenceType", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink"); - jcasType.ll_cas.ll_setStringValue(addr, ((CoreferenceLink_Type)jcasType).casFeatCode_referenceType, v);} + _setStringValueNfc(wrapGetIntCatchException(_FH_referenceType), v); + } + //*--------------* @@ -138,19 +156,16 @@ public void setReferenceType(String v) { * @generated * @return value of the feature */ - public String getReferenceRelation() { - if (CoreferenceLink_Type.featOkTst && ((CoreferenceLink_Type)jcasType).casFeat_referenceRelation == null) - jcasType.jcas.throwFeatMissing("referenceRelation", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink"); - return jcasType.ll_cas.ll_getStringValue(addr, ((CoreferenceLink_Type)jcasType).casFeatCode_referenceRelation);} + public String getReferenceRelation() { return _getStringValueNc(wrapGetIntCatchException(_FH_referenceRelation));} /** setter for referenceRelation - sets The type of relation between this link and the next link in the chain. * @generated * @param v value to set into the feature */ public void setReferenceRelation(String v) { - if (CoreferenceLink_Type.featOkTst && ((CoreferenceLink_Type)jcasType).casFeat_referenceRelation == null) - jcasType.jcas.throwFeatMissing("referenceRelation", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink"); - jcasType.ll_cas.ll_setStringValue(addr, ((CoreferenceLink_Type)jcasType).casFeatCode_referenceRelation, v);} + _setStringValueNfc(wrapGetIntCatchException(_FH_referenceRelation), v); + } + } \ No newline at end of file diff --git a/dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/type/CoreferenceLink_Type.java b/dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/type/CoreferenceLink_Type.java deleted file mode 100644 index 66d273dd4e..0000000000 --- a/dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/type/CoreferenceLink_Type.java +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* First created by JCasGen Sun Nov 20 19:27:55 CET 2011 */ -package de.tudarmstadt.ukp.dkpro.core.api.coref.type; - -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.JCasRegistry; -import org.apache.uima.cas.impl.CASImpl; -import org.apache.uima.cas.impl.FSGenerator; -import org.apache.uima.cas.FeatureStructure; -import org.apache.uima.cas.impl.TypeImpl; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.impl.FeatureImpl; -import org.apache.uima.cas.Feature; -import org.apache.uima.jcas.tcas.Annotation_Type; - -/** A link in the coreference chain. - * Updated by JCasGen Fri Oct 31 23:38:08 CET 2014 - * @generated */ -public class CoreferenceLink_Type extends Annotation_Type { - /** @generated */ - @Override -protected FSGenerator getFSGenerator() {return fsGenerator;} - /** @generated */ - private final FSGenerator fsGenerator = - new FSGenerator() { - @Override - public FeatureStructure createFS(int addr, CASImpl cas) { - if (CoreferenceLink_Type.this.useExistingInstance) { - // Return eq fs instance if already created - FeatureStructure fs = CoreferenceLink_Type.this.jcas.getJfsFromCaddr(addr); - if (null == fs) { - fs = new CoreferenceLink(addr, CoreferenceLink_Type.this); - CoreferenceLink_Type.this.jcas.putJfsFromCaddr(addr, fs); - return fs; - } - return fs; - } else return new CoreferenceLink(addr, CoreferenceLink_Type.this); - } - }; - /** @generated */ - public final static int typeIndexID = CoreferenceLink.typeIndexID; - /** @generated - @modifiable */ - public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink"); - - /** @generated */ - final Feature casFeat_next; - /** @generated */ - final int casFeatCode_next; - /** @generated - * @param addr low level Feature Structure reference - * @return the feature value - */ - public int getNext(int addr) { - if (featOkTst && casFeat_next == null) - jcas.throwFeatMissing("next", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink"); - return ll_cas.ll_getRefValue(addr, casFeatCode_next); - } - /** @generated - * @param addr low level Feature Structure reference - * @param v value to set - */ - public void setNext(int addr, int v) { - if (featOkTst && casFeat_next == null) - jcas.throwFeatMissing("next", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink"); - ll_cas.ll_setRefValue(addr, casFeatCode_next, v);} - - - - /** @generated */ - final Feature casFeat_referenceType; - /** @generated */ - final int casFeatCode_referenceType; - /** @generated - * @param addr low level Feature Structure reference - * @return the feature value - */ - public String getReferenceType(int addr) { - if (featOkTst && casFeat_referenceType == null) - jcas.throwFeatMissing("referenceType", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink"); - return ll_cas.ll_getStringValue(addr, casFeatCode_referenceType); - } - /** @generated - * @param addr low level Feature Structure reference - * @param v value to set - */ - public void setReferenceType(int addr, String v) { - if (featOkTst && casFeat_referenceType == null) - jcas.throwFeatMissing("referenceType", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink"); - ll_cas.ll_setStringValue(addr, casFeatCode_referenceType, v);} - - - - /** @generated */ - final Feature casFeat_referenceRelation; - /** @generated */ - final int casFeatCode_referenceRelation; - /** @generated - * @param addr low level Feature Structure reference - * @return the feature value - */ - public String getReferenceRelation(int addr) { - if (featOkTst && casFeat_referenceRelation == null) - jcas.throwFeatMissing("referenceRelation", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink"); - return ll_cas.ll_getStringValue(addr, casFeatCode_referenceRelation); - } - /** @generated - * @param addr low level Feature Structure reference - * @param v value to set - */ - public void setReferenceRelation(int addr, String v) { - if (featOkTst && casFeat_referenceRelation == null) - jcas.throwFeatMissing("referenceRelation", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink"); - ll_cas.ll_setStringValue(addr, casFeatCode_referenceRelation, v);} - - - - - - /** initialize variables to correspond with Cas Type and Features - * @generated - * @param jcas JCas - * @param casType Type - */ - public CoreferenceLink_Type(JCas jcas, Type casType) { - super(jcas, casType); - casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator()); - - - casFeat_next = jcas.getRequiredFeatureDE(casType, "next", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink", featOkTst); - casFeatCode_next = (null == casFeat_next) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_next).getCode(); - - - casFeat_referenceType = jcas.getRequiredFeatureDE(casType, "referenceType", "uima.cas.String", featOkTst); - casFeatCode_referenceType = (null == casFeat_referenceType) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_referenceType).getCode(); - - - casFeat_referenceRelation = jcas.getRequiredFeatureDE(casType, "referenceRelation", "uima.cas.String", featOkTst); - casFeatCode_referenceRelation = (null == casFeat_referenceRelation) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_referenceRelation).getCode(); - - } -} - - - - \ No newline at end of file diff --git a/dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/CorefApiConstants.java b/dkpro-core-api-coref-asl/src/main/java/org/dkpro/core/api/coref/CorefApiConstants.java similarity index 93% rename from dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/CorefApiConstants.java rename to dkpro-core-api-coref-asl/src/main/java/org/dkpro/core/api/coref/CorefApiConstants.java index 0194850e79..76f840aa37 100644 --- a/dkpro-core-api-coref-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/coref/CorefApiConstants.java +++ b/dkpro-core-api-coref-asl/src/main/java/org/dkpro/core/api/coref/CorefApiConstants.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.coref; +package org.dkpro.core.api.coref; /** * Actually just an excuse to get a javadoc artifact. diff --git a/dkpro-core-api-coref-asl/src/main/java/org/dkpro/core/api/coref/package-info.java b/dkpro-core-api-coref-asl/src/main/java/org/dkpro/core/api/coref/package-info.java new file mode 100644 index 0000000000..d3a864823d --- /dev/null +++ b/dkpro-core-api-coref-asl/src/main/java/org/dkpro/core/api/coref/package-info.java @@ -0,0 +1,22 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Types for coreference annotations. + */ +package org.dkpro.core.api.coref; diff --git a/dkpro-core-api-coref-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map b/dkpro-core-api-coref-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map new file mode 100644 index 0000000000..4a85f6e7ea --- /dev/null +++ b/dkpro-core-api-coref-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map @@ -0,0 +1,2 @@ +de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain=http://w3id.org/meta-share/omtd-share/Coreference +de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink=http://w3id.org/meta-share/omtd-share/Coreference diff --git a/dkpro-core-api-coref-asl/suppressions.xml b/dkpro-core-api-coref-asl/suppressions.xml new file mode 100644 index 0000000000..80c10d505e --- /dev/null +++ b/dkpro-core-api-coref-asl/suppressions.xml @@ -0,0 +1,9 @@ + + + + + + + diff --git a/dkpro-core-api-datasets-asl/pom.xml b/dkpro-core-api-datasets-asl/pom.xml index 85aa82a848..ae24569259 100644 --- a/dkpro-core-api-datasets-asl/pom.xml +++ b/dkpro-core-api-datasets-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - DKPro Core ASL - Datasets - de.tudarmstadt.ukp.dkpro.core.api.datasets-asl + dkpro-core-api-datasets-asl jar + DKPro Core ASL - Datasets + https://dkpro.github.io/dkpro-core/ DKPro Core module for loading publicly available datasets. @@ -48,6 +49,10 @@ commons-logging commons-logging-api + + org.slf4j + slf4j-api + org.springframework spring-core @@ -69,7 +74,7 @@ com.github.junrar junrar - 0.7 + 4.0.0 org.apache.commons @@ -88,8 +93,8 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/ArtifactDescription.java b/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/ArtifactDescription.java deleted file mode 100644 index c2c62a7647..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/ArtifactDescription.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets; - -import java.util.List; - -public interface ArtifactDescription -{ - /** - * @return artifact name/ID - */ - String getName(); - - /** - * Instead of downloading the artifact, create a file with the given text as content. If text - * is set, all other settings are ignored. - * - * @return text content. - */ - String getText(); - - /** - * @return URL from which to obtain the artifact. - */ - String getUrl(); - - /** - * @return SHA1 hash of the artifact. - */ - String getSha1(); - - /** - * Whether this artifact is shared between multiple datasets. If this flag is enabled, the - * artifact may be stored in a special location within the cache, i.e. not under the dataset - * folder. - * - * @return shared status. - */ - boolean isShared(); - - /** - * Any post-download actions, e.g. to explode the artifact. - * - * @return list of actions. - */ - List getActions(); -} \ No newline at end of file diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DatasetFactory.java b/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DatasetFactory.java deleted file mode 100644 index 35e2a7111f..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DatasetFactory.java +++ /dev/null @@ -1,368 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets; - -import static java.util.Collections.unmodifiableList; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.Writer; -import java.net.URL; -import java.net.URLConnection; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.security.DigestInputStream; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; - -import org.apache.commons.codec.binary.Hex; -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.FilenameUtils; -import org.apache.commons.io.IOUtils; -import org.apache.commons.io.output.NullOutputStream; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.springframework.core.io.Resource; -import org.springframework.core.io.support.PathMatchingResourcePatternResolver; -import org.yaml.snakeyaml.TypeDescription; -import org.yaml.snakeyaml.Yaml; -import org.yaml.snakeyaml.constructor.Constructor; - -import de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.ActionDescriptionImpl; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.ArtifactDescriptionImpl; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.DatasetDescriptionImpl; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.LicenseDescriptionImpl; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.LoadedDataset; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.actions.Action_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.actions.Explode; - -public class DatasetFactory -{ - private Map datasets; - - private Map> actionRegistry; - - private final Log LOG = LogFactory.getLog(getClass()); - - private Path cacheRoot; - - { - actionRegistry = new HashMap<>(); - actionRegistry.put("explode", Explode.class); - } - - public DatasetFactory() - { - // Nothing to do - } - - public DatasetFactory(Path aCacheRoot) - { - cacheRoot = aCacheRoot; - } - - public DatasetFactory(File aCacheRoot) - { - this(aCacheRoot.toPath()); - } - - public Path getCacheRoot() - { - return cacheRoot; - } - - public List listIds() - throws IOException - { - return unmodifiableList(new ArrayList<>(registry().keySet())); - } - - public DatasetDescription getDescription(String aId) - throws IOException - { - return registry().get(aId); - } - - public Dataset load(String aId) - throws IOException - { - DatasetDescription desc = getDescription(aId); - if (desc == null) { - throw new IllegalArgumentException("Unknown dataset [" + aId + "]"); - } - materialize(desc); - return new LoadedDataset(this, desc); - } - - private Map registry() - throws IOException - { - // If no cache was set, create one and make sure to clean it up on exit - if (cacheRoot == null) { - cacheRoot = Files.createTempDirectory("dkpro-dataset-cache"); - cacheRoot.toFile().deleteOnExit(); - } - - // Load datesets only once - if (datasets == null) { - // Load the YAML descriptions - datasets = loadFromYaml(); - } - - return datasets; - } - - private Map loadFromYaml() - throws IOException - { - // Scan for locators - PathMatchingResourcePatternResolver resolver = new PathMatchingResourcePatternResolver(); - Resource[] locators = resolver - .getResources("classpath:META-INF/org.dkpro.core/datasets.txt"); - - // Read locators - Set patterns = new LinkedHashSet<>(); - for (Resource locator : locators) { - try (InputStream is = locator.getInputStream()) { - IOUtils.lineIterator(is, "UTF-8").forEachRemaining(l -> patterns.add(l)); - } - } - - // Scan for YAML dataset descriptions - List resources = new ArrayList<>(); - for (String pattern : patterns) { - for (Resource r : resolver.getResources(pattern)) { - resources.add(r); - } - } - - // Configure YAML deserialization - Constructor datasetConstructor = new Constructor(DatasetDescriptionImpl.class); - TypeDescription datasetDesc = new TypeDescription(DatasetDescriptionImpl.class); - datasetDesc.putMapPropertyType("artifacts", String.class, ArtifactDescriptionImpl.class); - datasetDesc.putListPropertyType("licenses", LicenseDescriptionImpl.class); - datasetConstructor.addTypeDescription(datasetDesc); - TypeDescription artifactDesc = new TypeDescription(ArtifactDescriptionImpl.class); - artifactDesc.putListPropertyType("actions", ActionDescriptionImpl.class); - datasetConstructor.addTypeDescription(artifactDesc); - Yaml yaml = new Yaml(datasetConstructor); - - // Ensure that there is a fixed order (at least if toString is correctly implemented) - Collections.sort(resources, (a, b) -> { - return a.toString().compareTo(b.toString()); - }); - - // Load the YAML descriptions - Map sets = new LinkedHashMap<>(); - for (Resource res : resources) { - LOG.debug("Loading [" + res + "]"); - try (InputStream is = res.getInputStream()) { - String id = FilenameUtils.getBaseName(res.getFilename()); - DatasetDescriptionImpl ds = yaml.loadAs(is, DatasetDescriptionImpl.class); - ds.setId(id); - ds.setOwner(this); - - // Inject artifact names into artifacts - for (Entry e : ds.getArtifacts().entrySet()) { - ((ArtifactDescriptionImpl) e.getValue()).setName(e.getKey()); - } - - sets.put(ds.getId(), ds); - } - } - - return sets; - } - - public Path resolve(DatasetDescription aDataset) - { - return cacheRoot.resolve(aDataset.getId()); - } - - /** - * Get the cache location for the given artifact. - */ - private Path resolve(DatasetDescription aDataset, ArtifactDescription aArtifact) - { - if (aArtifact.isShared()) { - // Shared artifacts stored in a folder named by their SHA1 - return cacheRoot.resolve("shared").resolve(aArtifact.getSha1()) - .resolve(aArtifact.getName()); - } - else { - // Unshared artifacts are stored in the dataset folder - return resolve(aDataset).resolve(aArtifact.getName()); - } - } - - /** - * Verify/download/update artifact in cache. Execute post-download actions. - */ - private void materialize(DatasetDescription aDataset) - throws IOException - { - Path root = resolve(aDataset); - Collection artifacts = aDataset.getArtifacts().values(); - - // First validate if local copies are still up-to-date - boolean reload = false; - packageValidationLoop: for (ArtifactDescription artifact : artifacts) { - Path cachedFile = resolve(aDataset, artifact); - if (!Files.exists(cachedFile)) { - continue; - } - - if (artifact.getSha1() != null) { - String actual = getDigest(cachedFile, "SHA1"); - if (!artifact.getSha1().equals(actual)) { - LOG.info("Local SHA1 hash mismatch on [" + cachedFile + "] - expected [" - + artifact.getSha1() + "] - actual [" + actual + "]"); - reload = true; - break packageValidationLoop; - } - else { - LOG.info("Local SHA1 hash verified on [" + cachedFile + "] - [" + actual + "]"); - } - } - } - - // If any of the packages are outdated, clear the cache and download again - if (reload) { - LOG.info("Clearing local cache for [" + root + "]"); - FileUtils.deleteQuietly(root.toFile()); - } - - for (ArtifactDescription artifact : artifacts) { - Path cachedFile = resolve(aDataset, artifact); - - if (Files.exists(cachedFile)) { - continue; - } - - - if (artifact.getText() != null) { - Files.createDirectories(cachedFile.getParent()); - - LOG.info("Creating [" + cachedFile + "]"); - try (Writer out = Files.newBufferedWriter(cachedFile, StandardCharsets.UTF_8)) { - out.write(artifact.getText()); - } - } - - if (artifact.getUrl() != null) { - Files.createDirectories(cachedFile.getParent()); - - MessageDigest sha1; - try { - sha1 = MessageDigest.getInstance("SHA1"); - } - catch (NoSuchAlgorithmException e) { - throw new IOException(e); - } - - URL source = new URL(artifact.getUrl()); - - LOG.info("Fetching [" + cachedFile + "]"); - - URLConnection connection = source.openConnection(); - connection.setRequestProperty("User-Agent", "Java"); - - try (InputStream is = connection.getInputStream()) { - DigestInputStream sha1Filter = new DigestInputStream(is, sha1); - Files.copy(sha1Filter, cachedFile); - - if (artifact.getSha1() != null) { - String sha1Hex = new String( - Hex.encodeHex(sha1Filter.getMessageDigest().digest())); - if (!artifact.getSha1().equals(sha1Hex)) { - String message = "SHA1 mismatch. Expected [" + artifact.getSha1() - + "] but got [" + sha1Hex + "]."; - LOG.error(message); - throw new IOException(message); - } - } - } - } - } - - // Perform a post-fetch action such as unpacking - Path postActionCompleteMarker = resolve(aDataset).resolve(".postComplete"); - if (!Files.exists(postActionCompleteMarker)) { - for (ArtifactDescription artifact : artifacts) { - Path cachedFile = resolve(aDataset, artifact); - - List actions = artifact.getActions(); - if (actions != null && !actions.isEmpty()) { - try { - for (ActionDescription action : actions) { - LOG.info("Post-download action [" + action.getAction() + "]"); - Class implClass = actionRegistry - .get(action.getAction()); - - if (implClass == null) { - throw new IllegalStateException( - "Unknown or unsupported action [" + action.getAction() + "]"); - } - - Action_ImplBase impl = implClass.newInstance(); - impl.apply(action, aDataset, artifact, cachedFile); - } - } - catch (IllegalStateException e) { - throw e; - } - catch (IOException e) { - throw e; - } - catch (Exception e) { - throw new IllegalStateException(e); - } - } - } - Files.createFile(postActionCompleteMarker); - } - } - - private String getDigest(Path aFile, String aDigest) throws IOException - { - MessageDigest digest; - try { - digest = MessageDigest.getInstance(aDigest); - } - catch (NoSuchAlgorithmException e) { - throw new IOException(e); - } - try (InputStream is = Files.newInputStream(aFile)) { - DigestInputStream digestFilter = new DigestInputStream(is, digest); - IOUtils.copy(digestFilter, new NullOutputStream()); - return new String(Hex.encodeHex(digestFilter.getMessageDigest().digest())); - } - } -} diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/ArtifactDescriptionImpl.java b/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/ArtifactDescriptionImpl.java deleted file mode 100644 index 84dc6b6a5f..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/ArtifactDescriptionImpl.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets.internal; - -import java.util.List; - -import de.tudarmstadt.ukp.dkpro.core.api.datasets.ActionDescription; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.ArtifactDescription; - -public class ArtifactDescriptionImpl - implements ArtifactDescription -{ - private String name; - private String text; - private String url; - private String sha1; - private boolean shared; - private List actions; - - @Override - public String getName() - { - return name; - } - - public void setName(String aName) - { - name = aName; - } - - @Override - public String getText() - { - return text; - } - - public void setText(String aText) - { - text = aText; - } - - @Override - public String getUrl() - { - return url; - } - - public void setUrl(String aUrl) - { - url = aUrl; - } - - @Override - public String getSha1() - { - return sha1; - } - - public void setSha1(String aSha1) - { - sha1 = aSha1; - } - - @Override - public List getActions() - { - return actions; - } - - public void setActions(List aActions) - { - actions = aActions; - } - - @Override - public boolean isShared() - { - return shared; - } - - public void setShared(boolean aShared) - { - shared = aShared; - } -} diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java b/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java deleted file mode 100644 index 0451ef38ab..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Explode.java +++ /dev/null @@ -1,305 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.actions; - -import static java.util.Arrays.asList; - -import java.io.BufferedInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.List; -import java.util.Locale; -import java.util.Map; - -import org.apache.commons.compress.archivers.ArchiveEntry; -import org.apache.commons.compress.archivers.ArchiveInputStream; -import org.apache.commons.compress.archivers.ArchiveStreamFactory; -import org.apache.commons.compress.archivers.sevenz.SevenZArchiveEntry; -import org.apache.commons.compress.archivers.sevenz.SevenZFile; -import org.apache.commons.compress.compressors.CompressorException; -import org.apache.commons.compress.compressors.CompressorStreamFactory; -import org.apache.commons.io.FilenameUtils; -import org.apache.commons.io.IOUtils; - -import com.github.junrar.Archive; -import com.github.junrar.exception.RarException; -import com.github.junrar.impl.FileVolumeManager; -import com.github.junrar.rarfile.FileHeader; - -import de.tudarmstadt.ukp.dkpro.core.api.datasets.ActionDescription; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.ArtifactDescription; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetDescription; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.DatasetDescriptionImpl; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.util.AntFileFilter; - -public class Explode - extends Action_ImplBase -{ - @Override - public void apply(ActionDescription aAction, DatasetDescription aDataset, - ArtifactDescription aPack, Path aCachedFile) - throws Exception - { - DatasetDescriptionImpl dsi = (DatasetDescriptionImpl) aDataset; - - Map cfg = aAction.getConfiguration(); - // Sometimes, we have to explode a file that was created as the result of exploding the - // main artifact. Thus, we can override the target - Path targetFile = cfg.containsKey("file") - ? dsi.getOwner().resolve(dsi).resolve((String) cfg.get("file")) : aCachedFile; - - // Apache Commons Compress does not handle RAR files, so we handle them separately - if (targetFile.toString().toLowerCase(Locale.ENGLISH).endsWith(".rar")) { - extractRar(aAction, targetFile, dsi.getOwner().resolve(dsi)); - } - if (targetFile.toString().toLowerCase(Locale.ENGLISH).endsWith(".7z")) { - // 7z does not support streaming in Apache Commons Compress - extract7z(aAction, targetFile, dsi.getOwner().resolve(dsi)); - } - else { - // Auto-detect the archive format using Apache Commons Compress - try (InputStream is = new BufferedInputStream(Files.newInputStream(targetFile))) { - InputStream uncompressed; - - try { - uncompressed = new BufferedInputStream( - new CompressorStreamFactory().createCompressorInputStream(is)); - } - catch (CompressorException e) { - // If the compressor is not detected, we may be dealing with an archive format that - // compresses internally, e.g. ZIP. - uncompressed = is; - } - - ArchiveInputStream archive = new ArchiveStreamFactory() - .createArchiveInputStream(uncompressed); - extract(aAction, targetFile, archive, dsi.getOwner().resolve(dsi)); - } - } - } - - private void extract7z(ActionDescription aAction, Path aCachedFile, Path aTarget) - throws IOException, RarException - { - // We always extract archives into a subfolder. Figure out the name of the folder. - String base = getBase(aCachedFile.getFileName().toString()); - - Map cfg = aAction.getConfiguration(); - int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; - - AntFileFilter filter = new AntFileFilter(coerceToList(cfg.get("includes")), - coerceToList(cfg.get("excludes"))); - - try (SevenZFile archive = new SevenZFile(aCachedFile.toFile())) { - SevenZArchiveEntry entry = archive.getNextEntry(); - while (entry != null) { - String name = stripLeadingFolders(entry.getName(), strip); - - if (name == null) { - // Stripped to null - nothing left to extract - continue; - continue; - } - - if (filter.accept(name)) { - Path out = aTarget.resolve(base).resolve(name); - if (entry.isDirectory()) { - Files.createDirectories(out); - } - else { - Files.createDirectories(out.getParent()); - try (OutputStream os = Files.newOutputStream(out)) { - InputStream is = new SevenZEntryInputStream(archive, entry); - IOUtils.copyLarge(is, os); - } - } - } - - entry = archive.getNextEntry(); - } - } - } - - private void extractRar(ActionDescription aAction, Path aCachedFile, Path aTarget) - throws IOException, RarException - { - // We always extract archives into a subfolder. Figure out the name of the folder. - String base = getBase(aCachedFile.getFileName().toString()); - - Map cfg = aAction.getConfiguration(); - int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; - - AntFileFilter filter = new AntFileFilter(coerceToList(cfg.get("includes")), - coerceToList(cfg.get("excludes"))); - - try (Archive archive = new Archive(new FileVolumeManager(aCachedFile.toFile()))) { - FileHeader fh = archive.nextFileHeader(); - while (fh != null) { - String name = stripLeadingFolders(fh.getFileNameString(), strip); - - if (name == null) { - // Stripped to null - nothing left to extract - continue; - continue; - } - - if (filter.accept(name)) { - Path out = aTarget.resolve(base).resolve(name); - if (fh.isDirectory()) { - Files.createDirectories(out); - } - else { - Files.createDirectories(out.getParent()); - try (OutputStream os = Files.newOutputStream(out)) { - archive.extractFile(fh, os); - } - } - } - - - fh = archive.nextFileHeader(); - } - } - } - - private void extract(ActionDescription aAction, Path aArchive, ArchiveInputStream aAStream, - Path aTarget) - throws IOException - { - // We always extract archives into a subfolder. Figure out the name of the folder. - String base = getBase(aArchive.getFileName().toString()); - - Map cfg = aAction.getConfiguration(); - int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; - - AntFileFilter filter = new AntFileFilter(coerceToList(cfg.get("includes")), - coerceToList(cfg.get("excludes"))); - - ArchiveEntry entry = null; - while ((entry = aAStream.getNextEntry()) != null) { - String name = stripLeadingFolders(entry.getName(), strip); - - if (name == null) { - // Stripped to null - nothing left to extract - continue; - continue; - } - - if (filter.accept(name)) { - Path out = aTarget.resolve(base).resolve(name); - if (entry.isDirectory()) { - Files.createDirectories(out); - } - else { - Files.createDirectories(out.getParent()); - Files.copy(aAStream, out); - } - } - } - } - - private String stripLeadingFolders(String aName, int aLevels) - { - if (aLevels > 0) { - Path p = Paths.get(aName); - if (p.getNameCount() <= aLevels) { - return null; - } - else { - p = p.subpath(aLevels, p.getNameCount()); - aName = p.toString(); - return aName; - } - } - else { - return aName; - } - } - - public static String getBase(String aFilename) - { - // We always extract archives into a subfolder. Figure out the name of the folder. - String base = aFilename; - while (base.contains(".")) { - base = FilenameUtils.removeExtension(base); - } - return base; - } - - @SuppressWarnings("unchecked") - public static List coerceToList(Object aRaw) - { - List cooked; - if (aRaw == null) { - return null; - } - else if (aRaw instanceof String) { - cooked = asList((String) aRaw); - } - else if (aRaw instanceof List) { - cooked = (List) aRaw; - } - else { - throw new IllegalArgumentException("Cannot coerce to String list: [" + aRaw + "]"); - } - return cooked; - } - - private static class SevenZEntryInputStream - extends InputStream - { - private SevenZFile archive; - private SevenZArchiveEntry entry; - private int totalRead; - - public SevenZEntryInputStream(SevenZFile aArchive, SevenZArchiveEntry aEnty) - { - archive = aArchive; - entry = aEnty; - } - - @Override - public int read() - throws IOException - { - if (totalRead < entry.getSize()) { - totalRead++; - return archive.read(); - } - else { - return -1; - } - } - - @Override - public int read(byte[] aB, int aOff, int aLen) - throws IOException - { - if (totalRead < entry.getSize()) { - int blocksize = (int) Math.min(aLen, entry.getSize() - totalRead); - int read = archive.read(aB, aOff, blocksize); - totalRead += read; - return read; - } - else { - return -1; - } - } - } -} diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/ActionDescription.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/ActionDescription.java similarity index 93% rename from dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/ActionDescription.java rename to dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/ActionDescription.java index 7368c14140..589ddc1677 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/ActionDescription.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/ActionDescription.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets; +package org.dkpro.core.api.datasets; import java.util.Map; @@ -24,5 +24,4 @@ public interface ActionDescription String getAction(); Map getConfiguration(); - -} \ No newline at end of file +} diff --git a/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/ArtifactDescription.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/ArtifactDescription.java new file mode 100644 index 0000000000..48b6390dcd --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/ArtifactDescription.java @@ -0,0 +1,85 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.datasets; + +import java.util.List; + +public interface ArtifactDescription +{ + /** + * @return the dataset to which this artifact belongs. + */ + DatasetDescription getDataset(); + + /** + * @return artifact name/ID + */ + String getName(); + + /** + * Instead of downloading the artifact, create a file with the given text as content. If text is + * set, all other settings are ignored. + * + * @return text content. + */ + String getText(); + + /** + * @return URL from which to obtain the artifact. + */ + String getUrl(); + + /** + * @return SHA1 hash of the artifact. + */ + String getSha1(); + + /** + * @return SHA512 hash of the artifact. + */ + String getSha512(); + + /** + * @return the verification mode. + */ + VerificationMode getVerificationMode(); + + /** + * Whether this artifact is shared between multiple datasets. If this flag is enabled, the + * artifact may be stored in a special location within the cache, i.e. not under the dataset + * folder. + * + * @return shared status. + */ + boolean isShared(); + + /** + * Any post-download actions, e.g. to explode the artifact. + * + * @return list of actions. + */ + List getActions(); + + /** + * Whether this artifact is optional. If an optional artifact cannot be located or downloaded + * (e.g. due to a network problem), then the rest materializes still. + * + * @return optional status. + */ + boolean isOptional(); +} diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DataPackage.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DataPackage.java similarity index 98% rename from dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DataPackage.java rename to dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DataPackage.java index edd911ebd8..0d48c0b05b 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DataPackage.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DataPackage.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets; +package org.dkpro.core.api.datasets; @Deprecated public class DataPackage diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/Dataset.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/Dataset.java similarity index 94% rename from dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/Dataset.java rename to dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/Dataset.java index c3a5596da1..dab3ff0483 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/Dataset.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/Dataset.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets; +package org.dkpro.core.api.datasets; import java.io.File; import java.util.Arrays; @@ -23,8 +23,7 @@ import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; - -import de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.SplitImpl; +import org.dkpro.core.api.datasets.internal.SplitImpl; public interface Dataset { diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DatasetDescription.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetDescription.java similarity index 94% rename from dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DatasetDescription.java rename to dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetDescription.java index a5df41c77c..92d899e431 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DatasetDescription.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetDescription.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets; +package org.dkpro.core.api.datasets; import java.util.List; import java.util.Map; @@ -30,10 +30,10 @@ public interface DatasetDescription String getId(); String getLanguage(); - + String getEncoding(); Map> getRoles(); Map getArtifacts(); -} \ No newline at end of file +} diff --git a/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetFactory.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetFactory.java new file mode 100644 index 0000000000..c3a3d99329 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetFactory.java @@ -0,0 +1,508 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.datasets; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.util.Collections.unmodifiableList; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.Writer; +import java.net.URLConnection; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.security.DigestInputStream; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.apache.commons.codec.binary.Hex; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.FilenameUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.output.NullOutputStream; +import org.apache.commons.lang3.StringUtils; +import org.dkpro.core.api.datasets.internal.ActionDescriptionImpl; +import org.dkpro.core.api.datasets.internal.ArtifactDescriptionImpl; +import org.dkpro.core.api.datasets.internal.DatasetDescriptionImpl; +import org.dkpro.core.api.datasets.internal.LicenseDescriptionImpl; +import org.dkpro.core.api.datasets.internal.LoadedDataset; +import org.dkpro.core.api.datasets.internal.actions.Action_ImplBase; +import org.dkpro.core.api.datasets.internal.actions.Explode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.core.io.DefaultResourceLoader; +import org.springframework.core.io.Resource; +import org.springframework.core.io.ResourceLoader; +import org.springframework.core.io.support.PathMatchingResourcePatternResolver; +import org.yaml.snakeyaml.TypeDescription; +import org.yaml.snakeyaml.Yaml; +import org.yaml.snakeyaml.constructor.Constructor; + +public class DatasetFactory +{ + public static final String PROP_DATASET_VERIFICATION_POLICY = "dkpro.dataset.verification.policy"; + + private static final DatasetValidationPolicy defaultVerificationPolicy = DatasetValidationPolicy + .valueOf(System.getProperty(PROP_DATASET_VERIFICATION_POLICY, + DatasetValidationPolicy.STRICT.name())); + + private Map datasets; + + private final Map> actionRegistry; + + private final Logger log = LoggerFactory.getLogger(getClass()); + + private Path cacheRoot; + + private ClassLoader classLoader; + + { + actionRegistry = new HashMap<>(); + actionRegistry.put("explode", Explode.class); + } + + public DatasetFactory() + { + // Nothing to do + } + + public DatasetFactory(File aCacheRoot) + { + this(aCacheRoot.toPath()); + } + + public DatasetFactory(Path aCacheRoot) + { + this(aCacheRoot, null); + } + + public DatasetFactory(Path aCacheRoot, ClassLoader aClassLoader) + { + cacheRoot = aCacheRoot; + classLoader = aClassLoader; + } + + public Path getCacheRoot() + { + return cacheRoot; + } + + public List listIds() + throws IOException + { + return unmodifiableList(new ArrayList<>(registry().keySet())); + } + + public DatasetDescription getDescription(String aId) + throws IOException + { + return registry().get(aId); + } + + public Dataset load(String aId) + throws IOException + { + return load(aId, defaultVerificationPolicy); + } + + public Dataset load(String aId, DatasetValidationPolicy aPolicy) + throws IOException + { + DatasetDescription desc = getDescription(aId); + if (desc == null) { + throw new IllegalArgumentException("Unknown dataset [" + aId + "]"); + } + materialize(desc, aPolicy); + return new LoadedDataset(this, desc); + } + + private Map registry() + throws IOException + { + // If no cache was set, create one and make sure to clean it up on exit + if (cacheRoot == null) { + cacheRoot = Files.createTempDirectory("dkpro-dataset-cache"); + cacheRoot.toFile().deleteOnExit(); + } + + // Load datesets only once + if (datasets == null) { + // Load the YAML descriptions + datasets = loadFromYaml(); + } + + return datasets; + } + + private Map loadFromYaml() + throws IOException + { + // Scan for locators + PathMatchingResourcePatternResolver resolver = new PathMatchingResourcePatternResolver(); + Resource[] locators = resolver + .getResources("classpath:META-INF/org.dkpro.core/datasets.txt"); + + // Read locators + Set patterns = new LinkedHashSet<>(); + for (Resource locator : locators) { + try (InputStream is = locator.getInputStream()) { + IOUtils.lineIterator(is, "UTF-8").forEachRemaining(l -> patterns.add(l)); + } + } + + // Scan for YAML dataset descriptions + List resources = new ArrayList<>(); + for (String pattern : patterns) { + for (Resource r : resolver.getResources(pattern)) { + resources.add(r); + } + } + + // Configure YAML deserialization + Constructor datasetConstructor = new Constructor(DatasetDescriptionImpl.class); + TypeDescription datasetDesc = new TypeDescription(DatasetDescriptionImpl.class); + datasetDesc.putMapPropertyType("artifacts", String.class, ArtifactDescriptionImpl.class); + datasetDesc.putListPropertyType("licenses", LicenseDescriptionImpl.class); + datasetConstructor.addTypeDescription(datasetDesc); + TypeDescription artifactDesc = new TypeDescription(ArtifactDescriptionImpl.class); + artifactDesc.putListPropertyType("actions", ActionDescriptionImpl.class); + datasetConstructor.addTypeDescription(artifactDesc); + Yaml yaml = new Yaml(datasetConstructor); + + // Ensure that there is a fixed order (at least if toString is correctly implemented) + Collections.sort(resources, (a, b) -> { + return a.toString().compareTo(b.toString()); + }); + + // Load the YAML descriptions + Map sets = new LinkedHashMap<>(); + for (Resource res : resources) { + log.debug("Loading [{}]", res); + try (InputStream is = res.getInputStream()) { + String id = FilenameUtils.getBaseName(res.getFilename()); + DatasetDescriptionImpl ds = yaml.loadAs(is, DatasetDescriptionImpl.class); + ds.setId(id); + ds.setOwner(this); + + // Inject artifact names into artifacts + for (Entry e : ds.getArtifacts().entrySet()) { + ((ArtifactDescriptionImpl) e.getValue()).setName(e.getKey()); + ((ArtifactDescriptionImpl) e.getValue()).setDataset(ds); + } + + sets.put(ds.getId(), ds); + } + } + + log.debug("Loaded [{}] dataset description", sets.size()); + + + return sets; + } + + public Path resolve(DatasetDescription aDataset) + { + return cacheRoot.resolve(aDataset.getId()); + } + + /** + * Get the cache location for the given artifact. + */ + private Path resolve(DatasetDescription aDataset, ArtifactDescription aArtifact) + { + if (aArtifact.isShared()) { + // Shared artifacts stored in a folder named by their hash + // Prefere SHA1 for the time being to avoid users having to re-download too much as + // we slowly switch over to SHA512 + if (aArtifact.getSha1() != null) { + return cacheRoot.resolve("shared").resolve(aArtifact.getSha1()) + .resolve(aArtifact.getName()); + } + else { + return cacheRoot.resolve("shared").resolve(aArtifact.getSha512()) + .resolve(aArtifact.getName()); + } + } + else { + // Unshared artifacts are stored in the dataset folder + return resolve(aDataset).resolve(aArtifact.getName()); + } + } + + /** + * Verify/download/update artifact in cache. Execute post-download actions. + */ + private void materialize(DatasetDescription aDataset, DatasetValidationPolicy aPolicy) + throws IOException + { + Path root = resolve(aDataset); + Collection artifacts = aDataset.getArtifacts().values(); + + // First validate if local copies are still up-to-date + boolean reload = false; + packageValidationLoop: for (ArtifactDescription artifact : artifacts) { + Path cachedFile = resolve(aDataset, artifact); + if (!Files.exists(cachedFile)) { + continue; + } + + if (artifact.getUrl() != null) { + boolean verificationOk = checkDigest(cachedFile, artifact); + if (!verificationOk) { + reload = true; + break packageValidationLoop; + } + } + } + + // If any of the packages are outdated, clear the cache and download again + if (reload) { + if (!DatasetValidationPolicy.DESPERATE.equals(aPolicy)) { + log.info("Clearing local cache for [{}]", root); + FileUtils.deleteQuietly(root.toFile()); + } + else { + log.info("DESPERATE policy in effect. Not clearing local cache for [{}]", root); + } + } + + for (ArtifactDescription artifact : artifacts) { + if (artifact.getText() != null) { + materializeEmbeddedText(artifact); + } + else if (artifact.getUrl() != null) { + try { + materializeRemoteFileArtifact(artifact, aPolicy); + } + catch (Exception e) { + if (artifact.isOptional()) { + if (log.isDebugEnabled()) { + log.warn("Skipping optional artifact [{}]", + artifact.getName(), e); + } + else { + log.warn("Skipping optional artifact [{}]: {}", + artifact.getName(), e.getMessage()); + } + } + else { + throw e; + } + } + } + } + + // Perform a post-fetch action such as unpacking + Path postActionCompleteMarker = resolve(aDataset).resolve(".postComplete"); + if (!Files.exists(postActionCompleteMarker)) { + for (ArtifactDescription artifact : artifacts) { + Path cachedFile = resolve(aDataset, artifact); + + List actions = artifact.getActions(); + if (actions != null && !actions.isEmpty()) { + try { + for (ActionDescription action : actions) { + log.info("Post-download action [{}]", action.getAction()); + Class implClass = actionRegistry + .get(action.getAction()); + + if (implClass == null) { + throw new IllegalStateException( + "Unknown or unsupported action [" + action.getAction() + "]"); + } + + Action_ImplBase impl = implClass.newInstance(); + impl.apply(action, aDataset, artifact, cachedFile); + } + } + catch (IllegalStateException | IOException e) { + throw e; + } + catch (Exception e) { + throw new IllegalStateException(e); + } + } + } + Files.createFile(postActionCompleteMarker); + } + } + + private void materializeRemoteFileArtifact(ArtifactDescription artifact, + DatasetValidationPolicy aPolicy) + throws IOException + { + Path cachedFile = resolve(artifact.getDataset(), artifact); + + if (Files.exists(cachedFile)) { + return; + } + + Files.createDirectories(cachedFile.getParent()); + + ResourceLoader resourceLoader = new DefaultResourceLoader(classLoader); + Resource res = resourceLoader.getResource(artifact.getUrl()); + if (!res.exists()) { + throw new FileNotFoundException("Resource not found at [" + artifact.getUrl() + "]"); + } + + if (log.isDebugEnabled()) { + log.debug("Fetching [{}] from [{}]", cachedFile, artifact.getUrl()); + } + else { + log.info("Fetching [{}]", cachedFile); + } + + URLConnection connection = res.getURL().openConnection(); + connection.setRequestProperty("User-Agent", "Java"); + + try (InputStream is = connection.getInputStream()) { + Files.copy(is, cachedFile); + } + + boolean verificationOk = checkDigest(cachedFile, artifact); + if (!verificationOk) { + switch (aPolicy) { + case STRICT: + throw new IOException("Checksum verification failed on [" + cachedFile + + "] STRICT policy in effect. Bailing out."); + case CONTINUE: + log.warn( + "Checksum verification failed on [{}] CONTINUE policy in effect. Ignoring mismatch.", + cachedFile); + break; + case DESPERATE: + log.warn( + "Checksum verification failed on [{}] DESPERATE policy in effect. Ignoring mismatch.", + cachedFile); + break; + default: + throw new IllegalArgumentException("Unknown policy: " + aPolicy); + } + } + } + + private void materializeEmbeddedText(ArtifactDescription artifact) throws IOException + { + Path cachedFile = resolve(artifact.getDataset(), artifact); + + // Check if file on disk corresponds to text stored in artifact description + if (Files.exists(cachedFile)) { + String text = FileUtils.readFileToString(cachedFile.toFile(), UTF_8); + text = StringUtils.normalizeSpace(text); + if (StringUtils.normalizeSpace(artifact.getText()).equals(text)) { + return; + } + } + + Files.createDirectories(cachedFile.getParent()); + + log.info("Creating [{}]", cachedFile); + try (Writer out = Files.newBufferedWriter(cachedFile, StandardCharsets.UTF_8)) { + out.write(artifact.getText()); + } + } + + private InputStream getDigestInputStream(Path aFile, ArtifactDescription aArtifact) + throws IOException + { + switch (aArtifact.getVerificationMode()) { + case BINARY: + return Files.newInputStream(aFile); + case TEXT: + String text = FileUtils.readFileToString(aFile.toFile(), UTF_8); + text = StringUtils.normalizeSpace(text); + return IOUtils.toInputStream(text, UTF_8); + default: + throw new IllegalArgumentException( + "Unknown verification mode [" + aArtifact.getVerificationMode() + "]"); + } + } + + private boolean checkDigest(Path aFile, ArtifactDescription aArtifact) throws IOException + { + MessageDigest sha1; + MessageDigest sha512; + try { + sha1 = MessageDigest.getInstance("SHA-1"); + sha512 = MessageDigest.getInstance("SHA-512"); + } + catch (NoSuchAlgorithmException e) { + throw new IOException(e); + } + + try (InputStream is = getDigestInputStream(aFile, aArtifact)) { + DigestInputStream sha1Filter = new DigestInputStream(is, sha1); + DigestInputStream sha512Filter = new DigestInputStream(sha1Filter, sha512); + IOUtils.copy(sha512Filter, new NullOutputStream()); + String sha1Hash = new String(Hex.encodeHex(sha1Filter.getMessageDigest().digest())); + String sha512Hash = new String(Hex.encodeHex(sha512Filter.getMessageDigest().digest())); + + if (aArtifact.getSha1() != null) { + if (!sha1Hash.equals(aArtifact.getSha1())) { + log.info( + "Local SHA1 hash mismatch for artifact [{}] in dataset [{}] - expected [{}] - actual [{}] (mode: {})", + aArtifact.getName(), aArtifact.getDataset().getId(), + aArtifact.getSha1(), sha1Hash, aArtifact.getVerificationMode()); + return false; + } + else if (aArtifact.getSha512() == null) { + log.info( + "Local SHA1 hash verified for artifact [{}] in dataset [{}] (mode: {})", + aArtifact.getName(), aArtifact.getDataset().getId(), + aArtifact.getVerificationMode()); + } + } + + if (aArtifact.getSha512() != null) { + if (!sha512Hash.equals(aArtifact.getSha512())) { + log.info( + "Local SHA512 hash mismatch for artifact [{}] in dataset [{}] - expected [{}] - actual [{}] (mode: {})", + aArtifact.getName(), aArtifact.getDataset().getId(), + aArtifact.getSha512(), sha512Hash, aArtifact.getVerificationMode()); + return false; + } + else { + log.info( + "Local SHA512 hash verified for artifact [{}] in dataset [{}] (mode: {})", + aArtifact.getName(), aArtifact.getDataset().getId(), + aArtifact.getVerificationMode()); + } + } + else { + log.info( + "No SHA512 hash for artifact [{}] in dataset [{}] - it is recommended to add it: [{}]", + aArtifact.getName(), aArtifact.getDataset().getId(), sha512Hash); + } + + return true; + } + } +} diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DatasetLoader.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetLoader.java similarity index 89% rename from dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DatasetLoader.java rename to dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetLoader.java index 31aa0eb894..bf7fb9d6ca 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DatasetLoader.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetLoader.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets; +package org.dkpro.core.api.datasets; import java.io.BufferedInputStream; import java.io.File; @@ -24,6 +24,7 @@ import java.io.InputStream; import java.net.URL; import java.net.URLConnection; +import java.nio.file.Path; import java.security.DigestInputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; @@ -41,14 +42,13 @@ import org.apache.commons.io.output.NullOutputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; - -import de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.ud.UDDataset; +import org.dkpro.core.api.datasets.internal.ud.UDDataset; @Deprecated public class DatasetLoader { private final Log LOG = LogFactory.getLog(getClass()); - + private File cacheRoot; public DatasetLoader() @@ -70,19 +70,18 @@ public File getCacheRoot() return cacheRoot; } - public List loadUniversalDependencyTreebankV1_3() - throws IOException + public List loadUniversalDependencyTreebankV1_3() throws IOException { File dataDir = new File(cacheRoot, "ud-treebanks-v1.3"); DataPackage data = new DataPackage.Builder() .url("https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/" + "1-1699/ud-treebanks-v1.3.tgz?sequence=1&isAllowed=y") - .sha1("44367112880cf0af3f293cb3f0cc6ce50c0e65c0") - .target("ud-treebanks-v1.3.tgz") - .postAction((d) -> { untgz(new File(dataDir, d.getTarget()), dataDir); }) - .build(); - + .sha1("44367112880cf0af3f293cb3f0cc6ce50c0e65c0").target("ud-treebanks-v1.3.tgz") + .postAction((d) -> { + untgz(new File(dataDir, d.getTarget()), dataDir); + }).build(); + fetch(dataDir, data); List sets = new ArrayList<>(); @@ -91,9 +90,8 @@ public List loadUniversalDependencyTreebankV1_3() } return sets; } - - private void fetch(File aTarget, DataPackage... aPackages) - throws IOException + + private void fetch(File aTarget, DataPackage... aPackages) throws IOException { // First validate if local copies are still up-to-date boolean reload = false; @@ -102,7 +100,7 @@ private void fetch(File aTarget, DataPackage... aPackages) if (!cachedFile.exists()) { continue; } - + if (pack.getSha1() != null) { String actual = getDigest(cachedFile, "SHA1"); if (!pack.getSha1().equals(actual)) { @@ -115,7 +113,7 @@ private void fetch(File aTarget, DataPackage... aPackages) LOG.info("Local SHA1 hash verified on [" + cachedFile + "] - [" + actual + "]"); } } - + if (pack.getMd5() != null) { String actual = getDigest(cachedFile, "MD5"); if (!pack.getMd5().equals(actual)) { @@ -128,23 +126,22 @@ private void fetch(File aTarget, DataPackage... aPackages) LOG.info("Local MD5 hash verified on [" + cachedFile + "] - [" + actual + "]"); } } - + } - + // If any of the packages are outdated, clear the cache and download again if (reload) { LOG.info("Clearing local cache for [" + aTarget + "]"); FileUtils.deleteQuietly(aTarget); } - + for (DataPackage pack : aPackages) { File cachedFile = new File(aTarget, pack.getTarget()); - + if (cachedFile.exists()) { continue; } - MessageDigest md5; try { md5 = MessageDigest.getInstance("MD5"); @@ -165,10 +162,10 @@ private void fetch(File aTarget, DataPackage... aPackages) URL source = new URL(pack.getUrl()); LOG.info("Fetching [" + cachedFile + "]"); - + URLConnection connection = source.openConnection(); connection.setRequestProperty("User-Agent", "Java"); - + try (InputStream is = connection.getInputStream()) { DigestInputStream md5Filter = new DigestInputStream(is, md5); DigestInputStream sha1Filter = new DigestInputStream(md5Filter, sha1); @@ -184,7 +181,7 @@ private void fetch(File aTarget, DataPackage... aPackages) throw new IOException(message); } } - + if (pack.getSha1() != null) { String sha1Hex = new String( Hex.encodeHex(sha1Filter.getMessageDigest().digest())); @@ -197,11 +194,11 @@ private void fetch(File aTarget, DataPackage... aPackages) } } } - + // Perform a post-fetch action such as unpacking for (DataPackage pack : aPackages) { File cachedFile = new File(aTarget, pack.getTarget()); - File postActionCompleteMarker = new File(cachedFile.getPath()+".postComplete"); + File postActionCompleteMarker = new File(cachedFile.getPath() + ".postComplete"); if (pack.getPostAction() != null && !postActionCompleteMarker.exists()) { try { pack.getPostAction().run(pack); @@ -211,12 +208,12 @@ private void fetch(File aTarget, DataPackage... aPackages) throw e; } catch (Exception e) { - throw new IllegalStateException(e); + throw new IllegalStateException(e); } } } } - + private String getDigest(File aFile, String aDigest) throws IOException { MessageDigest digest; @@ -232,34 +229,41 @@ private String getDigest(File aFile, String aDigest) throws IOException return new String(Hex.encodeHex(digestFilter.getMessageDigest().digest())); } } - - private void untgz(File aArchive, File aTarget) - throws IOException + + private void untgz(File aArchive, File aTarget) throws IOException { try (ArchiveInputStream archive = new TarArchiveInputStream(new GzipCompressorInputStream( new BufferedInputStream(new FileInputStream(aArchive))))) { extract(aArchive, archive, aTarget); } } - + private void extract(File aArchive, ArchiveInputStream aArchiveStream, File aTarget) throws IOException { ArchiveEntry entry = null; while ((entry = aArchiveStream.getNextEntry()) != null) { String name = entry.getName(); - + // Ensure that the filename will not break the manifest if (name.contains("\n")) { throw new IllegalStateException("Filename must not contain line break"); } + + Path base = aTarget.toPath().toAbsolutePath(); + Path out = base.resolve(name).toAbsolutePath(); + + if (!out.startsWith(base)) { + // Ignore attempts to write outside the base + continue; + } - File out = new File(aTarget, name); if (entry.isDirectory()) { - FileUtils.forceMkdir(out); + FileUtils.forceMkdir(out.toFile()); } else { - FileUtils.copyInputStreamToFile(new CloseShieldInputStream(aArchiveStream), out); + FileUtils.copyInputStreamToFile(new CloseShieldInputStream(aArchiveStream), + out.toFile()); } } } diff --git a/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetValidationPolicy.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetValidationPolicy.java new file mode 100644 index 0000000000..a0b9e9fb66 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/DatasetValidationPolicy.java @@ -0,0 +1,40 @@ +/* + * Copyright 2018 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.datasets; + +public enum DatasetValidationPolicy +{ + /** + * If the local hash does not match or if there is no local data, download it. If the + * freshly downloaded data does not match, fail. + */ + STRICT, + + /** + * If the local hash does not match if there is no local data, download it. If the freshly + * downloaded data does not match, continue. + */ + CONTINUE, + + /** + * Use the local cached version, even if its hash does not match. Do not try to download it + * again. If there is no cached version, try downloading the data and use it whether it + * matches the hash or not. + */ + DESPERATE; +} diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/FileRole.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/FileRole.java similarity index 96% rename from dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/FileRole.java rename to dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/FileRole.java index f5c6fcfbcb..66727fe710 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/FileRole.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/FileRole.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets; +package org.dkpro.core.api.datasets; public final class FileRole { diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/LicenseDescription.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/LicenseDescription.java similarity index 93% rename from dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/LicenseDescription.java rename to dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/LicenseDescription.java index d1eb83b827..689c3a0c32 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/LicenseDescription.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/LicenseDescription.java @@ -15,14 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets; +package org.dkpro.core.api.datasets; public interface LicenseDescription { - String getName(); String getUrl(); String getComment(); -} \ No newline at end of file +} diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/Split.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/Split.java similarity index 94% rename from dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/Split.java rename to dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/Split.java index 60d0adb60c..ed0600fabe 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/Split.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/Split.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets; +package org.dkpro.core.api.datasets; import java.io.File; diff --git a/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/VerificationMode.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/VerificationMode.java new file mode 100644 index 0000000000..6ecf72ef97 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/VerificationMode.java @@ -0,0 +1,31 @@ +/* + * Copyright 2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.datasets; + +public enum VerificationMode +{ + /** + * Calculate the hash based on the binary content of the file. + */ + BINARY, + + /** + * Normalize whitespace before calculating the hash. + */ + TEXT; +} diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/ActionDescriptionImpl.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/ActionDescriptionImpl.java similarity index 91% rename from dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/ActionDescriptionImpl.java rename to dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/ActionDescriptionImpl.java index 5ab54b9770..7fd515aedf 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/ActionDescriptionImpl.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/ActionDescriptionImpl.java @@ -15,14 +15,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets.internal; +package org.dkpro.core.api.datasets.internal; import static java.util.Collections.emptyMap; import static java.util.Collections.unmodifiableMap; import java.util.Map; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.ActionDescription; +import org.dkpro.core.api.datasets.ActionDescription; public class ActionDescriptionImpl implements ActionDescription diff --git a/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/ArtifactDescriptionImpl.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/ArtifactDescriptionImpl.java new file mode 100644 index 0000000000..7b62582cb7 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/ArtifactDescriptionImpl.java @@ -0,0 +1,152 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.datasets.internal; + +import static org.dkpro.core.api.datasets.VerificationMode.BINARY; + +import java.util.List; + +import org.dkpro.core.api.datasets.ActionDescription; +import org.dkpro.core.api.datasets.ArtifactDescription; +import org.dkpro.core.api.datasets.DatasetDescription; +import org.dkpro.core.api.datasets.VerificationMode; + +public class ArtifactDescriptionImpl + implements ArtifactDescription +{ + private DatasetDescription dataset; + private String name; + private String text; + private String url; + private String sha1; + private String sha512; + private VerificationMode verificationMode = BINARY; + private boolean shared; + private boolean optional; + private List actions; + + @Override + public DatasetDescription getDataset() + { + return dataset; + } + + public void setDataset(DatasetDescription aDataset) + { + dataset = aDataset; + } + + @Override + public String getName() + { + return name; + } + + public void setName(String aName) + { + name = aName; + } + + @Override + public String getText() + { + return text; + } + + public void setText(String aText) + { + text = aText; + } + + @Override + public String getUrl() + { + return url; + } + + public void setUrl(String aUrl) + { + url = aUrl; + } + + @Override + public String getSha1() + { + return sha1; + } + + public void setSha1(String aSha1) + { + sha1 = aSha1; + } + + @Override + public String getSha512() + { + return sha512; + } + + public void setSha512(String aSha512) + { + sha512 = aSha512; + } + + @Override + public VerificationMode getVerificationMode() + { + return verificationMode; + } + + public void setVerificationMode(VerificationMode aVerificationMode) + { + verificationMode = aVerificationMode; + } + + @Override + public List getActions() + { + return actions; + } + + public void setActions(List aActions) + { + actions = aActions; + } + + @Override + public boolean isShared() + { + return shared; + } + + public void setShared(boolean aShared) + { + shared = aShared; + } + + @Override + public boolean isOptional() + { + return optional; + } + + public void setOptional(boolean aOptional) + { + optional = aOptional; + } +} diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/DatasetDescriptionImpl.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/DatasetDescriptionImpl.java similarity index 91% rename from dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/DatasetDescriptionImpl.java rename to dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/DatasetDescriptionImpl.java index b8c2b01cad..aed2301aff 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/DatasetDescriptionImpl.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/DatasetDescriptionImpl.java @@ -15,17 +15,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets.internal; +package org.dkpro.core.api.datasets.internal; -import static java.util.Collections.*; +import static java.util.Collections.emptyMap; +import static java.util.Collections.unmodifiableMap; import java.util.List; import java.util.Map; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.ArtifactDescription; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetDescription; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetFactory; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.LicenseDescription; +import org.dkpro.core.api.datasets.ArtifactDescription; +import org.dkpro.core.api.datasets.DatasetDescription; +import org.dkpro.core.api.datasets.DatasetFactory; +import org.dkpro.core.api.datasets.LicenseDescription; public class DatasetDescriptionImpl implements DatasetDescription @@ -208,6 +209,7 @@ public void setLicenses(List aLicenses) licenses = aLicenses; } + @Override public String getEncoding() { return encoding; diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/LicenseDescriptionImpl.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/LicenseDescriptionImpl.java similarity index 91% rename from dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/LicenseDescriptionImpl.java rename to dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/LicenseDescriptionImpl.java index a87ace7dd6..b921d4dd9c 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/LicenseDescriptionImpl.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/LicenseDescriptionImpl.java @@ -15,11 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets.internal; +package org.dkpro.core.api.datasets.internal; import java.util.List; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.LicenseDescription; +import org.dkpro.core.api.datasets.LicenseDescription; public class LicenseDescriptionImpl implements LicenseDescription diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/LoadedDataset.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/LoadedDataset.java similarity index 89% rename from dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/LoadedDataset.java rename to dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/LoadedDataset.java index 71e12b25a9..d842a4c7ee 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/LoadedDataset.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/LoadedDataset.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets.internal; +package org.dkpro.core.api.datasets.internal; import static java.util.Arrays.asList; @@ -30,13 +30,12 @@ import org.apache.commons.io.FileUtils; import org.apache.commons.io.filefilter.TrueFileFilter; - -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Dataset; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetDescription; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetFactory; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.FileRole; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Split; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.util.AntFileFilter; +import org.dkpro.core.api.datasets.Dataset; +import org.dkpro.core.api.datasets.DatasetDescription; +import org.dkpro.core.api.datasets.DatasetFactory; +import org.dkpro.core.api.datasets.FileRole; +import org.dkpro.core.api.datasets.Split; +import org.dkpro.core.api.datasets.internal.util.AntFileFilter; public class LoadedDataset implements Dataset diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/SplitImpl.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/SplitImpl.java similarity index 93% rename from dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/SplitImpl.java rename to dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/SplitImpl.java index 13507d2cb8..380de167b7 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/SplitImpl.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/SplitImpl.java @@ -15,11 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets.internal; +package org.dkpro.core.api.datasets.internal; import java.io.File; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Split; +import org.dkpro.core.api.datasets.Split; public class SplitImpl implements Split diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Action_ImplBase.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/actions/Action_ImplBase.java similarity index 76% rename from dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Action_ImplBase.java rename to dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/actions/Action_ImplBase.java index 26f20a5958..b7bbb53420 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/actions/Action_ImplBase.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/actions/Action_ImplBase.java @@ -15,13 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.actions; +package org.dkpro.core.api.datasets.internal.actions; import java.nio.file.Path; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.ActionDescription; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.ArtifactDescription; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetDescription; +import org.dkpro.core.api.datasets.ActionDescription; +import org.dkpro.core.api.datasets.ArtifactDescription; +import org.dkpro.core.api.datasets.DatasetDescription; public abstract class Action_ImplBase { diff --git a/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/actions/Explode.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/actions/Explode.java new file mode 100644 index 0000000000..7bede6f9f1 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/actions/Explode.java @@ -0,0 +1,347 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.datasets.internal.actions; + +import static java.util.Arrays.asList; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +import org.apache.commons.compress.archivers.ArchiveEntry; +import org.apache.commons.compress.archivers.ArchiveException; +import org.apache.commons.compress.archivers.ArchiveInputStream; +import org.apache.commons.compress.archivers.ArchiveStreamFactory; +import org.apache.commons.compress.archivers.sevenz.SevenZArchiveEntry; +import org.apache.commons.compress.archivers.sevenz.SevenZFile; +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.apache.commons.io.FilenameUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dkpro.core.api.datasets.ActionDescription; +import org.dkpro.core.api.datasets.ArtifactDescription; +import org.dkpro.core.api.datasets.DatasetDescription; +import org.dkpro.core.api.datasets.internal.DatasetDescriptionImpl; +import org.dkpro.core.api.datasets.internal.util.AntFileFilter; + +import com.github.junrar.Archive; +import com.github.junrar.exception.RarException; +import com.github.junrar.impl.FileVolumeManager; +import com.github.junrar.rarfile.FileHeader; + +public class Explode + extends Action_ImplBase +{ + private final Log LOG = LogFactory.getLog(getClass()); + + @Override + public void apply(ActionDescription aAction, DatasetDescription aDataset, + ArtifactDescription aPack, Path aCachedFile) + throws Exception + { + DatasetDescriptionImpl dsi = (DatasetDescriptionImpl) aDataset; + + Map cfg = aAction.getConfiguration(); + // Sometimes, we have to explode a file that was created as the result of exploding the + // main artifact. Thus, we can override the target + Path targetFile = cfg.containsKey("file") + ? dsi.getOwner().resolve(dsi).resolve((String) cfg.get("file")) + : aCachedFile; + + // Apache Commons Compress does not handle RAR files, so we handle them separately + if (targetFile.toString().toLowerCase(Locale.ENGLISH).endsWith(".rar")) { + extractRar(aAction, targetFile, dsi.getOwner().resolve(dsi)); + } + else if (targetFile.toString().toLowerCase(Locale.ENGLISH).endsWith(".7z")) { + // 7z does not support streaming in Apache Commons Compress + extract7z(aAction, targetFile, dsi.getOwner().resolve(dsi)); + } + else { + // Auto-detect the archive format using Apache Commons Compress + try (InputStream is = new BufferedInputStream(Files.newInputStream(targetFile))) { + InputStream uncompressed; + + try { + uncompressed = new BufferedInputStream( + new CompressorStreamFactory().createCompressorInputStream(is)); + } + catch (CompressorException e) { + // If the compressor is not detected, we may be dealing with an archive format + // that + // compresses internally, e.g. ZIP. + uncompressed = is; + } + + ArchiveInputStream archive = new ArchiveStreamFactory() + .createArchiveInputStream(uncompressed); + extract(aAction, targetFile, archive, dsi.getOwner().resolve(dsi)); + } + catch (ArchiveException e) { + throw new ArchiveException("Unable to extract files from [" + targetFile + "]", e); + } + } + } + + private void extract7z(ActionDescription aAction, Path aArchive, Path aTarget) + throws IOException, RarException + { + // We always extract archives into a subfolder. Figure out the name of the folder. + Path base = aTarget.resolve(getPathWithoutFileExtension(aArchive)).toAbsolutePath(); + + Map cfg = aAction.getConfiguration(); + int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; + + AntFileFilter filter = new AntFileFilter(coerceToList(cfg.get("includes")), + coerceToList(cfg.get("excludes"))); + + LOG.info("Extracting files of [" + aArchive.getFileName() + "] to [" + aTarget.resolve(base) + + "]"); + + try (SevenZFile archive = new SevenZFile(aArchive.toFile())) { + SevenZArchiveEntry entry = archive.getNextEntry(); + while (entry != null) { + String name = stripLeadingFolders(entry.getName(), strip); + + if (name == null) { + // Stripped to null - nothing left to extract - continue; + continue; + } + + if (filter.accept(name)) { + Path out = base.resolve(name).toAbsolutePath(); + if (!out.startsWith(base)) { + throw new IOException( + "Archive tries to generate file outside target folder: [" + name + + "]"); + } + + if (entry.isDirectory()) { + Files.createDirectories(out); + } + else { + Files.createDirectories(out.getParent()); + try (OutputStream os = Files.newOutputStream(out)) { + InputStream is = new SevenZEntryInputStream(archive, entry); + IOUtils.copyLarge(is, os); + } + } + } + + entry = archive.getNextEntry(); + } + } + } + + private void extractRar(ActionDescription aAction, Path aArchive, Path aTarget) + throws IOException, RarException + { + // We always extract archives into a subfolder. Figure out the name of the folder. + Path base = aTarget.resolve(getPathWithoutFileExtension(aArchive)).toAbsolutePath(); + + Map cfg = aAction.getConfiguration(); + int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; + + AntFileFilter filter = new AntFileFilter(coerceToList(cfg.get("includes")), + coerceToList(cfg.get("excludes"))); + + LOG.info("Extracting files of [" + aArchive.getFileName() + "] to [" + aTarget.resolve(base) + + "]"); + + try (Archive archive = new Archive(new FileVolumeManager(aArchive.toFile()))) { + FileHeader fh = archive.nextFileHeader(); + while (fh != null) { + String name = stripLeadingFolders(fh.getFileNameString(), strip); + + if (name == null) { + // Stripped to null - nothing left to extract - continue; + continue; + } + + if (filter.accept(name)) { + Path out = base.resolve(name).toAbsolutePath(); + if (!out.startsWith(base)) { + throw new IOException( + "Archive tries to generate file outside target folder: [" + name + + "]"); + } + + if (fh.isDirectory()) { + Files.createDirectories(out); + } + else { + Files.createDirectories(out.getParent()); + try (OutputStream os = Files.newOutputStream(out)) { + archive.extractFile(fh, os); + } + } + } + + fh = archive.nextFileHeader(); + } + } + } + + private void extract(ActionDescription aAction, Path aArchive, ArchiveInputStream aAStream, + Path aTarget) + throws IOException + { + // We always extract archives into a subfolder. Figure out the name of the folder. + Path base = aTarget.resolve(getPathWithoutFileExtension(aArchive)).toAbsolutePath(); + + Map cfg = aAction.getConfiguration(); + int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; + + AntFileFilter filter = new AntFileFilter(coerceToList(cfg.get("includes")), + coerceToList(cfg.get("excludes"))); + + LOG.info("Extracting files of [" + aArchive.getFileName() + "] to [" + aTarget.resolve(base) + + "]"); + + ArchiveEntry entry = null; + while ((entry = aAStream.getNextEntry()) != null) { + String name = stripLeadingFolders(entry.getName(), strip); + + if (name == null) { + // Stripped to null - nothing left to extract - continue; + continue; + } + + if (filter.accept(name)) { + Path out = base.resolve(name).toAbsolutePath(); + if (!out.startsWith(base)) { + throw new IOException( + "Archive tries to generate file outside target folder: [" + name + "]"); + } + + if (entry.isDirectory()) { + Files.createDirectories(out); + } + else { + Files.createDirectories(out.getParent()); + Files.copy(aAStream, out); + } + } + } + } + + private String stripLeadingFolders(String aName, int aLevels) + { + if (aName == null) { + return null; + } + + if (aLevels > 0) { + Path p = Paths.get(aName); + if (p.getNameCount() <= aLevels) { + return null; + } + else { + p = p.subpath(aLevels, p.getNameCount()); + aName = p.toString(); + return aName; + } + } + else { + return aName; + } + } + + /** + * The the name of the archive without any extensions (e.g. in the case of multiple extensions + * such as .tar.gz). + */ + public static String getPathWithoutFileExtension(Path aFilename) + { + + + // We always extract archives into a subfolder. Figure out the name of the folder. + String base = aFilename.getFileName().toString(); + while (base.contains(".")) { + base = FilenameUtils.removeExtension(base); + } + return base; + } + + @SuppressWarnings("unchecked") + public static List coerceToList(Object aRaw) + { + List cooked; + if (aRaw == null) { + return null; + } + else if (aRaw instanceof String) { + cooked = asList((String) aRaw); + } + else if (aRaw instanceof List) { + cooked = (List) aRaw; + } + else { + throw new IllegalArgumentException("Cannot coerce to String list: [" + aRaw + "]"); + } + return cooked; + } + + private static class SevenZEntryInputStream + extends InputStream + { + private SevenZFile archive; + private SevenZArchiveEntry entry; + private int totalRead; + + public SevenZEntryInputStream(SevenZFile aArchive, SevenZArchiveEntry aEnty) + { + archive = aArchive; + entry = aEnty; + } + + @Override + public int read() throws IOException + { + if (totalRead < entry.getSize()) { + totalRead++; + return archive.read(); + } + else { + return -1; + } + } + + @Override + public int read(byte[] aB, int aOff, int aLen) throws IOException + { + if (totalRead < entry.getSize()) { + int blocksize = (int) Math.min(aLen, entry.getSize() - totalRead); + int read = archive.read(aB, aOff, blocksize); + totalRead += read; + return read; + } + else { + return -1; + } + } + } +} diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/ud/UDDataset.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/ud/UDDataset.java similarity index 91% rename from dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/ud/UDDataset.java rename to dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/ud/UDDataset.java index fa642fc3d0..c57a043f2d 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/ud/UDDataset.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/ud/UDDataset.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.ud; +package org.dkpro.core.api.datasets.internal.ud; import static java.util.Arrays.asList; @@ -24,9 +24,9 @@ import java.util.HashSet; import java.util.Set; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Dataset; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Split; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.SplitImpl; +import org.dkpro.core.api.datasets.Dataset; +import org.dkpro.core.api.datasets.Split; +import org.dkpro.core.api.datasets.internal.SplitImpl; public class UDDataset implements Dataset diff --git a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/util/AntFileFilter.java b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/util/AntFileFilter.java similarity index 93% rename from dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/util/AntFileFilter.java rename to dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/util/AntFileFilter.java index 3a7efcdc36..886fc58f27 100644 --- a/dkpro-core-api-datasets-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/internal/util/AntFileFilter.java +++ b/dkpro-core-api-datasets-asl/src/main/java/org/dkpro/core/api/datasets/internal/util/AntFileFilter.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.util; +package org.dkpro.core.api.datasets.internal.util; import java.io.File; import java.nio.file.Path; @@ -25,19 +25,20 @@ import org.apache.commons.io.filefilter.AbstractFileFilter; import org.springframework.util.AntPathMatcher; -public class AntFileFilter extends AbstractFileFilter +public class AntFileFilter + extends AbstractFileFilter { private AntPathMatcher matcher = new AntPathMatcher(); - + private Path baseDir; private List includes; private List excludes; - + public AntFileFilter(List aIncludes, List aExcludes) { this(null, aIncludes, aExcludes); } - + public AntFileFilter(Path aBaseDir, List aIncludes, List aExcludes) { baseDir = aBaseDir; @@ -64,14 +65,15 @@ public boolean accept(File aFile) public boolean accept(String aPath) { boolean ok = true; - - // Ant matcher uses slashes as separator by default and that is also what we do in the YAML files. + + // Ant matcher uses slashes as separator by default and that is also what we do in the YAML + // files. // Thus we need to transform system paths to UNIX-style if necessary. String path = aPath; if (File.separatorChar == '\\') { path = FilenameUtils.separatorsToUnix(path); } - + // If includes are set, we only consider stuff that is included if (includes != null) { ok = false; @@ -95,4 +97,4 @@ public boolean accept(String aPath) return ok; } -} \ No newline at end of file +} diff --git a/dkpro-core-api-datasets-asl/src/main/resources/META-INF/org.dkpro.core/datasets.txt b/dkpro-core-api-datasets-asl/src/main/resources/META-INF/org.dkpro.core/datasets.txt index f0ad2ed909..60d43d72fa 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/META-INF/org.dkpro.core/datasets.txt +++ b/dkpro-core-api-datasets-asl/src/main/resources/META-INF/org.dkpro.core/datasets.txt @@ -1 +1 @@ -classpath*:de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/*.yaml +classpath*:org/dkpro/core/api/datasets/lib/*.yaml diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/alpino-conll-nl-20100114.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/alpino-conll-nl-20100114.yaml deleted file mode 100644 index 77df8f7a8f..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/alpino-conll-nl-20100114.yaml +++ /dev/null @@ -1,32 +0,0 @@ -groupId: org.dkpro.core.datasets.alpino -datasetId: alpino-alpino2conll -version: 20100114 -language: nl -mediaType: text/x.org.dkpro.conll-2006 -encoding: UTF-8 - -name: Alpino2conll -url: http://www.let.rug.nl/~bplank/alpino2conll/ -attribution: | - Barbara Plank. Improved statistical measures to assess natural language parser performance across - domains. In Proceedings of the 7th International Conference on Language Resources and Evaluation - (LREC2010), Valletta, Malta, May 2010. -description: | - Training and test datasets for Dutch in retagged CoNLL format. The data was converted from Alpino - XML into CoNLL format based on an adapted version of Erwin Marsi's conversion software, but PoS - tags were replaced by automatically assigned Alpino tags. - - (This description has been sourced from the corpus website). - -artifacts: - cdb.conll.utf8: - url: http://www.let.rug.nl/~bplank/alpino2conll/data/cdb.conll.utf8 - sha1: 11313d405abb0f268247a2d5420afa413eb244e7 - conll2006-test.conll: - url: http://www.let.rug.nl/~bplank/alpino2conll/data/conll2006-test.conll - sha1: 11313d405abb0f268247a2d5420afa413eb244e7 - -roles: - data: - - "cdb.conll.utf8" - - "conll2006-test.conll" diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/aqmar-ar-1.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/aqmar-ar-1.0.yaml deleted file mode 100644 index 41e37f3168..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/aqmar-ar-1.0.yaml +++ /dev/null @@ -1,39 +0,0 @@ -groupId: org.dkpro.core.datasets.aqmar -datasetId: aqmar -version: 1.0 -language: ar -mediaType: text/x.org.dkpro.conll-2000 -encoding: UTF-8 - -name: AQMAR Arabic Wikipedia Named Entity Corpus -url: http://www.cs.cmu.edu/~ark/ArabicNER/ -description: | - 73,853 tokens in 28 Arabic Wikipedia articles hand-annotated for named entities. - - (This description has been partially copied from the corpus website). - -attribution: | - By Behrang Mohit, Nathan Schneider, Rishav Bhowmick, Kemal Oflazer, and Noah Smith as part of the - AQMAR project. - -licenses: - - name: CC-BY-SA 3.0 - url: https://creativecommons.org/licenses/by-sa/3.0/ - -artifacts: - LICENSE.txt: - url: http://www.cs.cmu.edu/~ark/ArabicNER/corpus/LICENSE - sha1: 43f4082fb8432ad86d927bdff687f9406db43d0f - data.zip: - url: "http://www.cs.cmu.edu/~ark/ArabicNER/AQMAR_Arabic_NER_corpus-1.0.zip" - sha1: 4fa2c37d7673bb456c6e382566a091545531d85f - actions: - - action: explode - configuration: { includes: "*.txt" } - -roles: - licenses: - - LICENSE.txt - data: - - "data/*.txt" - diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2006-pt.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2006-pt.yaml deleted file mode 100644 index 1fe6efb046..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2006-pt.yaml +++ /dev/null @@ -1,46 +0,0 @@ -groupId: org.dkpro.core.datasets.conll2006 -datasetId: conll2006 -# Didn't find any version information at the source, falling back to date of the corpus files -version: 20100302 -language: pt -mediaType: text/x.org.dkpro.conll-2006 -encoding: UTF-8 - -name: CoNLL-2006 Shared Task (Portuguese) -url: http://ilk.uvt.nl/conll/ -attribution: Diana Santos, Eckhard Bick -description: | - This is the Portuguese part of the CONLL-X Shared Task. The was derived from the Floresta - Sintá(c)tica Bosque 7.3 by Sabine Buchholz. - - (This description has been partially sourced from the README file included with the corpus). - - NOTE: We dd not find license information for this dataset. One might assume the license of this - dataset is equivalent to that of the Floresta Sintá(c)tica from which it was derived. - -licenses: - - name: Floresta Sintá(c)tica License - url: http://www.linguateca.pt/floresta/info_floresta_English.html - -artifacts: - README.txt: - url: http://www.linguateca.pt/floresta/CoNLL-X/readme.conll - sha1: 7afe672cba645d22fc037d8f6e2bf9d501d0aee6 - portuguese_bosque_train.conll: - url: http://www.linguateca.pt/floresta/CoNLL-X/portuguese_bosque_train.conll - sha1: 29e630e207c74a42e0d2999193aa25d73f262920 - portuguese_bosque_test_blind.conll: - url: http://www.linguateca.pt/floresta/CoNLL-X/portuguese_bosque_test_blind.conll - sha1: fabcfbd73a531e21786af9b8233f1a4aa78dfddb - portuguese_bosque_test.conll: - url: http://www.linguateca.pt/floresta/CoNLL-X/portuguese_bosque_test.conll - sha1: e399cdc1203df1ff43816f3f934223cb9a625992 - -roles: - training: - - portuguese_bosque_train.conll - testing: - - portuguese_bosque_test_blind.conll - development: - - portuguese_bosque_test.conll - diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/coptictb-conll-cop-1.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/coptictb-conll-cop-1.0.yaml deleted file mode 100644 index b8d3462585..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/coptictb-conll-cop-1.0.yaml +++ /dev/null @@ -1,33 +0,0 @@ -groupId: org.dkpro.core.datasets.coptictb -datasetId: coptictb -version: 1.0 -# There is no ISO 639-1 language code for Coptic. Have to use ISO 639-3 -language: cop -mediaType: text/x.org.dkpro.conll-2006 -encoding: UTF-8 - -name: Coptic Treebank -url: http://copticscriptorium.org -attribution: Amir Zeldes -description: | - The Coptic Treebank from the Coptic SCRIPTORIUM corpora (http://copticscriptorium.org/). - -licenses: - - name: CC-BY 4.0 - url: https://creativecommons.org/licenses/by/4.0/ - -artifacts: - LICENSE.txt: - url: "https://github.com/CopticScriptorium/corpora/raw/3506b54ce769639c382145161da4f99fd3d6156b/coptic-treebank/LICENSE.txt" - sha1: fc0bdc662ce901ac2c631f9574c9aa8b54ebf8c7 - coptic.treebank.conll10: - url: "https://github.com/CopticScriptorium/corpora/raw/3506b54ce769639c382145161da4f99fd3d6156b/coptic-treebank/coptic.treebank.conll10" - sha1: 8c363df27408cb14cb42f3869916c1575fe1625a - -roles: - licenses: - - LICENSE.txt - data: - - coptic.treebank.conll10 - - diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/finntb-fi-3.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/finntb-fi-3.1.yaml deleted file mode 100644 index 6c933ec3a9..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/finntb-fi-3.1.yaml +++ /dev/null @@ -1,38 +0,0 @@ -groupId: org.dkpro.core.datasets.finntb -datasetId: finntb -version: 3.1 -language: nfi -mediaType: text/x.org.dkpro.conll-2006 -encoding: UTF-8 - -name: FinnTreeBank -url: http://www.ling.helsinki.fi/kieliteknologia/tutkimus/treebank/ -description: | - The FinnTreeBank project is creating a treebank and a parsebank for Finnish. This work is licensed - under a Creative Commons Attribution 3.0. - - The first and second version of the treebank is annotated by hand and based on 17.000 model - senctences in the Large Grammar of Finnish VISK - Iso Suomen Kielioppi. Brief samples of text - from other sources, e.g. news items and literature, are also available in the second version. A - parsebank for Finnish based on the Europarl and the JRC-Aquis will be available in June 2012. - - (This description has been sourced from the dataset website). - -licenses: - - name: CC-BY 3.0 - url: http://creativecommons.org/licenses/by/3.0/ - -artifacts: - LICENSE.txt: - url: http://creativecommons.org/licenses/by/3.0/legalcode.txt - sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 - ftb3.1.conllx.gz: - url: http://www.ling.helsinki.fi/kieliteknologia/tutkimus/treebank/sources/ftb3.1.conllx.gz - sha1: 7c58064bf9995980cea08e84035c0414adc54f06 - -roles: - licenses: - - LICENSE.txt - data: - - ftb3.1.conllx.gz - diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/germeval2014-de.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/germeval2014-de.yaml deleted file mode 100644 index 6370a52d99..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/germeval2014-de.yaml +++ /dev/null @@ -1,53 +0,0 @@ -groupId: org.dkpro.core.datasets.germeval2014 -datasetId: germeval2014 -# There is no uniform version for the whole datased, using the date of the most recent artifact -version: 20140911 -language: de -mediaType: text/x.org.dkpro.germeval-2014 -encoding: UTF-8 - -name: GermEval 2014 Named Entity Recognition Shared Task -url: https://sites.google.com/site/germeval2014ner/ -attribution: | - D. Benikova, C. Biemann, M. Reznicek. NoSta-D Named Entity Annotation for German: Guidelines and - Dataset. Proceedings of LREC 2014, Reykjavik, Iceland -description: | - The GermEval 2014 NER Shared Task builds on a new dataset with German Named Entity annotation - with the following properties: - - * The data was sampled from German Wikipedia and News Corpora as a collection of citations. - * The dataset covers over 31,000 sentences corresponding to over 590,000 tokens. - * The NER annotation uses the NoSta-D guidelines, which extend the Tübingen Treebank guidelines, - using four main NER categories with sub-structure, and annotating embeddings among NEs such as - `[ORG FC Kickers [LOC Darmstadt]]`. - - (This description has been sourced from the dataset website). - -licenses: - - name: CC-BY 4.0 - url: http://creativecommons.org/licenses/by/4.0/ - -artifacts: - LICENSE.txt: - url: "https://creativecommons.org/licenses/by/4.0/legalcode.txt" - sha1: 1167f0e28fe2db01e38e883aaf1e749fb09f9ceb - NER-de-dev.tsv: - url: "https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv?attredirects=0&d=1" - sha1: 70aba5d247f51ec22e0bcc671c7fb325e4ff4277 - NER-de-test.tsv: - url: "https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv?attredirects=0&d=1" - sha1: 214deaf091e01567af2e958aac87863bf685342a - NER-de-train.tsv: - url: "https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv?attredirects=0&d=1" - sha1: 7644cb09676050c0a2836e06fa0aeb8509b9e1cb - -roles: - training: - - NER-de-train.tsv - testing: - - NER-de-test.tsv - development: - - NER-de-dev.tsv - licenses: - - LICENSE.txt - diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/jos100k-conll-sl-2.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/jos100k-conll-sl-2.0.yaml deleted file mode 100644 index 34a9df038f..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/jos100k-conll-sl-2.0.yaml +++ /dev/null @@ -1,43 +0,0 @@ -groupId: org.dkpro.core.datasets.jos -datasetId: jos100k-conll -version: 2.0 -language: sl -mediaType: text/x.org.dkpro.conll-2006 -encoding: UTF-8 - -name: JOS - jos100k -url: http://nl.ijs.si/jos/jos100k-en.html -attribution: | - Tomaž Erjavec, Darja Fišer, Simon Krek, Nina Ledinek: The JOS Linguistically Tagged Corpus of - Slovene. Proceedings of the Seventh International Conference on Language Resources and Evaluation - (LREC'10), Malta, 2010. - (link:http://www.lrec-conf.org/proceedings/lrec2010/summaries/139.html[PDF]) -description: | - The jos100k corpus contains 100,000 words of sampled paragraphs from the FidaPLUS corpus. It is - meant to serve as a reference annotated corpus of Slovene: its manually-validated annotations - cover three level of linguistic description. - - (This description has been sourced from the corpus website). - -licenses: - - name: CC-BY-NC 3.0 - url: http://creativecommons.org/licenses/by-nc/3.0/ - -artifacts: - LICENSE.txt: - url: http://creativecommons.org/licenses/by-nc/3.0/legalcode.txt - sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 - data.zip: - url: http://nl.ijs.si/jos/download/jos100kv2_0.zip - sha1: 9f330ffd102cc5d5734fdaecbbf67751c84a1339 - actions: - - action: explode - configuration: { strip: 1, includes: [ "00README.txt", "jos100kv2_0-sl.conll" ] } - -roles: - licenses: - - LICENSE.txt - - data/00README.txt - data: - - data/jos100kv2_0-sl.conll - diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/ndt-nb-1.01.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/ndt-nb-1.01.yaml deleted file mode 100644 index 77d02f6d0f..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/ndt-nb-1.01.yaml +++ /dev/null @@ -1,40 +0,0 @@ -groupId: org.dkpro.core.datasets.ndt -datasetId: ndt -version: 1.01 -language: nb -mediaType: text/x.org.dkpro.conll-2006 -encoding: UTF-8 - -name: Norwegian Dependency Treebank (Norwegian Bokmål) -url: http://www.nb.no/sprakbanken/show?serial=sbr-10 -attribution: CLARINO NB – Språkbanken -description: | - The Norwegian Dependency Treebank (NDT) consists of text which is manually annotated with - morphological features, syntactic functions and hierarchical structure. The formalism used for the - syntactic annotation is dependency grammar. With a few exceptions, the syntactic analysis follows - Norsk referensegrammatikk ‘Norwegian Reference Grammar'. - - (This description has been sourced from the dataset website). - -licenses: - - name: CC0 1.0 - url: http://creativecommons.org/publicdomain/zero/1.0/ - -artifacts: - LICENSE_NDT.txt: - url: http://www.nb.no/sbfil/dok/LICENSE_NDT.txt - sha1: a2f433206f421c0d630b3bec5fad01334673b765 - 20140328_NDT_1-01.tar.gz: - url: http://www.nb.no/sbfil/tekst/20140328_NDT_1-01.tar.gz - sha1: 97935c225f98119aa94d53f37aa64762cba332f3 - shared: true - actions: - - action: explode - configuration: { strip: 1, includes: "nob/conll/*.conll" } - -roles: - licenses: - - LICENSE_NDT.txt - data: - - "20140328_NDT_1-01/nob/conll/*.conll" - diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/ndt-nn-1.01.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/ndt-nn-1.01.yaml deleted file mode 100644 index fb1cf2bea1..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/ndt-nn-1.01.yaml +++ /dev/null @@ -1,40 +0,0 @@ -groupId: org.dkpro.core.datasets.ndt -datasetId: ndt -version: 1.01 -language: nn -mediaType: text/x.org.dkpro.conll-2006 -encoding: UTF-8 - -name: Norwegian Dependency Treebank (Norwegian Nynorsk) -url: http://www.nb.no/sprakbanken/show?serial=sbr-10 -attribution: CLARINO NB – Språkbanken -description: | - The Norwegian Dependency Treebank (NDT) consists of text which is manually annotated with - morphological features, syntactic functions and hierarchical structure. The formalism used for the - syntactic annotation is dependency grammar. With a few exceptions, the syntactic analysis follows - Norsk referensegrammatikk ‘Norwegian Reference Grammar'. - - (This description has been sourced from the dataset website). - -licenses: - - name: CC0 1.0 - url: http://creativecommons.org/publicdomain/zero/1.0/ - -artifacts: - LICENSE_NDT.txt: - url: http://www.nb.no/sbfil/dok/LICENSE_NDT.txt - sha1: a2f433206f421c0d630b3bec5fad01334673b765 - 20140328_NDT_1-01.tar.gz: - url: http://www.nb.no/sbfil/tekst/20140328_NDT_1-01.tar.gz - sha1: 97935c225f98119aa94d53f37aa64762cba332f3 - shared: true - actions: - - action: explode - configuration: { strip: 1, includes: "nno/conll/*.conll" } - -roles: - licenses: - - LICENSE_NDT.txt - data: - - "20140328_NDT_1-01/nno/conll/*.conll" - diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/nemgp-de-0.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/nemgp-de-0.1.yaml deleted file mode 100644 index 5f246c8acd..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/nemgp-de-0.1.yaml +++ /dev/null @@ -1,35 +0,0 @@ -groupId: org.dkpro.core.datasets.nemgp -datasetId: nemgp -version: 0.1 -language: de -# mediaType: text/x.org.dkpro.opennlp-ne -encoding: UTF-8 - -name: Named Entity Model for German, Politics (NEMGP) -url: http://www.thomas-zastrow.de/nlp/ -attribution: Thomas Zastrow -description: | - The Named Entity Model for German, Politics (NEMGP) is a collection of texts from Wikipedia and - WikiNews, manually annotated with named entity information. - - (This description has been sourced from the dataset website). - -licenses: - - name: CC-BY-SA 3.0 - url: https://creativecommons.org/licenses/by-sa/3.0/ - -artifacts: - LICENSE.txt: - url: https://creativecommons.org/licenses/by-sa/3.0/legalcode.txt - sha1: fb41626a3005c2b6e14b8b3f5d9d0b19b5faaa51 - data.zip: - url: "http://www.thomas-zastrow.de/nlp/nemgp_trainingdata_01.txt.zip" - sha1: f2a1fd54df9232741a3a1892d1ffb0a4d7205991 - actions: - - action: explode - -roles: - licenses: - - LICENSE.txt - training: - - data/nemgp_trainingdata_01.txt diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/perseus-el-2.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/perseus-el-2.1.yaml deleted file mode 100644 index 7e1c7c0f4c..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/perseus-el-2.1.yaml +++ /dev/null @@ -1,38 +0,0 @@ -groupId: org.dkpro.core.datasets.agdt -datasetId: agdt -version: 2.1 -language: el -# mediaType: unknown -encoding: UTF-8 - -name: Ancient Greek and Latin Dependency Treebank (Greek) -url: https://perseusdl.github.io/treebank_data/ -attribution: Giuseppe G. A. Celano, Gregory Crane, Bridget Almas et al. -description: | - The Ancient Greek and Latin Dependency Treebank (AGLDT) is the earliest treebank for Ancient Greek - and Latin. The project started at Tufts University in 2006 and is currently under development and - maintenance at Leipzig University-Tufts University. - - (This description has been sourced from the dataset website). - -licenses: - - name: CC-BY-SA 3.0 - url: https://creativecommons.org/licenses/by-sa/3.0/ - -artifacts: - LICENSE.txt: - url: http://creativecommons.org/licenses/by-sa/3.0/legalcode.txt - sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 - perseus.zip: - url: "https://github.com/PerseusDL/treebank_data/archive/f56a35f65ef15ac454f6fbd2cfc6ea97bf2ca9b8.zip" - sha1: 140eee6d2e3e83745f95d3d5274d9e965d898980 - shared: true - actions: - - action: explode - configuration: { strip: 1, includes: [ "README.md", "v2.1/Greek/**/*" ] } - -roles: - licenses: - - LICENSE.txt - data: - - "perseus/v2.1/Greek/texts/*.xml" diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/perseus-la-2.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/perseus-la-2.1.yaml deleted file mode 100644 index fe3a1020fd..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/perseus-la-2.1.yaml +++ /dev/null @@ -1,39 +0,0 @@ -groupId: org.dkpro.core.datasets.agdt -datasetId: agdt -version: 2.1 -language: la -# mediaType: unknown -encoding: ISO-8859-1 - -name: Ancient Greek and Latin Dependency Treebank (Latin) -url: https://perseusdl.github.io/treebank_data/ -attribution: Giuseppe G. A. Celano, Gregory Crane, Bridget Almas et al. -description: | - The Ancient Greek and Latin Dependency Treebank (AGLDT) is the earliest treebank for Ancient Greek - and Latin. The project started at Tufts University in 2006 and is currently under development and - maintenance at Leipzig University-Tufts University. - - (This description has been sourced from the dataset website). - -licenses: - - name: CC-BY-SA 3.0 - url: https://creativecommons.org/licenses/by-sa/3.0/ - -artifacts: - LICENSE.txt: - url: http://creativecommons.org/licenses/by-sa/3.0/legalcode.txt - sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 - perseus.zip: - url: "https://github.com/PerseusDL/treebank_data/archive/f56a35f65ef15ac454f6fbd2cfc6ea97bf2ca9b8.zip" - sha1: 140eee6d2e3e83745f95d3d5274d9e965d898980 - shared: true - actions: - - action: explode - configuration: { strip: 1, includes: [ "README.md", "v2.1/Latin/**/*" ] } - -roles: - licenses: - - LICENSE.txt - data: - - "perseus/v2.1/Latin/texts/*.xml" - diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/poldb-pl-0.5.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/poldb-pl-0.5.yaml deleted file mode 100644 index c54243e850..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/poldb-pl-0.5.yaml +++ /dev/null @@ -1,32 +0,0 @@ -groupId: org.dkpro.core.datasets.poldb -datasetId: poldb -version: 0.5 -language: pl -mediaType: text/x.org.dkpro.conll-2006 -encoding: UTF-8 - -name: Polish Dependency Bank -url: http://zil.ipipan.waw.pl/Składnica -description: | - The dependency treebank (Składnica zależnościowa), version 0.5, is a result of an automatic - conversion of manually disambiguated constituency trees into dependency structures. - - (This description has been sourced from the corpus website). - -licenses: - - name: GPL 3.0 - url: https://www.gnu.org/licenses/gpl-3.0.html - -artifacts: - LICENSE.txt: - url: https://www.gnu.org/licenses/gpl-3.0.txt - sha1: 8624bcdae55baeef00cd11d5dfcfa60f68710a02 - poldb-0.5.conll.gz: - url: "http://zil.ipipan.waw.pl/Składnica?action=AttachFile&do=get&target=Składnica-zależnościowa-0.5.conll.gz" - sha1: 187424608e91b271957dabcf140a7274f1c88d63 - -roles: - licenses: - - LICENSE.txt - data: - - poldb-0.5.conll.gz diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/poltb-pl-0.5.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/poltb-pl-0.5.yaml deleted file mode 100644 index 260a1c3e21..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/poltb-pl-0.5.yaml +++ /dev/null @@ -1,32 +0,0 @@ -groupId: org.dkpro.core.datasets.poltb -datasetId: poltb -version: 0.5 -language: pl -mediaType: application/x.org.dkpro.tiger+xml -encoding: UTF-8 - -name: Polish Constituency Treebank -url: http://zil.ipipan.waw.pl/Składnica -description: | - The Polish constituency treebank (Składnica frazowa), version 0.5. Trees in the Tiger XML format - containing only parse trees selected by dendrologists (one interpretation per sentence). - - (This description has been sourced from the corpus website). - -licenses: - - name: GPL 3.0 - url: https://www.gnu.org/licenses/gpl-3.0.html - -artifacts: - LICENSE.txt: - url: https://www.gnu.org/licenses/gpl-3.0.txt - sha1: 8624bcdae55baeef00cd11d5dfcfa60f68710a02 - poltb-0.5-tiger.xml.gz: - url: "http://zil.ipipan.waw.pl/Składnica?action=AttachFile&do=get&target=Składnica-frazowa-0.5-TigerXML.xml.gz" - sha1: c8977d436d218b726d657224305bced178071dcf - -roles: - licenses: - - LICENSE.txt - data: - - poltb-0.5-tiger.xml.gz diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/sethr-hr-1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/sethr-hr-1.yaml deleted file mode 100644 index f3ad25971d..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/sethr-hr-1.yaml +++ /dev/null @@ -1,31 +0,0 @@ -groupId: org.dkpro.core.datasets.sethr -datasetId: sethr -version: 1 -language: hr -mediaType: text/x.org.dkpro.conll-2006 -encoding: UTF-8 - -name: SETimes.HR dependency treebank -url: http://nlp.ffzg.hr/resources/corpora/setimes-hr/ -description: | - The corpus is based on the Croatian part of the SETimes parallel corpus. - - (This description has been sourced from the corpus website). - -licenses: - - name: CC-BY-SA 3.0 - url: http://creativecommons.org/licenses/by-sa/3.0/ - -artifacts: - LICENSE.txt: - url: http://creativecommons.org/licenses/by-sa/3.0/legalcode.txt - sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 - setimes.hr.v1.conllx.gz: - url: http://nlp.ffzg.hr/data/corpora/setimes.hr.v1.conllx.gz - sha1: 0faebfe55136692f83dcddd4cf659a8b59655d62 - -roles: - licenses: - - LICENSE.txt - data: - - setimes.hr.v1.conllx.gz diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/sethrplus-hr-20160613.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/sethrplus-hr-20160613.yaml deleted file mode 100644 index 635123c5f0..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/sethrplus-hr-20160613.yaml +++ /dev/null @@ -1,49 +0,0 @@ -groupId: org.dkpro.core.datasets.sethr -datasetId: sethrplus -version: 20160613 -language: hr -mediaType: text/x.org.dkpro.conll-u -encoding: UTF-8 - -name: SETimes.HR+ Croatian dependency treebank -url: https://github.com/ffnlp/sethr -attribution: | - Agić and Ljubešić (2014) - (link:http://www.lrec-conf.org/proceedings/lrec2014/pdf/690_Paper.pdf[PDF]) - (link:http://aclweb.org/anthology/L/L14/L14-1542.bib[bib]) -description: | - The treebank is a result of an effort in providing free-culture language resources for Croatian by - the NLP group at FF Zagreb. - - (This description has been sourced from the corpus website). - -licenses: - - name: CC-BY 4.0 - url: http://creativecommons.org/licenses/by/4.0/ - comment: SETimes.HR dataset (set.hr.conll) - - name: CC-BY-NC-SA 4.0 - url: https://creativecommons.org/licenses/by-nc-sa/4.0/ - comment: web.hr.conll and news.hr.conll datasets - -artifacts: - LICENSE-CC-BY.txt: - url: "https://creativecommons.org/licenses/by/4.0/legalcode.txt" - sha1: 1167f0e28fe2db01e38e883aaf1e749fb09f9ceb - LICENSE-CC-BY-NC-SA.txt: - url: "https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt" - sha1: 5d572362228001e9dbc0c8802f49121ceb78ace2 - data.zip: - url: https://github.com/ffnlp/sethr/archive/c50697a81ee588b70328952dd56175da4c298c7c.zip - sha1: a52d13cfa91589c0d93fe0a90333a4f0e997b7cf - actions: - - action: explode - configuration: { strip: 1, includes: [ "LICENSE.md", "README.md", "*.hr*.conll" ] } - -roles: - licenses: - - LICENSE-CC-BY.txt - - LICENSE-CC-BY-NC-SA.txt - training: - - "data/*.hr.conll" - testing: - - "data/*.hr.test.conll" diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/tedtreebank-conll-en-1.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/tedtreebank-conll-en-1.0.yaml deleted file mode 100644 index a0de5bcac6..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/tedtreebank-conll-en-1.0.yaml +++ /dev/null @@ -1,47 +0,0 @@ -groupId: org.dkpro.core.datasets.tedtreebank -datasetId: tedtreebank-conll -version: 1.0 -language: en -mediaType: text/x.org.dkpro.conll-2006 -encoding: UTF-8 - -name: NAIST/NTT TED Treebank -url: http://ahclab.naist.jp/resource/tedtreebank/ -attribution: | - Graham Neubig, Katsuhito Sudoh, Yusuke Oda, Kevin Duh, Hajime Tsukada, Masaaki Nagata. - The NAIST-NTT Ted Talk Treebank. In proceedings of International Workshop on Spoken Language - Translation (IWSLT). Lake Tahoe, USA. December 2014. - (link:http://www.phontron.com/paper/neubig14iwslt.pdf[PDF]) - (link:http://phontron.com/bibtex.php?n=222[bib]) - -licenses: - - name: CC-BY-NC-SA 3.0 (?) - url: https://creativecommons.org/licenses/by-nc-sa/3.0/ - -description: | - The NAIST-NTT Ted Talk Treebank is a manually annotated treebank of TED talks that was created - through a joint research project of NAIST and the NTT CS Lab. All treebank annotation follows the - Penn Treebank standard. - - (This description has been sourced from the corpus website/README file in the corpus). - - NOTE: The website does not state which version of the CC-BY-SA-NC applies. One might consider - it is the version 3.0 which is also used for the TED talks themselves. - -artifacts: - LICENSE.txt: - url: http://creativecommons.org/licenses/by-nc-sa/3.0/legalcode.txt - sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 - data.tar.gz: - url: http://ahclab.naist.jp/resource/tedtreebank/naist-ntt-ted-treebank-v1.tar.gz - sha1: 89c6495bd64c4b3e699b4c478b47a0c827ea46ea - actions: - - action: explode - configuration: { strip: 1, includes: [ "README.md", "en-dep/*.dep" ] } - -roles: - licenses: - - LICENSE.txt - - data/README.md - data: - - "data/en-dep/*.dep" diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/tut-conll-it-20101122.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/tut-conll-it-20101122.yaml deleted file mode 100644 index a8a3233f1a..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/tut-conll-it-20101122.yaml +++ /dev/null @@ -1,51 +0,0 @@ -groupId: org.dkpro.core.datasets.tut -datasetId: tut -version: 20101122 -language: it -mediaType: text/x.org.dkpro.conll-2006 -encoding: UTF-8 - -name: Turin University Treebank -url: http://www.di.unito.it/~tutreeb/treebanks.html -attribution: | - Cristina Bosco, Leonardo Lesmo, Vincenzo Lombardo, Alessandro Mazzei, Livio Robaldo -description: | - TUT is a morpho-syntactically annotated collection of Italian sentences, which includes texts - from different text genres and domains, released in several annotation formats. - - (This description has been sourced from the corpus website). - -licenses: - - name: CC-BY-NC-SA 2.5 - url: http://creativecommons.org/licenses/by-nc-sa/2.5/it/ - -artifacts: - NEWS.zip: - url: http://www.di.unito.it/~tutreeb/corpora/tutINconll/NEWS-22nov2010.conl.zip - sha1: 3d9b22d8ebf533aa1d6d39d417316c30900b9a0e - actions: - - action: explode - VEDCH.zip: - url: http://www.di.unito.it/~tutreeb/corpora/tutINconll/VEDCH-22nov2010.conl.zip - sha1: 2278e6e770ddc4a8eea5e045c4a77a5df2ae0977 - actions: - - action: explode - CODICECIVILE.zip: - url: http://www.di.unito.it/~tutreeb/corpora/tutINconll/CODICECIVILE-22nov2010.conl.zip - sha1: 9cf9c0a9c652b3df6564d1fa0ca97c2d7905faa3 - actions: - - action: explode - EUDIR.zip: - url: http://www.di.unito.it/~tutreeb/corpora/tutINconll/EUDIR-22nov2010.conl.zip - sha1: 72a6e55627481ff99930b59714cfc0909ccf60e1 - actions: - - action: explode - WIKI.zip: - url: http://www.di.unito.it/~tutreeb/corpora/tutINconll/WIKI-22nov2010.conl.zip - sha1: a421f488859324e3e12687b9a3067652248eb8df - actions: - - action: explode - -roles: - data: - - "**/*.conl" diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/updt-fa-1.3.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/updt-fa-1.3.yaml deleted file mode 100644 index f7e4f743bf..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/updt-fa-1.3.yaml +++ /dev/null @@ -1,50 +0,0 @@ -groupId: org.dkpro.core.datasets.sequoia -datasetId: sequoia -version: 1.3 -language: fa -mediaType: text/x.org.dkpro.conll-2006 -encoding: UTF-8 - -name: Uppsala Persian Dependency Treebank -url: http://stp.lingfil.uu.se/%7Emojgan/UPDT.html -attribution: Mojgan Seraji, under the supervision of Joakim Nivre and Carina Jahani. -description: | - Uppsala Persian Dependency Treebank (UPDT) (Seraji, 2015, Chapter 5, pp. 97-146) is a - dependency-based syntactically annotated corpus. - - (This description has been sourced from the dataset website). - -licenses: - - name: CC-BY 3.0 - url: http://creativecommons.org/licenses/by/3.0/ - -artifacts: - LICENSE.txt: - url: http://creativecommons.org/licenses/by/3.0/legalcode.txt - sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 - train-conll.tar.gz: - url: "http://stp.lingfil.uu.se/~mojgan/train.conll.tar.gz" - sha1: 6ace1d1132b121b09d0b88f53749d28a59843cd5 - actions: - - action: explode - dev-conll.tar.gz: - url: "http://stp.lingfil.uu.se/~mojgan/dev.conll.tar.gz" - sha1: e96a06b399bb1f565e16e49fb4dfe7da241f5d75 - actions: - - action: explode - test-conll.tar.gz: - url: "http://stp.lingfil.uu.se/~mojgan/test.conll.tar.gz" - sha1: ec79e91413dd2c49883bfbbd1a207f68377ac683 - actions: - - action: explode - -roles: - licenses: - - LICENSE.txt - training: - - train-conll/train.conll - testing: - - test-conll/test.conll - development: - - dev-conll/dev.conll - diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/wasr-de-1.00.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/wasr-de-1.00.yaml deleted file mode 100644 index 31254a6153..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/wasr-de-1.00.yaml +++ /dev/null @@ -1,41 +0,0 @@ -groupId: org.dkpro.core.datasets.wasr -datasetId: wasr-l-en -version: 1.00 -language: en -mediaType: text/x.org.dkpro.conll-2009 -encoding: UTF-8 - -name: English Word Sense and Semantic Role Datasets (WaSR) -url: https://www.ukp.tu-darmstadt.de/data/semantic-role-resources/knowledge-based-semantic-role-labeling/ -attribution: | - Silvana Hartmann, Judith Eckle-Kohler, and Iryna Gurevych. Generating Training Data for Semantic - Role Labeling based on Label Transfer from Linked Lexical Resources. In: Transactions of the - Association for Computational Linguistics, vol. 4, no. 1, p. (to appear), 2016. - (link:https://www.ukp.tu-darmstadt.de/fileadmin/user_upload/Group_UKP/publikationen/2016/717-cameraready.pdf[PDF]) -description: | - German Frame and Role Annotations. - - (This description has been sourced from the README file included with the corpus). - -licenses: - - name: CC-BY-NC-ND 3.0 - url: https://creativecommons.org/licenses/by-nc-nd/3.0/ - -artifacts: - LICENSE.txt: - url: http://creativecommons.org/licenses/by-nc-nd/3.0/legalcode.txt - sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 - data.tar.bz2: - url: "https://www.ukp.tu-darmstadt.de/fileadmin/user_upload/Group_UKP/data/semantic_role_resources/WaSR-de_v1.tar.bz2" - sha1: b706711ae6fffc94409f80b635595bd45d8c2ece - actions: - - action: explode - configuration: { strip: 1 } - - action: explode - configuration: { file: "data/WaSR-de_all.tar.bz2" } - -roles: - licenses: - - LICENSE.txt - data: - - "WaSR-de_all/WaSR-de_all.tsv" diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/wasr-l-en-1.00.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/wasr-l-en-1.00.yaml deleted file mode 100644 index ec9577c2bb..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/wasr-l-en-1.00.yaml +++ /dev/null @@ -1,42 +0,0 @@ -groupId: org.dkpro.core.datasets.wasr -datasetId: wasr-l-en -version: 1.00 -language: en -mediaType: text/x.org.dkpro.conll-2009 -encoding: UTF-8 - -name: English Word Sense and Semantic Role Datasets (WaSR) -url: https://www.ukp.tu-darmstadt.de/data/semantic-role-resources/knowledge-based-semantic-role-labeling/ -attribution: | - Silvana Hartmann, Judith Eckle-Kohler, and Iryna Gurevych. Generating Training Data for Semantic - Role Labeling based on Label Transfer from Linked Lexical Resources. In: Transactions of the - Association for Computational Linguistics, vol. 4, no. 1, p. (to appear), 2016. - (link:https://www.ukp.tu-darmstadt.de/fileadmin/user_upload/Group_UKP/publikationen/2016/717-cameraready.pdf[PDF]) -description: | - English Frame and Role Annotations. - - (This description has been sourced from the README file included with the corpus). - -licenses: - - name: CC-BY-NC-ND 3.0 - url: https://creativecommons.org/licenses/by-nc-nd/3.0/ - -artifacts: - LICENSE.txt: - url: http://creativecommons.org/licenses/by-nc-nd/3.0/legalcode.txt - sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 - part1.tar.bz2: - url: "https://www.ukp.tu-darmstadt.de/fileadmin/user_upload/Group_UKP/data/semantic_role_resources/WaSR-en_v1_part1.tar.bz2" - sha1: ef7ccf5cb23da63003bdb19d99b15b0ea2821e55 - shared: true - actions: - - action: explode - configuration: { strip: 1, includes: [ "README", "WaSR_L_all.7z" ] } - - action: explode - configuration: { file: "part1/WaSR_L_all.7z" } - -roles: - licenses: - - LICENSE.txt - data: - - "WaSR_L_all/WaSR_L_all.tsv" diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/wasr-xl-en-1.00.yaml b/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/wasr-xl-en-1.00.yaml deleted file mode 100644 index e1657a8e9f..0000000000 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/wasr-xl-en-1.00.yaml +++ /dev/null @@ -1,63 +0,0 @@ -groupId: org.dkpro.core.datasets.wasr -datasetId: wasr-xl-en -version: 1.00 -language: en -mediaType: text/x.org.dkpro.conll-2009 -encoding: UTF-8 - -name: English Word Sense and Semantic Role Datasets (WaSR) -url: https://www.ukp.tu-darmstadt.de/data/semantic-role-resources/knowledge-based-semantic-role-labeling/ -attribution: | - Silvana Hartmann, Judith Eckle-Kohler, and Iryna Gurevych. Generating Training Data for Semantic - Role Labeling based on Label Transfer from Linked Lexical Resources. In: Transactions of the - Association for Computational Linguistics, vol. 4, no. 1, p. (to appear), 2016. - (link:https://www.ukp.tu-darmstadt.de/fileadmin/user_upload/Group_UKP/publikationen/2016/717-cameraready.pdf[PDF]) -description: | - English Frame and Role Annotations. - - (This description has been sourced from the README file included with the corpus). - -licenses: - - name: CC-BY-NC-ND 3.0 - url: https://creativecommons.org/licenses/by-nc-nd/3.0/ - -artifacts: - LICENSE.txt: - url: http://creativecommons.org/licenses/by-nc-nd/3.0/legalcode.txt - sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 - part1.tar.bz2: - url: "https://www.ukp.tu-darmstadt.de/fileadmin/user_upload/Group_UKP/data/semantic_role_resources/WaSR-en_v1_part1.tar.bz2" - sha1: ef7ccf5cb23da63003bdb19d99b15b0ea2821e55 - shared: true - actions: - - action: explode - configuration: { strip: 1, includes: [ "README", "WaSR_XL_part1_3.7z" ] } - - action: explode - configuration: { file: "part1/WaSR_XL_part1_3.7z" } - part2.tar.bz2: - url: "https://www.ukp.tu-darmstadt.de/fileadmin/user_upload/Group_UKP/data/semantic_role_resources/WaSR-en_v1_part2.tar.bz2" - sha1: 0a9c98cbf1fe02841edf52e963444a7e38986577 - shared: true - actions: - - action: explode - configuration: { strip: 1 } - - action: explode - configuration: { file: "part2/WaSR_XL_part2_3.7z" } - part3.tar.bz2: - url: "https://www.ukp.tu-darmstadt.de/fileadmin/user_upload/Group_UKP/data/semantic_role_resources/WaSR-en_v1_part3.tar.bz2" - sha1: 9c0cc79ecab9140f82683d39ed6acb51b148f9f7 - shared: true - actions: - - action: explode - configuration: { strip: 1 } - - action: explode - configuration: { file: "part3/WaSR_XL_part3_3.7z" } - -roles: - licenses: - - LICENSE.txt - data: - - "WaSR_XL_part1_3/WaSR_XL_part1_3.tsv" - - "WaSR_XL_part2_3/WaSR_XL_part2_3.tsv" - - "WaSR_XL_part3_3/WaSR_XL_part3_3.tsv" - diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/alpino-conll-nl-20100114.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/alpino-conll-nl-20100114.yaml new file mode 100644 index 0000000000..c4fcfc94fb --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/alpino-conll-nl-20100114.yaml @@ -0,0 +1,34 @@ +groupId: org.dkpro.core.datasets.alpino +datasetId: alpino-alpino2conll +version: 20100114 +language: nl +mediaType: text/x.org.dkpro.conll-2006 +encoding: UTF-8 + +name: Alpino2conll +url: http://www.let.rug.nl/~bplank/alpino2conll/ +attribution: | + Barbara Plank. Improved statistical measures to assess natural language parser performance across + domains. In Proceedings of the 7th International Conference on Language Resources and Evaluation + (LREC2010), Valletta, Malta, May 2010. +description: | + Training and test datasets for Dutch in retagged CoNLL format. The data was converted from Alpino + XML into CoNLL format based on an adapted version of Erwin Marsi's conversion software, but PoS + tags were replaced by automatically assigned Alpino tags. + + (This description has been sourced from the corpus website). + +artifacts: + cdb.conll.utf8: + url: http://www.let.rug.nl/~bplank/alpino2conll/data/cdb.conll.utf8 + sha1: f5e1517383f4489c8cb0c75ad202ac57c21874fc + sha512: d3702175a3a233cd3b158ae5854ec28ae0bc058108b7c0dac071eff01383501d79c098ae3eeb04c516e105d07bb3dc69bbb7aef6eb19f97607ad73931ee80a48 + conll2006-test.conll: + url: http://www.let.rug.nl/~bplank/alpino2conll/data/conll2006-test.conll + sha1: c055154ae56dfa8c29d304ed852af90aedf00a5d + sha512: 34792f773363d4b25b396748fd78b14c0e88fc46793800148fa59855835077b1a2a67f56d4989fb58407a4dc79afdc8e09ab803b9dc32ceac22bd9f9d9a4725b + +roles: + data: + - "cdb.conll.utf8" + - "conll2006-test.conll" diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/aqmar-ar-1.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/aqmar-ar-1.0.yaml new file mode 100644 index 0000000000..e966f196e9 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/aqmar-ar-1.0.yaml @@ -0,0 +1,42 @@ +groupId: org.dkpro.core.datasets.aqmar +datasetId: aqmar +version: 1.0 +language: ar +mediaType: text/x.org.dkpro.conll-2000 +encoding: UTF-8 + +name: AQMAR Arabic Wikipedia Named Entity Corpus +url: http://www.cs.cmu.edu/~ark/ArabicNER/ +description: | + 73,853 tokens in 28 Arabic Wikipedia articles hand-annotated for named entities. + + (This description has been partially copied from the corpus website). + +attribution: | + By Behrang Mohit, Nathan Schneider, Rishav Bhowmick, Kemal Oflazer, and Noah Smith as part of the + AQMAR project. + +licenses: + - name: CC-BY-SA 3.0 + url: https://creativecommons.org/licenses/by-sa/3.0/ + +artifacts: + LICENSE.txt: + url: http://www.cs.cmu.edu/~ark/ArabicNER/corpus/LICENSE + sha1: 54977f4065ec070057e99b4b446273e5c8f071d2 + sha512: 10ebe8ff7e3e41c65ff1ce412c6af0dc5bde5eedd1847440e82d50629f102ed7f8d1af24e551ea5c7b2bb846f186edeeda5d0bc853774e41cdb70b78a5158180 + verificationMode: TEXT + data.zip: + url: "http://www.cs.cmu.edu/~ark/ArabicNER/AQMAR_Arabic_NER_corpus-1.0.zip" + sha1: 4fa2c37d7673bb456c6e382566a091545531d85f + sha512: 3936cbc9a0e8f07090cab1cac27b348352bafccf427a47d5257b6975e0231b27c7e62c8d86d22a0c533310bdcbebd7cbc1ae91c727265727bc1ca0dd540a6b4c + actions: + - action: explode + configuration: { includes: "*.txt" } + +roles: + licenses: + - LICENSE.txt + data: + - "data/*.txt" + diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/brown-en-teixml.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/brown-en-teixml.yaml similarity index 91% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/brown-en-teixml.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/brown-en-teixml.yaml index 77f68e3ab7..9d00638e78 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/brown-en-teixml.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/brown-en-teixml.yaml @@ -38,6 +38,7 @@ artifacts: brown.zip: url: "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/brown_tei.zip" sha1: 1e4eadeb358f6f7e6ac9b3677a82f4353bbe91ed + sha512: f3dcc36bcab63d481e4d833c8946f10163f732166114c8fdd63932fff9fba3c236593a082ebcdf96f74aea6d33e424b7be4c645fd0f5ee5090f0335544c02c47 actions: - action: explode configuration: { excludes: "Corpus.xml", strip: 1 } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/cdt-conll-da-1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/cdt-conll-da-1.yaml similarity index 78% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/cdt-conll-da-1.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/cdt-conll-da-1.yaml index 18cbf66717..682ba7ff79 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/cdt-conll-da-1.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/cdt-conll-da-1.yaml @@ -28,11 +28,14 @@ licenses: artifacts: LICENSE.txt: - url: https://www.gnu.org/licenses/gpl-2.0.txt - sha1: 4cc77b90af91e615a64ae04893fdffa7939db84c + url: classpath:/org/dkpro/core/api/datasets/lib/common-licenses/gpl-2.0.txt + sha1: 0e5aad9553dc0ed784ec220bb09e22d52fefbb8b + sha512: 7881dbc2d75fd63161fa31c2209b21c7858e9664c3cab00fcae14bddac91bded9eba2f34252b5d734fa3f98c35e6ed3a388044eba3bbb746233aafac182cc442 + verificationMode: TEXT data.zip: url: https://github.com/mbkromann/copenhagen-dependency-treebank/archive/2fa64f811364db42407fb4bcdd2189d4ee33bda1.zip sha1: 11313d405abb0f268247a2d5420afa413eb244e7 + sha512: 9184e3bb3e07caffd932f38060a37d80aa294f6b3c05cd68754ed46d8a82cb892b94a348e1ccf17104740daf900a2f7a7adda4bce35615e8138970ee949e3da5 shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-3.0.txt b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-3.0.txt new file mode 100644 index 0000000000..bd32fa8477 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-3.0.txt @@ -0,0 +1,319 @@ +Creative Commons Legal Code + +Attribution 3.0 Unported + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR + DAMAGES RESULTING FROM ITS USE. + +License + +THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE +COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY +COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS +AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED. + +BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE +TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY +BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS +CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND +CONDITIONS. + +1. Definitions + + a. "Adaptation" means a work based upon the Work, or upon the Work and + other pre-existing works, such as a translation, adaptation, + derivative work, arrangement of music or other alterations of a + literary or artistic work, or phonogram or performance and includes + cinematographic adaptations or any other form in which the Work may be + recast, transformed, or adapted including in any form recognizably + derived from the original, except that a work that constitutes a + Collection will not be considered an Adaptation for the purpose of + this License. For the avoidance of doubt, where the Work is a musical + work, performance or phonogram, the synchronization of the Work in + timed-relation with a moving image ("synching") will be considered an + Adaptation for the purpose of this License. + b. "Collection" means a collection of literary or artistic works, such as + encyclopedias and anthologies, or performances, phonograms or + broadcasts, or other works or subject matter other than works listed + in Section 1(f) below, which, by reason of the selection and + arrangement of their contents, constitute intellectual creations, in + which the Work is included in its entirety in unmodified form along + with one or more other contributions, each constituting separate and + independent works in themselves, which together are assembled into a + collective whole. A work that constitutes a Collection will not be + considered an Adaptation (as defined above) for the purposes of this + License. + c. "Distribute" means to make available to the public the original and + copies of the Work or Adaptation, as appropriate, through sale or + other transfer of ownership. + d. "Licensor" means the individual, individuals, entity or entities that + offer(s) the Work under the terms of this License. + e. "Original Author" means, in the case of a literary or artistic work, + the individual, individuals, entity or entities who created the Work + or if no individual or entity can be identified, the publisher; and in + addition (i) in the case of a performance the actors, singers, + musicians, dancers, and other persons who act, sing, deliver, declaim, + play in, interpret or otherwise perform literary or artistic works or + expressions of folklore; (ii) in the case of a phonogram the producer + being the person or legal entity who first fixes the sounds of a + performance or other sounds; and, (iii) in the case of broadcasts, the + organization that transmits the broadcast. + f. "Work" means the literary and/or artistic work offered under the terms + of this License including without limitation any production in the + literary, scientific and artistic domain, whatever may be the mode or + form of its expression including digital form, such as a book, + pamphlet and other writing; a lecture, address, sermon or other work + of the same nature; a dramatic or dramatico-musical work; a + choreographic work or entertainment in dumb show; a musical + composition with or without words; a cinematographic work to which are + assimilated works expressed by a process analogous to cinematography; + a work of drawing, painting, architecture, sculpture, engraving or + lithography; a photographic work to which are assimilated works + expressed by a process analogous to photography; a work of applied + art; an illustration, map, plan, sketch or three-dimensional work + relative to geography, topography, architecture or science; a + performance; a broadcast; a phonogram; a compilation of data to the + extent it is protected as a copyrightable work; or a work performed by + a variety or circus performer to the extent it is not otherwise + considered a literary or artistic work. + g. "You" means an individual or entity exercising rights under this + License who has not previously violated the terms of this License with + respect to the Work, or who has received express permission from the + Licensor to exercise rights under this License despite a previous + violation. + h. "Publicly Perform" means to perform public recitations of the Work and + to communicate to the public those public recitations, by any means or + process, including by wire or wireless means or public digital + performances; to make available to the public Works in such a way that + members of the public may access these Works from a place and at a + place individually chosen by them; to perform the Work to the public + by any means or process and the communication to the public of the + performances of the Work, including by public digital performance; to + broadcast and rebroadcast the Work by any means including signs, + sounds or images. + i. "Reproduce" means to make copies of the Work by any means including + without limitation by sound or visual recordings and the right of + fixation and reproducing fixations of the Work, including storage of a + protected performance or phonogram in digital form or other electronic + medium. + +2. Fair Dealing Rights. Nothing in this License is intended to reduce, +limit, or restrict any uses free from copyright or rights arising from +limitations or exceptions that are provided for in connection with the +copyright protection under copyright law or other applicable laws. + +3. License Grant. Subject to the terms and conditions of this License, +Licensor hereby grants You a worldwide, royalty-free, non-exclusive, +perpetual (for the duration of the applicable copyright) license to +exercise the rights in the Work as stated below: + + a. to Reproduce the Work, to incorporate the Work into one or more + Collections, and to Reproduce the Work as incorporated in the + Collections; + b. to create and Reproduce Adaptations provided that any such Adaptation, + including any translation in any medium, takes reasonable steps to + clearly label, demarcate or otherwise identify that changes were made + to the original Work. For example, a translation could be marked "The + original work was translated from English to Spanish," or a + modification could indicate "The original work has been modified."; + c. to Distribute and Publicly Perform the Work including as incorporated + in Collections; and, + d. to Distribute and Publicly Perform Adaptations. + e. For the avoidance of doubt: + + i. Non-waivable Compulsory License Schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme cannot be waived, the Licensor + reserves the exclusive right to collect such royalties for any + exercise by You of the rights granted under this License; + ii. Waivable Compulsory License Schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme can be waived, the Licensor waives the + exclusive right to collect such royalties for any exercise by You + of the rights granted under this License; and, + iii. Voluntary License Schemes. The Licensor waives the right to + collect royalties, whether individually or, in the event that the + Licensor is a member of a collecting society that administers + voluntary licensing schemes, via that society, from any exercise + by You of the rights granted under this License. + +The above rights may be exercised in all media and formats whether now +known or hereafter devised. The above rights include the right to make +such modifications as are technically necessary to exercise the rights in +other media and formats. Subject to Section 8(f), all rights not expressly +granted by Licensor are hereby reserved. + +4. Restrictions. The license granted in Section 3 above is expressly made +subject to and limited by the following restrictions: + + a. You may Distribute or Publicly Perform the Work only under the terms + of this License. You must include a copy of, or the Uniform Resource + Identifier (URI) for, this License with every copy of the Work You + Distribute or Publicly Perform. You may not offer or impose any terms + on the Work that restrict the terms of this License or the ability of + the recipient of the Work to exercise the rights granted to that + recipient under the terms of the License. You may not sublicense the + Work. You must keep intact all notices that refer to this License and + to the disclaimer of warranties with every copy of the Work You + Distribute or Publicly Perform. When You Distribute or Publicly + Perform the Work, You may not impose any effective technological + measures on the Work that restrict the ability of a recipient of the + Work from You to exercise the rights granted to that recipient under + the terms of the License. This Section 4(a) applies to the Work as + incorporated in a Collection, but this does not require the Collection + apart from the Work itself to be made subject to the terms of this + License. If You create a Collection, upon notice from any Licensor You + must, to the extent practicable, remove from the Collection any credit + as required by Section 4(b), as requested. If You create an + Adaptation, upon notice from any Licensor You must, to the extent + practicable, remove from the Adaptation any credit as required by + Section 4(b), as requested. + b. If You Distribute, or Publicly Perform the Work or any Adaptations or + Collections, You must, unless a request has been made pursuant to + Section 4(a), keep intact all copyright notices for the Work and + provide, reasonable to the medium or means You are utilizing: (i) the + name of the Original Author (or pseudonym, if applicable) if supplied, + and/or if the Original Author and/or Licensor designate another party + or parties (e.g., a sponsor institute, publishing entity, journal) for + attribution ("Attribution Parties") in Licensor's copyright notice, + terms of service or by other reasonable means, the name of such party + or parties; (ii) the title of the Work if supplied; (iii) to the + extent reasonably practicable, the URI, if any, that Licensor + specifies to be associated with the Work, unless such URI does not + refer to the copyright notice or licensing information for the Work; + and (iv) , consistent with Section 3(b), in the case of an Adaptation, + a credit identifying the use of the Work in the Adaptation (e.g., + "French translation of the Work by Original Author," or "Screenplay + based on original Work by Original Author"). The credit required by + this Section 4 (b) may be implemented in any reasonable manner; + provided, however, that in the case of a Adaptation or Collection, at + a minimum such credit will appear, if a credit for all contributing + authors of the Adaptation or Collection appears, then as part of these + credits and in a manner at least as prominent as the credits for the + other contributing authors. For the avoidance of doubt, You may only + use the credit required by this Section for the purpose of attribution + in the manner set out above and, by exercising Your rights under this + License, You may not implicitly or explicitly assert or imply any + connection with, sponsorship or endorsement by the Original Author, + Licensor and/or Attribution Parties, as appropriate, of You or Your + use of the Work, without the separate, express prior written + permission of the Original Author, Licensor and/or Attribution + Parties. + c. Except as otherwise agreed in writing by the Licensor or as may be + otherwise permitted by applicable law, if You Reproduce, Distribute or + Publicly Perform the Work either by itself or as part of any + Adaptations or Collections, You must not distort, mutilate, modify or + take other derogatory action in relation to the Work which would be + prejudicial to the Original Author's honor or reputation. Licensor + agrees that in those jurisdictions (e.g. Japan), in which any exercise + of the right granted in Section 3(b) of this License (the right to + make Adaptations) would be deemed to be a distortion, mutilation, + modification or other derogatory action prejudicial to the Original + Author's honor and reputation, the Licensor will waive or not assert, + as appropriate, this Section, to the fullest extent permitted by the + applicable national law, to enable You to reasonably exercise Your + right under Section 3(b) of this License (right to make Adaptations) + but not otherwise. + +5. Representations, Warranties and Disclaimer + +UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR +OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY +KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, +INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, +FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF +LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, +WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION +OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU. + +6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE +LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR +ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES +ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS +BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +7. Termination + + a. This License and the rights granted hereunder will terminate + automatically upon any breach by You of the terms of this License. + Individuals or entities who have received Adaptations or Collections + from You under this License, however, will not have their licenses + terminated provided such individuals or entities remain in full + compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will + survive any termination of this License. + b. Subject to the above terms and conditions, the license granted here is + perpetual (for the duration of the applicable copyright in the Work). + Notwithstanding the above, Licensor reserves the right to release the + Work under different license terms or to stop distributing the Work at + any time; provided, however that any such election will not serve to + withdraw this License (or any other license that has been, or is + required to be, granted under the terms of this License), and this + License will continue in full force and effect unless terminated as + stated above. + +8. Miscellaneous + + a. Each time You Distribute or Publicly Perform the Work or a Collection, + the Licensor offers to the recipient a license to the Work on the same + terms and conditions as the license granted to You under this License. + b. Each time You Distribute or Publicly Perform an Adaptation, Licensor + offers to the recipient a license to the original Work on the same + terms and conditions as the license granted to You under this License. + c. If any provision of this License is invalid or unenforceable under + applicable law, it shall not affect the validity or enforceability of + the remainder of the terms of this License, and without further action + by the parties to this agreement, such provision shall be reformed to + the minimum extent necessary to make such provision valid and + enforceable. + d. No term or provision of this License shall be deemed waived and no + breach consented to unless such waiver or consent shall be in writing + and signed by the party to be charged with such waiver or consent. + e. This License constitutes the entire agreement between the parties with + respect to the Work licensed here. There are no understandings, + agreements or representations with respect to the Work not specified + here. Licensor shall not be bound by any additional provisions that + may appear in any communication from You. This License may not be + modified without the mutual written agreement of the Licensor and You. + f. The rights granted under, and the subject matter referenced, in this + License were drafted utilizing the terminology of the Berne Convention + for the Protection of Literary and Artistic Works (as amended on + September 28, 1979), the Rome Convention of 1961, the WIPO Copyright + Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 + and the Universal Copyright Convention (as revised on July 24, 1971). + These rights and subject matter take effect in the relevant + jurisdiction in which the License terms are sought to be enforced + according to the corresponding provisions of the implementation of + those treaty provisions in the applicable national law. If the + standard suite of rights granted under applicable copyright law + includes additional rights not granted under this License, such + additional rights are deemed to be included in the License; this + License is not intended to restrict the license of any rights under + applicable law. + + +Creative Commons Notice + + Creative Commons is not a party to this License, and makes no warranty + whatsoever in connection with the Work. Creative Commons will not be + liable to You or any party on any legal theory for any damages + whatsoever, including without limitation any general, special, + incidental or consequential damages arising in connection to this + license. Notwithstanding the foregoing two (2) sentences, if Creative + Commons has expressly identified itself as the Licensor hereunder, it + shall have all rights and obligations of Licensor. + + Except for the limited purpose of indicating to the public that the + Work is licensed under the CCPL, Creative Commons does not authorize + the use by either party of the trademark "Creative Commons" or any + related trademark or logo of Creative Commons without the prior + written consent of Creative Commons. Any permitted use will be in + compliance with Creative Commons' then-current trademark usage + guidelines, as may be published on its website or otherwise made + available upon request from time to time. For the avoidance of doubt, + this trademark restriction does not form part of this License. + + Creative Commons may be contacted at https://creativecommons.org/. \ No newline at end of file diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-4.0.txt b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-4.0.txt new file mode 100644 index 0000000000..2fb94156cd --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-4.0.txt @@ -0,0 +1,396 @@ +Attribution 4.0 International + +======================================================================= + +Creative Commons Corporation ("Creative Commons") is not a law firm and +does not provide legal services or legal advice. Distribution of +Creative Commons public licenses does not create a lawyer-client or +other relationship. Creative Commons makes its licenses and related +information available on an "as-is" basis. Creative Commons gives no +warranties regarding its licenses, any material licensed under their +terms and conditions, or any related information. Creative Commons +disclaims all liability for damages resulting from their use to the +fullest extent possible. + +Using Creative Commons Public Licenses + +Creative Commons public licenses provide a standard set of terms and +conditions that creators and other rights holders may use to share +original works of authorship and other material subject to copyright +and certain other rights specified in the public license below. The +following considerations are for informational purposes only, are not +exhaustive, and do not form part of our licenses. + + Considerations for licensors: Our public licenses are + intended for use by those authorized to give the public + permission to use material in ways otherwise restricted by + copyright and certain other rights. Our licenses are + irrevocable. Licensors should read and understand the terms + and conditions of the license they choose before applying it. + Licensors should also secure all rights necessary before + applying our licenses so that the public can reuse the + material as expected. Licensors should clearly mark any + material not subject to the license. This includes other CC- + licensed material, or material used under an exception or + limitation to copyright. More considerations for licensors: + wiki.creativecommons.org/Considerations_for_licensors + + Considerations for the public: By using one of our public + licenses, a licensor grants the public permission to use the + licensed material under specified terms and conditions. If + the licensor's permission is not necessary for any reason--for + example, because of any applicable exception or limitation to + copyright--then that use is not regulated by the license. Our + licenses grant only permissions under copyright and certain + other rights that a licensor has authority to grant. Use of + the licensed material may still be restricted for other + reasons, including because others have copyright or other + rights in the material. A licensor may make special requests, + such as asking that all changes be marked or described. + Although not required by our licenses, you are encouraged to + respect those requests where reasonable. More considerations + for the public: + wiki.creativecommons.org/Considerations_for_licensees + +======================================================================= + +Creative Commons Attribution 4.0 International Public License + +By exercising the Licensed Rights (defined below), You accept and agree +to be bound by the terms and conditions of this Creative Commons +Attribution 4.0 International Public License ("Public License"). To the +extent this Public License may be interpreted as a contract, You are +granted the Licensed Rights in consideration of Your acceptance of +these terms and conditions, and the Licensor grants You such rights in +consideration of benefits the Licensor receives from making the +Licensed Material available under these terms and conditions. + + +Section 1 -- Definitions. + + a. Adapted Material means material subject to Copyright and Similar + Rights that is derived from or based upon the Licensed Material + and in which the Licensed Material is translated, altered, + arranged, transformed, or otherwise modified in a manner requiring + permission under the Copyright and Similar Rights held by the + Licensor. For purposes of this Public License, where the Licensed + Material is a musical work, performance, or sound recording, + Adapted Material is always produced where the Licensed Material is + synched in timed relation with a moving image. + + b. Adapter's License means the license You apply to Your Copyright + and Similar Rights in Your contributions to Adapted Material in + accordance with the terms and conditions of this Public License. + + c. Copyright and Similar Rights means copyright and/or similar rights + closely related to copyright including, without limitation, + performance, broadcast, sound recording, and Sui Generis Database + Rights, without regard to how the rights are labeled or + categorized. For purposes of this Public License, the rights + specified in Section 2(b)(1)-(2) are not Copyright and Similar + Rights. + + d. Effective Technological Measures means those measures that, in the + absence of proper authority, may not be circumvented under laws + fulfilling obligations under Article 11 of the WIPO Copyright + Treaty adopted on December 20, 1996, and/or similar international + agreements. + + e. Exceptions and Limitations means fair use, fair dealing, and/or + any other exception or limitation to Copyright and Similar Rights + that applies to Your use of the Licensed Material. + + f. Licensed Material means the artistic or literary work, database, + or other material to which the Licensor applied this Public + License. + + g. Licensed Rights means the rights granted to You subject to the + terms and conditions of this Public License, which are limited to + all Copyright and Similar Rights that apply to Your use of the + Licensed Material and that the Licensor has authority to license. + + h. Licensor means the individual(s) or entity(ies) granting rights + under this Public License. + + i. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, distribution, + dissemination, communication, or importation, and to make material + available to the public including in ways that members of the + public may access the material from a place and at a time + individually chosen by them. + + j. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and of + the Council of 11 March 1996 on the legal protection of databases, + as amended and/or succeeded, as well as other essentially + equivalent rights anywhere in the world. + + k. You means the individual or entity exercising the Licensed Rights + under this Public License. Your has a corresponding meaning. + + +Section 2 -- Scope. + + a. License grant. + + 1. Subject to the terms and conditions of this Public License, + the Licensor hereby grants You a worldwide, royalty-free, + non-sublicensable, non-exclusive, irrevocable license to + exercise the Licensed Rights in the Licensed Material to: + + a. reproduce and Share the Licensed Material, in whole or + in part; and + + b. produce, reproduce, and Share Adapted Material. + + 2. Exceptions and Limitations. For the avoidance of doubt, where + Exceptions and Limitations apply to Your use, this Public + License does not apply, and You do not need to comply with + its terms and conditions. + + 3. Term. The term of this Public License is specified in Section + 6(a). + + 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter created, + and to make technical modifications necessary to do so. The + Licensor waives and/or agrees not to assert any right or + authority to forbid You from making technical modifications + necessary to exercise the Licensed Rights, including + technical modifications necessary to circumvent Effective + Technological Measures. For purposes of this Public License, + simply making modifications authorized by this Section 2(a) + (4) never produces Adapted Material. + + 5. Downstream recipients. + + a. Offer from the Licensor -- Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + + b. No downstream restrictions. You may not offer or impose + any additional or different terms or conditions on, or + apply any Effective Technological Measures to, the + Licensed Material if doing so restricts exercise of the + Licensed Rights by any recipient of the Licensed + Material. + + 6. No endorsement. Nothing in this Public License constitutes or + may be construed as permission to assert or imply that You + are, or that Your use of the Licensed Material is, connected + with, or sponsored, endorsed, or granted official status by, + the Licensor or others designated to receive attribution as + provided in Section 3(a)(1)(A)(i). + + b. Other rights. + + 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, to + the extent possible, the Licensor waives and/or agrees not to + assert any such rights held by the Licensor to the limited + extent necessary to allow You to exercise the Licensed + Rights, but not otherwise. + + 2. Patent and trademark rights are not licensed under this + Public License. + + 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties. + + +Section 3 -- License Conditions. + +Your exercise of the Licensed Rights is expressly made subject to the +following conditions. + + a. Attribution. + + 1. If You Share the Licensed Material (including in modified + form), You must: + + a. retain the following if it is supplied by the Licensor + with the Licensed Material: + + i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if + designated); + + ii. a copyright notice; + + iii. a notice that refers to this Public License; + + iv. a notice that refers to the disclaimer of + warranties; + + v. a URI or hyperlink to the Licensed Material to the + extent reasonably practicable; + + b. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + + c. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + + 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may be + reasonable to satisfy the conditions by providing a URI or + hyperlink to a resource that includes the required + information. + + 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + + 4. If You Share Adapted Material You produce, the Adapter's + License You apply must not prevent recipients of the Adapted + Material from complying with this Public License. + + +Section 4 -- Sui Generis Database Rights. + +Where the Licensed Rights include Sui Generis Database Rights that +apply to Your use of the Licensed Material: + + a. for the avoidance of doubt, Section 2(a)(1) grants You the right + to extract, reuse, reproduce, and Share all or a substantial + portion of the contents of the database; + + b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material; and + + c. You must comply with the conditions in Section 3(a) if You Share + all or a substantial portion of the contents of the database. + +For the avoidance of doubt, this Section 4 supplements and does not +replace Your obligations under this Public License where the Licensed +Rights include other Copyright and Similar Rights. + + +Section 5 -- Disclaimer of Warranties and Limitation of Liability. + + a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE + EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS + AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF + ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, + IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. + + b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE + TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, + NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, + INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, + COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR + USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR + DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR + IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. + + c. The disclaimer of warranties and limitation of liability provided + above shall be interpreted in a manner that, to the extent + possible, most closely approximates an absolute disclaimer and + waiver of all liability. + + +Section 6 -- Term and Termination. + + a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply with + this Public License, then Your rights under this Public License + terminate automatically. + + b. Where Your right to use the Licensed Material has terminated under + Section 6(a), it reinstates: + + 1. automatically as of the date the violation is cured, provided + it is cured within 30 days of Your discovery of the + violation; or + + 2. upon express reinstatement by the Licensor. + + For the avoidance of doubt, this Section 6(b) does not affect any + right the Licensor may have to seek remedies for Your violations + of this Public License. + + c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing so + will not terminate this Public License. + + d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + + +Section 7 -- Other Terms and Conditions. + + a. The Licensor shall not be bound by any additional or different + terms or conditions communicated by You unless expressly agreed. + + b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + + +Section 8 -- Interpretation. + + a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could lawfully + be made without permission under this Public License. + + b. To the extent possible, if any provision of this Public License is + deemed unenforceable, it shall be automatically reformed to the + minimum extent necessary to make it enforceable. If the provision + cannot be reformed, it shall be severed from this Public License + without affecting the enforceability of the remaining terms and + conditions. + + c. No term or condition of this Public License will be waived and no + failure to comply consented to unless expressly agreed to by the + Licensor. + + d. Nothing in this Public License constitutes or may be interpreted + as a limitation upon, or waiver of, any privileges and immunities + that apply to the Licensor or You, including from the legal + processes of any jurisdiction or authority. + + +======================================================================= + +Creative Commons is not a party to its public +licenses. Notwithstanding, Creative Commons may elect to apply one of +its public licenses to material it publishes and in those instances +will be considered the “Licensor.” The text of the Creative Commons +public licenses is dedicated to the public domain under the CC0 Public +Domain Dedication. Except for the limited purpose of indicating that +material is shared under a Creative Commons public license or as +otherwise permitted by the Creative Commons policies published at +creativecommons.org/policies, Creative Commons does not authorize the +use of the trademark "Creative Commons" or any other trademark or logo +of Creative Commons without its prior written consent including, +without limitation, in connection with any unauthorized modifications +to any of its public licenses or any other arrangements, +understandings, or agreements concerning use of licensed material. For +the avoidance of doubt, this paragraph does not form part of the +public licenses. + +Creative Commons may be contacted at creativecommons.org. + diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-NC-3.0.txt b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-NC-3.0.txt new file mode 100644 index 0000000000..197ec4de65 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-NC-3.0.txt @@ -0,0 +1,334 @@ +Creative Commons Legal Code + +Attribution-NonCommercial 3.0 Unported + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR + DAMAGES RESULTING FROM ITS USE. + +License + +THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE +COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY +COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS +AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED. + +BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE +TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY +BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS +CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND +CONDITIONS. + +1. Definitions + + a. "Adaptation" means a work based upon the Work, or upon the Work and + other pre-existing works, such as a translation, adaptation, + derivative work, arrangement of music or other alterations of a + literary or artistic work, or phonogram or performance and includes + cinematographic adaptations or any other form in which the Work may be + recast, transformed, or adapted including in any form recognizably + derived from the original, except that a work that constitutes a + Collection will not be considered an Adaptation for the purpose of + this License. For the avoidance of doubt, where the Work is a musical + work, performance or phonogram, the synchronization of the Work in + timed-relation with a moving image ("synching") will be considered an + Adaptation for the purpose of this License. + b. "Collection" means a collection of literary or artistic works, such as + encyclopedias and anthologies, or performances, phonograms or + broadcasts, or other works or subject matter other than works listed + in Section 1(f) below, which, by reason of the selection and + arrangement of their contents, constitute intellectual creations, in + which the Work is included in its entirety in unmodified form along + with one or more other contributions, each constituting separate and + independent works in themselves, which together are assembled into a + collective whole. A work that constitutes a Collection will not be + considered an Adaptation (as defined above) for the purposes of this + License. + c. "Distribute" means to make available to the public the original and + copies of the Work or Adaptation, as appropriate, through sale or + other transfer of ownership. + d. "Licensor" means the individual, individuals, entity or entities that + offer(s) the Work under the terms of this License. + e. "Original Author" means, in the case of a literary or artistic work, + the individual, individuals, entity or entities who created the Work + or if no individual or entity can be identified, the publisher; and in + addition (i) in the case of a performance the actors, singers, + musicians, dancers, and other persons who act, sing, deliver, declaim, + play in, interpret or otherwise perform literary or artistic works or + expressions of folklore; (ii) in the case of a phonogram the producer + being the person or legal entity who first fixes the sounds of a + performance or other sounds; and, (iii) in the case of broadcasts, the + organization that transmits the broadcast. + f. "Work" means the literary and/or artistic work offered under the terms + of this License including without limitation any production in the + literary, scientific and artistic domain, whatever may be the mode or + form of its expression including digital form, such as a book, + pamphlet and other writing; a lecture, address, sermon or other work + of the same nature; a dramatic or dramatico-musical work; a + choreographic work or entertainment in dumb show; a musical + composition with or without words; a cinematographic work to which are + assimilated works expressed by a process analogous to cinematography; + a work of drawing, painting, architecture, sculpture, engraving or + lithography; a photographic work to which are assimilated works + expressed by a process analogous to photography; a work of applied + art; an illustration, map, plan, sketch or three-dimensional work + relative to geography, topography, architecture or science; a + performance; a broadcast; a phonogram; a compilation of data to the + extent it is protected as a copyrightable work; or a work performed by + a variety or circus performer to the extent it is not otherwise + considered a literary or artistic work. + g. "You" means an individual or entity exercising rights under this + License who has not previously violated the terms of this License with + respect to the Work, or who has received express permission from the + Licensor to exercise rights under this License despite a previous + violation. + h. "Publicly Perform" means to perform public recitations of the Work and + to communicate to the public those public recitations, by any means or + process, including by wire or wireless means or public digital + performances; to make available to the public Works in such a way that + members of the public may access these Works from a place and at a + place individually chosen by them; to perform the Work to the public + by any means or process and the communication to the public of the + performances of the Work, including by public digital performance; to + broadcast and rebroadcast the Work by any means including signs, + sounds or images. + i. "Reproduce" means to make copies of the Work by any means including + without limitation by sound or visual recordings and the right of + fixation and reproducing fixations of the Work, including storage of a + protected performance or phonogram in digital form or other electronic + medium. + +2. Fair Dealing Rights. Nothing in this License is intended to reduce, +limit, or restrict any uses free from copyright or rights arising from +limitations or exceptions that are provided for in connection with the +copyright protection under copyright law or other applicable laws. + +3. License Grant. Subject to the terms and conditions of this License, +Licensor hereby grants You a worldwide, royalty-free, non-exclusive, +perpetual (for the duration of the applicable copyright) license to +exercise the rights in the Work as stated below: + + a. to Reproduce the Work, to incorporate the Work into one or more + Collections, and to Reproduce the Work as incorporated in the + Collections; + b. to create and Reproduce Adaptations provided that any such Adaptation, + including any translation in any medium, takes reasonable steps to + clearly label, demarcate or otherwise identify that changes were made + to the original Work. For example, a translation could be marked "The + original work was translated from English to Spanish," or a + modification could indicate "The original work has been modified."; + c. to Distribute and Publicly Perform the Work including as incorporated + in Collections; and, + d. to Distribute and Publicly Perform Adaptations. + +The above rights may be exercised in all media and formats whether now +known or hereafter devised. The above rights include the right to make +such modifications as are technically necessary to exercise the rights in +other media and formats. Subject to Section 8(f), all rights not expressly +granted by Licensor are hereby reserved, including but not limited to the +rights set forth in Section 4(d). + +4. Restrictions. The license granted in Section 3 above is expressly made +subject to and limited by the following restrictions: + + a. You may Distribute or Publicly Perform the Work only under the terms + of this License. You must include a copy of, or the Uniform Resource + Identifier (URI) for, this License with every copy of the Work You + Distribute or Publicly Perform. You may not offer or impose any terms + on the Work that restrict the terms of this License or the ability of + the recipient of the Work to exercise the rights granted to that + recipient under the terms of the License. You may not sublicense the + Work. You must keep intact all notices that refer to this License and + to the disclaimer of warranties with every copy of the Work You + Distribute or Publicly Perform. When You Distribute or Publicly + Perform the Work, You may not impose any effective technological + measures on the Work that restrict the ability of a recipient of the + Work from You to exercise the rights granted to that recipient under + the terms of the License. This Section 4(a) applies to the Work as + incorporated in a Collection, but this does not require the Collection + apart from the Work itself to be made subject to the terms of this + License. If You create a Collection, upon notice from any Licensor You + must, to the extent practicable, remove from the Collection any credit + as required by Section 4(c), as requested. If You create an + Adaptation, upon notice from any Licensor You must, to the extent + practicable, remove from the Adaptation any credit as required by + Section 4(c), as requested. + b. You may not exercise any of the rights granted to You in Section 3 + above in any manner that is primarily intended for or directed toward + commercial advantage or private monetary compensation. The exchange of + the Work for other copyrighted works by means of digital file-sharing + or otherwise shall not be considered to be intended for or directed + toward commercial advantage or private monetary compensation, provided + there is no payment of any monetary compensation in connection with + the exchange of copyrighted works. + c. If You Distribute, or Publicly Perform the Work or any Adaptations or + Collections, You must, unless a request has been made pursuant to + Section 4(a), keep intact all copyright notices for the Work and + provide, reasonable to the medium or means You are utilizing: (i) the + name of the Original Author (or pseudonym, if applicable) if supplied, + and/or if the Original Author and/or Licensor designate another party + or parties (e.g., a sponsor institute, publishing entity, journal) for + attribution ("Attribution Parties") in Licensor's copyright notice, + terms of service or by other reasonable means, the name of such party + or parties; (ii) the title of the Work if supplied; (iii) to the + extent reasonably practicable, the URI, if any, that Licensor + specifies to be associated with the Work, unless such URI does not + refer to the copyright notice or licensing information for the Work; + and, (iv) consistent with Section 3(b), in the case of an Adaptation, + a credit identifying the use of the Work in the Adaptation (e.g., + "French translation of the Work by Original Author," or "Screenplay + based on original Work by Original Author"). The credit required by + this Section 4(c) may be implemented in any reasonable manner; + provided, however, that in the case of a Adaptation or Collection, at + a minimum such credit will appear, if a credit for all contributing + authors of the Adaptation or Collection appears, then as part of these + credits and in a manner at least as prominent as the credits for the + other contributing authors. For the avoidance of doubt, You may only + use the credit required by this Section for the purpose of attribution + in the manner set out above and, by exercising Your rights under this + License, You may not implicitly or explicitly assert or imply any + connection with, sponsorship or endorsement by the Original Author, + Licensor and/or Attribution Parties, as appropriate, of You or Your + use of the Work, without the separate, express prior written + permission of the Original Author, Licensor and/or Attribution + Parties. + d. For the avoidance of doubt: + + i. Non-waivable Compulsory License Schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme cannot be waived, the Licensor + reserves the exclusive right to collect such royalties for any + exercise by You of the rights granted under this License; + ii. Waivable Compulsory License Schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme can be waived, the Licensor reserves + the exclusive right to collect such royalties for any exercise by + You of the rights granted under this License if Your exercise of + such rights is for a purpose or use which is otherwise than + noncommercial as permitted under Section 4(b) and otherwise waives + the right to collect royalties through any statutory or compulsory + licensing scheme; and, + iii. Voluntary License Schemes. The Licensor reserves the right to + collect royalties, whether individually or, in the event that the + Licensor is a member of a collecting society that administers + voluntary licensing schemes, via that society, from any exercise + by You of the rights granted under this License that is for a + purpose or use which is otherwise than noncommercial as permitted + under Section 4(c). + e. Except as otherwise agreed in writing by the Licensor or as may be + otherwise permitted by applicable law, if You Reproduce, Distribute or + Publicly Perform the Work either by itself or as part of any + Adaptations or Collections, You must not distort, mutilate, modify or + take other derogatory action in relation to the Work which would be + prejudicial to the Original Author's honor or reputation. Licensor + agrees that in those jurisdictions (e.g. Japan), in which any exercise + of the right granted in Section 3(b) of this License (the right to + make Adaptations) would be deemed to be a distortion, mutilation, + modification or other derogatory action prejudicial to the Original + Author's honor and reputation, the Licensor will waive or not assert, + as appropriate, this Section, to the fullest extent permitted by the + applicable national law, to enable You to reasonably exercise Your + right under Section 3(b) of this License (right to make Adaptations) + but not otherwise. + +5. Representations, Warranties and Disclaimer + +UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR +OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY +KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, +INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, +FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF +LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, +WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION +OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU. + +6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE +LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR +ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES +ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS +BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +7. Termination + + a. This License and the rights granted hereunder will terminate + automatically upon any breach by You of the terms of this License. + Individuals or entities who have received Adaptations or Collections + from You under this License, however, will not have their licenses + terminated provided such individuals or entities remain in full + compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will + survive any termination of this License. + b. Subject to the above terms and conditions, the license granted here is + perpetual (for the duration of the applicable copyright in the Work). + Notwithstanding the above, Licensor reserves the right to release the + Work under different license terms or to stop distributing the Work at + any time; provided, however that any such election will not serve to + withdraw this License (or any other license that has been, or is + required to be, granted under the terms of this License), and this + License will continue in full force and effect unless terminated as + stated above. + +8. Miscellaneous + + a. Each time You Distribute or Publicly Perform the Work or a Collection, + the Licensor offers to the recipient a license to the Work on the same + terms and conditions as the license granted to You under this License. + b. Each time You Distribute or Publicly Perform an Adaptation, Licensor + offers to the recipient a license to the original Work on the same + terms and conditions as the license granted to You under this License. + c. If any provision of this License is invalid or unenforceable under + applicable law, it shall not affect the validity or enforceability of + the remainder of the terms of this License, and without further action + by the parties to this agreement, such provision shall be reformed to + the minimum extent necessary to make such provision valid and + enforceable. + d. No term or provision of this License shall be deemed waived and no + breach consented to unless such waiver or consent shall be in writing + and signed by the party to be charged with such waiver or consent. + e. This License constitutes the entire agreement between the parties with + respect to the Work licensed here. There are no understandings, + agreements or representations with respect to the Work not specified + here. Licensor shall not be bound by any additional provisions that + may appear in any communication from You. This License may not be + modified without the mutual written agreement of the Licensor and You. + f. The rights granted under, and the subject matter referenced, in this + License were drafted utilizing the terminology of the Berne Convention + for the Protection of Literary and Artistic Works (as amended on + September 28, 1979), the Rome Convention of 1961, the WIPO Copyright + Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 + and the Universal Copyright Convention (as revised on July 24, 1971). + These rights and subject matter take effect in the relevant + jurisdiction in which the License terms are sought to be enforced + according to the corresponding provisions of the implementation of + those treaty provisions in the applicable national law. If the + standard suite of rights granted under applicable copyright law + includes additional rights not granted under this License, such + additional rights are deemed to be included in the License; this + License is not intended to restrict the license of any rights under + applicable law. + + +Creative Commons Notice + + Creative Commons is not a party to this License, and makes no warranty + whatsoever in connection with the Work. Creative Commons will not be + liable to You or any party on any legal theory for any damages + whatsoever, including without limitation any general, special, + incidental or consequential damages arising in connection to this + license. Notwithstanding the foregoing two (2) sentences, if Creative + Commons has expressly identified itself as the Licensor hereunder, it + shall have all rights and obligations of Licensor. + + Except for the limited purpose of indicating to the public that the + Work is licensed under the CCPL, Creative Commons does not authorize + the use by either party of the trademark "Creative Commons" or any + related trademark or logo of Creative Commons without the prior + written consent of Creative Commons. Any permitted use will be in + compliance with Creative Commons' then-current trademark usage + guidelines, as may be published on its website or otherwise made + available upon request from time to time. For the avoidance of doubt, + this trademark restriction does not form part of the License. + + Creative Commons may be contacted at https://creativecommons.org/. diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-NC-SA-3.0.txt b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-NC-SA-3.0.txt new file mode 100644 index 0000000000..a71ea7947d --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-NC-SA-3.0.txt @@ -0,0 +1,361 @@ +Creative Commons Legal Code + +Attribution-NonCommercial-ShareAlike 3.0 Unported + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR + DAMAGES RESULTING FROM ITS USE. + +License + +THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE +COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY +COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS +AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED. + +BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE +TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY +BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS +CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND +CONDITIONS. + +1. Definitions + + a. "Adaptation" means a work based upon the Work, or upon the Work and + other pre-existing works, such as a translation, adaptation, + derivative work, arrangement of music or other alterations of a + literary or artistic work, or phonogram or performance and includes + cinematographic adaptations or any other form in which the Work may be + recast, transformed, or adapted including in any form recognizably + derived from the original, except that a work that constitutes a + Collection will not be considered an Adaptation for the purpose of + this License. For the avoidance of doubt, where the Work is a musical + work, performance or phonogram, the synchronization of the Work in + timed-relation with a moving image ("synching") will be considered an + Adaptation for the purpose of this License. + b. "Collection" means a collection of literary or artistic works, such as + encyclopedias and anthologies, or performances, phonograms or + broadcasts, or other works or subject matter other than works listed + in Section 1(g) below, which, by reason of the selection and + arrangement of their contents, constitute intellectual creations, in + which the Work is included in its entirety in unmodified form along + with one or more other contributions, each constituting separate and + independent works in themselves, which together are assembled into a + collective whole. A work that constitutes a Collection will not be + considered an Adaptation (as defined above) for the purposes of this + License. + c. "Distribute" means to make available to the public the original and + copies of the Work or Adaptation, as appropriate, through sale or + other transfer of ownership. + d. "License Elements" means the following high-level license attributes + as selected by Licensor and indicated in the title of this License: + Attribution, Noncommercial, ShareAlike. + e. "Licensor" means the individual, individuals, entity or entities that + offer(s) the Work under the terms of this License. + f. "Original Author" means, in the case of a literary or artistic work, + the individual, individuals, entity or entities who created the Work + or if no individual or entity can be identified, the publisher; and in + addition (i) in the case of a performance the actors, singers, + musicians, dancers, and other persons who act, sing, deliver, declaim, + play in, interpret or otherwise perform literary or artistic works or + expressions of folklore; (ii) in the case of a phonogram the producer + being the person or legal entity who first fixes the sounds of a + performance or other sounds; and, (iii) in the case of broadcasts, the + organization that transmits the broadcast. + g. "Work" means the literary and/or artistic work offered under the terms + of this License including without limitation any production in the + literary, scientific and artistic domain, whatever may be the mode or + form of its expression including digital form, such as a book, + pamphlet and other writing; a lecture, address, sermon or other work + of the same nature; a dramatic or dramatico-musical work; a + choreographic work or entertainment in dumb show; a musical + composition with or without words; a cinematographic work to which are + assimilated works expressed by a process analogous to cinematography; + a work of drawing, painting, architecture, sculpture, engraving or + lithography; a photographic work to which are assimilated works + expressed by a process analogous to photography; a work of applied + art; an illustration, map, plan, sketch or three-dimensional work + relative to geography, topography, architecture or science; a + performance; a broadcast; a phonogram; a compilation of data to the + extent it is protected as a copyrightable work; or a work performed by + a variety or circus performer to the extent it is not otherwise + considered a literary or artistic work. + h. "You" means an individual or entity exercising rights under this + License who has not previously violated the terms of this License with + respect to the Work, or who has received express permission from the + Licensor to exercise rights under this License despite a previous + violation. + i. "Publicly Perform" means to perform public recitations of the Work and + to communicate to the public those public recitations, by any means or + process, including by wire or wireless means or public digital + performances; to make available to the public Works in such a way that + members of the public may access these Works from a place and at a + place individually chosen by them; to perform the Work to the public + by any means or process and the communication to the public of the + performances of the Work, including by public digital performance; to + broadcast and rebroadcast the Work by any means including signs, + sounds or images. + j. "Reproduce" means to make copies of the Work by any means including + without limitation by sound or visual recordings and the right of + fixation and reproducing fixations of the Work, including storage of a + protected performance or phonogram in digital form or other electronic + medium. + +2. Fair Dealing Rights. Nothing in this License is intended to reduce, +limit, or restrict any uses free from copyright or rights arising from +limitations or exceptions that are provided for in connection with the +copyright protection under copyright law or other applicable laws. + +3. License Grant. Subject to the terms and conditions of this License, +Licensor hereby grants You a worldwide, royalty-free, non-exclusive, +perpetual (for the duration of the applicable copyright) license to +exercise the rights in the Work as stated below: + + a. to Reproduce the Work, to incorporate the Work into one or more + Collections, and to Reproduce the Work as incorporated in the + Collections; + b. to create and Reproduce Adaptations provided that any such Adaptation, + including any translation in any medium, takes reasonable steps to + clearly label, demarcate or otherwise identify that changes were made + to the original Work. For example, a translation could be marked "The + original work was translated from English to Spanish," or a + modification could indicate "The original work has been modified."; + c. to Distribute and Publicly Perform the Work including as incorporated + in Collections; and, + d. to Distribute and Publicly Perform Adaptations. + +The above rights may be exercised in all media and formats whether now +known or hereafter devised. The above rights include the right to make +such modifications as are technically necessary to exercise the rights in +other media and formats. Subject to Section 8(f), all rights not expressly +granted by Licensor are hereby reserved, including but not limited to the +rights described in Section 4(e). + +4. Restrictions. The license granted in Section 3 above is expressly made +subject to and limited by the following restrictions: + + a. You may Distribute or Publicly Perform the Work only under the terms + of this License. You must include a copy of, or the Uniform Resource + Identifier (URI) for, this License with every copy of the Work You + Distribute or Publicly Perform. You may not offer or impose any terms + on the Work that restrict the terms of this License or the ability of + the recipient of the Work to exercise the rights granted to that + recipient under the terms of the License. You may not sublicense the + Work. You must keep intact all notices that refer to this License and + to the disclaimer of warranties with every copy of the Work You + Distribute or Publicly Perform. When You Distribute or Publicly + Perform the Work, You may not impose any effective technological + measures on the Work that restrict the ability of a recipient of the + Work from You to exercise the rights granted to that recipient under + the terms of the License. This Section 4(a) applies to the Work as + incorporated in a Collection, but this does not require the Collection + apart from the Work itself to be made subject to the terms of this + License. If You create a Collection, upon notice from any Licensor You + must, to the extent practicable, remove from the Collection any credit + as required by Section 4(d), as requested. If You create an + Adaptation, upon notice from any Licensor You must, to the extent + practicable, remove from the Adaptation any credit as required by + Section 4(d), as requested. + b. You may Distribute or Publicly Perform an Adaptation only under: (i) + the terms of this License; (ii) a later version of this License with + the same License Elements as this License; (iii) a Creative Commons + jurisdiction license (either this or a later license version) that + contains the same License Elements as this License (e.g., + Attribution-NonCommercial-ShareAlike 3.0 US) ("Applicable License"). + You must include a copy of, or the URI, for Applicable License with + every copy of each Adaptation You Distribute or Publicly Perform. You + may not offer or impose any terms on the Adaptation that restrict the + terms of the Applicable License or the ability of the recipient of the + Adaptation to exercise the rights granted to that recipient under the + terms of the Applicable License. You must keep intact all notices that + refer to the Applicable License and to the disclaimer of warranties + with every copy of the Work as included in the Adaptation You + Distribute or Publicly Perform. When You Distribute or Publicly + Perform the Adaptation, You may not impose any effective technological + measures on the Adaptation that restrict the ability of a recipient of + the Adaptation from You to exercise the rights granted to that + recipient under the terms of the Applicable License. This Section 4(b) + applies to the Adaptation as incorporated in a Collection, but this + does not require the Collection apart from the Adaptation itself to be + made subject to the terms of the Applicable License. + c. You may not exercise any of the rights granted to You in Section 3 + above in any manner that is primarily intended for or directed toward + commercial advantage or private monetary compensation. The exchange of + the Work for other copyrighted works by means of digital file-sharing + or otherwise shall not be considered to be intended for or directed + toward commercial advantage or private monetary compensation, provided + there is no payment of any monetary compensation in con-nection with + the exchange of copyrighted works. + d. If You Distribute, or Publicly Perform the Work or any Adaptations or + Collections, You must, unless a request has been made pursuant to + Section 4(a), keep intact all copyright notices for the Work and + provide, reasonable to the medium or means You are utilizing: (i) the + name of the Original Author (or pseudonym, if applicable) if supplied, + and/or if the Original Author and/or Licensor designate another party + or parties (e.g., a sponsor institute, publishing entity, journal) for + attribution ("Attribution Parties") in Licensor's copyright notice, + terms of service or by other reasonable means, the name of such party + or parties; (ii) the title of the Work if supplied; (iii) to the + extent reasonably practicable, the URI, if any, that Licensor + specifies to be associated with the Work, unless such URI does not + refer to the copyright notice or licensing information for the Work; + and, (iv) consistent with Section 3(b), in the case of an Adaptation, + a credit identifying the use of the Work in the Adaptation (e.g., + "French translation of the Work by Original Author," or "Screenplay + based on original Work by Original Author"). The credit required by + this Section 4(d) may be implemented in any reasonable manner; + provided, however, that in the case of a Adaptation or Collection, at + a minimum such credit will appear, if a credit for all contributing + authors of the Adaptation or Collection appears, then as part of these + credits and in a manner at least as prominent as the credits for the + other contributing authors. For the avoidance of doubt, You may only + use the credit required by this Section for the purpose of attribution + in the manner set out above and, by exercising Your rights under this + License, You may not implicitly or explicitly assert or imply any + connection with, sponsorship or endorsement by the Original Author, + Licensor and/or Attribution Parties, as appropriate, of You or Your + use of the Work, without the separate, express prior written + permission of the Original Author, Licensor and/or Attribution + Parties. + e. For the avoidance of doubt: + + i. Non-waivable Compulsory License Schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme cannot be waived, the Licensor + reserves the exclusive right to collect such royalties for any + exercise by You of the rights granted under this License; + ii. Waivable Compulsory License Schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme can be waived, the Licensor reserves + the exclusive right to collect such royalties for any exercise by + You of the rights granted under this License if Your exercise of + such rights is for a purpose or use which is otherwise than + noncommercial as permitted under Section 4(c) and otherwise waives + the right to collect royalties through any statutory or compulsory + licensing scheme; and, + iii. Voluntary License Schemes. The Licensor reserves the right to + collect royalties, whether individually or, in the event that the + Licensor is a member of a collecting society that administers + voluntary licensing schemes, via that society, from any exercise + by You of the rights granted under this License that is for a + purpose or use which is otherwise than noncommercial as permitted + under Section 4(c). + f. Except as otherwise agreed in writing by the Licensor or as may be + otherwise permitted by applicable law, if You Reproduce, Distribute or + Publicly Perform the Work either by itself or as part of any + Adaptations or Collections, You must not distort, mutilate, modify or + take other derogatory action in relation to the Work which would be + prejudicial to the Original Author's honor or reputation. Licensor + agrees that in those jurisdictions (e.g. Japan), in which any exercise + of the right granted in Section 3(b) of this License (the right to + make Adaptations) would be deemed to be a distortion, mutilation, + modification or other derogatory action prejudicial to the Original + Author's honor and reputation, the Licensor will waive or not assert, + as appropriate, this Section, to the fullest extent permitted by the + applicable national law, to enable You to reasonably exercise Your + right under Section 3(b) of this License (right to make Adaptations) + but not otherwise. + +5. Representations, Warranties and Disclaimer + +UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING AND TO THE +FULLEST EXTENT PERMITTED BY APPLICABLE LAW, LICENSOR OFFERS THE WORK AS-IS +AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE +WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT +LIMITATION, WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR +PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, +ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT +DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED +WARRANTIES, SO THIS EXCLUSION MAY NOT APPLY TO YOU. + +6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE +LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR +ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES +ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS +BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +7. Termination + + a. This License and the rights granted hereunder will terminate + automatically upon any breach by You of the terms of this License. + Individuals or entities who have received Adaptations or Collections + from You under this License, however, will not have their licenses + terminated provided such individuals or entities remain in full + compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will + survive any termination of this License. + b. Subject to the above terms and conditions, the license granted here is + perpetual (for the duration of the applicable copyright in the Work). + Notwithstanding the above, Licensor reserves the right to release the + Work under different license terms or to stop distributing the Work at + any time; provided, however that any such election will not serve to + withdraw this License (or any other license that has been, or is + required to be, granted under the terms of this License), and this + License will continue in full force and effect unless terminated as + stated above. + +8. Miscellaneous + + a. Each time You Distribute or Publicly Perform the Work or a Collection, + the Licensor offers to the recipient a license to the Work on the same + terms and conditions as the license granted to You under this License. + b. Each time You Distribute or Publicly Perform an Adaptation, Licensor + offers to the recipient a license to the original Work on the same + terms and conditions as the license granted to You under this License. + c. If any provision of this License is invalid or unenforceable under + applicable law, it shall not affect the validity or enforceability of + the remainder of the terms of this License, and without further action + by the parties to this agreement, such provision shall be reformed to + the minimum extent necessary to make such provision valid and + enforceable. + d. No term or provision of this License shall be deemed waived and no + breach consented to unless such waiver or consent shall be in writing + and signed by the party to be charged with such waiver or consent. + e. This License constitutes the entire agreement between the parties with + respect to the Work licensed here. There are no understandings, + agreements or representations with respect to the Work not specified + here. Licensor shall not be bound by any additional provisions that + may appear in any communication from You. This License may not be + modified without the mutual written agreement of the Licensor and You. + f. The rights granted under, and the subject matter referenced, in this + License were drafted utilizing the terminology of the Berne Convention + for the Protection of Literary and Artistic Works (as amended on + September 28, 1979), the Rome Convention of 1961, the WIPO Copyright + Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 + and the Universal Copyright Convention (as revised on July 24, 1971). + These rights and subject matter take effect in the relevant + jurisdiction in which the License terms are sought to be enforced + according to the corresponding provisions of the implementation of + those treaty provisions in the applicable national law. If the + standard suite of rights granted under applicable copyright law + includes additional rights not granted under this License, such + additional rights are deemed to be included in the License; this + License is not intended to restrict the license of any rights under + applicable law. + + +Creative Commons Notice + + Creative Commons is not a party to this License, and makes no warranty + whatsoever in connection with the Work. Creative Commons will not be + liable to You or any party on any legal theory for any damages + whatsoever, including without limitation any general, special, + incidental or consequential damages arising in connection to this + license. Notwithstanding the foregoing two (2) sentences, if Creative + Commons has expressly identified itself as the Licensor hereunder, it + shall have all rights and obligations of Licensor. + + Except for the limited purpose of indicating to the public that the + Work is licensed under the CCPL, Creative Commons does not authorize + the use by either party of the trademark "Creative Commons" or any + related trademark or logo of Creative Commons without the prior + written consent of Creative Commons. Any permitted use will be in + compliance with Creative Commons' then-current trademark usage + guidelines, as may be published on its website or otherwise made + available upon request from time to time. For the avoidance of doubt, + this trademark restriction does not form part of this License. + + Creative Commons may be contacted at https://creativecommons.org/. + diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-NC-SA-4.0.txt b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-NC-SA-4.0.txt new file mode 100644 index 0000000000..7cdbe0b482 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-NC-SA-4.0.txt @@ -0,0 +1,438 @@ +Attribution-NonCommercial-ShareAlike 4.0 International + +======================================================================= + +Creative Commons Corporation ("Creative Commons") is not a law firm and +does not provide legal services or legal advice. Distribution of +Creative Commons public licenses does not create a lawyer-client or +other relationship. Creative Commons makes its licenses and related +information available on an "as-is" basis. Creative Commons gives no +warranties regarding its licenses, any material licensed under their +terms and conditions, or any related information. Creative Commons +disclaims all liability for damages resulting from their use to the +fullest extent possible. + +Using Creative Commons Public Licenses + +Creative Commons public licenses provide a standard set of terms and +conditions that creators and other rights holders may use to share +original works of authorship and other material subject to copyright +and certain other rights specified in the public license below. The +following considerations are for informational purposes only, are not +exhaustive, and do not form part of our licenses. + + Considerations for licensors: Our public licenses are + intended for use by those authorized to give the public + permission to use material in ways otherwise restricted by + copyright and certain other rights. Our licenses are + irrevocable. Licensors should read and understand the terms + and conditions of the license they choose before applying it. + Licensors should also secure all rights necessary before + applying our licenses so that the public can reuse the + material as expected. Licensors should clearly mark any + material not subject to the license. This includes other CC- + licensed material, or material used under an exception or + limitation to copyright. More considerations for licensors: + wiki.creativecommons.org/Considerations_for_licensors + + Considerations for the public: By using one of our public + licenses, a licensor grants the public permission to use the + licensed material under specified terms and conditions. If + the licensor's permission is not necessary for any reason--for + example, because of any applicable exception or limitation to + copyright--then that use is not regulated by the license. Our + licenses grant only permissions under copyright and certain + other rights that a licensor has authority to grant. Use of + the licensed material may still be restricted for other + reasons, including because others have copyright or other + rights in the material. A licensor may make special requests, + such as asking that all changes be marked or described. + Although not required by our licenses, you are encouraged to + respect those requests where reasonable. More considerations + for the public: + wiki.creativecommons.org/Considerations_for_licensees + +======================================================================= + +Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International +Public License + +By exercising the Licensed Rights (defined below), You accept and agree +to be bound by the terms and conditions of this Creative Commons +Attribution-NonCommercial-ShareAlike 4.0 International Public License +("Public License"). To the extent this Public License may be +interpreted as a contract, You are granted the Licensed Rights in +consideration of Your acceptance of these terms and conditions, and the +Licensor grants You such rights in consideration of benefits the +Licensor receives from making the Licensed Material available under +these terms and conditions. + + +Section 1 -- Definitions. + + a. Adapted Material means material subject to Copyright and Similar + Rights that is derived from or based upon the Licensed Material + and in which the Licensed Material is translated, altered, + arranged, transformed, or otherwise modified in a manner requiring + permission under the Copyright and Similar Rights held by the + Licensor. For purposes of this Public License, where the Licensed + Material is a musical work, performance, or sound recording, + Adapted Material is always produced where the Licensed Material is + synched in timed relation with a moving image. + + b. Adapter's License means the license You apply to Your Copyright + and Similar Rights in Your contributions to Adapted Material in + accordance with the terms and conditions of this Public License. + + c. BY-NC-SA Compatible License means a license listed at + creativecommons.org/compatiblelicenses, approved by Creative + Commons as essentially the equivalent of this Public License. + + d. Copyright and Similar Rights means copyright and/or similar rights + closely related to copyright including, without limitation, + performance, broadcast, sound recording, and Sui Generis Database + Rights, without regard to how the rights are labeled or + categorized. For purposes of this Public License, the rights + specified in Section 2(b)(1)-(2) are not Copyright and Similar + Rights. + + e. Effective Technological Measures means those measures that, in the + absence of proper authority, may not be circumvented under laws + fulfilling obligations under Article 11 of the WIPO Copyright + Treaty adopted on December 20, 1996, and/or similar international + agreements. + + f. Exceptions and Limitations means fair use, fair dealing, and/or + any other exception or limitation to Copyright and Similar Rights + that applies to Your use of the Licensed Material. + + g. License Elements means the license attributes listed in the name + of a Creative Commons Public License. The License Elements of this + Public License are Attribution, NonCommercial, and ShareAlike. + + h. Licensed Material means the artistic or literary work, database, + or other material to which the Licensor applied this Public + License. + + i. Licensed Rights means the rights granted to You subject to the + terms and conditions of this Public License, which are limited to + all Copyright and Similar Rights that apply to Your use of the + Licensed Material and that the Licensor has authority to license. + + j. Licensor means the individual(s) or entity(ies) granting rights + under this Public License. + + k. NonCommercial means not primarily intended for or directed towards + commercial advantage or monetary compensation. For purposes of + this Public License, the exchange of the Licensed Material for + other material subject to Copyright and Similar Rights by digital + file-sharing or similar means is NonCommercial provided there is + no payment of monetary compensation in connection with the + exchange. + + l. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, distribution, + dissemination, communication, or importation, and to make material + available to the public including in ways that members of the + public may access the material from a place and at a time + individually chosen by them. + + m. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and of + the Council of 11 March 1996 on the legal protection of databases, + as amended and/or succeeded, as well as other essentially + equivalent rights anywhere in the world. + + n. You means the individual or entity exercising the Licensed Rights + under this Public License. Your has a corresponding meaning. + + +Section 2 -- Scope. + + a. License grant. + + 1. Subject to the terms and conditions of this Public License, + the Licensor hereby grants You a worldwide, royalty-free, + non-sublicensable, non-exclusive, irrevocable license to + exercise the Licensed Rights in the Licensed Material to: + + a. reproduce and Share the Licensed Material, in whole or + in part, for NonCommercial purposes only; and + + b. produce, reproduce, and Share Adapted Material for + NonCommercial purposes only. + + 2. Exceptions and Limitations. For the avoidance of doubt, where + Exceptions and Limitations apply to Your use, this Public + License does not apply, and You do not need to comply with + its terms and conditions. + + 3. Term. The term of this Public License is specified in Section + 6(a). + + 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter created, + and to make technical modifications necessary to do so. The + Licensor waives and/or agrees not to assert any right or + authority to forbid You from making technical modifications + necessary to exercise the Licensed Rights, including + technical modifications necessary to circumvent Effective + Technological Measures. For purposes of this Public License, + simply making modifications authorized by this Section 2(a) + (4) never produces Adapted Material. + + 5. Downstream recipients. + + a. Offer from the Licensor -- Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + + b. Additional offer from the Licensor -- Adapted Material. + Every recipient of Adapted Material from You + automatically receives an offer from the Licensor to + exercise the Licensed Rights in the Adapted Material + under the conditions of the Adapter's License You apply. + + c. No downstream restrictions. You may not offer or impose + any additional or different terms or conditions on, or + apply any Effective Technological Measures to, the + Licensed Material if doing so restricts exercise of the + Licensed Rights by any recipient of the Licensed + Material. + + 6. No endorsement. Nothing in this Public License constitutes or + may be construed as permission to assert or imply that You + are, or that Your use of the Licensed Material is, connected + with, or sponsored, endorsed, or granted official status by, + the Licensor or others designated to receive attribution as + provided in Section 3(a)(1)(A)(i). + + b. Other rights. + + 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, to + the extent possible, the Licensor waives and/or agrees not to + assert any such rights held by the Licensor to the limited + extent necessary to allow You to exercise the Licensed + Rights, but not otherwise. + + 2. Patent and trademark rights are not licensed under this + Public License. + + 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties, including when + the Licensed Material is used other than for NonCommercial + purposes. + + +Section 3 -- License Conditions. + +Your exercise of the Licensed Rights is expressly made subject to the +following conditions. + + a. Attribution. + + 1. If You Share the Licensed Material (including in modified + form), You must: + + a. retain the following if it is supplied by the Licensor + with the Licensed Material: + + i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if + designated); + + ii. a copyright notice; + + iii. a notice that refers to this Public License; + + iv. a notice that refers to the disclaimer of + warranties; + + v. a URI or hyperlink to the Licensed Material to the + extent reasonably practicable; + + b. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + + c. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + + 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may be + reasonable to satisfy the conditions by providing a URI or + hyperlink to a resource that includes the required + information. + 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + + b. ShareAlike. + + In addition to the conditions in Section 3(a), if You Share + Adapted Material You produce, the following conditions also apply. + + 1. The Adapter's License You apply must be a Creative Commons + license with the same License Elements, this version or + later, or a BY-NC-SA Compatible License. + + 2. You must include the text of, or the URI or hyperlink to, the + Adapter's License You apply. You may satisfy this condition + in any reasonable manner based on the medium, means, and + context in which You Share Adapted Material. + + 3. You may not offer or impose any additional or different terms + or conditions on, or apply any Effective Technological + Measures to, Adapted Material that restrict exercise of the + rights granted under the Adapter's License You apply. + + +Section 4 -- Sui Generis Database Rights. + +Where the Licensed Rights include Sui Generis Database Rights that +apply to Your use of the Licensed Material: + + a. for the avoidance of doubt, Section 2(a)(1) grants You the right + to extract, reuse, reproduce, and Share all or a substantial + portion of the contents of the database for NonCommercial purposes + only; + + b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material, + including for purposes of Section 3(b); and + + c. You must comply with the conditions in Section 3(a) if You Share + all or a substantial portion of the contents of the database. + +For the avoidance of doubt, this Section 4 supplements and does not +replace Your obligations under this Public License where the Licensed +Rights include other Copyright and Similar Rights. + + +Section 5 -- Disclaimer of Warranties and Limitation of Liability. + + a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE + EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS + AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF + ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, + IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. + + b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE + TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, + NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, + INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, + COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR + USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR + DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR + IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. + + c. The disclaimer of warranties and limitation of liability provided + above shall be interpreted in a manner that, to the extent + possible, most closely approximates an absolute disclaimer and + waiver of all liability. + + +Section 6 -- Term and Termination. + + a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply with + this Public License, then Your rights under this Public License + terminate automatically. + + b. Where Your right to use the Licensed Material has terminated under + Section 6(a), it reinstates: + + 1. automatically as of the date the violation is cured, provided + it is cured within 30 days of Your discovery of the + violation; or + + 2. upon express reinstatement by the Licensor. + + For the avoidance of doubt, this Section 6(b) does not affect any + right the Licensor may have to seek remedies for Your violations + of this Public License. + + c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing so + will not terminate this Public License. + + d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + + +Section 7 -- Other Terms and Conditions. + + a. The Licensor shall not be bound by any additional or different + terms or conditions communicated by You unless expressly agreed. + + b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + + +Section 8 -- Interpretation. + + a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could lawfully + be made without permission under this Public License. + + b. To the extent possible, if any provision of this Public License is + deemed unenforceable, it shall be automatically reformed to the + minimum extent necessary to make it enforceable. If the provision + cannot be reformed, it shall be severed from this Public License + without affecting the enforceability of the remaining terms and + conditions. + + c. No term or condition of this Public License will be waived and no + failure to comply consented to unless expressly agreed to by the + Licensor. + + d. Nothing in this Public License constitutes or may be interpreted + as a limitation upon, or waiver of, any privileges and immunities + that apply to the Licensor or You, including from the legal + processes of any jurisdiction or authority. + +======================================================================= + +Creative Commons is not a party to its public +licenses. Notwithstanding, Creative Commons may elect to apply one of +its public licenses to material it publishes and in those instances +will be considered the “Licensor.” The text of the Creative Commons +public licenses is dedicated to the public domain under the CC0 Public +Domain Dedication. Except for the limited purpose of indicating that +material is shared under a Creative Commons public license or as +otherwise permitted by the Creative Commons policies published at +creativecommons.org/policies, Creative Commons does not authorize the +use of the trademark "Creative Commons" or any other trademark or logo +of Creative Commons without its prior written consent including, +without limitation, in connection with any unauthorized modifications +to any of its public licenses or any other arrangements, +understandings, or agreements concerning use of licensed material. For +the avoidance of doubt, this paragraph does not form part of the +public licenses. + +Creative Commons may be contacted at creativecommons.org. + diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-SA-3.0.txt b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-SA-3.0.txt new file mode 100644 index 0000000000..604209a804 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-SA-3.0.txt @@ -0,0 +1,359 @@ +Creative Commons Legal Code + +Attribution-ShareAlike 3.0 Unported + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR + DAMAGES RESULTING FROM ITS USE. + +License + +THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE +COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY +COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS +AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED. + +BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE +TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY +BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS +CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND +CONDITIONS. + +1. Definitions + + a. "Adaptation" means a work based upon the Work, or upon the Work and + other pre-existing works, such as a translation, adaptation, + derivative work, arrangement of music or other alterations of a + literary or artistic work, or phonogram or performance and includes + cinematographic adaptations or any other form in which the Work may be + recast, transformed, or adapted including in any form recognizably + derived from the original, except that a work that constitutes a + Collection will not be considered an Adaptation for the purpose of + this License. For the avoidance of doubt, where the Work is a musical + work, performance or phonogram, the synchronization of the Work in + timed-relation with a moving image ("synching") will be considered an + Adaptation for the purpose of this License. + b. "Collection" means a collection of literary or artistic works, such as + encyclopedias and anthologies, or performances, phonograms or + broadcasts, or other works or subject matter other than works listed + in Section 1(f) below, which, by reason of the selection and + arrangement of their contents, constitute intellectual creations, in + which the Work is included in its entirety in unmodified form along + with one or more other contributions, each constituting separate and + independent works in themselves, which together are assembled into a + collective whole. A work that constitutes a Collection will not be + considered an Adaptation (as defined below) for the purposes of this + License. + c. "Creative Commons Compatible License" means a license that is listed + at https://creativecommons.org/compatiblelicenses that has been + approved by Creative Commons as being essentially equivalent to this + License, including, at a minimum, because that license: (i) contains + terms that have the same purpose, meaning and effect as the License + Elements of this License; and, (ii) explicitly permits the relicensing + of adaptations of works made available under that license under this + License or a Creative Commons jurisdiction license with the same + License Elements as this License. + d. "Distribute" means to make available to the public the original and + copies of the Work or Adaptation, as appropriate, through sale or + other transfer of ownership. + e. "License Elements" means the following high-level license attributes + as selected by Licensor and indicated in the title of this License: + Attribution, ShareAlike. + f. "Licensor" means the individual, individuals, entity or entities that + offer(s) the Work under the terms of this License. + g. "Original Author" means, in the case of a literary or artistic work, + the individual, individuals, entity or entities who created the Work + or if no individual or entity can be identified, the publisher; and in + addition (i) in the case of a performance the actors, singers, + musicians, dancers, and other persons who act, sing, deliver, declaim, + play in, interpret or otherwise perform literary or artistic works or + expressions of folklore; (ii) in the case of a phonogram the producer + being the person or legal entity who first fixes the sounds of a + performance or other sounds; and, (iii) in the case of broadcasts, the + organization that transmits the broadcast. + h. "Work" means the literary and/or artistic work offered under the terms + of this License including without limitation any production in the + literary, scientific and artistic domain, whatever may be the mode or + form of its expression including digital form, such as a book, + pamphlet and other writing; a lecture, address, sermon or other work + of the same nature; a dramatic or dramatico-musical work; a + choreographic work or entertainment in dumb show; a musical + composition with or without words; a cinematographic work to which are + assimilated works expressed by a process analogous to cinematography; + a work of drawing, painting, architecture, sculpture, engraving or + lithography; a photographic work to which are assimilated works + expressed by a process analogous to photography; a work of applied + art; an illustration, map, plan, sketch or three-dimensional work + relative to geography, topography, architecture or science; a + performance; a broadcast; a phonogram; a compilation of data to the + extent it is protected as a copyrightable work; or a work performed by + a variety or circus performer to the extent it is not otherwise + considered a literary or artistic work. + i. "You" means an individual or entity exercising rights under this + License who has not previously violated the terms of this License with + respect to the Work, or who has received express permission from the + Licensor to exercise rights under this License despite a previous + violation. + j. "Publicly Perform" means to perform public recitations of the Work and + to communicate to the public those public recitations, by any means or + process, including by wire or wireless means or public digital + performances; to make available to the public Works in such a way that + members of the public may access these Works from a place and at a + place individually chosen by them; to perform the Work to the public + by any means or process and the communication to the public of the + performances of the Work, including by public digital performance; to + broadcast and rebroadcast the Work by any means including signs, + sounds or images. + k. "Reproduce" means to make copies of the Work by any means including + without limitation by sound or visual recordings and the right of + fixation and reproducing fixations of the Work, including storage of a + protected performance or phonogram in digital form or other electronic + medium. + +2. Fair Dealing Rights. Nothing in this License is intended to reduce, +limit, or restrict any uses free from copyright or rights arising from +limitations or exceptions that are provided for in connection with the +copyright protection under copyright law or other applicable laws. + +3. License Grant. Subject to the terms and conditions of this License, +Licensor hereby grants You a worldwide, royalty-free, non-exclusive, +perpetual (for the duration of the applicable copyright) license to +exercise the rights in the Work as stated below: + + a. to Reproduce the Work, to incorporate the Work into one or more + Collections, and to Reproduce the Work as incorporated in the + Collections; + b. to create and Reproduce Adaptations provided that any such Adaptation, + including any translation in any medium, takes reasonable steps to + clearly label, demarcate or otherwise identify that changes were made + to the original Work. For example, a translation could be marked "The + original work was translated from English to Spanish," or a + modification could indicate "The original work has been modified."; + c. to Distribute and Publicly Perform the Work including as incorporated + in Collections; and, + d. to Distribute and Publicly Perform Adaptations. + e. For the avoidance of doubt: + + i. Non-waivable Compulsory License Schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme cannot be waived, the Licensor + reserves the exclusive right to collect such royalties for any + exercise by You of the rights granted under this License; + ii. Waivable Compulsory License Schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme can be waived, the Licensor waives the + exclusive right to collect such royalties for any exercise by You + of the rights granted under this License; and, + iii. Voluntary License Schemes. The Licensor waives the right to + collect royalties, whether individually or, in the event that the + Licensor is a member of a collecting society that administers + voluntary licensing schemes, via that society, from any exercise + by You of the rights granted under this License. + +The above rights may be exercised in all media and formats whether now +known or hereafter devised. The above rights include the right to make +such modifications as are technically necessary to exercise the rights in +other media and formats. Subject to Section 8(f), all rights not expressly +granted by Licensor are hereby reserved. + +4. Restrictions. The license granted in Section 3 above is expressly made +subject to and limited by the following restrictions: + + a. You may Distribute or Publicly Perform the Work only under the terms + of this License. You must include a copy of, or the Uniform Resource + Identifier (URI) for, this License with every copy of the Work You + Distribute or Publicly Perform. You may not offer or impose any terms + on the Work that restrict the terms of this License or the ability of + the recipient of the Work to exercise the rights granted to that + recipient under the terms of the License. You may not sublicense the + Work. You must keep intact all notices that refer to this License and + to the disclaimer of warranties with every copy of the Work You + Distribute or Publicly Perform. When You Distribute or Publicly + Perform the Work, You may not impose any effective technological + measures on the Work that restrict the ability of a recipient of the + Work from You to exercise the rights granted to that recipient under + the terms of the License. This Section 4(a) applies to the Work as + incorporated in a Collection, but this does not require the Collection + apart from the Work itself to be made subject to the terms of this + License. If You create a Collection, upon notice from any Licensor You + must, to the extent practicable, remove from the Collection any credit + as required by Section 4(c), as requested. If You create an + Adaptation, upon notice from any Licensor You must, to the extent + practicable, remove from the Adaptation any credit as required by + Section 4(c), as requested. + b. You may Distribute or Publicly Perform an Adaptation only under the + terms of: (i) this License; (ii) a later version of this License with + the same License Elements as this License; (iii) a Creative Commons + jurisdiction license (either this or a later license version) that + contains the same License Elements as this License (e.g., + Attribution-ShareAlike 3.0 US)); (iv) a Creative Commons Compatible + License. If you license the Adaptation under one of the licenses + mentioned in (iv), you must comply with the terms of that license. If + you license the Adaptation under the terms of any of the licenses + mentioned in (i), (ii) or (iii) (the "Applicable License"), you must + comply with the terms of the Applicable License generally and the + following provisions: (I) You must include a copy of, or the URI for, + the Applicable License with every copy of each Adaptation You + Distribute or Publicly Perform; (II) You may not offer or impose any + terms on the Adaptation that restrict the terms of the Applicable + License or the ability of the recipient of the Adaptation to exercise + the rights granted to that recipient under the terms of the Applicable + License; (III) You must keep intact all notices that refer to the + Applicable License and to the disclaimer of warranties with every copy + of the Work as included in the Adaptation You Distribute or Publicly + Perform; (IV) when You Distribute or Publicly Perform the Adaptation, + You may not impose any effective technological measures on the + Adaptation that restrict the ability of a recipient of the Adaptation + from You to exercise the rights granted to that recipient under the + terms of the Applicable License. This Section 4(b) applies to the + Adaptation as incorporated in a Collection, but this does not require + the Collection apart from the Adaptation itself to be made subject to + the terms of the Applicable License. + c. If You Distribute, or Publicly Perform the Work or any Adaptations or + Collections, You must, unless a request has been made pursuant to + Section 4(a), keep intact all copyright notices for the Work and + provide, reasonable to the medium or means You are utilizing: (i) the + name of the Original Author (or pseudonym, if applicable) if supplied, + and/or if the Original Author and/or Licensor designate another party + or parties (e.g., a sponsor institute, publishing entity, journal) for + attribution ("Attribution Parties") in Licensor's copyright notice, + terms of service or by other reasonable means, the name of such party + or parties; (ii) the title of the Work if supplied; (iii) to the + extent reasonably practicable, the URI, if any, that Licensor + specifies to be associated with the Work, unless such URI does not + refer to the copyright notice or licensing information for the Work; + and (iv) , consistent with Ssection 3(b), in the case of an + Adaptation, a credit identifying the use of the Work in the Adaptation + (e.g., "French translation of the Work by Original Author," or + "Screenplay based on original Work by Original Author"). The credit + required by this Section 4(c) may be implemented in any reasonable + manner; provided, however, that in the case of a Adaptation or + Collection, at a minimum such credit will appear, if a credit for all + contributing authors of the Adaptation or Collection appears, then as + part of these credits and in a manner at least as prominent as the + credits for the other contributing authors. For the avoidance of + doubt, You may only use the credit required by this Section for the + purpose of attribution in the manner set out above and, by exercising + Your rights under this License, You may not implicitly or explicitly + assert or imply any connection with, sponsorship or endorsement by the + Original Author, Licensor and/or Attribution Parties, as appropriate, + of You or Your use of the Work, without the separate, express prior + written permission of the Original Author, Licensor and/or Attribution + Parties. + d. Except as otherwise agreed in writing by the Licensor or as may be + otherwise permitted by applicable law, if You Reproduce, Distribute or + Publicly Perform the Work either by itself or as part of any + Adaptations or Collections, You must not distort, mutilate, modify or + take other derogatory action in relation to the Work which would be + prejudicial to the Original Author's honor or reputation. Licensor + agrees that in those jurisdictions (e.g. Japan), in which any exercise + of the right granted in Section 3(b) of this License (the right to + make Adaptations) would be deemed to be a distortion, mutilation, + modification or other derogatory action prejudicial to the Original + Author's honor and reputation, the Licensor will waive or not assert, + as appropriate, this Section, to the fullest extent permitted by the + applicable national law, to enable You to reasonably exercise Your + right under Section 3(b) of this License (right to make Adaptations) + but not otherwise. + +5. Representations, Warranties and Disclaimer + +UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR +OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY +KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, +INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, +FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF +LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, +WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION +OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU. + +6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE +LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR +ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES +ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS +BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +7. Termination + + a. This License and the rights granted hereunder will terminate + automatically upon any breach by You of the terms of this License. + Individuals or entities who have received Adaptations or Collections + from You under this License, however, will not have their licenses + terminated provided such individuals or entities remain in full + compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will + survive any termination of this License. + b. Subject to the above terms and conditions, the license granted here is + perpetual (for the duration of the applicable copyright in the Work). + Notwithstanding the above, Licensor reserves the right to release the + Work under different license terms or to stop distributing the Work at + any time; provided, however that any such election will not serve to + withdraw this License (or any other license that has been, or is + required to be, granted under the terms of this License), and this + License will continue in full force and effect unless terminated as + stated above. + +8. Miscellaneous + + a. Each time You Distribute or Publicly Perform the Work or a Collection, + the Licensor offers to the recipient a license to the Work on the same + terms and conditions as the license granted to You under this License. + b. Each time You Distribute or Publicly Perform an Adaptation, Licensor + offers to the recipient a license to the original Work on the same + terms and conditions as the license granted to You under this License. + c. If any provision of this License is invalid or unenforceable under + applicable law, it shall not affect the validity or enforceability of + the remainder of the terms of this License, and without further action + by the parties to this agreement, such provision shall be reformed to + the minimum extent necessary to make such provision valid and + enforceable. + d. No term or provision of this License shall be deemed waived and no + breach consented to unless such waiver or consent shall be in writing + and signed by the party to be charged with such waiver or consent. + e. This License constitutes the entire agreement between the parties with + respect to the Work licensed here. There are no understandings, + agreements or representations with respect to the Work not specified + here. Licensor shall not be bound by any additional provisions that + may appear in any communication from You. This License may not be + modified without the mutual written agreement of the Licensor and You. + f. The rights granted under, and the subject matter referenced, in this + License were drafted utilizing the terminology of the Berne Convention + for the Protection of Literary and Artistic Works (as amended on + September 28, 1979), the Rome Convention of 1961, the WIPO Copyright + Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 + and the Universal Copyright Convention (as revised on July 24, 1971). + These rights and subject matter take effect in the relevant + jurisdiction in which the License terms are sought to be enforced + according to the corresponding provisions of the implementation of + those treaty provisions in the applicable national law. If the + standard suite of rights granted under applicable copyright law + includes additional rights not granted under this License, such + additional rights are deemed to be included in the License; this + License is not intended to restrict the license of any rights under + applicable law. + + +Creative Commons Notice + + Creative Commons is not a party to this License, and makes no warranty + whatsoever in connection with the Work. Creative Commons will not be + liable to You or any party on any legal theory for any damages + whatsoever, including without limitation any general, special, + incidental or consequential damages arising in connection to this + license. Notwithstanding the foregoing two (2) sentences, if Creative + Commons has expressly identified itself as the Licensor hereunder, it + shall have all rights and obligations of Licensor. + + Except for the limited purpose of indicating to the public that the + Work is licensed under the CCPL, Creative Commons does not authorize + the use by either party of the trademark "Creative Commons" or any + related trademark or logo of Creative Commons without the prior + written consent of Creative Commons. Any permitted use will be in + compliance with Creative Commons' then-current trademark usage + guidelines, as may be published on its website or otherwise made + available upon request from time to time. For the avoidance of doubt, + this trademark restriction does not form part of the License. + + Creative Commons may be contacted at https://creativecommons.org/. diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-SA-4.0.txt b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-SA-4.0.txt new file mode 100644 index 0000000000..a73481c4be --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-SA-4.0.txt @@ -0,0 +1,428 @@ +Attribution-ShareAlike 4.0 International + +======================================================================= + +Creative Commons Corporation ("Creative Commons") is not a law firm and +does not provide legal services or legal advice. Distribution of +Creative Commons public licenses does not create a lawyer-client or +other relationship. Creative Commons makes its licenses and related +information available on an "as-is" basis. Creative Commons gives no +warranties regarding its licenses, any material licensed under their +terms and conditions, or any related information. Creative Commons +disclaims all liability for damages resulting from their use to the +fullest extent possible. + +Using Creative Commons Public Licenses + +Creative Commons public licenses provide a standard set of terms and +conditions that creators and other rights holders may use to share +original works of authorship and other material subject to copyright +and certain other rights specified in the public license below. The +following considerations are for informational purposes only, are not +exhaustive, and do not form part of our licenses. + + Considerations for licensors: Our public licenses are + intended for use by those authorized to give the public + permission to use material in ways otherwise restricted by + copyright and certain other rights. Our licenses are + irrevocable. Licensors should read and understand the terms + and conditions of the license they choose before applying it. + Licensors should also secure all rights necessary before + applying our licenses so that the public can reuse the + material as expected. Licensors should clearly mark any + material not subject to the license. This includes other CC- + licensed material, or material used under an exception or + limitation to copyright. More considerations for licensors: + wiki.creativecommons.org/Considerations_for_licensors + + Considerations for the public: By using one of our public + licenses, a licensor grants the public permission to use the + licensed material under specified terms and conditions. If + the licensor's permission is not necessary for any reason--for + example, because of any applicable exception or limitation to + copyright--then that use is not regulated by the license. Our + licenses grant only permissions under copyright and certain + other rights that a licensor has authority to grant. Use of + the licensed material may still be restricted for other + reasons, including because others have copyright or other + rights in the material. A licensor may make special requests, + such as asking that all changes be marked or described. + Although not required by our licenses, you are encouraged to + respect those requests where reasonable. More considerations + for the public: + wiki.creativecommons.org/Considerations_for_licensees + +======================================================================= + +Creative Commons Attribution-ShareAlike 4.0 International Public +License + +By exercising the Licensed Rights (defined below), You accept and agree +to be bound by the terms and conditions of this Creative Commons +Attribution-ShareAlike 4.0 International Public License ("Public +License"). To the extent this Public License may be interpreted as a +contract, You are granted the Licensed Rights in consideration of Your +acceptance of these terms and conditions, and the Licensor grants You +such rights in consideration of benefits the Licensor receives from +making the Licensed Material available under these terms and +conditions. + + +Section 1 -- Definitions. + + a. Adapted Material means material subject to Copyright and Similar + Rights that is derived from or based upon the Licensed Material + and in which the Licensed Material is translated, altered, + arranged, transformed, or otherwise modified in a manner requiring + permission under the Copyright and Similar Rights held by the + Licensor. For purposes of this Public License, where the Licensed + Material is a musical work, performance, or sound recording, + Adapted Material is always produced where the Licensed Material is + synched in timed relation with a moving image. + + b. Adapter's License means the license You apply to Your Copyright + and Similar Rights in Your contributions to Adapted Material in + accordance with the terms and conditions of this Public License. + + c. BY-SA Compatible License means a license listed at + creativecommons.org/compatiblelicenses, approved by Creative + Commons as essentially the equivalent of this Public License. + + d. Copyright and Similar Rights means copyright and/or similar rights + closely related to copyright including, without limitation, + performance, broadcast, sound recording, and Sui Generis Database + Rights, without regard to how the rights are labeled or + categorized. For purposes of this Public License, the rights + specified in Section 2(b)(1)-(2) are not Copyright and Similar + Rights. + + e. Effective Technological Measures means those measures that, in the + absence of proper authority, may not be circumvented under laws + fulfilling obligations under Article 11 of the WIPO Copyright + Treaty adopted on December 20, 1996, and/or similar international + agreements. + + f. Exceptions and Limitations means fair use, fair dealing, and/or + any other exception or limitation to Copyright and Similar Rights + that applies to Your use of the Licensed Material. + + g. License Elements means the license attributes listed in the name + of a Creative Commons Public License. The License Elements of this + Public License are Attribution and ShareAlike. + + h. Licensed Material means the artistic or literary work, database, + or other material to which the Licensor applied this Public + License. + + i. Licensed Rights means the rights granted to You subject to the + terms and conditions of this Public License, which are limited to + all Copyright and Similar Rights that apply to Your use of the + Licensed Material and that the Licensor has authority to license. + + j. Licensor means the individual(s) or entity(ies) granting rights + under this Public License. + + k. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, distribution, + dissemination, communication, or importation, and to make material + available to the public including in ways that members of the + public may access the material from a place and at a time + individually chosen by them. + + l. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and of + the Council of 11 March 1996 on the legal protection of databases, + as amended and/or succeeded, as well as other essentially + equivalent rights anywhere in the world. + + m. You means the individual or entity exercising the Licensed Rights + under this Public License. Your has a corresponding meaning. + + +Section 2 -- Scope. + + a. License grant. + + 1. Subject to the terms and conditions of this Public License, + the Licensor hereby grants You a worldwide, royalty-free, + non-sublicensable, non-exclusive, irrevocable license to + exercise the Licensed Rights in the Licensed Material to: + + a. reproduce and Share the Licensed Material, in whole or + in part; and + + b. produce, reproduce, and Share Adapted Material. + + 2. Exceptions and Limitations. For the avoidance of doubt, where + Exceptions and Limitations apply to Your use, this Public + License does not apply, and You do not need to comply with + its terms and conditions. + + 3. Term. The term of this Public License is specified in Section + 6(a). + + 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter created, + and to make technical modifications necessary to do so. The + Licensor waives and/or agrees not to assert any right or + authority to forbid You from making technical modifications + necessary to exercise the Licensed Rights, including + technical modifications necessary to circumvent Effective + Technological Measures. For purposes of this Public License, + simply making modifications authorized by this Section 2(a) + (4) never produces Adapted Material. + + 5. Downstream recipients. + + a. Offer from the Licensor -- Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + + b. Additional offer from the Licensor -- Adapted Material. + Every recipient of Adapted Material from You + automatically receives an offer from the Licensor to + exercise the Licensed Rights in the Adapted Material + under the conditions of the Adapter's License You apply. + + c. No downstream restrictions. You may not offer or impose + any additional or different terms or conditions on, or + apply any Effective Technological Measures to, the + Licensed Material if doing so restricts exercise of the + Licensed Rights by any recipient of the Licensed + Material. + + 6. No endorsement. Nothing in this Public License constitutes or + may be construed as permission to assert or imply that You + are, or that Your use of the Licensed Material is, connected + with, or sponsored, endorsed, or granted official status by, + the Licensor or others designated to receive attribution as + provided in Section 3(a)(1)(A)(i). + + b. Other rights. + + 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, to + the extent possible, the Licensor waives and/or agrees not to + assert any such rights held by the Licensor to the limited + extent necessary to allow You to exercise the Licensed + Rights, but not otherwise. + + 2. Patent and trademark rights are not licensed under this + Public License. + + 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties. + + +Section 3 -- License Conditions. + +Your exercise of the Licensed Rights is expressly made subject to the +following conditions. + + a. Attribution. + + 1. If You Share the Licensed Material (including in modified + form), You must: + + a. retain the following if it is supplied by the Licensor + with the Licensed Material: + + i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if + designated); + + ii. a copyright notice; + + iii. a notice that refers to this Public License; + + iv. a notice that refers to the disclaimer of + warranties; + + v. a URI or hyperlink to the Licensed Material to the + extent reasonably practicable; + + b. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + + c. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + + 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may be + reasonable to satisfy the conditions by providing a URI or + hyperlink to a resource that includes the required + information. + + 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + + b. ShareAlike. + + In addition to the conditions in Section 3(a), if You Share + Adapted Material You produce, the following conditions also apply. + + 1. The Adapter's License You apply must be a Creative Commons + license with the same License Elements, this version or + later, or a BY-SA Compatible License. + + 2. You must include the text of, or the URI or hyperlink to, the + Adapter's License You apply. You may satisfy this condition + in any reasonable manner based on the medium, means, and + context in which You Share Adapted Material. + + 3. You may not offer or impose any additional or different terms + or conditions on, or apply any Effective Technological + Measures to, Adapted Material that restrict exercise of the + rights granted under the Adapter's License You apply. + + +Section 4 -- Sui Generis Database Rights. + +Where the Licensed Rights include Sui Generis Database Rights that +apply to Your use of the Licensed Material: + + a. for the avoidance of doubt, Section 2(a)(1) grants You the right + to extract, reuse, reproduce, and Share all or a substantial + portion of the contents of the database; + + b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material, + + including for purposes of Section 3(b); and + c. You must comply with the conditions in Section 3(a) if You Share + all or a substantial portion of the contents of the database. + +For the avoidance of doubt, this Section 4 supplements and does not +replace Your obligations under this Public License where the Licensed +Rights include other Copyright and Similar Rights. + + +Section 5 -- Disclaimer of Warranties and Limitation of Liability. + + a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE + EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS + AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF + ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, + IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. + + b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE + TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, + NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, + INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, + COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR + USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR + DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR + IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. + + c. The disclaimer of warranties and limitation of liability provided + above shall be interpreted in a manner that, to the extent + possible, most closely approximates an absolute disclaimer and + waiver of all liability. + + +Section 6 -- Term and Termination. + + a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply with + this Public License, then Your rights under this Public License + terminate automatically. + + b. Where Your right to use the Licensed Material has terminated under + Section 6(a), it reinstates: + + 1. automatically as of the date the violation is cured, provided + it is cured within 30 days of Your discovery of the + violation; or + + 2. upon express reinstatement by the Licensor. + + For the avoidance of doubt, this Section 6(b) does not affect any + right the Licensor may have to seek remedies for Your violations + of this Public License. + + c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing so + will not terminate this Public License. + + d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + + +Section 7 -- Other Terms and Conditions. + + a. The Licensor shall not be bound by any additional or different + terms or conditions communicated by You unless expressly agreed. + + b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + + +Section 8 -- Interpretation. + + a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could lawfully + be made without permission under this Public License. + + b. To the extent possible, if any provision of this Public License is + deemed unenforceable, it shall be automatically reformed to the + minimum extent necessary to make it enforceable. If the provision + cannot be reformed, it shall be severed from this Public License + without affecting the enforceability of the remaining terms and + conditions. + + c. No term or condition of this Public License will be waived and no + failure to comply consented to unless expressly agreed to by the + Licensor. + + d. Nothing in this Public License constitutes or may be interpreted + as a limitation upon, or waiver of, any privileges and immunities + that apply to the Licensor or You, including from the legal + processes of any jurisdiction or authority. + + +======================================================================= + +Creative Commons is not a party to its public +licenses. Notwithstanding, Creative Commons may elect to apply one of +its public licenses to material it publishes and in those instances +will be considered the “Licensor.” The text of the Creative Commons +public licenses is dedicated to the public domain under the CC0 Public +Domain Dedication. Except for the limited purpose of indicating that +material is shared under a Creative Commons public license or as +otherwise permitted by the Creative Commons policies published at +creativecommons.org/policies, Creative Commons does not authorize the +use of the trademark "Creative Commons" or any other trademark or logo +of Creative Commons without its prior written consent including, +without limitation, in connection with any unauthorized modifications +to any of its public licenses or any other arrangements, +understandings, or agreements concerning use of licensed material. For +the avoidance of doubt, this paragraph does not form part of the +public licenses. + +Creative Commons may be contacted at creativecommons.org. + diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/gpl-2.0.txt b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/gpl-2.0.txt new file mode 100644 index 0000000000..d159169d10 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/common-licenses/gpl-2.0.txt @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2000-en.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2000-en.yaml similarity index 84% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2000-en.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2000-en.yaml index b9a166d589..5088641c96 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2000-en.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2000-en.yaml @@ -34,6 +34,8 @@ artifacts: train.txt.gz: url: "https://www.clips.uantwerpen.be/conll2000/chunking/train.txt.gz" sha1: 9f31cf936554cebf558d07cce923dca0b7f31864 + sha512: 02a9be73f1bdd3d654ec2337cb64b358f0f6df0428b5da167aa95462d4fe06f4834ddd2dbad8cd2dd6eeb06d759379ae94f7dd2790e06f7d334afad902ec233c test.txt.gz: url: "https://www.clips.uantwerpen.be/conll2000/chunking/test.txt.gz" sha1: dc57527f1f60eeafad03da51235185141152f849 + sha512: 2668c8e4025cfc8067d78c5a6ef08ffa8d66883a351faeb995314a7832801b96c5997cac2117c6a34b0ed58c38073f00091b629e11a2341945c2e835a7410b5c diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2002-es.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2002-es.yaml similarity index 84% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2002-es.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2002-es.yaml index 403fe3c94b..3e121f012d 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2002-es.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2002-es.yaml @@ -23,8 +23,9 @@ description: | artifacts: data.tgz: - url: "http://www.cnts.ua.ac.be/conll2002/ner.tgz" + url: "https://web.archive.org/web/20170307123302if_/http://www.cnts.ua.ac.be/conll2002/ner.tgz" sha1: 686ef8fed3125a1d8aefe1351ff0e619fe9c34cb + sha512: 61a7423b1fb2bd3dac0f85b37e56a04b26d0aa8443d707191c93a9ea83da9990edab4eb71e689bd223bb38504208a17b750cbec94e436362c9f7c524da8b8e64 shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2002-nl.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2002-nl.yaml similarity index 89% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2002-nl.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2002-nl.yaml index 94c0c1d6e8..831948c3f5 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2002-nl.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2002-nl.yaml @@ -22,6 +22,7 @@ artifacts: data.tgz: url: "http://www.cnts.ua.ac.be/conll2002/ner.tgz" sha1: 686ef8fed3125a1d8aefe1351ff0e619fe9c34cb + sha512: 61a7423b1fb2bd3dac0f85b37e56a04b26d0aa8443d707191c93a9ea83da9990edab4eb71e689bd223bb38504208a17b750cbec94e436362c9f7c524da8b8e64 shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2006-pt.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2006-pt.yaml new file mode 100644 index 0000000000..2a3502935b --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2006-pt.yaml @@ -0,0 +1,51 @@ +groupId: org.dkpro.core.datasets.conll2006 +datasetId: conll2006 +# Didn't find any version information at the source, falling back to date of the corpus files +version: 20100302 +language: pt +mediaType: text/x.org.dkpro.conll-2006 +encoding: UTF-8 + +name: CoNLL-2006 Shared Task (Portuguese) +url: http://ilk.uvt.nl/conll/ +attribution: Diana Santos, Eckhard Bick +description: | + This is the Portuguese part of the CONLL-X Shared Task. The was derived from the Floresta + Sintá(c)tica Bosque 7.3 by Sabine Buchholz. + + (This description has been partially sourced from the README file included with the corpus). + + NOTE: We dd not find license information for this dataset. One might assume the license of this + dataset is equivalent to that of the Floresta Sintá(c)tica from which it was derived. + +licenses: + - name: Floresta Sintá(c)tica License + url: http://www.linguateca.pt/floresta/info_floresta_English.html + +artifacts: + README.txt: + url: https://www.linguateca.pt/floresta/CoNLL-X/readme.conll + sha1: 10da89fed0ecb888c8fc7fad350b1a11bb9050d7 + sha512: 178e28f6d7e13728412736659e4afc2c46302f16c2bff860e39516707aeb5acc072bc3f0ab1852e35853b240051985b682e2e64f3483175bf941cf512cfc1b53 + verificationMode: TEXT + portuguese_bosque_train.conll: + url: https://www.linguateca.pt/floresta/CoNLL-X/portuguese_bosque_train.conll + sha1: 29e630e207c74a42e0d2999193aa25d73f262920 + sha512: 32efcaece5c81e6b2fb31efeba09613ae50374c0b89c83969f2c43ca3b2a527e6944a0727e2983e374824264ef0f0c398c18a5a1f75c42764d415fca0755e524 + portuguese_bosque_test_blind.conll: + url: https://www.linguateca.pt/floresta/CoNLL-X/portuguese_bosque_test_blind.conll + sha1: fabcfbd73a531e21786af9b8233f1a4aa78dfddb + sha512: fbd3382dfb5acd2a34d5ecd5cbe449c33e96b8618b69d1de772f70a799cfb5b684c209f193474e63683033528af4e5111c8c3c0af4168c3d121b448630bce424 + portuguese_bosque_test.conll: + url: https://www.linguateca.pt/floresta/CoNLL-X/portuguese_bosque_test.conll + sha1: e399cdc1203df1ff43816f3f934223cb9a625992 + sha512: 8d600d1158d87f446c2814f5adb74b8a7380cdffbf4a62e971d7d7775702b9574506b92fcfe347f0572976447a20b6499b8693ac412a89335fa71c00a1269730 + +roles: + training: + - portuguese_bosque_train.conll + testing: + - portuguese_bosque_test_blind.conll + development: + - portuguese_bosque_test.conll + diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2009-ca.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-ca.yaml similarity index 92% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2009-ca.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-ca.yaml index e8eb8c49b3..53ce9b3d83 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2009-ca.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-ca.yaml @@ -30,6 +30,7 @@ artifacts: data.zip: url: "http://ufal.mff.cuni.cz/conll2009-st/data/CoNLL2009-ST-Catalan-traindevC.zip" sha1: 500cbb81709012cce4d23bfa72d93c320b0b7e6f + sha512: de862ccb6ffca557453dc6d631d6b7b0125724aa56c357e67ebc38d792f866dc563dfd2dceca8c67050d4018759e499d966f19bca90048c303a4324c65a45d4d actions: - action: explode configuration: { strip: 1, includes: ["README.TXT", "datasets/*" ]} diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2009-de.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-de.yaml similarity index 92% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2009-de.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-de.yaml index 5d48189021..ea9e0e59aa 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2009-de.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-de.yaml @@ -35,7 +35,7 @@ artifacts: # URL of the artifact url: "http://ufal.mff.cuni.cz/conll2009-st/data/CoNLL2009-ST-German-traindevB.zip" # Checksum used to validate download and cache integrity - sha1: ad4c03c3c4e4668c8beb34c399e71f539e6d633d + sha512: ae037e60d1065c72fdf7aca6507d1249538c9ae7f2f3662305787da8bd60afa810c43226d7b36712c627d371ab8dd3e01dd6565c65971ac9479584dcaaedb6dd actions: - action: explode # Extract archive after downloading configuration: { strip: 1 } # Remove one leading path element while extracting diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2009-es.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-es.yaml similarity index 92% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2009-es.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-es.yaml index 8bfc6cb0aa..2a46fd28cf 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2009-es.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-es.yaml @@ -30,6 +30,7 @@ artifacts: data.zip: url: "http://ufal.mff.cuni.cz/conll2009-st/data/CoNLL2009-ST-Spanish-traindevB.zip" sha1: ef36c3369bd05966609b4b13d6bf78884c23ece1 + sha512: 2b20574c36c684bd2e406f6356298f9f853366e8627866d60e87e7d95c8d87f2b159df2dffc8ac6a632bd833ce36b0898dd7cde8e375a314ce5e7a546bcdb594 actions: - action: explode configuration: { strip: 1, excludes: [ "documentation", "documentation/**/*" ] } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2009-ja.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-ja.yaml similarity index 90% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2009-ja.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-ja.yaml index a8b58d8ead..971033a05f 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/conll2009-ja.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/conll2009-ja.yaml @@ -24,6 +24,7 @@ artifacts: data.zip: url: "http://ufal.mff.cuni.cz/conll2009-st/data/CoNLL2009-ST-Japanese-traindevA.zip" sha1: 8c96a1eda2527a9ba1bf37dd4125cc6af11e7dd4 + sha512: 135eb63b727e0a8b77da72af32bf5f5ec84a2bda3d7e44866a2e5091d7d23b7e723fd5540d839b3e0a3e60bd16696d1ecb2a66ddcdc020447ec8220420c4971b actions: - action: explode configuration: { strip: 1 } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/coptictb-conll-cop-1.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/coptictb-conll-cop-1.0.yaml new file mode 100644 index 0000000000..34f564a5fd --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/coptictb-conll-cop-1.0.yaml @@ -0,0 +1,36 @@ +groupId: org.dkpro.core.datasets.coptictb +datasetId: coptictb +version: 1.0 +# There is no ISO 639-1 language code for Coptic. Have to use ISO 639-3 +language: cop +mediaType: text/x.org.dkpro.conll-2006 +encoding: UTF-8 + +name: Coptic Treebank +url: http://copticscriptorium.org +attribution: Amir Zeldes +description: | + The Coptic Treebank from the Coptic SCRIPTORIUM corpora (http://copticscriptorium.org/). + +licenses: + - name: CC-BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + +artifacts: + LICENSE.txt: + url: "https://github.com/CopticScriptorium/corpora/raw/3506b54ce769639c382145161da4f99fd3d6156b/coptic-treebank/LICENSE.txt" + sha1: 3015e20629818d25c34527d59808e716fd0d8ced + sha512: 6660c7fa3b570110e5ae641b169ecea50582e7ebc4214d111cdd46b783937e3d1165b92a38ef6faca2172813781b27f372117665af9097fe9c658a66ccbe87c8 + verificationMode: TEXT + coptic.treebank.conll10: + url: "https://github.com/CopticScriptorium/corpora/raw/3506b54ce769639c382145161da4f99fd3d6156b/coptic-treebank/coptic.treebank.conll10" + sha1: 8c363df27408cb14cb42f3869916c1575fe1625a + sha512: da1c89705b7ceb1922fd3c91720f57e4b9326401e60ae6bbae3a17fd6ebf884ede61cf92fdd4d150cb6c9a36fe414fad39a49e196b9880c354ee900a817988c7 + +roles: + licenses: + - LICENSE.txt + data: + - coptic.treebank.conll10 + + diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/finntb-fi-3.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/finntb-fi-3.1.yaml new file mode 100644 index 0000000000..23c5486b28 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/finntb-fi-3.1.yaml @@ -0,0 +1,41 @@ +groupId: org.dkpro.core.datasets.finntb +datasetId: finntb +version: 3.1 +language: nfi +mediaType: text/x.org.dkpro.conll-2006 +encoding: UTF-8 + +name: FinnTreeBank +url: http://www.ling.helsinki.fi/kieliteknologia/tutkimus/treebank/ +description: | + The FinnTreeBank project is creating a treebank and a parsebank for Finnish. This work is licensed + under a Creative Commons Attribution 3.0. + + The first and second version of the treebank is annotated by hand and based on 17.000 model + senctences in the Large Grammar of Finnish VISK - Iso Suomen Kielioppi. Brief samples of text + from other sources, e.g. news items and literature, are also available in the second version. A + parsebank for Finnish based on the Europarl and the JRC-Aquis will be available in June 2012. + + (This description has been sourced from the dataset website). + +licenses: + - name: CC-BY 3.0 + url: http://creativecommons.org/licenses/by/3.0/ + +artifacts: + LICENSE.txt: + url: classpath:/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-3.0.txt + sha1: aaf1a43d7cf20483321212f54bff33132a070ec0 + sha512: e9362e13775f8bd2442f357e459a2eac94d989f39ffbd8972e13d3ed52857515dc5dfe8baaa7f2432efea952931b320ff9d87431e00a9b519663ad4acba6afd7 + verificationMode: TEXT + ftb3.1.conllx.gz: + url: http://www.ling.helsinki.fi/kieliteknologia/tutkimus/treebank/sources/ftb3.1.conllx.gz + sha1: 7c58064bf9995980cea08e84035c0414adc54f06 + sha512: 62a4661d032e155b6f203493498cf761b952bb902de4907c61b5d0d704c74b9a31ee3db553402ccb45f225209e15d475dcecaf8613579fd940d2f89762548c89 + +roles: + licenses: + - LICENSE.txt + data: + - ftb3.1.conllx.gz + diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/germeval2014-de.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/germeval2014-de.yaml new file mode 100644 index 0000000000..e84372912b --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/germeval2014-de.yaml @@ -0,0 +1,53 @@ +groupId: org.dkpro.core.datasets.germeval2014 +datasetId: germeval2014 +# There is no uniform version for the whole datased, using the date of the most recent artifact +version: 20200808 +language: de +mediaType: text/x.org.dkpro.germeval-2014 +encoding: UTF-8 + +name: GermEval 2014 Named Entity Recognition Shared Task +url: https://sites.google.com/site/germeval2014ner/ +attribution: | + D. Benikova, C. Biemann, M. Reznicek. NoSta-D Named Entity Annotation for German: Guidelines and + Dataset. Proceedings of LREC 2014, Reykjavik, Iceland +description: | + The GermEval 2014 NER Shared Task builds on a new dataset with German Named Entity annotation + with the following properties: + + * The data was sampled from German Wikipedia and News Corpora as a collection of citations. + * The dataset covers over 31,000 sentences corresponding to over 590,000 tokens. + * The NER annotation uses the NoSta-D guidelines, which extend the Tübingen Treebank guidelines, + using four main NER categories with sub-structure, and annotating embeddings among NEs such as + `[ORG FC Kickers [LOC Darmstadt]]`. + + (This description has been sourced from the dataset website). + +licenses: + - name: CC-BY 4.0 + url: http://creativecommons.org/licenses/by/4.0/ + +artifacts: + LICENSE.txt: + url: "classpath:/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-4.0.txt" + sha1: 9c5bee7a22ab39ad6c19ab29ea9e94ac5874f9c6 + sha512: 222cf997169925ee3a09a26798d04332673693c578c24cb2d0cc550785a8b87849b161dccd9c48d2e4f3fa15290b6a10ac5262945f9c8cc6bdbd362d37416300 + verificationMode: TEXT + GermEval2014NER.zip: + url: "https://www.ims.uni-stuttgart.de/data/GermEval2014NER.zip" + sha1: 827edc0232f813fb1344e06924a46e9344ec2f61 + sha512: ce2e32e039fd847ea2a7acac736a9f73a0bbe73b9ce96cf2f7f93b60e2780b4ebf10217037ea9c0f033159b660677c70ebf7e069bd1feb375b1a51fad1e9d649 + actions: + - action: explode + configuration: { includes: ["*.tsv"] } + +roles: + training: + - GermEval2014NER/NER-de-train.tsv + testing: + - GermEval2014NER/NER-de-test.tsv + development: + - GermEval2014NER/NER-de-dev.tsv + licenses: + - LICENSE.txt + diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/glove.6B-en-20151025.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/glove.6B-en-20151025.yaml similarity index 75% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/glove.6B-en-20151025.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/glove.6B-en-20151025.yaml index 0838503dbd..d7bf83e225 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/glove.6B-en-20151025.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/glove.6B-en-20151025.yaml @@ -22,16 +22,17 @@ licenses: url: http://www.opendatacommons.org/licenses/pddl/1.0/ artifacts: - glove.6B.zip: - url: "https://nlp.stanford.edu/data/glove.6B.zip" + data.zip: + url: "http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip" sha1: b64e54f1877d2f735bdd000c1d7d771e25c7dfdc + sha512: 8a600c0df42436554d034d23d6d82f51b7c2e4ab8a3e3554b403bac951c9c600a2ef5612d89b2ed59ce8aecaed3c4c1d53a4e9e2a696999b95e64af267a8752e actions: - action: explode roles: data: - - glove.6B.50d.txt - - glove.6B.100d.txt - - glove.6B.200d.txt - - glove.6B.300d.txt + - data/glove.6B.50d.txt + - data/glove.6B.100d.txt + - data/glove.6B.200d.txt + - data/glove.6B.300d.txt \ No newline at end of file diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-dep-stanford-en-4.1.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-dep-stanford-en-4.1.0.yaml new file mode 100644 index 0000000000..d05593f366 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-dep-stanford-en-4.1.0.yaml @@ -0,0 +1,61 @@ +groupId: org.dkpro.core.datasets.gum +datasetId: gum-dep-stanford +version: 4.1.0 +language: en +mediaType: text/x.org.dkpro.conll-2006 +encoding: UTF-8 + +name: Georgetown University Multilayer Corpus (UD) +url: https://corpling.uis.georgetown.edu/gum/ +attribution: | + Zeldes, Amir (2017) "The GUM Corpus: Creating Multilayer Resources in the Classroom". + Language Resources and Evaluation 51(3), 581–612. For the GUM annotation team, see + https://corpling.uis.georgetown.edu/gum/ +description: | + This dataset contains release versions of the Georgetown University Multilayer Corpus (GUM), a + corpus of English texts from four text types (interviews, news, travel guides, instructional + texts). The corpus is created as part of the course LING-367 (Computational Corpus Linguistics) at + Georgetown University. For more details see: http://corpling.uis.georgetown.edu/gum. + + The dep layer gives a dependency syntax analysis according to the Stanford Dependencies manual. + This layer was intially produced using the Stanford Parser and then manually corrected using the + Arborator collaborative syntax annotation software. For the annotation project we used + non-collapsed dependencies, and dependencies for punctuation tokens have been removed. + + (This description has been sourced from the dataset website). + + The CPOS column of the files contains an extended POS tagset as it is used by the English + TreeTagger models. The POS column contains the regular PTB tagset. + + Note that this dataset does not include the Reddit data as it can only be obtained by running + a Python script which comes with GUM. + +licenses: + - name: CC-BY 2.5 + url: http://creativecommons.org/licenses/by/2.5/ + comment: "Wikinews texts (Source: https://en.wikinews.org/wiki/Wikinews:Copyright)" + - name: CC-BY-SA 3.0 + url: https://creativecommons.org/licenses/by-sa/3.0/ + comment: "WikiVoyage and Biographies texts (Source: https://wikimediafoundation.org/wiki/Terms_of_Use)" + - name: CC-BY-NC-SA 3.0 + url: http://creativecommons.org/licenses/by-nc-sa/3.0/ + comment: "WikiHow and Fiction texts (Source: http://www.wikihow.com/wikiHow:Creative-Commons)" + - name: CC-BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + comment: "Annotations (Source: https://corpling.uis.georgetown.edu/gum/)" + +artifacts: + gum.tar.gz: + url: "https://github.com/amir-zeldes/gum/archive/V4.1.0.tar.gz" + sha1: 91ded1ba5b6c05fe8e70e42a0a36ee0d20556888 + sha512: 4ca7a346f2f8d344db0ac798152fbafbf6fbb794047574f5dd0475050179a69ae9972312babae3c6fada9b4fcd313b1167f83e6a70bc6f292ce721bb12d2f3c6 + shared: true + actions: + - action: explode + configuration: { includes: ["dep/stanford/*", "LICENSE.txt", "README.md"], strip: 1 } + +roles: + licenses: + - gum/LICENSE.txt + data: + - "**/dep/stanford/*.conll10" \ No newline at end of file diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/gum-en-conll-2.2.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-2.2.0.yaml similarity index 90% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/gum-en-conll-2.2.0.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-2.2.0.yaml index d5f0867703..4f7c9f8dd1 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/gum-en-conll-2.2.0.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-2.2.0.yaml @@ -33,7 +33,7 @@ licenses: comment: "WikiVoyage texts (Source: https://wikimediafoundation.org/wiki/Terms_of_Use)" - name: CC-BY-NC-SA 3.0 url: http://creativecommons.org/licenses/by-nc-sa/3.0/ - comment: "WikiVoyage texts (Source: http://www.wikihow.com/wikiHow:Creative-Commons)" + comment: "WikiHow texts (Source: http://www.wikihow.com/wikiHow:Creative-Commons)" - name: CC-BY 4.0 url: https://creativecommons.org/licenses/by/4.0/ comment: "Annotations (Source: https://corpling.uis.georgetown.edu/gum/)" @@ -42,6 +42,7 @@ artifacts: gum.zip: url: "https://github.com/amir-zeldes/gum/archive/V2.2.0.zip" sha1: b17e276998ced83153be605d8157afacf1f10fdc + sha512: c9606ba69ec1152267b8c801510f251cdcff1b835a53fd5bf9416800499bb6201a731039a6fdaf1baebf4f3048b325034d267485bd3f7dc3633443f9a16e00c3 actions: - action: explode configuration: { includes: ["dep/*", "LICENSE.txt", "README.md"], strip: 1 } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/gum-en-conll-2.3.2.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-2.3.2.yaml similarity index 90% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/gum-en-conll-2.3.2.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-2.3.2.yaml index 813e951c8e..eb3bbf4246 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/gum-en-conll-2.3.2.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-2.3.2.yaml @@ -33,7 +33,7 @@ licenses: comment: "WikiVoyage texts (Source: https://wikimediafoundation.org/wiki/Terms_of_Use)" - name: CC-BY-NC-SA 3.0 url: http://creativecommons.org/licenses/by-nc-sa/3.0/ - comment: "WikiVoyage texts (Source: http://www.wikihow.com/wikiHow:Creative-Commons)" + comment: "WikiHow texts (Source: http://www.wikihow.com/wikiHow:Creative-Commons)" - name: CC-BY 4.0 url: https://creativecommons.org/licenses/by/4.0/ comment: "Annotations (Source: https://corpling.uis.georgetown.edu/gum/)" @@ -42,6 +42,7 @@ artifacts: gum.zip: url: "https://github.com/amir-zeldes/gum/archive/V2.3.2.zip" sha1: 471c3a35c2a0e9aee4bbff9a9cf05441fce3ef21 + sha512: 713d731714ff037ab79ccc9db34a6de7b02c3d55adc67a9aeaad2d18c5f96cb12173fb6fe7fefd3aca6ffaba606932e392623f5acd1c56045536939f7ac74ea4 actions: - action: explode configuration: { includes: ["dep/*", "LICENSE.txt", "README.md"], strip: 1 } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/gum-en-conll-3.0.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-3.0.0.yaml similarity index 91% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/gum-en-conll-3.0.0.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-3.0.0.yaml index a651f81ad9..5d0ace9dd2 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/gum-en-conll-3.0.0.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-en-conll-3.0.0.yaml @@ -36,7 +36,7 @@ licenses: comment: "WikiVoyage texts (Source: https://wikimediafoundation.org/wiki/Terms_of_Use)" - name: CC-BY-NC-SA 3.0 url: http://creativecommons.org/licenses/by-nc-sa/3.0/ - comment: "WikiVoyage texts (Source: http://www.wikihow.com/wikiHow:Creative-Commons)" + comment: "WikiHow texts (Source: http://www.wikihow.com/wikiHow:Creative-Commons)" - name: CC-BY 4.0 url: https://creativecommons.org/licenses/by/4.0/ comment: "Annotations (Source: https://corpling.uis.georgetown.edu/gum/)" @@ -45,6 +45,7 @@ artifacts: gum.zip: url: "https://github.com/amir-zeldes/gum/archive/V3.0.0.zip" sha1: b590dbe3f4ae198ca500618a53491f75c221e98b + sha512: 540240d6e9827cb316b5dedc3667f9245f1effd9525da1e8b14a0700ceed7da683bb358bda5ee4c0e3457fe20260574d0485e4574ab357fed7bf598e4efe46de actions: - action: explode configuration: { includes: ["dep/*", "LICENSE.txt", "README.md"], strip: 1 } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-ud-en-conll-5.0.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-ud-en-conll-5.0.0.yaml new file mode 100644 index 0000000000..e939a96acd --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/gum-ud-en-conll-5.0.0.yaml @@ -0,0 +1,57 @@ +groupId: org.dkpro.core.datasets.gum +datasetId: gum-dep-ud +version: 5.0.0 +language: en +mediaType: text/x.org.dkpro.conll-u +encoding: UTF-8 + +name: Georgetown University Multilayer Corpus +url: https://corpling.uis.georgetown.edu/gum/ +attribution: | + Zeldes, Amir (2017) "The GUM Corpus: Creating Multilayer Resources in the Classroom". + Language Resources and Evaluation 51(3), 581–612. + For Gum annotation team, see https://corpling.uis.georgetown.edu/gum/ +description: | + GUM is an open source multilayer corpus of richly annotated web texts from eight text types. + The corpus is collected and expanded by students as part of the curriculum in LING-367 + Computational Corpus Linguistics at Georgetown University. The selection of text types is meant + to represent different communicative purposes, while coming from sources that are readily and + openly available (mostly Creative Commons licenses), so that new texts can be annotated and + published with ease. + + (This description has been sourced from the dataset website). + +licenses: + - name: CC-BY 2.5 + url: http://creativecommons.org/licenses/by/2.5/ + comment: "Wikinews/interviews texts (Source: https://en.wikinews.org/wiki/Wikinews:Copyright)" + - name: CC-BY-SA 3.0 + url: https://creativecommons.org/licenses/by-sa/3.0/ + comment: | + WikiVoyage texts (Source: https://wikimediafoundation.org/wiki/Terms_of_Use); + Wikipedia biographies (Source: https://en.wikipedia.org/wiki/Wikipedia:Copyrights) + - name: CC-BY-NC-SA 3.0 + url: http://creativecommons.org/licenses/by-nc-sa/3.0/ + comment: | + WikiHow texts (Source: http://www.wikihow.com/wikiHow:Creative-Commons); + Fiction texts (Source: http://smallbeerpress.com/creative-commons/) + - name: CC-BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + comment: | + Annotations (Source: https://corpling.uis.georgetown.edu/gum/); + Academic texts (various sources, see LICENSE.txt file) + +artifacts: + gum.zip: + url: "https://github.com/amir-zeldes/gum/archive/V5.0.0.zip" + sha512: fbf57b1c5400cad2185337bb8735391ca728583f9d49d40e95fd3e1449ef8160eb36efd400b901f9e33649b4133f9caff0c3f45de41be35adc33257c4e5192a7 + shared: true + actions: + - action: explode + configuration: { includes: ["dep/ud/*", "LICENSE.txt", "README.md"], strip: 1 } + +roles: + licenses: + - gum/LICENSE.txt + data: + - "**/dep/ud/*.conllu" \ No newline at end of file diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/hdt-de-conll-1.0.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/hdt-de-conll-1.0.1.yaml similarity index 76% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/hdt-de-conll-1.0.1.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/hdt-de-conll-1.0.1.yaml index b60d29e65c..b3410c2822 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/hdt-de-conll-1.0.1.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/hdt-de-conll-1.0.1.yaml @@ -21,8 +21,10 @@ licenses: artifacts: LICENSE-CC-BY-SA.txt: - url: https://creativecommons.org/licenses/by-sa/4.0/legalcode.txt - sha1: 8f551a766d1f4556d1a2596365c0fc2191366efa + url: classpath:/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-SA-4.0.txt + sha1: 7f893542ae74df4c277b98278ad9e3ad6c09e690 + sha512: 492cfa38f596c70aed7006ed695da45d15ae674d3e750e0791912f0f19c8814fab947535e19a8f9bf7ec20167a62554d5a1845b6612fc22970697eb39f0ca5f6 + verificationMode: TEXT LICENSE-HZSK-ACA.txt: text: | HZSK-ACA language resources can be accessed only for research purposes by ACAdemic @@ -32,6 +34,7 @@ artifacts: hamburgDepTreebank.tar.xz: url: "https://corpora.uni-hamburg.de:8443/fedora/objects/file:hdt_hdt-conll/datastreams/hdt-conll-tar-xz/content?asOfDateTime=2016-02-17T15:38:47.643Z&download=true" sha1: 6594e5cd48966db7dac04f2b5ff948eb2bcadf37 + sha512: 50c38068e63487845dfc98e3414bddfae3e6e463b8cdb97a91f30d64c37637893342ac5bc8af584749397039c00287c19eaa14262b7abe62b2ca7bd53b14bcd0 actions: - action: explode configuration: { strip: 1 } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/iulatb-es-1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/iulatb-es-1.yaml similarity index 75% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/iulatb-es-1.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/iulatb-es-1.yaml index d3c1eb5535..a6a3abb69b 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/iulatb-es-1.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/iulatb-es-1.yaml @@ -30,11 +30,14 @@ licenses: artifacts: LICENSE.txt: - url: http://creativecommons.org/licenses/by/3.0/legalcode.txt - sha1: da39a3ee5e6b4b0d3255bfef95601890afd80709 + url: classpath:/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-3.0.txt + sha1: aaf1a43d7cf20483321212f54bff33132a070ec0 + sha512: e9362e13775f8bd2442f357e459a2eac94d989f39ffbd8972e13d3ed52857515dc5dfe8baaa7f2432efea952931b320ff9d87431e00a9b519663ad4acba6afd7 + verificationMode: TEXT data.rar: - url: http://repositori.upf.edu/bitstream/handle/10230/20048/IULA_Spanish_LSP_Treebank.rar?sequence=1 + url: https://repositori.upf.edu/bitstream/handle/10230/20048/IULA_Spanish_LSP_Treebank.rar?sequence=1 sha1: 67e2ce3327501605b7c9f0844cc4982070612222 + sha512: a2d6786fb41701699b9dad1fe6ac2de93d212aac28492a2cc99e3116764e2236684926c4e9b1bacde937b6083c58381f08434505b21fdb739ff774b8e84d9f23 actions: - action: explode - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/jos100k-conll-sl-2.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/jos100k-conll-sl-2.0.yaml new file mode 100644 index 0000000000..cb9fc018b4 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/jos100k-conll-sl-2.0.yaml @@ -0,0 +1,46 @@ +groupId: org.dkpro.core.datasets.jos +datasetId: jos100k-conll +version: 2.0 +language: sl +mediaType: text/x.org.dkpro.conll-2006 +encoding: UTF-8 + +name: JOS - jos100k +url: http://nl.ijs.si/jos/jos100k-en.html +attribution: | + Tomaž Erjavec, Darja Fišer, Simon Krek, Nina Ledinek: The JOS Linguistically Tagged Corpus of + Slovene. Proceedings of the Seventh International Conference on Language Resources and Evaluation + (LREC'10), Malta, 2010. + (link:http://www.lrec-conf.org/proceedings/lrec2010/summaries/139.html[PDF]) +description: | + The jos100k corpus contains 100,000 words of sampled paragraphs from the FidaPLUS corpus. It is + meant to serve as a reference annotated corpus of Slovene: its manually-validated annotations + cover three level of linguistic description. + + (This description has been sourced from the corpus website). + +licenses: + - name: CC-BY-NC 3.0 + url: http://creativecommons.org/licenses/by-nc/3.0/ + +artifacts: + LICENSE.txt: + url: classpath:/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-NC-3.0.txt + sha1: 23e82cd9f77862b5a26bf268aba9822784a9ab6a + sha512: 2ee0155e3dabdfaf004d40cef761e6ebdd7d6c15f33aa7ce969a0ba75905d56bb806597b816d55ef519aba40d380929f6d26e2d47cbb72dd80d9cc7dcee21d6a + verificationMode: TEXT + data.zip: + url: http://nl.ijs.si/jos/download/jos100kv2_0.zip + sha1: 9f330ffd102cc5d5734fdaecbbf67751c84a1339 + sha512: 3358d37ef31ee7ac6b5dbd846de6a2c56396cb4856efa00d7731011a603894720a3d922a108d14c62dd504b5b4909d5ce0e0d7699f350c7007a3e08409ee4ce2 + actions: + - action: explode + configuration: { strip: 1, includes: [ "00README.txt", "jos100kv2_0-sl.conll" ] } + +roles: + licenses: + - LICENSE.txt + - data/00README.txt + data: + - data/jos100kv2_0-sl.conll + diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/masc-conll-en-20080522.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/masc-conll-en-20080522.yaml similarity index 87% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/masc-conll-en-20080522.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/masc-conll-en-20080522.yaml index 29d09db853..c19faa46bf 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/masc-conll-en-20080522.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/masc-conll-en-20080522.yaml @@ -21,6 +21,7 @@ artifacts: data.zip: url: "http://www.anc.org/MASC/download/masc-conll.zip" sha1: d9f53a05c659204a3223e901c450fe8ffa5fa9fa + sha512: 67d9e67f8003153e9782a151d9c5ea8646d0c8604de13ae54c90416ed682171f6dfc745dc2a5ff1677e7f0517c94c6067cb6372a0b86fddb6e410b89d9af28cc actions: - action: explode configuration: { strip: 1 } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ndt-nb-1.01.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ndt-nb-1.01.yaml new file mode 100644 index 0000000000..ea155a9224 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ndt-nb-1.01.yaml @@ -0,0 +1,43 @@ +groupId: org.dkpro.core.datasets.ndt +datasetId: ndt +version: 1.01 +language: nb +mediaType: text/x.org.dkpro.conll-2006 +encoding: UTF-8 + +name: Norwegian Dependency Treebank (Norwegian Bokmål) +url: http://www.nb.no/sprakbanken/show?serial=sbr-10 +attribution: CLARINO NB – Språkbanken +description: | + The Norwegian Dependency Treebank (NDT) consists of text which is manually annotated with + morphological features, syntactic functions and hierarchical structure. The formalism used for the + syntactic annotation is dependency grammar. With a few exceptions, the syntactic analysis follows + Norsk referensegrammatikk ‘Norwegian Reference Grammar'. + + (This description has been sourced from the dataset website). + +licenses: + - name: CC0 1.0 + url: http://creativecommons.org/publicdomain/zero/1.0/ + +artifacts: + LICENSE_NDT.txt: + url: https://www.nb.no/sbfil/dok/LICENSE_NDT.txt + sha1: ae02a3ca7e000d6cc98f07d3a8aa017f38900499 + sha512: bcd16abbb9b8604640488871432092825cd535bcf2561ada5f3807014e0d5433cdff8fd6f913d39d723497490bdb8da4329ad3da59beb2ba0634898965535942 + verificationMode: TEXT + 20140328_NDT_1-01.tar.gz: + url: https://www.nb.no/sbfil/tekst/20140328_NDT_1-01.tar.gz + sha1: 97935c225f98119aa94d53f37aa64762cba332f3 + sha512: ace37828398cb00677adf38ba2f4046a4bf21934c4abc326ba027251d599c595871a6488b6692b3ac968a0967bc9d727ef9aab71ef34abee87e805abb43bc2ab + shared: true + actions: + - action: explode + configuration: { strip: 1, includes: "nob/conll/*.conll" } + +roles: + licenses: + - LICENSE_NDT.txt + data: + - "20140328_NDT_1-01/nob/conll/*.conll" + diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ndt-nn-1.01.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ndt-nn-1.01.yaml new file mode 100644 index 0000000000..04ec1df6f6 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ndt-nn-1.01.yaml @@ -0,0 +1,43 @@ +groupId: org.dkpro.core.datasets.ndt +datasetId: ndt +version: 1.01 +language: nn +mediaType: text/x.org.dkpro.conll-2006 +encoding: UTF-8 + +name: Norwegian Dependency Treebank (Norwegian Nynorsk) +url: http://www.nb.no/sprakbanken/show?serial=sbr-10 +attribution: CLARINO NB – Språkbanken +description: | + The Norwegian Dependency Treebank (NDT) consists of text which is manually annotated with + morphological features, syntactic functions and hierarchical structure. The formalism used for the + syntactic annotation is dependency grammar. With a few exceptions, the syntactic analysis follows + Norsk referensegrammatikk ‘Norwegian Reference Grammar'. + + (This description has been sourced from the dataset website). + +licenses: + - name: CC0 1.0 + url: http://creativecommons.org/publicdomain/zero/1.0/ + +artifacts: + LICENSE_NDT.txt: + url: https://www.nb.no/sbfil/dok/LICENSE_NDT.txt + sha1: ae02a3ca7e000d6cc98f07d3a8aa017f38900499 + sha512: bcd16abbb9b8604640488871432092825cd535bcf2561ada5f3807014e0d5433cdff8fd6f913d39d723497490bdb8da4329ad3da59beb2ba0634898965535942 + verificationMode: TEXT + 20140328_NDT_1-01.tar.gz: + url: https://www.nb.no/sbfil/tekst/20140328_NDT_1-01.tar.gz + sha1: 97935c225f98119aa94d53f37aa64762cba332f3 + sha512: ace37828398cb00677adf38ba2f4046a4bf21934c4abc326ba027251d599c595871a6488b6692b3ac968a0967bc9d727ef9aab71ef34abee87e805abb43bc2ab + shared: true + actions: + - action: explode + configuration: { strip: 1, includes: "nno/conll/*.conll" } + +roles: + licenses: + - LICENSE_NDT.txt + data: + - "20140328_NDT_1-01/nno/conll/*.conll" + diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/nemgp-de-0.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/nemgp-de-0.1.yaml new file mode 100644 index 0000000000..fd3180d15c --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/nemgp-de-0.1.yaml @@ -0,0 +1,38 @@ +groupId: org.dkpro.core.datasets.nemgp +datasetId: nemgp +version: 0.1 +language: de +# mediaType: text/x.org.dkpro.opennlp-ne +encoding: UTF-8 + +name: Named Entity Model for German, Politics (NEMGP) +url: http://www.thomas-zastrow.de/nlp/ +attribution: Thomas Zastrow +description: | + The Named Entity Model for German, Politics (NEMGP) is a collection of texts from Wikipedia and + WikiNews, manually annotated with named entity information. + + (This description has been sourced from the dataset website). + +licenses: + - name: CC-BY-SA 3.0 + url: https://creativecommons.org/licenses/by-sa/3.0/ + +artifacts: + LICENSE.txt: + url: classpath:/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-SA-3.0.txt + sha1: fb6f31be27fed5efbcd4c2e1e64c50de470364b1 + sha512: ba59a7187a93fd7e0d4bcbf4f18076a341f8d4091d0ebc5d2b6f3ee7e8e3c79cd6c485640880def013e9116cba55c7ddc08890ff9859d0403f075393df45ea9f + verificationMode: TEXT + data.zip: + url: "https://www.thomas-zastrow.de/nlp/nemgp_trainingdata_01.txt.zip" + sha1: f2a1fd54df9232741a3a1892d1ffb0a4d7205991 + sha512: 128a2abc5c07b7483e626e65d05db9c4c80bb782e2bd7770b59e6748d6847ab3734ee97e00d1fe72e4346bc6aef0e489bd6efd3ca4e3b7e4824aef4e49704587 + actions: + - action: explode + +roles: + licenses: + - LICENSE.txt + training: + - data/nemgp_trainingdata_01.txt diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/perseus-el-2.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/perseus-el-2.1.yaml new file mode 100644 index 0000000000..6cc927f974 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/perseus-el-2.1.yaml @@ -0,0 +1,41 @@ +groupId: org.dkpro.core.datasets.agdt +datasetId: agdt +version: 2.1 +language: el +# mediaType: unknown +encoding: UTF-8 + +name: Ancient Greek and Latin Dependency Treebank (Greek) +url: https://perseusdl.github.io/treebank_data/ +attribution: Giuseppe G. A. Celano, Gregory Crane, Bridget Almas et al. +description: | + The Ancient Greek and Latin Dependency Treebank (AGLDT) is the earliest treebank for Ancient Greek + and Latin. The project started at Tufts University in 2006 and is currently under development and + maintenance at Leipzig University-Tufts University. + + (This description has been sourced from the dataset website). + +licenses: + - name: CC-BY-SA 3.0 + url: https://creativecommons.org/licenses/by-sa/3.0/ + +artifacts: + LICENSE.txt: + url: classpath:/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-SA-3.0.txt + sha1: fb6f31be27fed5efbcd4c2e1e64c50de470364b1 + sha512: ba59a7187a93fd7e0d4bcbf4f18076a341f8d4091d0ebc5d2b6f3ee7e8e3c79cd6c485640880def013e9116cba55c7ddc08890ff9859d0403f075393df45ea9f + verificationMode: TEXT + perseus.zip: + url: "https://github.com/PerseusDL/treebank_data/archive/f56a35f65ef15ac454f6fbd2cfc6ea97bf2ca9b8.zip" + sha1: 140eee6d2e3e83745f95d3d5274d9e965d898980 + sha512: b8fe14202b5dbe6d7c7b387f38a80036d62d3ecc860fa0fc1ee698ed10a8121b144c2c36b09b45fd6b4fb17a025f88e4669be66524b8a5b550c57032f789ceb4 + shared: true + actions: + - action: explode + configuration: { strip: 1, includes: [ "README.md", "v2.1/Greek/**/*" ] } + +roles: + licenses: + - LICENSE.txt + data: + - "perseus/v2.1/Greek/texts/*.xml" diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/perseus-la-2.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/perseus-la-2.1.yaml new file mode 100644 index 0000000000..78507979f4 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/perseus-la-2.1.yaml @@ -0,0 +1,42 @@ +groupId: org.dkpro.core.datasets.agdt +datasetId: agdt +version: 2.1 +language: la +# mediaType: unknown +encoding: ISO-8859-1 + +name: Ancient Greek and Latin Dependency Treebank (Latin) +url: https://perseusdl.github.io/treebank_data/ +attribution: Giuseppe G. A. Celano, Gregory Crane, Bridget Almas et al. +description: | + The Ancient Greek and Latin Dependency Treebank (AGLDT) is the earliest treebank for Ancient Greek + and Latin. The project started at Tufts University in 2006 and is currently under development and + maintenance at Leipzig University-Tufts University. + + (This description has been sourced from the dataset website). + +licenses: + - name: CC-BY-SA 3.0 + url: https://creativecommons.org/licenses/by-sa/3.0/ + +artifacts: + LICENSE.txt: + url: classpath:/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-SA-3.0.txt + sha1: fb6f31be27fed5efbcd4c2e1e64c50de470364b1 + sha512: ba59a7187a93fd7e0d4bcbf4f18076a341f8d4091d0ebc5d2b6f3ee7e8e3c79cd6c485640880def013e9116cba55c7ddc08890ff9859d0403f075393df45ea9f + verificationMode: TEXT + perseus.zip: + url: "https://github.com/PerseusDL/treebank_data/archive/f56a35f65ef15ac454f6fbd2cfc6ea97bf2ca9b8.zip" + sha1: 140eee6d2e3e83745f95d3d5274d9e965d898980 + sha512: b8fe14202b5dbe6d7c7b387f38a80036d62d3ecc860fa0fc1ee698ed10a8121b144c2c36b09b45fd6b4fb17a025f88e4669be66524b8a5b550c57032f789ceb4 + shared: true + actions: + - action: explode + configuration: { strip: 1, includes: [ "README.md", "v2.1/Latin/**/*" ] } + +roles: + licenses: + - LICENSE.txt + data: + - "perseus/v2.1/Latin/texts/*.xml" + diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/poldb-pl-0.5.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/poldb-pl-0.5.yaml new file mode 100644 index 0000000000..bb9e9b0714 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/poldb-pl-0.5.yaml @@ -0,0 +1,35 @@ +groupId: org.dkpro.core.datasets.poldb +datasetId: poldb +version: 0.5 +language: pl +mediaType: text/x.org.dkpro.conll-2006 +encoding: UTF-8 + +name: Polish Dependency Bank +url: http://zil.ipipan.waw.pl/Składnica +description: | + The dependency treebank (Składnica zależnościowa), version 0.5, is a result of an automatic + conversion of manually disambiguated constituency trees into dependency structures. + + (This description has been sourced from the corpus website). + +licenses: + - name: GPL 3.0 + url: https://www.gnu.org/licenses/gpl-3.0.html + +artifacts: + LICENSE.txt: + url: https://www.gnu.org/licenses/gpl-3.0.txt + sha1: 8b0cb355ed76e07cc7c876fec58341c2940cfee7 + sha512: b311f68b1c2dbf8f079f381d9854c185e1f6b64cb375ca96a5f67e25cb375d9f106875523e6ef7adbd8f1156ec572eb9f4ae8f04e6da6e6de35dd7938db354df + verificationMode: TEXT + poldb-0.5.conll.gz: + url: "http://zil.ipipan.waw.pl/Składnica?action=AttachFile&do=get&target=Składnica-zależnościowa-0.5.conll.gz" + sha1: 187424608e91b271957dabcf140a7274f1c88d63 + sha512: d08dc44330d5084fa06409a6b76b99b90a201d8564c7dd2bd6435ee196898cc1787dfc93820842d38921086375c452ec057e5a95dfbd7a4ce48eacee8948df37 + +roles: + licenses: + - LICENSE.txt + data: + - poldb-0.5.conll.gz diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/poltb-pl-0.5.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/poltb-pl-0.5.yaml new file mode 100644 index 0000000000..1dd7223a5c --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/poltb-pl-0.5.yaml @@ -0,0 +1,35 @@ +groupId: org.dkpro.core.datasets.poltb +datasetId: poltb +version: 0.5 +language: pl +mediaType: application/x.org.dkpro.tiger+xml +encoding: UTF-8 + +name: Polish Constituency Treebank +url: http://zil.ipipan.waw.pl/Składnica +description: | + The Polish constituency treebank (Składnica frazowa), version 0.5. Trees in the Tiger XML format + containing only parse trees selected by dendrologists (one interpretation per sentence). + + (This description has been sourced from the corpus website). + +licenses: + - name: GPL 3.0 + url: https://www.gnu.org/licenses/gpl-3.0.html + +artifacts: + LICENSE.txt: + url: https://www.gnu.org/licenses/gpl-3.0.txt + sha1: 8b0cb355ed76e07cc7c876fec58341c2940cfee7 + sha512: b311f68b1c2dbf8f079f381d9854c185e1f6b64cb375ca96a5f67e25cb375d9f106875523e6ef7adbd8f1156ec572eb9f4ae8f04e6da6e6de35dd7938db354df + verificationMode: TEXT + poltb-0.5-tiger.xml.gz: + url: "http://zil.ipipan.waw.pl/Składnica?action=AttachFile&do=get&target=Składnica-frazowa-0.5-TigerXML.xml.gz" + sha1: c8977d436d218b726d657224305bced178071dcf + sha512: 3da399b090dde90297a66cda7c5a6334bfee8bf16c9b6fb6d2af135f049ddbc57ca19cae4e382d26deba308c82f4bb970dc4a59e202a798665a6cbb49a23ee5d + +roles: + licenses: + - LICENSE.txt + data: + - poltb-0.5-tiger.xml.gz diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/sdt-conll-sl-0.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sdt-conll-sl-0.1.yaml similarity index 92% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/sdt-conll-sl-0.1.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sdt-conll-sl-0.1.yaml index 3a203cfa96..5b8c02f636 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/sdt-conll-sl-0.1.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sdt-conll-sl-0.1.yaml @@ -30,6 +30,7 @@ artifacts: data.zip: url: http://nl.ijs.si/sdt/data/SDT-2006-01-06-CoNLL-X/data.zip sha1: 2bd85ad77c35d0c305a6afb7ee092676d5d22a35 + sha512: 022d4ffc2dbbe54b660fc6bedb2fe92b8a1b610749e6973dbc3798792ce82f875313acf6420d499111a2b17b9b7180cda48ffa81a665ea876e72a12aa473a73b actions: - action: explode configuration: { strip: 3 } diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/sdt-conll-sl-0.4.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sdt-conll-sl-0.4.yaml similarity index 79% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/sdt-conll-sl-0.4.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sdt-conll-sl-0.4.yaml index 295b63328d..501e613aa2 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/sdt-conll-sl-0.4.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sdt-conll-sl-0.4.yaml @@ -30,10 +30,13 @@ licenses: artifacts: README.txt: url: http://nl.ijs.si/sdt/data/SDT-2006-05-17/00README.txt - sha1: d2ac8d9f8b45ceae34ce77f57b354662292bd609 + sha1: 9d047377eb96aa896461544cd1117b11b812809f + sha512: 2c11bf1b3c5960394dff57330b31020a51dae18bb0d91a7eac65ecfb4f6bb5eccb241b34036089bc58954b2f3752c9f0eab8672e8109b059c77a1c376679956b + verificationMode: TEXT sdt-conll.tbl: url: http://nl.ijs.si/sdt/data/SDT-2006-05-17/CONLL/sdt-conll.tbl sha1: 16cfa8a20ebf8ed0e4f13c0119c7aa76a2498b1f + sha512: 83359227235370ab16fe830d437bf4ca710d2b9a1e2885a8d0d44ec935aab2c187956a4ad0c5323eb7989c06c5f7326c843b1694a6de148de5c25cc8b0bd3958 roles: licenses: diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/sequoia-surf-conll-fr-7.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sequoia-surf-conll-fr-7.0.yaml similarity index 84% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/sequoia-surf-conll-fr-7.0.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sequoia-surf-conll-fr-7.0.yaml index 3b15e34fff..098c413da5 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/sequoia-surf-conll-fr-7.0.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sequoia-surf-conll-fr-7.0.yaml @@ -28,8 +28,9 @@ artifacts: (Lesser General Public License For Linguistic Resources) cf. http://deep-sequoia.inria.fr/lgpl-lr/ sequoia.tgz: - url: "http://talc2.loria.fr/deep-sequoia/sequoia-7.0.tgz" + url: "http://deep-sequoia.inria.fr/download/sequoia-7.0.tgz" sha1: 9f53475f809ef1032a92adedf262226da1615051 + sha512: d6a90a7404caaf4c25ca48098b76fa2abcdbe88c45d1954548d76362b16d988cbbc4025e7cd7810fc7fec2141be8dda11ebc29eca15708d0e1e3e149ccc4d951 actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sethr-hr-1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sethr-hr-1.yaml new file mode 100644 index 0000000000..1fb603863e --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sethr-hr-1.yaml @@ -0,0 +1,34 @@ +groupId: org.dkpro.core.datasets.sethr +datasetId: sethr +version: 1 +language: hr +mediaType: text/x.org.dkpro.conll-2006 +encoding: UTF-8 + +name: SETimes.HR dependency treebank +url: http://nlp.ffzg.hr/resources/corpora/setimes-hr/ +description: | + The corpus is based on the Croatian part of the SETimes parallel corpus. + + (This description has been sourced from the corpus website). + +licenses: + - name: CC-BY-SA 3.0 + url: http://creativecommons.org/licenses/by-sa/3.0/ + +artifacts: + LICENSE.txt: + url: classpath:/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-SA-3.0.txt + sha1: fb6f31be27fed5efbcd4c2e1e64c50de470364b1 + sha512: ba59a7187a93fd7e0d4bcbf4f18076a341f8d4091d0ebc5d2b6f3ee7e8e3c79cd6c485640880def013e9116cba55c7ddc08890ff9859d0403f075393df45ea9f + verificationMode: TEXT + setimes.hr.v1.conllx.gz: + url: http://nlp.ffzg.hr/data/corpora/setimes.hr.v1.conllx.gz + sha1: 0faebfe55136692f83dcddd4cf659a8b59655d62 + sha512: 81f4389172e6d340d7a8cf6581c86bc6213927ac4f25d0dd104e32c6ffb414f08ce2c3e6cdbde0bf6233acd7c3e8b9d475862d89dad762ba84d82d80b80d574f + +roles: + licenses: + - LICENSE.txt + data: + - setimes.hr.v1.conllx.gz diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sethrplus-hr-20160613.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sethrplus-hr-20160613.yaml new file mode 100644 index 0000000000..daf34408db --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/sethrplus-hr-20160613.yaml @@ -0,0 +1,54 @@ +groupId: org.dkpro.core.datasets.sethr +datasetId: sethrplus +version: 20160613 +language: hr +mediaType: text/x.org.dkpro.conll-u +encoding: UTF-8 + +name: SETimes.HR+ Croatian dependency treebank +url: https://github.com/ffnlp/sethr +attribution: | + Agić and Ljubešić (2014) + (link:http://www.lrec-conf.org/proceedings/lrec2014/pdf/690_Paper.pdf[PDF]) + (link:http://aclweb.org/anthology/L/L14/L14-1542.bib[bib]) +description: | + The treebank is a result of an effort in providing free-culture language resources for Croatian by + the NLP group at FF Zagreb. + + (This description has been sourced from the corpus website). + +licenses: + - name: CC-BY 4.0 + url: http://creativecommons.org/licenses/by/4.0/ + comment: SETimes.HR dataset (set.hr.conll) + - name: CC-BY-NC-SA 4.0 + url: https://creativecommons.org/licenses/by-nc-sa/4.0/ + comment: web.hr.conll and news.hr.conll datasets + +artifacts: + LICENSE-CC-BY.txt: + url: "classpath:/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-4.0.txt" + sha1: 9c5bee7a22ab39ad6c19ab29ea9e94ac5874f9c6 + sha512: 222cf997169925ee3a09a26798d04332673693c578c24cb2d0cc550785a8b87849b161dccd9c48d2e4f3fa15290b6a10ac5262945f9c8cc6bdbd362d37416300 + verificationMode: TEXT + LICENSE-CC-BY-NC-SA.txt: + url: "classpath:/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-NC-SA-4.0.txt" + sha1: 54cc324681563e5ede8088f020f0b21e35d37fb9 + sha512: 84b09f6057afa41c8e495697b67da30d6be0d00f04c4d7c244012f8003088d29f43f474905be1c9262d14f6e199130bbad64371818e32f60aa0311faa271e1ca + verificationMode: TEXT + data.zip: + url: https://github.com/ffnlp/sethr/archive/c50697a81ee588b70328952dd56175da4c298c7c.zip + sha1: a52d13cfa91589c0d93fe0a90333a4f0e997b7cf + sha512: 394e06eee8a804fa7bfed2d0ccca152cbe1bf13478459c19212c3fd0bf33ed68ee292bf2528154581110c4fe49a2824661298e4caa19fe8e6b3ba6128427e40f + actions: + - action: explode + configuration: { strip: 1, includes: [ "LICENSE.md", "README.md", "*.hr*.conll" ] } + +roles: + licenses: + - LICENSE-CC-BY.txt + - LICENSE-CC-BY-NC-SA.txt + training: + - "data/*.hr.conll" + testing: + - "data/*.hr.test.conll" diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/stanford-egw4-reut-512-clusters-20130608.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/stanford-egw4-reut-512-clusters-20130608.yaml similarity index 84% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/stanford-egw4-reut-512-clusters-20130608.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/stanford-egw4-reut-512-clusters-20130608.yaml index 777c770757..d175541150 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/stanford-egw4-reut-512-clusters-20130608.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/stanford-egw4-reut-512-clusters-20130608.yaml @@ -19,6 +19,7 @@ artifacts: egw4-reut.512.clusters: url: https://nlp.stanford.edu/software/egw4-reut.512.clusters sha1: 3f1352641a46e985c07d0023c0ada7e5be97e527 + sha512: 9feeb7de9dc49a278a7ec6c8fd02e582d0c154077ee656ae00eaa293a5366d6e74c2724223c38b9030eb30305ce3dd07ac4767f890814c4fa41f71c8c3b8c7f2 roles: data: diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/talkbanken05-dep-sv-1.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-dep-sv-1.1.yaml similarity index 87% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/talkbanken05-dep-sv-1.1.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-dep-sv-1.1.yaml index b2dac6bf86..82e4849e7e 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/talkbanken05-dep-sv-1.1.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-dep-sv-1.1.yaml @@ -29,8 +29,9 @@ licenses: artifacts: data.tar.gz: - url: http://stp.lingfil.uu.se/%7Enivre/research/Talbanken05_1.1.tar.gz + url: https://cl.lingfil.uu.se/~nivre/research/Talbanken05_1.1.tar.gz sha1: bc836ab364ba37522e2989481104bad2eb96a92e + sha512: 4a93609c6c674edd01f1254d4879f6c908865b48f640c1b813ac537a8a4e160f11fc75857f18d72d88de9b70d3cf8dd831acc97489b0f7c0c1d8c9058209b25c shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/talkbanken05-dps-sv-1.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-dps-sv-1.1.yaml similarity index 87% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/talkbanken05-dps-sv-1.1.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-dps-sv-1.1.yaml index 67fe42740f..998c5aa736 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/talkbanken05-dps-sv-1.1.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-dps-sv-1.1.yaml @@ -29,8 +29,9 @@ licenses: artifacts: data.tar.gz: - url: http://stp.lingfil.uu.se/%7Enivre/research/Talbanken05_1.1.tar.gz + url: https://cl.lingfil.uu.se/~nivre/research/Talbanken05_1.1.tar.gz sha1: bc836ab364ba37522e2989481104bad2eb96a92e + sha512: 4a93609c6c674edd01f1254d4879f6c908865b48f640c1b813ac537a8a4e160f11fc75857f18d72d88de9b70d3cf8dd831acc97489b0f7c0c1d8c9058209b25c shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/talkbanken05-fps-sv-1.1.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-fps-sv-1.1.yaml similarity index 87% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/talkbanken05-fps-sv-1.1.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-fps-sv-1.1.yaml index bc0c8caba6..617cd0d322 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/talkbanken05-fps-sv-1.1.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/talkbanken05-fps-sv-1.1.yaml @@ -29,8 +29,9 @@ licenses: artifacts: data.tar.gz: - url: http://stp.lingfil.uu.se/%7Enivre/research/Talbanken05_1.1.tar.gz + url: https://cl.lingfil.uu.se/~nivre/research/Talbanken05_1.1.tar.gz sha1: bc836ab364ba37522e2989481104bad2eb96a92e + sha512: 4a93609c6c674edd01f1254d4879f6c908865b48f640c1b813ac537a8a4e160f11fc75857f18d72d88de9b70d3cf8dd831acc97489b0f7c0c1d8c9058209b25c shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/tedtreebank-conll-en-1.0.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/tedtreebank-conll-en-1.0.yaml new file mode 100644 index 0000000000..a3c02cd81e --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/tedtreebank-conll-en-1.0.yaml @@ -0,0 +1,50 @@ +groupId: org.dkpro.core.datasets.tedtreebank +datasetId: tedtreebank-conll +version: 1.0 +language: en +mediaType: text/x.org.dkpro.conll-2006 +encoding: UTF-8 + +name: NAIST/NTT TED Treebank +url: http://ahclab.naist.jp/resource/tedtreebank/ +attribution: | + Graham Neubig, Katsuhito Sudoh, Yusuke Oda, Kevin Duh, Hajime Tsukada, Masaaki Nagata. + The NAIST-NTT Ted Talk Treebank. In proceedings of International Workshop on Spoken Language + Translation (IWSLT). Lake Tahoe, USA. December 2014. + (link:http://www.phontron.com/paper/neubig14iwslt.pdf[PDF]) + (link:http://phontron.com/bibtex.php?n=222[bib]) + +licenses: + - name: CC-BY-NC-SA 3.0 (?) + url: https://creativecommons.org/licenses/by-nc-sa/3.0/ + +description: | + The NAIST-NTT Ted Talk Treebank is a manually annotated treebank of TED talks that was created + through a joint research project of NAIST and the NTT CS Lab. All treebank annotation follows the + Penn Treebank standard. + + (This description has been sourced from the corpus website/README file in the corpus). + + NOTE: The website does not state which version of the CC-BY-SA-NC applies. One might consider + it is the version 3.0 which is also used for the TED talks themselves. + +artifacts: + LICENSE.txt: + url: classpath:/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-NC-SA-3.0.txt + sha1: 90490d92475de1dc68502b6cdb317187c4336b36 + sha512: ee6b27c709b76d32255ea3c1dd110b6238f5a6360d18cb59d8e7ce433cf22046fc9e13acf055682eacf832fc308e2fb3f842bf4a41ff5debf6e2c69775173cdb + verificationMode: TEXT + data.tar.gz: + url: https://ahcweb01.naist.jp/resource/tedtreebank/naist-ntt-ted-treebank-v1.tar.gz + sha1: 89c6495bd64c4b3e699b4c478b47a0c827ea46ea + sha512: a433d0dd1de9a04280f0115491e8d5414e6f5303fb271441cdd59f440eb5dd5c0f8cbbfd130f68dceb24323538ce2b3e0d8f0f77c61054338bf5816a7fd08b4b + actions: + - action: explode + configuration: { strip: 1, includes: [ "README.md", "en-dep/*.dep" ] } + +roles: + licenses: + - LICENSE.txt + - data/README.md + data: + - "data/en-dep/*.dep" diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/tut-conll-it-20101122.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/tut-conll-it-20101122.yaml new file mode 100644 index 0000000000..3c4b658fe1 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/tut-conll-it-20101122.yaml @@ -0,0 +1,56 @@ +groupId: org.dkpro.core.datasets.tut +datasetId: tut +version: 20101122 +language: it +mediaType: text/x.org.dkpro.conll-2006 +encoding: UTF-8 + +name: Turin University Treebank +url: http://www.di.unito.it/~tutreeb/treebanks.html +attribution: | + Cristina Bosco, Leonardo Lesmo, Vincenzo Lombardo, Alessandro Mazzei, Livio Robaldo +description: | + TUT is a morpho-syntactically annotated collection of Italian sentences, which includes texts + from different text genres and domains, released in several annotation formats. + + (This description has been sourced from the corpus website). + +licenses: + - name: CC-BY-NC-SA 2.5 + url: http://creativecommons.org/licenses/by-nc-sa/2.5/it/ + +artifacts: + NEWS.zip: + url: http://www.di.unito.it/~tutreeb/corpora/tutINconll/NEWS-22nov2010.conl.zip + sha1: 3d9b22d8ebf533aa1d6d39d417316c30900b9a0e + sha512: 2922115acd622f290518a5863edad57cd8e57030660c83c9f79d76045b3da318ff7fcdbf40404be78f7548aaf977c2355be34e12ff0c85ea98096a298f2c8fd8 + actions: + - action: explode + VEDCH.zip: + url: http://www.di.unito.it/~tutreeb/corpora/tutINconll/VEDCH-22nov2010.conl.zip + sha1: 2278e6e770ddc4a8eea5e045c4a77a5df2ae0977 + sha512: 8aac15c988d266719df467fa1e07bf1771773f1e6b93ba8d44d991c9ad8fc0019ecfcaa2507bcfdf47c4f5f2b6e5137f59d025a4234fe06e5232af50abf1c18f + actions: + - action: explode + CODICECIVILE.zip: + url: http://www.di.unito.it/~tutreeb/corpora/tutINconll/CODICECIVILE-22nov2010.conl.zip + sha1: 9cf9c0a9c652b3df6564d1fa0ca97c2d7905faa3 + sha512: dd7507383c9f940df7d11975738198f4f5dd0174ddd25aa4ed2bba592e35c7758443b934c384063193d20e55fab7d58e155f6b48bfe23407fab4bce89a22a77e + actions: + - action: explode + EUDIR.zip: + url: http://www.di.unito.it/~tutreeb/corpora/tutINconll/EUDIR-22nov2010.conl.zip + sha1: 72a6e55627481ff99930b59714cfc0909ccf60e1 + sha512: b4502d9a0f4749e0a27b0e1bb4b8e0c51b8d71d55bceaa46124048d136957a1d99579add80c482009fa176ff89ec74f1db58d7b356920863c14adc0b47a47023 + actions: + - action: explode + WIKI.zip: + url: http://www.di.unito.it/~tutreeb/corpora/tutINconll/WIKI-22nov2010.conl.zip + sha1: a421f488859324e3e12687b9a3067652248eb8df + sha512: 5282e893b39a1f03c0b6a9a3afc31ecedee4f2e014aceb7bcf01b01dfdf14fbd586008747df3c68443c4843195c565a420912d9d65b910facf0063f2c6f26f87 + actions: + - action: explode + +roles: + data: + - "**/*.conl" diff --git a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/ud-en-conllu-1.4.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ud-en-conllu-1.4.yaml similarity index 92% rename from dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/ud-en-conllu-1.4.yaml rename to dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ud-en-conllu-1.4.yaml index 1e6e9ea353..5bebd70fb8 100644 --- a/dkpro-core-api-datasets-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/datasets/lib/ud-en-conllu-1.4.yaml +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/ud-en-conllu-1.4.yaml @@ -29,6 +29,7 @@ artifacts: data.tgz: url: "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-1827/ud-treebanks-v1.4.tgz?sequence=4&isAllowed=y" sha1: 1c41c28b000935ffa6c63b9ff17c48e892c56597 + sha512: b41e297dc6befb8e7dfe1fe3281e796e4ee2fceff87187f1a7db2f75eb232705605e4ee2c282345db28b4f7970ac767fa6c572f1b5486c263ea94a814d360b38 shared: true actions: - action: explode diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/updt-fa-1.3.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/updt-fa-1.3.yaml new file mode 100644 index 0000000000..25e3159c9e --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/updt-fa-1.3.yaml @@ -0,0 +1,44 @@ +groupId: org.dkpro.core.datasets.sequoia +datasetId: sequoia +version: 1.3 +language: fa +mediaType: text/x.org.dkpro.conll-2006 +encoding: UTF-8 + +name: Uppsala Persian Dependency Treebank +url: http://stp.lingfil.uu.se/%7Emojgan/UPDT.html +attribution: Mojgan Seraji, under the supervision of Joakim Nivre and Carina Jahani. +description: | + Uppsala Persian Dependency Treebank (UPDT) (Seraji, 2015, Chapter 5, pp. 97-146) is a + dependency-based syntactically annotated corpus. + + (This description has been sourced from the dataset website). + +licenses: + - name: CC-BY 3.0 + url: http://creativecommons.org/licenses/by/3.0/ + +artifacts: + LICENSE.txt: + url: classpath:/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-3.0.txt + sha1: aaf1a43d7cf20483321212f54bff33132a070ec0 + sha512: e9362e13775f8bd2442f357e459a2eac94d989f39ffbd8972e13d3ed52857515dc5dfe8baaa7f2432efea952931b320ff9d87431e00a9b519663ad4acba6afd7 + verificationMode: TEXT + data.tar: + url: "https://sites.google.com/site/mojganserajicom/home/updt/updt-1-3/UPDT.1.3.tar?attredirects=0&d=1" + sha1: 336ba453635ff079ab2ae9a5349247efa11acdf8 + sha512: 2870038e8ee75191897dc398194baded1a33885d024435d06cab63153df51d86a05679dd35588df70205f94d0d6df0aa7bf8e8c5a8b71b03c8e70ecd28e209da + actions: + - action: explode + configuration: { strip: 1 } + +roles: + licenses: + - LICENSE.txt + training: + - data/train.conll + testing: + - data/test.conll + development: + - data/dev.conll + diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-de-1.00.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-de-1.00.yaml new file mode 100644 index 0000000000..18fc14cf58 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-de-1.00.yaml @@ -0,0 +1,44 @@ +groupId: org.dkpro.core.datasets.wasr +datasetId: wasr-l-en +version: 1.00 +language: en +mediaType: text/x.org.dkpro.conll-2009 +encoding: UTF-8 + +name: English Word Sense and Semantic Role Datasets (WaSR) +url: https://www.informatik.tu-darmstadt.de/ukp/research_6/data/semantic_role_resources/knowledge_based_semantic_role_labeling/index.en.jsp +attribution: | + Silvana Hartmann, Judith Eckle-Kohler, and Iryna Gurevych. Generating Training Data for Semantic + Role Labeling based on Label Transfer from Linked Lexical Resources. In: Transactions of the + Association for Computational Linguistics, vol. 4, no. 1, p. (to appear), 2016. + (link:https://www.ukp.tu-darmstadt.de/fileadmin/user_upload/Group_UKP/publikationen/2016/717-cameraready.pdf[PDF]) +description: | + German Frame and Role Annotations. + + (This description has been sourced from the README file included with the corpus). + +licenses: + - name: CC-BY-NC-ND 3.0 + url: https://creativecommons.org/licenses/by-nc-nd/3.0/ + +artifacts: + LICENSE.txt: + url: classpath:/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-NC-SA-3.0.txt + sha1: 90490d92475de1dc68502b6cdb317187c4336b36 + sha512: ee6b27c709b76d32255ea3c1dd110b6238f5a6360d18cb59d8e7ce433cf22046fc9e13acf055682eacf832fc308e2fb3f842bf4a41ff5debf6e2c69775173cdb + verificationMode: TEXT + data.tar.bz2: + url: "https://fileserver.ukp.informatik.tu-darmstadt.de/UKP_Webpage/DATA/WaSR-de_v1.tar.bz2" + sha1: b706711ae6fffc94409f80b635595bd45d8c2ece + sha512: ff2bc3becad49146873dc54644f299d1362f106258e0ed939e1c14058b02429372aa39d4027bf040845af6db20073c80a0813452cf103ffd0adf3d55eaea1704 + actions: + - action: explode + configuration: { strip: 1 } + - action: explode + configuration: { file: "data/WaSR-de_all.tar.bz2" } + +roles: + licenses: + - LICENSE.txt + data: + - "WaSR-de_all/WaSR-de_all.tsv" diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-l-en-1.00.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-l-en-1.00.yaml new file mode 100644 index 0000000000..7d629dc9c2 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-l-en-1.00.yaml @@ -0,0 +1,45 @@ +groupId: org.dkpro.core.datasets.wasr +datasetId: wasr-l-en +version: 1.00 +language: en +mediaType: text/x.org.dkpro.conll-2009 +encoding: UTF-8 + +name: English Word Sense and Semantic Role Datasets (WaSR) +url: https://www.informatik.tu-darmstadt.de/ukp/research_6/data/semantic_role_resources/knowledge_based_semantic_role_labeling/index.en.jsp +attribution: | + Silvana Hartmann, Judith Eckle-Kohler, and Iryna Gurevych. Generating Training Data for Semantic + Role Labeling based on Label Transfer from Linked Lexical Resources. In: Transactions of the + Association for Computational Linguistics, vol. 4, no. 1, p. (to appear), 2016. + (link:https://www.ukp.tu-darmstadt.de/fileadmin/user_upload/Group_UKP/publikationen/2016/717-cameraready.pdf[PDF]) +description: | + English Frame and Role Annotations. + + (This description has been sourced from the README file included with the corpus). + +licenses: + - name: CC-BY-NC-ND 3.0 + url: https://creativecommons.org/licenses/by-nc-nd/3.0/ + +artifacts: + LICENSE.txt: + url: classpath:/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-NC-SA-3.0.txt + sha1: 90490d92475de1dc68502b6cdb317187c4336b36 + sha512: ee6b27c709b76d32255ea3c1dd110b6238f5a6360d18cb59d8e7ce433cf22046fc9e13acf055682eacf832fc308e2fb3f842bf4a41ff5debf6e2c69775173cdb + verificationMode: TEXT + part1.tar.bz2: + url: "https://fileserver.ukp.informatik.tu-darmstadt.de/UKP_Webpage/DATA/WaSR-en_v1_part1.tar.bz2" + sha1: ef7ccf5cb23da63003bdb19d99b15b0ea2821e55 + sha512: bc6ba46503596aae4005b32934b23be9bf12399222cb13569f77af3ce262bd84f9a3e86c8b74897a17493969361464a6ff9cd22620f37322241e24741415b480 + shared: true + actions: + - action: explode + configuration: { strip: 1, includes: [ "README", "WaSR_L_all.7z" ] } + - action: explode + configuration: { file: "part1/WaSR_L_all.7z" } + +roles: + licenses: + - LICENSE.txt + data: + - "WaSR_L_all/WaSR_L_all.tsv" diff --git a/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-xl-en-1.00.yaml b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-xl-en-1.00.yaml new file mode 100644 index 0000000000..382efee8ae --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/main/resources/org/dkpro/core/api/datasets/lib/wasr-xl-en-1.00.yaml @@ -0,0 +1,68 @@ +groupId: org.dkpro.core.datasets.wasr +datasetId: wasr-xl-en +version: 1.00 +language: en +mediaType: text/x.org.dkpro.conll-2009 +encoding: UTF-8 + +name: English Word Sense and Semantic Role Datasets (WaSR) +url: https://www.informatik.tu-darmstadt.de/ukp/research_6/data/semantic_role_resources/knowledge_based_semantic_role_labeling/index.en.jsp +attribution: | + Silvana Hartmann, Judith Eckle-Kohler, and Iryna Gurevych. Generating Training Data for Semantic + Role Labeling based on Label Transfer from Linked Lexical Resources. In: Transactions of the + Association for Computational Linguistics, vol. 4, no. 1, p. (to appear), 2016. + (link:https://www.ukp.tu-darmstadt.de/fileadmin/user_upload/Group_UKP/publikationen/2016/717-cameraready.pdf[PDF]) +description: | + English Frame and Role Annotations. + + (This description has been sourced from the README file included with the corpus). + +licenses: + - name: CC-BY-NC-ND 3.0 + url: https://creativecommons.org/licenses/by-nc-nd/3.0/ + +artifacts: + LICENSE.txt: + url: classpath:/org/dkpro/core/api/datasets/lib/common-licenses/CC-BY-NC-SA-3.0.txt + sha1: 90490d92475de1dc68502b6cdb317187c4336b36 + sha512: ee6b27c709b76d32255ea3c1dd110b6238f5a6360d18cb59d8e7ce433cf22046fc9e13acf055682eacf832fc308e2fb3f842bf4a41ff5debf6e2c69775173cdb + verificationMode: TEXT + part1.tar.bz2: + url: "https://fileserver.ukp.informatik.tu-darmstadt.de/UKP_Webpage/DATA/WaSR-en_v1_part1.tar.bz2" + sha1: ef7ccf5cb23da63003bdb19d99b15b0ea2821e55 + sha512: bc6ba46503596aae4005b32934b23be9bf12399222cb13569f77af3ce262bd84f9a3e86c8b74897a17493969361464a6ff9cd22620f37322241e24741415b480 + shared: true + actions: + - action: explode + configuration: { strip: 1, includes: [ "README", "WaSR_XL_part1_3.7z" ] } + - action: explode + configuration: { file: "part1/WaSR_XL_part1_3.7z" } + part2.tar.bz2: + url: "https://fileserver.ukp.informatik.tu-darmstadt.de/UKP_Webpage/DATA/WaSR-en_v1_part2.tar.bz2" + sha1: 0a9c98cbf1fe02841edf52e963444a7e38986577 + sha512: 4df84de5414322dad68ef23bca5e75336ff09c22c059a2f82320e8c5aca51fd93bb9b5f12d78f1127f0a518650d03898e504bd05209c1cf7da8b8403f1aa13d0 + shared: true + actions: + - action: explode + configuration: { strip: 1 } + - action: explode + configuration: { file: "part2/WaSR_XL_part2_3.7z" } + part3.tar.bz2: + url: "https://fileserver.ukp.informatik.tu-darmstadt.de/UKP_Webpage/DATA/WaSR-en_v1_part3.tar.bz2" + sha1: 9c0cc79ecab9140f82683d39ed6acb51b148f9f7 + sha512: f5c229a13e02fd602f0fadf68c1a6d70ccfa9f29db1ee79a485ab0707a6ee70ed4a5e5b78bbe30e9890565e94a83fecb1b716ed9e5d8635fe0b6428a13c1c33f + shared: true + actions: + - action: explode + configuration: { strip: 1 } + - action: explode + configuration: { file: "part3/WaSR_XL_part3_3.7z" } + +roles: + licenses: + - LICENSE.txt + data: + - "WaSR_XL_part1_3/WaSR_XL_part1_3.tsv" + - "WaSR_XL_part2_3/WaSR_XL_part2_3.tsv" + - "WaSR_XL_part3_3/WaSR_XL_part3_3.tsv" + diff --git a/dkpro-core-api-datasets-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DatasetFactoryTest.java b/dkpro-core-api-datasets-asl/src/test/java/org/dkpro/core/api/datasets/DatasetFactoryTest.java similarity index 88% rename from dkpro-core-api-datasets-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DatasetFactoryTest.java rename to dkpro-core-api-datasets-asl/src/test/java/org/dkpro/core/api/datasets/DatasetFactoryTest.java index 5d4f9c81d4..39495ad385 100644 --- a/dkpro-core-api-datasets-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DatasetFactoryTest.java +++ b/dkpro-core-api-datasets-asl/src/test/java/org/dkpro/core/api/datasets/DatasetFactoryTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets; +package org.dkpro.core.api.datasets; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; @@ -24,16 +24,13 @@ import java.io.File; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Dataset; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetFactory; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Split; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - public class DatasetFactoryTest { @Ignore("Used at times for offline testing / development") @@ -41,17 +38,14 @@ public class DatasetFactoryTest public void testOne() throws Exception { - Path cache = testContext.getTestOutputFolder().toPath(); + //Path cache = testContext.getTestOutputFolder().toPath(); + Path cache = Paths.get("target/test-output/testLoadOne"); DatasetFactory df = new DatasetFactory(cache); { - Dataset ds = df.load("wasr-en-xl-1.00"); + Dataset ds = df.load("updt-fa-1.3"); assertDatasetOk(ds); } -// { -// Dataset ds = df.load("ndt-nb-1.01"); -// assertDatasetOk(ds); -// } } @Ignore("Used at times for offline testing / development") @@ -59,7 +53,7 @@ public void testOne() public void testLoadAll() throws Exception { - Path cache = testContext.getTestOutputFolder().toPath(); + Path cache = Paths.get("target/test-output/testLoadAll"); DatasetFactory df = new DatasetFactory(cache); for (String id : df.listIds()) { diff --git a/dkpro-core-api-datasets-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DatasetLoaderTest.java b/dkpro-core-api-datasets-asl/src/test/java/org/dkpro/core/api/datasets/DatasetLoaderTest.java similarity index 87% rename from dkpro-core-api-datasets-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DatasetLoaderTest.java rename to dkpro-core-api-datasets-asl/src/test/java/org/dkpro/core/api/datasets/DatasetLoaderTest.java index 33cff35bcc..eaa68392c9 100644 --- a/dkpro-core-api-datasets-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/DatasetLoaderTest.java +++ b/dkpro-core-api-datasets-asl/src/test/java/org/dkpro/core/api/datasets/DatasetLoaderTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets; +package org.dkpro.core.api.datasets; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; @@ -23,15 +23,14 @@ import java.io.File; import java.util.List; +import org.dkpro.core.api.datasets.Dataset; +import org.dkpro.core.api.datasets.DatasetLoader; +import org.dkpro.core.api.datasets.Split; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Dataset; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetLoader; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Split; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - @Ignore("Normally we do not run this") public class DatasetLoaderTest { diff --git a/dkpro-core-api-datasets-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/FindEncoding.java b/dkpro-core-api-datasets-asl/src/test/java/org/dkpro/core/api/datasets/FindEncoding.java similarity index 76% rename from dkpro-core-api-datasets-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/FindEncoding.java rename to dkpro-core-api-datasets-asl/src/test/java/org/dkpro/core/api/datasets/FindEncoding.java index 66c6bdede4..12c5f5e04e 100644 --- a/dkpro-core-api-datasets-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/datasets/FindEncoding.java +++ b/dkpro-core-api-datasets-asl/src/test/java/org/dkpro/core/api/datasets/FindEncoding.java @@ -15,18 +15,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.datasets; +package org.dkpro.core.api.datasets; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.nio.file.Path; -import java.util.Collection; -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.filefilter.DirectoryFileFilter; -import org.apache.commons.io.filefilter.RegexFileFilter; +import org.dkpro.core.api.datasets.Dataset; +import org.dkpro.core.api.datasets.DatasetFactory; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; @@ -34,25 +33,19 @@ import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Dataset; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetFactory; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - @Ignore("Normally we do not run this") public class FindEncoding { - @Ignore("Used at times for offline testing / development") + @Ignore("Used at times for offline testing / development") @Test - public void getEncoding() - throws IOException + public void getEncoding() throws IOException { String dsName = "ndt-nn-1.01"; findEncoding(dsName); } @Ignore("Used at times for offline testing / development") - public void findEncoding(String eName) - throws IOException + public void findEncoding(String eName) throws IOException { Path cache = testContext.getCacheFolder().toPath(); DatasetFactory df = new DatasetFactory(cache); @@ -70,10 +63,8 @@ public void findEncoding(String eName) System.out.println(e.getMessage()); } } - } @Rule public DkproTestContext testContext = new DkproTestContext(); - } diff --git a/dkpro-core-api-datasets-asl/src/test/resources/log4j.properties b/dkpro-core-api-datasets-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-api-datasets-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-api-datasets-asl/src/test/resources/log4j2.xml b/dkpro-core-api-datasets-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-api-datasets-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-api-discourse-asl/pom.xml b/dkpro-core-api-discourse-asl/pom.xml index 0f50c725dc..4a7e641cdf 100644 --- a/dkpro-core-api-discourse-asl/pom.xml +++ b/dkpro-core-api-discourse-asl/pom.xml @@ -18,22 +18,23 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.api.discourse-asl + dkpro-core-api-discourse-asl jar DKPro Core ASL - Discourse API + https://dkpro.github.io/dkpro-core/ org.apache.uima uimaj-core - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl @@ -65,7 +66,7 @@ - Maven doesn't detect the parameters module to be used because we - only use the XML type descriptors from it, not any actual Java code. --> - de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core:dkpro-core-api-segmentation-asl - 4.0.0 - - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT - ../dkpro-core-asl - - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl - jar - DKPro Core ASL - Metadata API - - - org.apache.uima - uimaj-core - - - org.apache.uima - uimaj-document-annotation - - - org.apache.uima - uimafit-core - - - junit - junit - test - - - - - - false - src/main/resources - - desc/type/**/* - - - - true - src/main/resources - - desc/type/**/* - - - - + 4.0.0 + + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT + ../dkpro-core-asl + + dkpro-core-api-metadata-asl + jar + DKPro Core ASL - Metadata API + https://dkpro.github.io/dkpro-core/ + + + org.apache.uima + uimaj-core + + + org.apache.uima + uimaj-document-annotation + + + org.apache.uima + uimafit-core + + + junit + junit + test + + + + + + false + src/main/resources + + desc/type/**/* + + + + true + src/main/resources + + desc/type/**/* + + + + diff --git a/dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/type/DocumentMetaData.java b/dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/type/DocumentMetaData.java index 0ddfae0bbf..047447293b 100644 --- a/dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/type/DocumentMetaData.java +++ b/dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/type/DocumentMetaData.java @@ -15,272 +15,277 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -/* First created by JCasGen Mon Nov 08 23:55:50 CET 2010 */ +/* Apache UIMA v3 - First created by JCasGen Fri Sep 08 09:51:28 EEST 2017 */ + package de.tudarmstadt.ukp.dkpro.core.api.metadata.type; +import java.lang.invoke.CallSite; +import java.lang.invoke.MethodHandle; + import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.FeatureStructure; +import org.apache.uima.cas.impl.CASImpl; +import org.apache.uima.cas.impl.TypeImpl; +import org.apache.uima.cas.impl.TypeSystemImpl; import org.apache.uima.fit.util.CasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.JCasRegistry; -import org.apache.uima.jcas.cas.TOP_Type; import org.apache.uima.jcas.tcas.DocumentAnnotation; -/** - * Updated by JCasGen Sun Nov 21 13:28:48 CET 2010 - * XML source: /Users/bluefire/UKP/Workspaces/dkpro-primary/de.tudarmstadt.ukp.dkpro.core-asl/de.tudarmstadt.ukp.dkpro.core.api.metadata/src/main/resources/desc/type/DocumentMetaData.xml - * @generated */ -public class DocumentMetaData - extends DocumentAnnotation -{ - /** - * @generated - * @ordered - */ - public final static int typeIndexID = JCasRegistry.register(DocumentMetaData.class); - /** - * @generated - * @ordered - */ - public final static int type = typeIndexID; - /** @generated */ - @Override - public int getTypeIndexID() {return typeIndexID;} - /** - * Never called. Disable default constructor - * - * @generated - */ - protected DocumentMetaData() {} - /** - * Internal - constructor used by generator - * - * @generated - * @param addr - * low level Feature Structure reference - * @param type - * the type of this Feature Structure - */ - public DocumentMetaData(int addr, TOP_Type type) { - super(addr, type); - readObject(); - } +/**

The DocumentMetaData annotation stores information about a single processed +document. There can only be one of these annotations per CAS. The annotation is +created by readers and contains information to uniquely identify the document from +which a CAS was created. Writer components use this information when determining +under which filename a CAS is stored.

- - /** - * @generated - * @param jcas - * JCas to which this Feature Structure belongs - */ - public DocumentMetaData(JCas jcas) { - super(jcas); +

There are two principle ways of identifying a document:

+ +

    +
  • collection id / document id: this simple system identifies a document + within a collection. The ID of the collection and the document are each + simple strings without any further semantics such as e.g. a hierarchy. For + this reason, this identification scheme is not well suited to preserve + information about directory structures.
  • + +
  • document base URI / document URI: this system identifies a document using + a URI. The base URI is used to derive the relative path of the document with + respect to the base location from where it has been read. E.g. if the base + URI is file:/texts and the document URI is file:/texts/english/text1.txt, then the relativ + path of the document is english/text1.txt. This + information is used by writers to recreate the directory structure found + under the base location in the target location.
  • +
+ +

It is possible and indeed common for a writer to initialize both systems of +identification. If both systems are present, most writers default to using the +URI-based systems. However, most writers also allow forcing the use of the ID-based +systems.

+ +

In addition to the features given here, there is a language feature inherited from UIMA's DocumentAnnotation. DKPro Core components expect a two letter ISO +639-1 language code there.

+ * Updated by JCasGen Fri Sep 08 09:51:28 EEST 2017 + * XML source: /Users/bluefire/git/dkpro-core/dkpro-core-api-metadata-asl/target/jcasgen/typesystem.xml + * @generated */ +public class DocumentMetaData extends DocumentAnnotation { + + /** @generated + * @ordered + */ + @SuppressWarnings ("hiding") + public final static String _TypeName = "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"; + + /** @generated + * @ordered + */ + @SuppressWarnings ("hiding") + public final static int typeIndexID = JCasRegistry.register(DocumentMetaData.class); + /** @generated + * @ordered + */ + @SuppressWarnings ("hiding") + public final static int type = typeIndexID; + /** @generated + * @return index of the type + */ + @Override + public int getTypeIndexID() {return typeIndexID;} + + + /* ******************* + * Feature Offsets * + * *******************/ + + public final static String _FeatName_documentTitle = "documentTitle"; + public final static String _FeatName_documentId = "documentId"; + public final static String _FeatName_documentUri = "documentUri"; + public final static String _FeatName_collectionId = "collectionId"; + public final static String _FeatName_documentBaseUri = "documentBaseUri"; + public final static String _FeatName_isLastSegment = "isLastSegment"; + + + /* Feature Adjusted Offsets */ + private final static CallSite _FC_documentTitle = TypeSystemImpl.createCallSite(DocumentMetaData.class, "documentTitle"); + private final static MethodHandle _FH_documentTitle = _FC_documentTitle.dynamicInvoker(); + private final static CallSite _FC_documentId = TypeSystemImpl.createCallSite(DocumentMetaData.class, "documentId"); + private final static MethodHandle _FH_documentId = _FC_documentId.dynamicInvoker(); + private final static CallSite _FC_documentUri = TypeSystemImpl.createCallSite(DocumentMetaData.class, "documentUri"); + private final static MethodHandle _FH_documentUri = _FC_documentUri.dynamicInvoker(); + private final static CallSite _FC_collectionId = TypeSystemImpl.createCallSite(DocumentMetaData.class, "collectionId"); + private final static MethodHandle _FH_collectionId = _FC_collectionId.dynamicInvoker(); + private final static CallSite _FC_documentBaseUri = TypeSystemImpl.createCallSite(DocumentMetaData.class, "documentBaseUri"); + private final static MethodHandle _FH_documentBaseUri = _FC_documentBaseUri.dynamicInvoker(); + private final static CallSite _FC_isLastSegment = TypeSystemImpl.createCallSite(DocumentMetaData.class, "isLastSegment"); + private final static MethodHandle _FH_isLastSegment = _FC_isLastSegment.dynamicInvoker(); + + + /** Never called. Disable default constructor + * @generated */ + protected DocumentMetaData() {/* intentionally empty block */} + + /** Internal - constructor used by generator + * @generated + * @param casImpl the CAS this Feature Structure belongs to + * @param type the type of this Feature Structure + */ + public DocumentMetaData(TypeImpl type, CASImpl casImpl) { + super(type, casImpl); readObject(); } + + /** @generated + * @param jcas JCas to which this Feature Structure belongs + */ + public DocumentMetaData(JCas jcas) { + super(jcas); + readObject(); + } - /** - * @generated - * @param jcas - * JCas to which this Feature Structure belongs - * @param begin - * offset to the begin spot in the SofA - * @param end - * offset to the end spot in the SofA - */ - public DocumentMetaData(JCas jcas, int begin, int end) { + + /** @generated + * @param jcas JCas to which this Feature Structure belongs + * @param begin offset to the begin spot in the SofA + * @param end offset to the end spot in the SofA + */ + public DocumentMetaData(JCas jcas, int begin, int end) { super(jcas); setBegin(begin); setEnd(end); readObject(); - } - - /** - * Write your own initialization here - * - * @generated modifiable - */ - private void readObject() - { - } - - // *--------------* - // * Feature: documentTitle - - /** - * getter for documentTitle - gets The human readable title of the document. - * - * @generated - * @return value of the feature - */ - public String getDocumentTitle() { - if (DocumentMetaData_Type.featOkTst && ((DocumentMetaData_Type)jcasType).casFeat_documentTitle == null) { - jcasType.jcas.throwFeatMissing("documentTitle", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - } - return jcasType.ll_cas.ll_getStringValue(addr, ((DocumentMetaData_Type)jcasType).casFeatCode_documentTitle);} - - /** - * setter for documentTitle - sets The human readable title of the document. - * - * @generated - * @param v - * value to set into the feature - */ - public void setDocumentTitle(String v) { - if (DocumentMetaData_Type.featOkTst && ((DocumentMetaData_Type)jcasType).casFeat_documentTitle == null) { - jcasType.jcas.throwFeatMissing("documentTitle", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - } - jcasType.ll_cas.ll_setStringValue(addr, ((DocumentMetaData_Type)jcasType).casFeatCode_documentTitle, v);} - - - // *--------------* - // * Feature: documentId - - /** - * getter for documentId - gets The id of the document. - * - * @generated - * @return value of the feature - */ - public String getDocumentId() { - if (DocumentMetaData_Type.featOkTst && ((DocumentMetaData_Type)jcasType).casFeat_documentId == null) { - jcasType.jcas.throwFeatMissing("documentId", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - } - return jcasType.ll_cas.ll_getStringValue(addr, ((DocumentMetaData_Type)jcasType).casFeatCode_documentId);} - - /** - * setter for documentId - sets The id of the document. - * - * @generated - * @param v - * value to set into the feature - */ - public void setDocumentId(String v) { - if (DocumentMetaData_Type.featOkTst && ((DocumentMetaData_Type)jcasType).casFeat_documentId == null) { - jcasType.jcas.throwFeatMissing("documentId", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - } - jcasType.ll_cas.ll_setStringValue(addr, ((DocumentMetaData_Type)jcasType).casFeatCode_documentId, v);} - - - // *--------------* - // * Feature: documentUri - - /** - * getter for documentUri - gets The URI of the document. - * - * @generated - * @return value of the feature - */ - public String getDocumentUri() { - if (DocumentMetaData_Type.featOkTst && ((DocumentMetaData_Type)jcasType).casFeat_documentUri == null) { - jcasType.jcas.throwFeatMissing("documentUri", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - } - return jcasType.ll_cas.ll_getStringValue(addr, ((DocumentMetaData_Type)jcasType).casFeatCode_documentUri);} - - /** - * setter for documentUri - sets The URI of the document. - * - * @generated - * @param v - * value to set into the feature - */ - public void setDocumentUri(String v) { - if (DocumentMetaData_Type.featOkTst && ((DocumentMetaData_Type)jcasType).casFeat_documentUri == null) { - jcasType.jcas.throwFeatMissing("documentUri", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - } - jcasType.ll_cas.ll_setStringValue(addr, ((DocumentMetaData_Type)jcasType).casFeatCode_documentUri, v);} - - - // *--------------* - // * Feature: collectionId - - /** - * getter for collectionId - gets The ID of the whole document collection. - * - * @generated - * @return value of the feature - */ - public String getCollectionId() { - if (DocumentMetaData_Type.featOkTst && ((DocumentMetaData_Type)jcasType).casFeat_collectionId == null) { - jcasType.jcas.throwFeatMissing("collectionId", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - } - return jcasType.ll_cas.ll_getStringValue(addr, ((DocumentMetaData_Type)jcasType).casFeatCode_collectionId);} - - /** - * setter for collectionId - sets The ID of the whole document collection. + } + + /** + * + * Write your own initialization here + * + * + * @generated modifiable + */ + private void readObject() {/*default - does nothing empty block */} + + + + //*--------------* + //* Feature: documentTitle + + /** getter for documentTitle - gets The human readable title of the document. + * @generated + * @return value of the feature + */ + public String getDocumentTitle() { return _getStringValueNc(wrapGetIntCatchException(_FH_documentTitle));} + + /** setter for documentTitle - sets The human readable title of the document. * @generated * @param v value to set into the feature - */ - public void setCollectionId(String v) { - if (DocumentMetaData_Type.featOkTst && ((DocumentMetaData_Type)jcasType).casFeat_collectionId == null) { - jcasType.jcas.throwFeatMissing("collectionId", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - } - jcasType.ll_cas.ll_setStringValue(addr, ((DocumentMetaData_Type)jcasType).casFeatCode_collectionId, v);} - - - // *--------------* - // * Feature: documentBaseUri - - /** - * getter for documentBaseUri - gets Base URI of the document. - * - * @generated - * @return value of the feature - */ - public String getDocumentBaseUri() { - if (DocumentMetaData_Type.featOkTst && ((DocumentMetaData_Type)jcasType).casFeat_documentBaseUri == null) { - jcasType.jcas.throwFeatMissing("documentBaseUri", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - } - return jcasType.ll_cas.ll_getStringValue(addr, ((DocumentMetaData_Type)jcasType).casFeatCode_documentBaseUri);} - - /** - * setter for documentBaseUri - sets Base URI of the document. - * - * @generated - * @param v - * value to set into the feature - */ - public void setDocumentBaseUri(String v) { - if (DocumentMetaData_Type.featOkTst && ((DocumentMetaData_Type)jcasType).casFeat_documentBaseUri == null) { - jcasType.jcas.throwFeatMissing("documentBaseUri", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - } - jcasType.ll_cas.ll_setStringValue(addr, ((DocumentMetaData_Type)jcasType).casFeatCode_documentBaseUri, v);} - - - // *--------------* - // * Feature: isLastSegment - - /** - * getter for isLastSegment - gets CAS de-multipliers need to know whether a CAS is the last - * multiplied segment. Thus CAS multipliers should set this field to true for the last CAS they - * produce. - * - * @generated - * @return value of the feature - */ - public boolean getIsLastSegment() { - if (DocumentMetaData_Type.featOkTst && ((DocumentMetaData_Type)jcasType).casFeat_isLastSegment == null) { - jcasType.jcas.throwFeatMissing("isLastSegment", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - } - return jcasType.ll_cas.ll_getBooleanValue(addr, ((DocumentMetaData_Type)jcasType).casFeatCode_isLastSegment);} - - /** - * setter for isLastSegment - sets CAS de-multipliers need to know whether a CAS is the last - * multiplied segment. Thus CAS multipliers should set this field to true for the last CAS they - * produce. - * - * @generated - * @param v - * value to set into the feature - */ - public void setIsLastSegment(boolean v) { - if (DocumentMetaData_Type.featOkTst && ((DocumentMetaData_Type)jcasType).casFeat_isLastSegment == null) { - jcasType.jcas.throwFeatMissing("isLastSegment", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - } - jcasType.ll_cas.ll_setBooleanValue(addr, ((DocumentMetaData_Type)jcasType).casFeatCode_isLastSegment, v);} + */ + public void setDocumentTitle(String v) { + _setStringValueNfc(wrapGetIntCatchException(_FH_documentTitle), v); + } + + + + //*--------------* + //* Feature: documentId + + /** getter for documentId - gets The id of the document. + * @generated + * @return value of the feature + */ + public String getDocumentId() { return _getStringValueNc(wrapGetIntCatchException(_FH_documentId));} + + /** setter for documentId - sets The id of the document. + * @generated + * @param v value to set into the feature + */ + public void setDocumentId(String v) { + _setStringValueNfc(wrapGetIntCatchException(_FH_documentId), v); + } + + + + //*--------------* + //* Feature: documentUri + + /** getter for documentUri - gets The URI of the document. + * @generated + * @return value of the feature + */ + public String getDocumentUri() { return _getStringValueNc(wrapGetIntCatchException(_FH_documentUri));} + + /** setter for documentUri - sets The URI of the document. + * @generated + * @param v value to set into the feature + */ + public void setDocumentUri(String v) { + _setStringValueNfc(wrapGetIntCatchException(_FH_documentUri), v); + } + + + + //*--------------* + //* Feature: collectionId + + /** getter for collectionId - gets The ID of the whole document collection. + * @generated + * @return value of the feature + */ + public String getCollectionId() { return _getStringValueNc(wrapGetIntCatchException(_FH_collectionId));} + + /** setter for collectionId - sets The ID of the whole document collection. + * @generated + * @param v value to set into the feature + */ + public void setCollectionId(String v) { + _setStringValueNfc(wrapGetIntCatchException(_FH_collectionId), v); + } + + + + //*--------------* + //* Feature: documentBaseUri + + /** getter for documentBaseUri - gets Base URI of the document. + * @generated + * @return value of the feature + */ + public String getDocumentBaseUri() { return _getStringValueNc(wrapGetIntCatchException(_FH_documentBaseUri));} + + /** setter for documentBaseUri - sets Base URI of the document. + * @generated + * @param v value to set into the feature + */ + public void setDocumentBaseUri(String v) { + _setStringValueNfc(wrapGetIntCatchException(_FH_documentBaseUri), v); + } + + + + //*--------------* + //* Feature: isLastSegment + + /** getter for isLastSegment - gets CAS de-multipliers need to know whether a CAS is the + last multiplied segment. + Thus CAS multipliers should set this field to true for the last CAS + they produce. + * @generated + * @return value of the feature + */ + public boolean getIsLastSegment() { return _getBooleanValueNc(wrapGetIntCatchException(_FH_isLastSegment));} + + /** setter for isLastSegment - sets CAS de-multipliers need to know whether a CAS is the + last multiplied segment. + Thus CAS multipliers should set this field to true for the last CAS + they produce. + * @generated + * @param v value to set into the feature + */ + public void setIsLastSegment(boolean v) { + _setBooleanValueNfc(wrapGetIntCatchException(_FH_isLastSegment), v); + } /** * Create a new {@link DocumentMetaData} annotation in the given CAS. The meta data fields can diff --git a/dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/type/DocumentMetaData_Type.java b/dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/type/DocumentMetaData_Type.java deleted file mode 100644 index a4c7c2e391..0000000000 --- a/dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/type/DocumentMetaData_Type.java +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* First created by JCasGen Mon Nov 08 23:55:50 CET 2010 */ -package de.tudarmstadt.ukp.dkpro.core.api.metadata.type; - -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.JCasRegistry; -import org.apache.uima.cas.impl.CASImpl; -import org.apache.uima.cas.impl.FSGenerator; -import org.apache.uima.cas.FeatureStructure; -import org.apache.uima.cas.impl.TypeImpl; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.impl.FeatureImpl; -import org.apache.uima.cas.Feature; -import org.apache.uima.jcas.tcas.DocumentAnnotation_Type; - -/** - * Updated by JCasGen Sun Nov 21 13:28:49 CET 2010 - * @generated */ -public class DocumentMetaData_Type extends DocumentAnnotation_Type { - /** @generated */ - @Override -protected FSGenerator getFSGenerator() {return fsGenerator;} - /** @generated */ - private final FSGenerator fsGenerator = - new FSGenerator() { - @Override - public FeatureStructure createFS(int addr, CASImpl cas) { - if (DocumentMetaData_Type.this.useExistingInstance) { - // Return eq fs instance if already created - FeatureStructure fs = DocumentMetaData_Type.this.jcas.getJfsFromCaddr(addr); - if (null == fs) { - fs = new DocumentMetaData(addr, DocumentMetaData_Type.this); - DocumentMetaData_Type.this.jcas.putJfsFromCaddr(addr, fs); - return fs; - } - return fs; - } else return new DocumentMetaData(addr, DocumentMetaData_Type.this); - } - }; - /** @generated */ - public final static int typeIndexID = DocumentMetaData.typeIndexID; - /** @generated - @modifiable */ - public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - - /** @generated */ - final Feature casFeat_documentTitle; - /** @generated */ - final int casFeatCode_documentTitle; - /** @generated - * @param addr low level Feature Structure reference - * @return the feature value - */ - public String getDocumentTitle(int addr) { - if (featOkTst && casFeat_documentTitle == null) - jcas.throwFeatMissing("documentTitle", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - return ll_cas.ll_getStringValue(addr, casFeatCode_documentTitle); - } - /** @generated - * @param addr low level Feature Structure reference - * @param v value to set - */ - public void setDocumentTitle(int addr, String v) { - if (featOkTst && casFeat_documentTitle == null) - jcas.throwFeatMissing("documentTitle", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - ll_cas.ll_setStringValue(addr, casFeatCode_documentTitle, v);} - - - - /** @generated */ - final Feature casFeat_documentId; - /** @generated */ - final int casFeatCode_documentId; - /** @generated - * @param addr low level Feature Structure reference - * @return the feature value - */ - public String getDocumentId(int addr) { - if (featOkTst && casFeat_documentId == null) - jcas.throwFeatMissing("documentId", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - return ll_cas.ll_getStringValue(addr, casFeatCode_documentId); - } - /** @generated - * @param addr low level Feature Structure reference - * @param v value to set - */ - public void setDocumentId(int addr, String v) { - if (featOkTst && casFeat_documentId == null) - jcas.throwFeatMissing("documentId", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - ll_cas.ll_setStringValue(addr, casFeatCode_documentId, v);} - - - - /** @generated */ - final Feature casFeat_documentUri; - /** @generated */ - final int casFeatCode_documentUri; - /** @generated - * @param addr low level Feature Structure reference - * @return the feature value - */ - public String getDocumentUri(int addr) { - if (featOkTst && casFeat_documentUri == null) - jcas.throwFeatMissing("documentUri", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - return ll_cas.ll_getStringValue(addr, casFeatCode_documentUri); - } - /** @generated - * @param addr low level Feature Structure reference - * @param v value to set - */ - public void setDocumentUri(int addr, String v) { - if (featOkTst && casFeat_documentUri == null) - jcas.throwFeatMissing("documentUri", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - ll_cas.ll_setStringValue(addr, casFeatCode_documentUri, v);} - - - - /** @generated */ - final Feature casFeat_collectionId; - /** @generated */ - final int casFeatCode_collectionId; - /** @generated - * @param addr low level Feature Structure reference - * @return the feature value - */ - public String getCollectionId(int addr) { - if (featOkTst && casFeat_collectionId == null) - jcas.throwFeatMissing("collectionId", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - return ll_cas.ll_getStringValue(addr, casFeatCode_collectionId); - } - /** @generated - * @param addr low level Feature Structure reference - * @param v value to set - */ - public void setCollectionId(int addr, String v) { - if (featOkTst && casFeat_collectionId == null) - jcas.throwFeatMissing("collectionId", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - ll_cas.ll_setStringValue(addr, casFeatCode_collectionId, v);} - - - - /** @generated */ - final Feature casFeat_documentBaseUri; - /** @generated */ - final int casFeatCode_documentBaseUri; - /** @generated - * @param addr low level Feature Structure reference - * @return the feature value - */ - public String getDocumentBaseUri(int addr) { - if (featOkTst && casFeat_documentBaseUri == null) - jcas.throwFeatMissing("documentBaseUri", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - return ll_cas.ll_getStringValue(addr, casFeatCode_documentBaseUri); - } - /** @generated - * @param addr low level Feature Structure reference - * @param v value to set - */ - public void setDocumentBaseUri(int addr, String v) { - if (featOkTst && casFeat_documentBaseUri == null) - jcas.throwFeatMissing("documentBaseUri", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - ll_cas.ll_setStringValue(addr, casFeatCode_documentBaseUri, v);} - - - - /** @generated */ - final Feature casFeat_isLastSegment; - /** @generated */ - final int casFeatCode_isLastSegment; - /** @generated - * @param addr low level Feature Structure reference - * @return the feature value - */ - public boolean getIsLastSegment(int addr) { - if (featOkTst && casFeat_isLastSegment == null) - jcas.throwFeatMissing("isLastSegment", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - return ll_cas.ll_getBooleanValue(addr, casFeatCode_isLastSegment); - } - /** @generated - * @param addr low level Feature Structure reference - * @param v value to set - */ - public void setIsLastSegment(int addr, boolean v) { - if (featOkTst && casFeat_isLastSegment == null) - jcas.throwFeatMissing("isLastSegment", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"); - ll_cas.ll_setBooleanValue(addr, casFeatCode_isLastSegment, v);} - - - - - - /** initialize variables to correspond with Cas Type and Features - * @generated - * @param jcas JCas - * @param casType Type - */ - public DocumentMetaData_Type(JCas jcas, Type casType) { - super(jcas, casType); - casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator()); - - - casFeat_documentTitle = jcas.getRequiredFeatureDE(casType, "documentTitle", "uima.cas.String", featOkTst); - casFeatCode_documentTitle = (null == casFeat_documentTitle) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_documentTitle).getCode(); - - - casFeat_documentId = jcas.getRequiredFeatureDE(casType, "documentId", "uima.cas.String", featOkTst); - casFeatCode_documentId = (null == casFeat_documentId) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_documentId).getCode(); - - - casFeat_documentUri = jcas.getRequiredFeatureDE(casType, "documentUri", "uima.cas.String", featOkTst); - casFeatCode_documentUri = (null == casFeat_documentUri) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_documentUri).getCode(); - - - casFeat_collectionId = jcas.getRequiredFeatureDE(casType, "collectionId", "uima.cas.String", featOkTst); - casFeatCode_collectionId = (null == casFeat_collectionId) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_collectionId).getCode(); - - - casFeat_documentBaseUri = jcas.getRequiredFeatureDE(casType, "documentBaseUri", "uima.cas.String", featOkTst); - casFeatCode_documentBaseUri = (null == casFeat_documentBaseUri) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_documentBaseUri).getCode(); - - - casFeat_isLastSegment = jcas.getRequiredFeatureDE(casType, "isLastSegment", "uima.cas.Boolean", featOkTst); - casFeatCode_isLastSegment = (null == casFeat_isLastSegment) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_isLastSegment).getCode(); - - } -} - - - - diff --git a/dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/AggregateTagset.java b/dkpro-core-api-metadata-asl/src/main/java/org/dkpro/core/api/metadata/AggregateTagset.java similarity index 97% rename from dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/AggregateTagset.java rename to dkpro-core-api-metadata-asl/src/main/java/org/dkpro/core/api/metadata/AggregateTagset.java index 9f6d6d7ca3..a58eba1119 100644 --- a/dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/AggregateTagset.java +++ b/dkpro-core-api-metadata-asl/src/main/java/org/dkpro/core/api/metadata/AggregateTagset.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.metadata; +package org.dkpro.core.api.metadata; import java.util.ArrayList; import java.util.HashMap; diff --git a/dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/SingletonTagset.java b/dkpro-core-api-metadata-asl/src/main/java/org/dkpro/core/api/metadata/SingletonTagset.java similarity index 97% rename from dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/SingletonTagset.java rename to dkpro-core-api-metadata-asl/src/main/java/org/dkpro/core/api/metadata/SingletonTagset.java index f9372e50db..864a1d90cf 100644 --- a/dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/SingletonTagset.java +++ b/dkpro-core-api-metadata-asl/src/main/java/org/dkpro/core/api/metadata/SingletonTagset.java @@ -15,14 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.metadata; +package org.dkpro.core.api.metadata; import static java.util.Collections.singletonMap; import java.util.Collection; import java.util.Map; import java.util.Map.Entry; - import java.util.Set; import java.util.TreeSet; diff --git a/dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/Tagset.java b/dkpro-core-api-metadata-asl/src/main/java/org/dkpro/core/api/metadata/Tagset.java similarity index 95% rename from dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/Tagset.java rename to dkpro-core-api-metadata-asl/src/main/java/org/dkpro/core/api/metadata/Tagset.java index 19ec560ef6..0a63f370d1 100644 --- a/dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/Tagset.java +++ b/dkpro-core-api-metadata-asl/src/main/java/org/dkpro/core/api/metadata/Tagset.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.metadata; +package org.dkpro.core.api.metadata; import java.util.Map; import java.util.Set; diff --git a/dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/TagsetBase.java b/dkpro-core-api-metadata-asl/src/main/java/org/dkpro/core/api/metadata/TagsetBase.java similarity index 97% rename from dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/TagsetBase.java rename to dkpro-core-api-metadata-asl/src/main/java/org/dkpro/core/api/metadata/TagsetBase.java index 81e21495dd..40347b17c5 100644 --- a/dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/TagsetBase.java +++ b/dkpro-core-api-metadata-asl/src/main/java/org/dkpro/core/api/metadata/TagsetBase.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.metadata; +package org.dkpro.core.api.metadata; import java.util.Map; import java.util.Map.Entry; diff --git a/dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/TagsetMetaData.java b/dkpro-core-api-metadata-asl/src/main/java/org/dkpro/core/api/metadata/TagsetMetaData.java similarity index 97% rename from dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/TagsetMetaData.java rename to dkpro-core-api-metadata-asl/src/main/java/org/dkpro/core/api/metadata/TagsetMetaData.java index 6d8a779a4c..d5d4647d90 100644 --- a/dkpro-core-api-metadata-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/TagsetMetaData.java +++ b/dkpro-core-api-metadata-asl/src/main/java/org/dkpro/core/api/metadata/TagsetMetaData.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.metadata; +package org.dkpro.core.api.metadata; public class TagsetMetaData { diff --git a/dkpro-core-api-metadata-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map b/dkpro-core-api-metadata-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map new file mode 100644 index 0000000000..b803567b5d --- /dev/null +++ b/dkpro-core-api-metadata-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map @@ -0,0 +1,4 @@ +# de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData +# de.tudarmstadt.ukp.dkpro.core.api.metadata.type.MetaDataStringField +# de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagDescription +# de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription diff --git a/dkpro-core-api-metadata-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/type/MetaDataStringFieldTest.java b/dkpro-core-api-metadata-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/type/MetaDataStringFieldTest.java index 41b4993ad8..72f4890658 100644 --- a/dkpro-core-api-metadata-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/type/MetaDataStringFieldTest.java +++ b/dkpro-core-api-metadata-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/metadata/type/MetaDataStringFieldTest.java @@ -19,7 +19,8 @@ package de.tudarmstadt.ukp.dkpro.core.api.metadata.type; import static org.apache.uima.fit.util.JCasUtil.select; -import static org.junit.Assert.*; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import java.util.Collection; @@ -30,7 +31,6 @@ public class MetaDataStringFieldTest { - @Test public void testSimple() throws UIMAException diff --git a/dkpro-core-api-metadata-asl/suppressions.xml b/dkpro-core-api-metadata-asl/suppressions.xml new file mode 100644 index 0000000000..0d4d2fd12c --- /dev/null +++ b/dkpro-core-api-metadata-asl/suppressions.xml @@ -0,0 +1,10 @@ + + + + + + + + diff --git a/dkpro-core-api-ner-asl/pom.xml b/dkpro-core-api-ner-asl/pom.xml index 034d91ba89..ffe0558608 100644 --- a/dkpro-core-api-ner-asl/pom.xml +++ b/dkpro-core-api-ner-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl jar - de.tudarmstadt.ukp.dkpro.core.api.ner-asl + dkpro-core-api-ner-asl DKPro Core ASL - Named Entity Recognition API + https://dkpro.github.io/dkpro-core/ org.apache.uima diff --git a/dkpro-core-api-ner-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map b/dkpro-core-api-ner-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map new file mode 100644 index 0000000000..0dbeba033d --- /dev/null +++ b/dkpro-core-api-ner-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map @@ -0,0 +1 @@ +de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity=http://w3id.org/meta-share/omtd-share/NamedEntity diff --git a/dkpro-core-api-ner-asl/suppressions.xml b/dkpro-core-api-ner-asl/suppressions.xml new file mode 100644 index 0000000000..05381817ea --- /dev/null +++ b/dkpro-core-api-ner-asl/suppressions.xml @@ -0,0 +1,9 @@ + + + + + + + diff --git a/dkpro-core-api-parameter-asl/pom.xml b/dkpro-core-api-parameter-asl/pom.xml index 6e8bc6e591..26e47b43d9 100644 --- a/dkpro-core-api-parameter-asl/pom.xml +++ b/dkpro-core-api-parameter-asl/pom.xml @@ -1,39 +1,40 @@ - 4.0.0 - - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT - ../dkpro-core-asl - - jar - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl - DKPro Core ASL - Parameter API - - - org.apache.uima - uimaj-core - - - org.apache.uima - uimafit-core - - + 4.0.0 + + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT + ../dkpro-core-asl + + jar + dkpro-core-api-parameter-asl + DKPro Core ASL - Parameter API + https://dkpro.github.io/dkpro-core/ + + + org.apache.uima + uimaj-core + + + org.apache.uima + uimafit-core + + \ No newline at end of file diff --git a/dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/AnnotationChecker.java b/dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/AnnotationChecker.java index 27b9ceec60..0d0b8c2fe4 100644 --- a/dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/AnnotationChecker.java +++ b/dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/AnnotationChecker.java @@ -21,49 +21,63 @@ import org.apache.uima.analysis_component.AnalysisComponent; import org.apache.uima.cas.CAS; -import org.apache.uima.fit.internal.ExtendedLogger; import org.apache.uima.fit.util.CasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.TOP; import org.apache.uima.util.Level; +import org.apache.uima.util.Logger; -public class AnnotationChecker { +public class AnnotationChecker +{ + private static WeakHashMap instanceMapExists = new WeakHashMap<>(); + private static WeakHashMap instanceMapNotExists = + new WeakHashMap<>(); + + public static void requireExists(AnalysisComponent callingInstance, JCas jcas, Logger logger, + Class... types) + { + requireExists(callingInstance, jcas.getCas(), logger, types); + } + + public static void requireExists(AnalysisComponent callingInstance, CAS cas, Logger logger, + Class... types) + { + // we only want to check the first CAS + if (!instanceMapExists.containsKey(callingInstance)) { + instanceMapExists.put(callingInstance, true); + + for (Class type : types) { + if (CasUtil.select(cas, CasUtil.getType(cas, type.getName())).size() == 0) { + logger.log(Level.WARNING, + callingInstance.getClass().getName() + + " called but no annotation of type '" + type.getName() + + "' found in CAS."); + } + } + } + } + + public static void requireNotExists(AnalysisComponent callingInstance, JCas jcas, Logger logger, + Class... types) + { + requireNotExists(callingInstance, jcas.getCas(), logger, types); + } + + public static void requireNotExists(AnalysisComponent callingInstance, CAS cas, Logger logger, + Class... types) + { + // we only want to check the first CAS + if (!instanceMapNotExists.containsKey(callingInstance)) { + instanceMapNotExists.put(callingInstance, true); + + for (Class type : types) { + if (CasUtil.select(cas, CasUtil.getType(cas, type.getName())).size() > 0) { + logger.log(Level.WARNING, callingInstance.getClass().getName() + + " called, but annotations of type '" + type.getName() + + "' already present in CAS. This might lead to unintended side-effects."); + } + } + } + } - private static WeakHashMap instanceMapExists = new WeakHashMap<>(); - private static WeakHashMap instanceMapNotExists = new WeakHashMap<>(); - - public static void requireExists(AnalysisComponent callingInstance, JCas jcas, ExtendedLogger logger, Class ... types) { - requireExists(callingInstance, jcas.getCas(), logger, types); - } - - public static void requireExists(AnalysisComponent callingInstance, CAS cas, ExtendedLogger logger, Class ... types) { - // we only want to check the first CAS - if (!instanceMapExists.containsKey(callingInstance)) { - instanceMapExists.put(callingInstance, true); - - for (Class type : types) { - if (CasUtil.select(cas, CasUtil.getType(cas, type.getName())).size() == 0) { - logger.log(Level.WARNING, callingInstance.getClass().getName() + " called but no annotation of type '" + type.getName() + "' found in CAS."); - } - } - } - } - - public static void requireNotExists(AnalysisComponent callingInstance, JCas jcas, ExtendedLogger logger, Class ... types) { - requireNotExists(callingInstance, jcas.getCas(), logger, types); - } - - public static void requireNotExists(AnalysisComponent callingInstance, CAS cas, ExtendedLogger logger, Class ... types) { - // we only want to check the first CAS - if (!instanceMapNotExists.containsKey(callingInstance)) { - instanceMapNotExists.put(callingInstance, true); - - for (Class type : types) { - if (CasUtil.select(cas, CasUtil.getType(cas, type.getName())).size() > 0) { - logger.log(Level.WARNING, callingInstance.getClass().getName() + " called, but annotations of type '" + type.getName() + "' already present in CAS. This might lead to unintended side-effects."); - } - } - } - } - } diff --git a/dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/ComponentParameters.java b/dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/ComponentParameters.java deleted file mode 100644 index a6b958aa48..0000000000 --- a/dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/ComponentParameters.java +++ /dev/null @@ -1,332 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.parameter; - - -public final class ComponentParameters -{ - public static final String DEFAULT_ENCODING = "UTF-8"; - - - /** - * For analysis engines: Use this language instead of the document language to resolve the model - * and tag set mapping. - * - * For readers: Set this as the language of the produced documents. - */ - public static final String PARAM_LANGUAGE = "language"; - - /** - * Variant of the model. Used to address a specific model if here are multiple models - * for one language. - */ - public static final String PARAM_PATTERNS = "patterns"; - - /** - * Variant of a model the model. Used to address a specific model if here are multiple models - * for one language. - */ - public static final String PARAM_VARIANT = "modelVariant"; - - /** - * Regex to filter tags, e. g. named entities. Used by, e. g, named entity, trainers to train a model for just one - * model variant. - */ - public static final String PARAM_ACCEPTED_TAGS_REGEX = "acceptedTagsRegex"; - - /** - * Location from which the model is read. - */ - public static final String PARAM_MODEL_LOCATION = "modelLocation"; - - /** - * Location from which the segmentation model is read. - */ - public static final String PARAM_SEGMENTATION_MODEL_LOCATION = "segmentationModelLocation"; - - /** - * Location from which the tokenization model is read. - */ - public static final String PARAM_TOKENIZATION_MODEL_LOCATION = "tokenizationModelLocation"; - - /** - * The character encoding used by the model. - */ - public static final String PARAM_MODEL_ENCODING = "modelEncoding"; - - /** - * Location from which the input is read. - */ - public static final String PARAM_SOURCE_LOCATION = "sourceLocation"; - - /** - * Character encoding of the input data. - */ - public static final String PARAM_SOURCE_ENCODING = "sourceEncoding"; - - /** - * Location to which the output is written. - */ - public static final String PARAM_TARGET_LOCATION = "targetLocation"; - - /** - * Character encoding of the output data. - */ - public static final String PARAM_TARGET_ENCODING = "targetEncoding"; - - /** - * Use this filename extension. - */ - public static final String PARAM_FILENAME_EXTENSION = "filenameExtension"; - - /** - * Remove the original extension. - */ - public static final String PARAM_STRIP_EXTENSION = "stripExtension"; - - /** - * Log the tag set(s) when a model is loaded. - */ - public static final String PARAM_PRINT_TAGSET = "printTagSet"; - - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spamming the heap with thousands of strings representing only a few different tags. - */ - public static final String PARAM_INTERN_TAGS = "internTags"; - - /** - * When splitting an annotation into multiple parts, e.g. when splitting a token that is a - * compound word into multiple tokens, each representing a part of the word, this parameter - * controls if the original annotation is kept or removed. - */ - public static final String PARAM_DELETE_COVER = "deleteCover"; - - /** - * Maximal sentence length in tokens that is still being processed. - */ - public static final String PARAM_MAX_SENTENCE_LENGTH = "maxSentenceLength"; - - /** - * The number of threads to use for components that implement multi-threading - */ - public static final String PARAM_NUM_THREADS = "numThreads"; - /** - * Use smart number of threads if PARAM_NUM_THREADS is set to this value - */ - public static final String AUTO_NUM_THREADS = "0"; - - /** - * Compute the number of threads to use for components that can make use of multi-threading. - *
    - *
  • for positive values: use the given number of threads, with the number of available CPUs maximum.
  • - *
  • for negative value: use the number of available CPUs minus the given value, minimum 1.
  • - *
  • for {@link #AUTO_NUM_THREADS} (0): use the number of available CPUs minus one.
  • - *
- * - * @param value the user-proposed number of threads (positive, negative, or 0) - * @return the actual number of threads to use. - */ - public static int computeNumThreads(int value) - { - int cpus = Runtime.getRuntime().availableProcessors(); - - if (value > 0) { - return Math.min(cpus, value); - } - else if (value < 0) { - return Math.max(1, cpus + value); - } - else { - return Math.max(1, cpus - 1); - } - } - - // ============================================================================================= - // Annotation types - // ============================================================================================= - - private static final String PARAGRAPH = "Paragraph"; - - private static final String SENTENCE = "Sentence"; - - private static final String FORM = "Form"; - - private static final String TOKEN = "Token"; - - private static final String LEMMA = "Lemma"; - - private static final String POS = "POS"; - - private static final String CPOS = "CPOS"; - - private static final String MORPH = "Morph"; - - private static final String CHUNK = "Chunk"; - - private static final String CONSTITUENT = "Constituent"; - - private static final String COREFERENCE = "Coreference"; - - private static final String PENN_TREE = "PennTree"; - - private static final String DEPENDENCY = "Dependency"; - - private static final String NAMED_ENTITY = "NamedEntity"; - - // ============================================================================================= - // Verbs for parameters - // ============================================================================================= - - private static final String READ = "read"; - - private static final String WRITE = "write"; - - // ============================================================================================= - // Nouns for parameters - // ============================================================================================= - - private static final String TAG_SET = "TagSet"; - - private static final String MAPPING_LOCATION = "MappingLocation"; - - // ============================================================================================= - // Enable / disable reading or writing of particular annotation types. - // ============================================================================================= - - public static final String PARAM_READ_PARAGRAPH = READ + PARAGRAPH; - - public static final String PARAM_READ_SENTENCE = READ + SENTENCE; - - public static final String PARAM_READ_TOKEN = READ + TOKEN; - - public static final String PARAM_READ_FORM = READ + FORM; - - public static final String PARAM_READ_LEMMA = READ + LEMMA; - - public static final String PARAM_READ_POS = READ + POS; - - public static final String PARAM_READ_CPOS = READ + CPOS; - - public static final String PARAM_READ_CHUNK = READ + CHUNK; - - public static final String PARAM_READ_MORPH = READ + MORPH; - - public static final String PARAM_READ_CONSTITUENT = READ + CONSTITUENT; - - public static final String PARAM_READ_COREFERENCE = READ + COREFERENCE; - - public static final String PARAM_READ_PENN_TREE = READ + PENN_TREE; - - public static final String PARAM_READ_DEPENDENCY = READ + DEPENDENCY; - - public static final String PARAM_READ_NAMED_ENTITY = READ + NAMED_ENTITY; - - public static final String PARAM_WRITE_PARAGRAPH = WRITE + PARAGRAPH; - - public static final String PARAM_WRITE_SENTENCE = WRITE + SENTENCE; - - public static final String PARAM_WRITE_TOKEN = WRITE + TOKEN; - - public static final String PARAM_WRITE_FORM = WRITE + FORM; - - public static final String PARAM_WRITE_LEMMA = WRITE + LEMMA; - - public static final String PARAM_WRITE_POS = WRITE + POS; - - public static final String PARAM_WRITE_CPOS = WRITE + CPOS; - - public static final String PARAM_WRITE_CHUNK = WRITE + CHUNK; - - public static final String PARAM_WRITE_MORPH = WRITE + MORPH; - - public static final String PARAM_WRITE_CONSTITUENT = WRITE + CONSTITUENT; - - public static final String PARAM_WRITE_COREFERENCE = WRITE + COREFERENCE; - - public static final String PARAM_WRITE_PENN_TREE = WRITE + PENN_TREE; - - public static final String PARAM_WRITE_DEPENDENCY = WRITE + DEPENDENCY; - - public static final String PARAM_WRITE_NAMED_ENTITY = WRITE + NAMED_ENTITY; - - // ============================================================================================= - // Configure tag sets for different kinds of annotations. - // - // Not using the type constants here because they are capitalized for use with verbs - // ============================================================================================= - - /** - * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the - * tag set defined as part of the model meta data. This can be useful if a custom model is - * specified which does not have such meta data, or it can be used in readers. - */ - public static final String PARAM_POS_TAG_SET = POS + TAG_SET; - - /** - * Use this chunk tag set to use to resolve the tag set mapping instead of using the - * tag set defined as part of the model meta data. This can be useful if a custom model is - * specified which does not have such meta data, or it can be used in readers. - */ - public static final String PARAM_CHUNK_TAG_SET = CHUNK + TAG_SET; - - public static final String PARAM_CONSTITUENT_TAG_SET = CONSTITUENT + TAG_SET; - - public static final String PARAM_MORPH_TAG_SET = MORPH + TAG_SET; - - // ============================================================================================= - // Configure mapping of tags to annotation types for different kinds of annotations. - // - // Not using the type constants here because they are capitalized for use with verbs - // ============================================================================================= - - /** - * Location of the mapping file for part-of-speech tags to UIMA types. - */ - public static final String PARAM_POS_MAPPING_LOCATION = POS + MAPPING_LOCATION; - - /** - * Location of the mapping file for constituent tags to UIMA types. - */ - public static final String PARAM_CONSTITUENT_MAPPING_LOCATION = CONSTITUENT + MAPPING_LOCATION; - - /** - * Location of the mapping file for chunk tags to UIMA types. - */ - public static final String PARAM_CHUNK_MAPPING_LOCATION = CHUNK + MAPPING_LOCATION; - - /** - * Location of the mapping file for named entity tags to UIMA types. - */ - public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = NAMED_ENTITY + MAPPING_LOCATION; - - /** - * Location of the mapping file for morphological analysis strings to features. - */ - public static final String PARAM_MORPH_MAPPING_LOCATION = MORPH + MAPPING_LOCATION; - - /** - * Location of the mapping file for dependency tags to UIMA types. - */ - public static final String PARAM_DEPENDENCY_MAPPING_LOCATION = DEPENDENCY + MAPPING_LOCATION; - - private ComponentParameters() - { - // No instances of this class - } -} diff --git a/dkpro-core-api-parameter-asl/src/main/java/org/dkpro/core/api/parameter/AnnotationChecker.java b/dkpro-core-api-parameter-asl/src/main/java/org/dkpro/core/api/parameter/AnnotationChecker.java new file mode 100644 index 0000000000..3408f7f09c --- /dev/null +++ b/dkpro-core-api-parameter-asl/src/main/java/org/dkpro/core/api/parameter/AnnotationChecker.java @@ -0,0 +1,83 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.parameter; + +import java.util.Map; +import java.util.WeakHashMap; + +import org.apache.uima.analysis_component.AnalysisComponent; +import org.apache.uima.cas.CAS; +import org.apache.uima.fit.util.CasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.TOP; +import org.apache.uima.util.Level; +import org.apache.uima.util.Logger; + +public class AnnotationChecker +{ + private static Map instanceMapExists = new WeakHashMap<>(); + private static Map instanceMapNotExists = new WeakHashMap<>(); + + public static void requireExists(AnalysisComponent callingInstance, JCas jcas, + Logger logger, Class... types) + { + requireExists(callingInstance, jcas.getCas(), logger, types); + } + + public static void requireExists(AnalysisComponent callingInstance, CAS cas, + Logger logger, Class... types) + { + // we only want to check the first CAS + if (!instanceMapExists.containsKey(callingInstance)) { + instanceMapExists.put(callingInstance, true); + + for (Class type : types) { + if (CasUtil.select(cas, CasUtil.getType(cas, type.getName())).size() == 0) { + logger.log(Level.WARNING, + callingInstance.getClass().getName() + + " called but no annotation of type '" + type.getName() + + "' found in CAS."); + } + } + } + } + + public static void requireNotExists(AnalysisComponent callingInstance, JCas jcas, + Logger logger, Class... types) + { + requireNotExists(callingInstance, jcas.getCas(), logger, types); + } + + public static void requireNotExists(AnalysisComponent callingInstance, CAS cas, + Logger logger, Class... types) + { + // we only want to check the first CAS + if (!instanceMapNotExists.containsKey(callingInstance)) { + instanceMapNotExists.put(callingInstance, true); + + for (Class type : types) { + if (CasUtil.select(cas, CasUtil.getType(cas, type.getName())).size() > 0) { + logger.log(Level.WARNING, callingInstance.getClass().getName() + + " called, but annotations of type '" + type.getName() + + "' already present in CAS. This might lead to unintended side-effects."); + } + } + } + } + +} diff --git a/dkpro-core-api-parameter-asl/src/main/java/org/dkpro/core/api/parameter/ComponentParameters.java b/dkpro-core-api-parameter-asl/src/main/java/org/dkpro/core/api/parameter/ComponentParameters.java new file mode 100644 index 0000000000..001059f24d --- /dev/null +++ b/dkpro-core-api-parameter-asl/src/main/java/org/dkpro/core/api/parameter/ComponentParameters.java @@ -0,0 +1,357 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.parameter; + +public final class ComponentParameters +{ + public static final String DEFAULT_ENCODING = "UTF-8"; + + /** + * For analysis engines: Use this language instead of the document language to resolve the model + * and tag set mapping. + * + * For readers: Set this as the language of the produced documents. + */ + public static final String PARAM_LANGUAGE = "language"; + + /** + * Variant of the model. Used to address a specific model if here are multiple models for one + * language. + */ + public static final String PARAM_PATTERNS = "patterns"; + + /** + * Variant of a model the model. Used to address a specific model if here are multiple models + * for one language. + */ + public static final String PARAM_VARIANT = "modelVariant"; + + /** + * Regex to filter tags, e. g. named entities. Used by, e. g, named entity, trainers to train a + * model for just one model variant. + */ + public static final String PARAM_ACCEPTED_TAGS_REGEX = "acceptedTagsRegex"; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + *

The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.

+ */ + public static final String PARAM_MODEL_ARTIFACT_URI = "modelArtifactUri"; + + /** + * Location from which the model is read. This is either a local path or a classpath location. + * In the latter case, the model artifact (if any) is searched as well. + */ + public static final String PARAM_MODEL_LOCATION = "modelLocation"; + + /** + * Location from which the segmentation model is read. + */ + public static final String PARAM_SEGMENTATION_MODEL_LOCATION = "segmentationModelLocation"; + + /** + * Location from which the tokenization model is read. + */ + public static final String PARAM_TOKENIZATION_MODEL_LOCATION = "tokenizationModelLocation"; + + /** + * The character encoding used by the model. + */ + public static final String PARAM_MODEL_ENCODING = "modelEncoding"; + + /** + * Location from which the input is read. + */ + public static final String PARAM_SOURCE_LOCATION = "sourceLocation"; + + /** + * Character encoding of the input data. + */ + public static final String PARAM_SOURCE_ENCODING = "sourceEncoding"; + + /** + * Location to which the output is written. + */ + public static final String PARAM_TARGET_LOCATION = "targetLocation"; + + /** + * Character encoding of the output data. + */ + public static final String PARAM_TARGET_ENCODING = "targetEncoding"; + + /** + * Use this filename extension. + */ + public static final String PARAM_FILENAME_EXTENSION = "filenameExtension"; + + /** + * Remove the original extension. + */ + public static final String PARAM_STRIP_EXTENSION = "stripExtension"; + + /** + * Log the tag set(s) when a model is loaded. + */ + public static final String PARAM_PRINT_TAGSET = "printTagSet"; + + /** + * When splitting an annotation into multiple parts, e.g. when splitting a token that is a + * compound word into multiple tokens, each representing a part of the word, this parameter + * controls if the original annotation is kept or removed. + */ + public static final String PARAM_DELETE_COVER = "deleteCover"; + + /** + * Maximal sentence length in tokens that is still being processed. + */ + public static final String PARAM_MAX_SENTENCE_LENGTH = "maxSentenceLength"; + + /** + * The number of threads to use for components that implement multi-threading + */ + public static final String PARAM_NUM_THREADS = "numThreads"; + /** + * Use smart number of threads if PARAM_NUM_THREADS is set to this value + */ + public static final String AUTO_NUM_THREADS = "0"; + + /** + * Compute the number of threads to use for components that can make use of multi-threading. + *
    + *
  • for positive values: use the given number of threads, with the number of available CPUs + * maximum.
  • + *
  • for negative value: use the number of available CPUs minus the given value, minimum + * 1.
  • + *
  • for {@link #AUTO_NUM_THREADS} (0): use the number of available CPUs minus one.
  • + *
+ * + * @param value + * the user-proposed number of threads (positive, negative, or 0) + * @return the actual number of threads to use. + */ + public static int computeNumThreads(int value) + { + int cpus = Runtime.getRuntime().availableProcessors(); + + if (value > 0) { + return Math.min(cpus, value); + } + else if (value < 0) { + return Math.max(1, cpus + value); + } + else { + return Math.max(1, cpus - 1); + } + } + + // ============================================================================================= + // Annotation types + // ============================================================================================= + + private static final String PARAGRAPH = "Paragraph"; + + private static final String SENTENCE = "Sentence"; + + private static final String FORM = "Form"; + + private static final String TOKEN = "Token"; + + private static final String LEMMA = "Lemma"; + + private static final String POS = "POS"; + + private static final String CPOS = "CPOS"; + + private static final String MORPH = "Morph"; + + private static final String CHUNK = "Chunk"; + + private static final String CONSTITUENT = "Constituent"; + + private static final String COREFERENCE = "Coreference"; + + private static final String PENN_TREE = "PennTree"; + + private static final String DEPENDENCY = "Dependency"; + + private static final String NAMED_ENTITY = "NamedEntity"; + + private static final String SEMANTIC_PREDICATE = "SemPred"; + + // ============================================================================================= + // Verbs for parameters + // ============================================================================================= + + private static final String READ = "read"; + + private static final String WRITE = "write"; + + // ============================================================================================= + // Nouns for parameters + // ============================================================================================= + + private static final String TAG_SET = "TagSet"; + + private static final String MAPPING_LOCATION = "MappingLocation"; + + private static final String COVERED_TEXT = "CoveredText"; + + // ============================================================================================= + // Enable / disable reading or writing of particular annotation types. + // ============================================================================================= + + public static final String PARAM_READ_PARAGRAPH = READ + PARAGRAPH; + + public static final String PARAM_READ_SENTENCE = READ + SENTENCE; + + public static final String PARAM_READ_TOKEN = READ + TOKEN; + + public static final String PARAM_READ_FORM = READ + FORM; + + public static final String PARAM_READ_LEMMA = READ + LEMMA; + + public static final String PARAM_READ_POS = READ + POS; + + public static final String PARAM_READ_CPOS = READ + CPOS; + + public static final String PARAM_READ_CHUNK = READ + CHUNK; + + public static final String PARAM_READ_MORPH = READ + MORPH; + + public static final String PARAM_READ_CONSTITUENT = READ + CONSTITUENT; + + public static final String PARAM_READ_COREFERENCE = READ + COREFERENCE; + + public static final String PARAM_READ_PENN_TREE = READ + PENN_TREE; + + public static final String PARAM_READ_DEPENDENCY = READ + DEPENDENCY; + + public static final String PARAM_READ_NAMED_ENTITY = READ + NAMED_ENTITY; + + public static final String PARAM_READ_SEMANTIC_PREDICATE = READ + SEMANTIC_PREDICATE; + + public static final String PARAM_WRITE_PARAGRAPH = WRITE + PARAGRAPH; + + public static final String PARAM_WRITE_SENTENCE = WRITE + SENTENCE; + + public static final String PARAM_WRITE_TOKEN = WRITE + TOKEN; + + public static final String PARAM_WRITE_FORM = WRITE + FORM; + + public static final String PARAM_WRITE_LEMMA = WRITE + LEMMA; + + public static final String PARAM_WRITE_POS = WRITE + POS; + + public static final String PARAM_WRITE_CPOS = WRITE + CPOS; + + public static final String PARAM_WRITE_CHUNK = WRITE + CHUNK; + + public static final String PARAM_WRITE_MORPH = WRITE + MORPH; + + public static final String PARAM_WRITE_CONSTITUENT = WRITE + CONSTITUENT; + + public static final String PARAM_WRITE_COREFERENCE = WRITE + COREFERENCE; + + public static final String PARAM_WRITE_PENN_TREE = WRITE + PENN_TREE; + + public static final String PARAM_WRITE_DEPENDENCY = WRITE + DEPENDENCY; + + public static final String PARAM_WRITE_NAMED_ENTITY = WRITE + NAMED_ENTITY; + + public static final String PARAM_WRITE_COVERED_TEXT = WRITE + COVERED_TEXT; + + public static final String PARAM_WRITE_SEMANTIC_PREDICATE = WRITE + SEMANTIC_PREDICATE; + + // ============================================================================================= + // Configure tag sets for different kinds of annotations. + // + // Not using the type constants here because they are capitalized for use with verbs + // ============================================================================================= + + /** + * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the + * tag set defined as part of the model meta data. This can be useful if a custom model is + * specified which does not have such meta data, or it can be used in readers. + */ + public static final String PARAM_POS_TAG_SET = POS + TAG_SET; + + /** + * Use this chunk tag set to use to resolve the tag set mapping instead of using the tag set + * defined as part of the model meta data. This can be useful if a custom model is specified + * which does not have such meta data, or it can be used in readers. + */ + public static final String PARAM_CHUNK_TAG_SET = CHUNK + TAG_SET; + + public static final String PARAM_CONSTITUENT_TAG_SET = CONSTITUENT + TAG_SET; + + public static final String PARAM_MORPH_TAG_SET = MORPH + TAG_SET; + + // ============================================================================================= + // Configure mapping of tags to annotation types for different kinds of annotations. + // + // Not using the type constants here because they are capitalized for use with verbs + // ============================================================================================= + + public static final String DEFAULT_MAPPING_ENABLED = "true"; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = "mappingEnabled"; + + /** + * Location of the mapping file for part-of-speech tags to UIMA types. + */ + public static final String PARAM_POS_MAPPING_LOCATION = POS + MAPPING_LOCATION; + + /** + * Location of the mapping file for constituent tags to UIMA types. + */ + public static final String PARAM_CONSTITUENT_MAPPING_LOCATION = CONSTITUENT + MAPPING_LOCATION; + + /** + * Location of the mapping file for chunk tags to UIMA types. + */ + public static final String PARAM_CHUNK_MAPPING_LOCATION = CHUNK + MAPPING_LOCATION; + + /** + * Location of the mapping file for named entity tags to UIMA types. + */ + public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = NAMED_ENTITY + + MAPPING_LOCATION; + + /** + * Location of the mapping file for morphological analysis strings to features. + */ + public static final String PARAM_MORPH_MAPPING_LOCATION = MORPH + MAPPING_LOCATION; + + /** + * Location of the mapping file for dependency tags to UIMA types. + */ + public static final String PARAM_DEPENDENCY_MAPPING_LOCATION = DEPENDENCY + MAPPING_LOCATION; + + private ComponentParameters() + { + // No instances of this class + } +} diff --git a/dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/Messages.java b/dkpro-core-api-parameter-asl/src/main/java/org/dkpro/core/api/parameter/Messages.java similarity index 94% rename from dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/Messages.java rename to dkpro-core-api-parameter-asl/src/main/java/org/dkpro/core/api/parameter/Messages.java index aef165cf58..a1c097bc0b 100644 --- a/dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/Messages.java +++ b/dkpro-core-api-parameter-asl/src/main/java/org/dkpro/core/api/parameter/Messages.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.parameter; +package org.dkpro.core.api.parameter; public final class Messages { diff --git a/dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/MimeTypes.java b/dkpro-core-api-parameter-asl/src/main/java/org/dkpro/core/api/parameter/MimeTypes.java similarity index 85% rename from dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/MimeTypes.java rename to dkpro-core-api-parameter-asl/src/main/java/org/dkpro/core/api/parameter/MimeTypes.java index 11e0789ff0..a1cb9ffbea 100644 --- a/dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/MimeTypes.java +++ b/dkpro-core-api-parameter-asl/src/main/java/org/dkpro/core/api/parameter/MimeTypes.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.parameter; +package org.dkpro.core.api.parameter; public final class MimeTypes { @@ -28,8 +28,10 @@ public final class MimeTypes public final static String APPLICATION_VND_XMI_XML = "application/vnd.xmi+xml"; - // DKPro application types + // DKPro Core application types public final static String APPLICATION_X_ANCORA_XML = "application/x.org.dkpro.ancora+xml"; + public final static String APPLICATION_X_BNC = "application/x.org.dkpro.bnc+xml"; + public final static String APPLICATION_X_BRAT = "application/x.org.dkpro.brat"; public final static String APPLICATION_X_DITOP = "application/x.org.dkpro.ditop"; public final static String APPLICATION_X_FANGORN = "application/x.org.dkpro.fangorn"; public final static String APPLICATION_X_GATE_XML = "application/x.org.dkpro.gate+xml"; @@ -39,9 +41,12 @@ public final class MimeTypes public final static String APPLICATION_X_UIMA_BINARY = "application/x.org.dkpro.uima+binary"; public final static String APPLICATION_X_LIF_JSON = "application/x.org.dkpro.lif+json"; public final static String APPLICATION_X_LXF_JSON = "application/x.org.dkpro.lxf+json"; + public final static String APPLICATION_X_PUB_ANNOTATION_JSON = "application/x.org.dkpro.pubannotation+json"; + public final static String APPLICATION_X_PERSEUS_XML = "application/x.org.dkpro.perseus+xml"; public final static String APPLICATION_X_NEGRA3 = "application/x.org.dkpro.negra3"; public final static String APPLICATION_X_NEGRA4 = "application/x.org.dkpro.negra4"; public final static String APPLICATION_X_NIF_TURTLE = "application/x.org.dkpro.nif+turtle"; + public final static String APPLICATION_X_NITF_XML = "application/x.org.dkpro.nitf+xml"; public final static String APPLICATION_X_UIMA_RDF = "application/x.org.dkpro.uima+rdf"; public final static String APPLICATION_X_REUTERS21578_SGML = "application/x.org.dkpro.reuters21578+sgml"; public final static String APPLICATION_X_TGREP2 = "application/x.org.dkpro.tgrep2"; @@ -49,6 +54,8 @@ public final class MimeTypes public final static String APPLICATION_X_SEMEVAL_2010_XML = "application/x.org.dkpro.semeval-2010+xml"; public final static String APPLICATION_X_TUEPP_XML = "application/x.org.dkpro.tuepp+xml"; public final static String APPLICATION_X_TUEBADZ_CHUNK = "application/x.org.dkpro.tuebadz-chunk"; + public final static String APPLICATION_X_XCES = "application/x.org.dkpro.xces+xml"; + public final static String APPLICATION_X_XCES_BASIC = "application/x.org.dkpro.xces-basic+xml"; // Standard text types (http://www.iana.org/assignments/media-types/media-types.xhtml) public final static String TEXT_CSV = "text/csv"; @@ -61,7 +68,7 @@ public final class MimeTypes // Non-standard text types public final static String TEXT_TCF = "text/tcf+xml"; - // DKPro text types + // DKPro Core text types public final static String TEXT_X_CONLL_2000 = "text/x.org.dkpro.conll-2000"; public final static String TEXT_X_CONLL_2002 = "text/x.org.dkpro.conll-2002"; public final static String TEXT_X_CONLL_2003 = "text/x.org.dkpro.conll-2003"; @@ -69,6 +76,7 @@ public final class MimeTypes public final static String TEXT_X_CONLL_2008 = "text/x.org.dkpro.conll-2008"; public final static String TEXT_X_CONLL_2009 = "text/x.org.dkpro.conll-2009"; public final static String TEXT_X_CONLL_2012 = "text/x.org.dkpro.conll-2012"; + public final static String TEXT_X_CONLL_CORENLP = "text/x.org.dkpro.conll-corenpl"; public final static String TEXT_X_CONLL_U = "text/x.org.dkpro.conll-u"; public final static String TEXT_X_IMSCWB = "text/x.org.dkpro.imscwb"; public final static String TEXT_X_GERMEVAL_2014 = "text/x.org.dkpro.germeval-2014"; @@ -77,6 +85,7 @@ public final class MimeTypes public final static String TEXT_X_PTB_CHUNKED = "text/x.org.dkpro.ptb-chunked"; public final static String TEXT_X_PTB_COMBINED = "text/x.org.dkpro.ptb-combined"; public final static String TEXT_X_REUTERS21578 = "text/x.org.dkpro.reuters21578"; + public final static String TEXT_X_WEBANNO_TSV3 = "text/x.org.dkpro.webanno-tsv3"; // OpenNLP model types public final static String APPLICATION_X_OPENNLP_CHUNK = "application/x.org.dkpro.core.opennlp.chunk"; @@ -94,6 +103,9 @@ public final class MimeTypes // LingPipe model types public final static String APPLICATION_X_LINGPIPE_NER = "application/x.org.dkpro.core.lingpipe.ner"; + // ArkTweet model types + public final static String APPLICATION_X_ARKTWEET_TAGGER = "application/x.org.dkpro.core.arktweet.tagger"; + private MimeTypes() { // No instances diff --git a/dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/ResourceParameter.java b/dkpro-core-api-parameter-asl/src/main/java/org/dkpro/core/api/parameter/ResourceParameter.java similarity index 95% rename from dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/ResourceParameter.java rename to dkpro-core-api-parameter-asl/src/main/java/org/dkpro/core/api/parameter/ResourceParameter.java index aadc3d3b4a..e7a7e7f6b6 100644 --- a/dkpro-core-api-parameter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/parameter/ResourceParameter.java +++ b/dkpro-core-api-parameter-asl/src/main/java/org/dkpro/core/api/parameter/ResourceParameter.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.parameter; +package org.dkpro.core.api.parameter; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; diff --git a/dkpro-core-api-parameter-asl/src/main/resources/META-INF/eu.openminted.share/mimeTypeMapping.map b/dkpro-core-api-parameter-asl/src/main/resources/META-INF/eu.openminted.share/mimeTypeMapping.map new file mode 100644 index 0000000000..c98a230fb4 --- /dev/null +++ b/dkpro-core-api-parameter-asl/src/main/resources/META-INF/eu.openminted.share/mimeTypeMapping.map @@ -0,0 +1,55 @@ +application/pdf=http://w3id.org/meta-share/omtd-share/Pdf +application/rtf=http://w3id.org/meta-share/omtd-share/Rtf +application/tei+xml=http://w3id.org/meta-share/omtd-share/Tei +application/vnd.xmi+xml=http://w3id.org/meta-share/omtd-share/Xmi +#application/x.org.dkpro.ancora+xml +application/x.org.dkpro.bnc+xml=http://w3id.org/meta-share/omtd-share/BncFormat +application/x.org.dkpro.brat=http://w3id.org/meta-share/omtd-share/Brat +#application/x.org.dkpro.core.lingpipe.ner +#application/x.org.dkpro.core.opennlp.chunk +#application/x.org.dkpro.core.opennlp.lemma +#application/x.org.dkpro.core.opennlp.ner +#application/x.org.dkpro.core.opennlp.sent +#application/x.org.dkpro.core.opennlp.tagger= +#application/x.org.dkpro.core.opennlp.token= +#application/x.org.dkpro.core.stanfordnlp.ner= +#application/x.org.dkpro.core.stanfordnlp.tagger= +#application/x.org.dkpro.ditop= +application/x.org.dkpro.gate+xml=http://w3id.org/meta-share/omtd-share/GateXml +#application/x.org.dkpro.lif+json= +#application/x.org.dkpro.lxf+json= +application/x.org.dkpro.negra3=http://w3id.org/meta-share/omtd-share/NegraExport +application/x.org.dkpro.negra4=http://w3id.org/meta-share/omtd-share/NegraExport +application/x.org.dkpro.nif+turtle=http://w3id.org/meta-share/omtd-share/Nif +application/x.org.dkpro.reuters21578+sgml=http://w3id.org/meta-share/omtd-share/Reuters21578Sgml +#application/x.org.dkpro.semeval-2010+xml= +application/x.org.dkpro.tgrep2=http://w3id.org/meta-share/omtd-share/Tgrep2 +application/x.org.dkpro.tiger+xml=http://w3id.org/meta-share/omtd-share/TigerXml +#application/x.org.dkpro.tuebadz-chunk= +application/x.org.dkpro.tuepp+xml=http://w3id.org/meta-share/omtd-share/Tuepp +application/x.org.dkpro.uima+binary=http://w3id.org/meta-share/omtd-share/BinaryCas +application/x.org.dkpro.uima+json=http://w3id.org/meta-share/omtd-share/Uima_json +application/x.org.dkpro.uima+xmi=http://w3id.org/meta-share/omtd-share/UimaCasFormat +application/x.org.dkpro.xces+xml=http://w3id.org/meta-share/omtd-share/Xces +application/x.org.dkpro.xces-basic+xml=http://w3id.org/meta-share/omtd-share/XcesIlspVariant +application/xhtml+xml=http://w3id.org/meta-share/omtd-share/Xhtml +#application/xml= +text/html=http://w3id.org/meta-share/omtd-share/Html +text/plain=http://w3id.org/meta-share/omtd-share/Text +text/rtf=http://w3id.org/meta-share/omtd-share/Rtf +text/tcf+xml=http://w3id.org/meta-share/omtd-share/Tcf +text/x.org.dkpro.conll-2000=http://w3id.org/meta-share/omtd-share/Conll2000 +text/x.org.dkpro.conll-2002=http://w3id.org/meta-share/omtd-share/Conll2002 +text/x.org.dkpro.conll-2003=http://w3id.org/meta-share/omtd-share/Conll2003 +text/x.org.dkpro.conll-2006=http://w3id.org/meta-share/omtd-share/Conll2006 +text/x.org.dkpro.conll-2008=http://w3id.org/meta-share/omtd-share/Conll2008 +text/x.org.dkpro.conll-2009=http://w3id.org/meta-share/omtd-share/Conll2009 +text/x.org.dkpro.conll-2012=http://w3id.org/meta-share/omtd-share/Conll2012 +text/x.org.dkpro.conll-u=http://w3id.org/meta-share/omtd-share/ConllU +#text/x.org.dkpro.germeval-2014= +text/x.org.dkpro.imscwb=http://w3id.org/meta-share/omtd-share/Imscwb +#text/x.org.dkpro.ngram= +text/x.org.dkpro.ptb-chunked=http://w3id.org/meta-share/omtd-share/PtbChunked +text/x.org.dkpro.ptb-combined=http://w3id.org/meta-share/omtd-share/PtbCombined +text/x.org.dkpro.reuters21578=http://w3id.org/meta-share/omtd-share/Reuters21578Txt +# text/xml= diff --git a/dkpro-core-api-parameter-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/parameter/Messages.properties b/dkpro-core-api-parameter-asl/src/main/resources/org/dkpro/core/api/parameter/Messages.properties similarity index 100% rename from dkpro-core-api-parameter-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/parameter/Messages.properties rename to dkpro-core-api-parameter-asl/src/main/resources/org/dkpro/core/api/parameter/Messages.properties diff --git a/dkpro-core-api-phonetics-asl/pom.xml b/dkpro-core-api-phonetics-asl/pom.xml index bda92372ff..e9dba5f2e4 100644 --- a/dkpro-core-api-phonetics-asl/pom.xml +++ b/dkpro-core-api-phonetics-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.api.phonetics-asl + dkpro-core-api-phonetics-asl jar DKPro Core ASL - Phonetics API + https://dkpro.github.io/dkpro-core/ org.apache.uima diff --git a/dkpro-core-api-phonetics-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/phonetics/package-info.java b/dkpro-core-api-phonetics-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/phonetics/package-info.java deleted file mode 100644 index 56d191d77b..0000000000 --- a/dkpro-core-api-phonetics-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/phonetics/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Types for the phonetic annotation level. - * - * @since 1.5.0 - */ -package de.tudarmstadt.ukp.dkpro.core.api.phonetics; \ No newline at end of file diff --git a/dkpro-core-api-phonetics-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/phonetics/util/SoundUtils.java b/dkpro-core-api-phonetics-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/phonetics/util/SoundUtils.java deleted file mode 100644 index 8b199aa93f..0000000000 --- a/dkpro-core-api-phonetics-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/phonetics/util/SoundUtils.java +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.api.phonetics.util; - -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -public class SoundUtils -{ - public static int differenceEncoded(String es1, String es2) { - - if (es1 == null || es2 == null) { - return 0; - } - int lengthToMatch = Math.min(es1.length(), es2.length()); - int diff = 0; - for (int i = 0; i < lengthToMatch; i++) { - if (es1.charAt(i) == es2.charAt(i)) { - diff++; - } - } - return diff; - } - - /** - * Converts an Arpabet phonemic transcription to an IPA phonemic - * transcription. Note that, somewhat unusually, the stress symbol will - * precede the vowel rather than the syllable. This is because Arpabet does - * not mark syllable boundaries. - * - * @param s - * The Darpabet phonemic transcription to convert. - * @return The IPA equivalent of s. - * @throws IllegalArgumentException if a phoneme is unknown. - */ - public static String arpabetToIPA(String s) throws IllegalArgumentException { - String[] arpaPhonemes = s.trim().split("[ \\t]+"); - StringBuffer ipaPhonemes = new StringBuffer(s.length()); - - for (String arpaPhoneme : arpaPhonemes) { - char stressChar = arpaPhoneme.charAt(arpaPhoneme.length() - 1); - if (stressChar == '0' || stressChar == '1' || stressChar == '2') { - arpaPhoneme = arpaPhoneme.substring(0, arpaPhoneme.length() - 1); - ipaPhonemes.append(arpabetMap.get(Character.toString(stressChar))); - } - - String ipaPhoneme = arpabetMap.get(arpaPhoneme); - if (ipaPhoneme == null) { - throw new IllegalArgumentException(); - } - ipaPhonemes.append(ipaPhoneme); - } - - return ipaPhonemes.toString(); - } - - private static final Map arpabetMap; - static { - Map aMap = new HashMap(); - aMap.put("0", ""); - aMap.put("1", "ˈ"); - aMap.put("2", "ˌ"); - aMap.put("AA", "ɑ"); - aMap.put("AE", "æ"); - aMap.put("AH", "ʌ"); - aMap.put("AO", "ɔ"); - aMap.put("AW", "aʊ"); - aMap.put("AX", "ə"); - aMap.put("AY", "aɪ"); - aMap.put("B", "b"); - aMap.put("CH", "tʃ"); - aMap.put("D", "d"); - aMap.put("DH", "ð"); - aMap.put("DX", "?"); - aMap.put("EH", "ɛ"); - aMap.put("ER", "ɚ"); - aMap.put("EY", "eɪ"); - aMap.put("F", "f"); - aMap.put("G", "?"); - aMap.put("HH", "h"); - aMap.put("IH", "ɪ"); - aMap.put("IY", "i"); - aMap.put("JH", "dʒ"); - aMap.put("K", "k"); - aMap.put("L", "l"); - aMap.put("M", "m"); - aMap.put("NG", "ŋ"); - aMap.put("N", "n"); - aMap.put("OW", "oʊ"); - aMap.put("OY", "ɔɪ"); - aMap.put("P", "p"); - aMap.put("R", "ɹ"); - aMap.put("SH", "ʃ"); - aMap.put("S", "s"); - aMap.put("TH", "θ"); - aMap.put("T", "t"); - aMap.put("UH", "ʊ"); - aMap.put("UW", "u"); - aMap.put("V", "v"); - aMap.put("W", "w"); - aMap.put("Y", "j"); - aMap.put("ZH", "ʒ"); - aMap.put("Z", "z"); - arpabetMap = Collections.unmodifiableMap(aMap); - } -} \ No newline at end of file diff --git a/dkpro-core-api-phonetics-asl/src/main/java/org/dkpro/core/api/phonetics/package-info.java b/dkpro-core-api-phonetics-asl/src/main/java/org/dkpro/core/api/phonetics/package-info.java new file mode 100644 index 0000000000..2ea3d85830 --- /dev/null +++ b/dkpro-core-api-phonetics-asl/src/main/java/org/dkpro/core/api/phonetics/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Types for the phonetic annotation level. + * + * @since 1.5.0 + */ +package org.dkpro.core.api.phonetics; diff --git a/dkpro-core-api-phonetics-asl/src/main/java/org/dkpro/core/api/phonetics/util/SoundUtils.java b/dkpro-core-api-phonetics-asl/src/main/java/org/dkpro/core/api/phonetics/util/SoundUtils.java new file mode 100644 index 0000000000..d0f8db47bd --- /dev/null +++ b/dkpro-core-api-phonetics-asl/src/main/java/org/dkpro/core/api/phonetics/util/SoundUtils.java @@ -0,0 +1,123 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.api.phonetics.util; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +public class SoundUtils +{ + public static int differenceEncoded(String es1, String es2) + { + if (es1 == null || es2 == null) { + return 0; + } + int lengthToMatch = Math.min(es1.length(), es2.length()); + int diff = 0; + for (int i = 0; i < lengthToMatch; i++) { + if (es1.charAt(i) == es2.charAt(i)) { + diff++; + } + } + return diff; + } + + /** + * Converts an Arpabet phonemic transcription to an IPA phonemic transcription. Note that, + * somewhat unusually, the stress symbol will precede the vowel rather than the syllable. This + * is because Arpabet does not mark syllable boundaries. + * + * @param s + * The Darpabet phonemic transcription to convert. + * @return The IPA equivalent of s. + * @throws IllegalArgumentException + * if a phoneme is unknown. + */ + public static String arpabetToIPA(String s) throws IllegalArgumentException + { + String[] arpaPhonemes = s.trim().split("[ \\t]+"); + StringBuffer ipaPhonemes = new StringBuffer(s.length()); + + for (String arpaPhoneme : arpaPhonemes) { + char stressChar = arpaPhoneme.charAt(arpaPhoneme.length() - 1); + if (stressChar == '0' || stressChar == '1' || stressChar == '2') { + arpaPhoneme = arpaPhoneme.substring(0, arpaPhoneme.length() - 1); + ipaPhonemes.append(arpabetMap.get(Character.toString(stressChar))); + } + + String ipaPhoneme = arpabetMap.get(arpaPhoneme); + if (ipaPhoneme == null) { + throw new IllegalArgumentException(); + } + ipaPhonemes.append(ipaPhoneme); + } + + return ipaPhonemes.toString(); + } + + private static final Map arpabetMap; + static { + Map aMap = new HashMap(); + aMap.put("0", ""); + aMap.put("1", "ˈ"); + aMap.put("2", "ˌ"); + aMap.put("AA", "ɑ"); + aMap.put("AE", "æ"); + aMap.put("AH", "ʌ"); + aMap.put("AO", "ɔ"); + aMap.put("AW", "aʊ"); + aMap.put("AX", "ə"); + aMap.put("AY", "aɪ"); + aMap.put("B", "b"); + aMap.put("CH", "tʃ"); + aMap.put("D", "d"); + aMap.put("DH", "ð"); + aMap.put("DX", "?"); + aMap.put("EH", "ɛ"); + aMap.put("ER", "ɚ"); + aMap.put("EY", "eɪ"); + aMap.put("F", "f"); + aMap.put("G", "?"); + aMap.put("HH", "h"); + aMap.put("IH", "ɪ"); + aMap.put("IY", "i"); + aMap.put("JH", "dʒ"); + aMap.put("K", "k"); + aMap.put("L", "l"); + aMap.put("M", "m"); + aMap.put("NG", "ŋ"); + aMap.put("N", "n"); + aMap.put("OW", "oʊ"); + aMap.put("OY", "ɔɪ"); + aMap.put("P", "p"); + aMap.put("R", "ɹ"); + aMap.put("SH", "ʃ"); + aMap.put("S", "s"); + aMap.put("TH", "θ"); + aMap.put("T", "t"); + aMap.put("UH", "ʊ"); + aMap.put("UW", "u"); + aMap.put("V", "v"); + aMap.put("W", "w"); + aMap.put("Y", "j"); + aMap.put("ZH", "ʒ"); + aMap.put("Z", "z"); + arpabetMap = Collections.unmodifiableMap(aMap); + } +} diff --git a/dkpro-core-api-phonetics-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/phonetics/util/SoundUtilsTest.java b/dkpro-core-api-phonetics-asl/src/test/java/org/dkpro/core/api/phonetics/util/SoundUtilsTest.java similarity index 90% rename from dkpro-core-api-phonetics-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/phonetics/util/SoundUtilsTest.java rename to dkpro-core-api-phonetics-asl/src/test/java/org/dkpro/core/api/phonetics/util/SoundUtilsTest.java index b16665ef8c..bc595e8fbc 100644 --- a/dkpro-core-api-phonetics-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/phonetics/util/SoundUtilsTest.java +++ b/dkpro-core-api-phonetics-asl/src/test/java/org/dkpro/core/api/phonetics/util/SoundUtilsTest.java @@ -1,32 +1,32 @@ -/* - * Copyright 2013 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.phonetics.util; - -import static org.junit.Assert.assertEquals; - -import org.junit.Test; - - -public class SoundUtilsTest -{ - @Test - public void arpabetToIPATest() throws Exception - { - assertEquals("ˈɛndˌɪnɚkwˈoʊt", SoundUtils.arpabetToIPA(" EH1 N D IH2 N ER0 K W OW1 T ")); - } -} +/* + * Copyright 2013 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.phonetics.util; + +import static org.junit.Assert.assertEquals; + +import org.dkpro.core.api.phonetics.util.SoundUtils; +import org.junit.Test; + +public class SoundUtilsTest +{ + @Test + public void arpabetToIPATest() throws Exception + { + assertEquals("ˈɛndˌɪnɚkwˈoʊt", SoundUtils.arpabetToIPA(" EH1 N D IH2 N ER0 K W OW1 T ")); + } +} diff --git a/dkpro-core-api-phonetics-asl/suppressions.xml b/dkpro-core-api-phonetics-asl/suppressions.xml new file mode 100644 index 0000000000..05381817ea --- /dev/null +++ b/dkpro-core-api-phonetics-asl/suppressions.xml @@ -0,0 +1,9 @@ + + + + + + + diff --git a/dkpro-core-api-resources-asl/pom.xml b/dkpro-core-api-resources-asl/pom.xml index 62f483f66c..531862ce85 100644 --- a/dkpro-core-api-resources-asl/pom.xml +++ b/dkpro-core-api-resources-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + dkpro-core-api-resources-asl jar DKPro Core ASL - Resources API + https://dkpro.github.io/dkpro-core/ 0.9.0.M2 3.1.0 @@ -66,16 +67,16 @@ 3.0.5 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl org.apache.ivy ivy - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl org.codehaus.plexus @@ -93,8 +94,8 @@ test - org.hamcrest - hamcrest-core + org.assertj + assertj-core test diff --git a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/CasConfigurableStreamProviderBase.java b/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/CasConfigurableStreamProviderBase.java deleted file mode 100644 index 52b342591c..0000000000 --- a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/CasConfigurableStreamProviderBase.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; - -import static org.apache.commons.io.IOUtils.closeQuietly; - -import java.io.IOException; -import java.io.InputStream; -import java.net.URL; - -/** - * CAS-configurable provider produces a resource from a stream instead of an URL. The provider - * implementation does not have to care about opening/closing the stream. - * - * - * @param the type of resource to produce. - */ -public abstract class CasConfigurableStreamProviderBase - extends CasConfigurableProviderBase -{ - @Override - protected M produceResource(URL aUrl) - throws IOException - { - InputStream is = null; - try { - is = aUrl.openStream(); - return produceResource(is); - } - catch (IOException e) { - throw e; - } - catch (Exception e) { - throw new IOException(e); - } - finally { - closeQuietly(is); - } - } - - protected abstract M produceResource(InputStream aStream) throws Exception; - -} diff --git a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/CompressionMethod.java b/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/CompressionMethod.java deleted file mode 100644 index f8807b6edf..0000000000 --- a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/CompressionMethod.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; - -/** - * Compression methods. - * - */ -public enum CompressionMethod -{ - NONE(""), - GZIP(".gz"), - BZIP2(".bz2"), - XZ(".xz"); - - private String extension; - - private CompressionMethod(String aExtension) - { - extension = aExtension; - } - - public String getExtension() - { - return extension; - } -} diff --git a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/CompressionUtils.java b/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/CompressionUtils.java deleted file mode 100644 index dd7b600be8..0000000000 --- a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/CompressionUtils.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; - -import static org.apache.commons.io.FileUtils.forceMkdir; -import static de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionMethod.*; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; - -import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; -import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; -import org.apache.commons.compress.compressors.xz.XZCompressorInputStream; -import org.apache.commons.compress.compressors.xz.XZCompressorOutputStream; - -/** - * Utility methods for dealing with compressed data. - * - */ -public class CompressionUtils -{ - public static String stripCompressionExtension(String aLocation) - { - String lcLocation = aLocation.toLowerCase(); - if (lcLocation.endsWith(GZIP.getExtension())) { - return aLocation.substring(0, GZIP.getExtension().length()); - } - else if (lcLocation.endsWith(BZIP2.getExtension()) || lcLocation.endsWith(".bzip2")) { - return aLocation.substring(0, BZIP2.getExtension().length()); - } - else if (lcLocation.endsWith(".bzip2")) { - return aLocation.substring(0, ".bzip2".length()); - } - else if (lcLocation.endsWith(XZ.getExtension())) { - return aLocation.substring(0, XZ.getExtension().length()); - } - else { - return aLocation; - } - } - - /** - * Get an uncompressed input stream for a given input stream created for a particular location. - * - * @param aLocation - * a resource location (e.g. a path, URL, etc.) - * @param aStream - * a raw stream of potentially compressed data. - * @return stream wrapped with a decompressing stream. - * @throws IOException if an I/O error has occurred, - */ - public static InputStream getInputStream(String aLocation, InputStream aStream) - throws IOException - { - String lcLocation = aLocation.toLowerCase(); - if (lcLocation.endsWith(GZIP.getExtension())) { - return new GZIPInputStream(aStream); - } - else if (lcLocation.endsWith(BZIP2.getExtension()) || lcLocation.endsWith(".bzip2")) { - return new BZip2CompressorInputStream(aStream); - } - else if (lcLocation.endsWith(XZ.getExtension())) { - return new XZCompressorInputStream(aStream); - } - else { - return aStream; - } - } - - /** - * Make sure the target directory exists and get a stream writing to the specified file within. - * If the file name ends with a typical extension for compressed files, the stream will be - * compressed. - * - * @param aFile - * the target file. - * @return a stream to write to. - * @throws IOException if an I/O error has occurred, - * @see CompressionMethod - */ - public static OutputStream getOutputStream(File aFile) - throws IOException - { - // Create parent folders for output file and set up stream - if (aFile.getParentFile() != null) { - forceMkdir(aFile.getParentFile()); - } - - String lcFilename = aFile.getName().toLowerCase(); - - OutputStream os = new FileOutputStream(aFile); - if (lcFilename.endsWith(GZIP.getExtension())) { - os = new GZIPOutputStream(os); - } - else if (lcFilename.endsWith(BZIP2.getExtension()) || lcFilename.endsWith(".bzip2")) { - os = new BZip2CompressorOutputStream(os); - } - else if (lcFilename.endsWith(XZ.getExtension())) { - os = new XZCompressorOutputStream(os); - } - return os; - } -} diff --git a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/DkproContext.java b/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/DkproContext.java deleted file mode 100644 index ceb117b78c..0000000000 --- a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/DkproContext.java +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; - -import java.io.File; -import java.io.IOException; - -/** - * Runtime context for DKPro. - * - */ -public -class DkproContext -{ - public static final String ENV_DKPRO_HOME = "DKPRO_HOME"; - public static final String DEFAULT_ENCODING = "UTF-8"; - - private static DkproContext context; - - /** - * The the current context. - * - * @return the context. - */ - public static synchronized - DkproContext getContext() - { - if (context == null) { - context = new DkproContext(); - } - return context; - } - - /** - * Get the workspace directory. - * - * @return the workspace directory. - * @throws IOException if the workspace cannot be obtained - */ - public - File getWorkspace() - throws IOException - { - if (System.getenv(ENV_DKPRO_HOME) != null) { - File f = new File(System.getenv(ENV_DKPRO_HOME)); - f.mkdirs(); - return f; - } - - throw new IOException("Environment variable ["+ENV_DKPRO_HOME+"] not set"); - } - - /** - * Get the workspace directory for a particular class. - * - * @param aClass a class. - * @return the class workspace. - * @throws IOException if the workspace cannot be obtained. - */ - public - File getWorkspace( - final Class aClass) - throws IOException - { - return getWorkspace(aClass.getName()); - } - - /** - * Get the workspace directory for a particular topic. - * - * @param aTopic the topic ID. - * @return the topic workspace. - * @throws IOException if the workspace cannot be obtained. - */ - public - File getWorkspace( - final String aTopic) - throws IOException - { - File f = new File(getWorkspace(), aTopic); - f.mkdirs(); - return f; - } -} diff --git a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/MappingProvider.java b/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/MappingProvider.java deleted file mode 100644 index d7ce3449f0..0000000000 --- a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/MappingProvider.java +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; - -import static de.tudarmstadt.ukp.dkpro.core.api.resources.MappingUtils.META_OVERRIDE; -import static de.tudarmstadt.ukp.dkpro.core.api.resources.MappingUtils.META_REDIRECT; -import static de.tudarmstadt.ukp.dkpro.core.api.resources.MappingUtils.META_TYPE_BASE; -import static de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils.resolveLocation; - -import java.io.IOException; -import java.net.URL; -import java.util.HashMap; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Properties; -import java.util.Set; - -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.TypeSystem; -import org.springframework.core.io.UrlResource; -import org.springframework.core.io.support.PropertiesLoaderUtils; - -public class MappingProvider extends CasConfigurableProviderBase> -{ - // private final Log log = LogFactory.getLog(getClass()); - - public static final String BASE_TYPE = "baseType"; - - private TypeSystem typeSystem; - private boolean notFound = false; - - private Map tagMappings; - - private Map tagMappingImports = new HashMap<>(); - - @Override - public void configure(CAS aCas) throws AnalysisEngineProcessException - { - typeSystem = aCas.getTypeSystem(); - - // Tag mappings can exist independently from the type mappings because tag mappings - // are configured in the model metadata - tagMappings = new HashMap<>(); - for (Entry imp : tagMappingImports.entrySet()) { - String prefix = imp.getKey() + ".tag.map."; - Properties props = imp.getValue().getResourceMetaData(); - for (String key : props.stringPropertyNames()) { - if (key.startsWith(prefix)) { - String originalTag = key.substring(prefix.length()); - String mappedTag = props.getProperty(key); - tagMappings.put(originalTag, mappedTag); - } - } - } - - // Try loading the type mappings - try { - notFound = false; - super.configure(aCas); - } - catch (AnalysisEngineProcessException e) { - if(getOverride(LOCATION)!=null){ - throw e; - } - notFound = true; - } - } - - public String getTag(String aTag) - { - String tag = aTag; - - // Apply tag mapping if configured - if (tagMappings != null) { - String t = tagMappings.get(aTag); - if (t != null) { - tag = t; - } - } - - return tag; - } - - /** - * Get the type for the given tag. - * - * @param aTag a tag. - * @return the type - * @throws IllegalStateException if the type could not be located - */ - public Type getTagType(String aTag) - { - String type = getTagTypeName(aTag); - - Type uimaType = typeSystem.getType(type); - - if (uimaType == null) { - throw new IllegalStateException("Type [" + type + "] mapped to tag [" + aTag - + "] is not defined in type system"); - } - - return uimaType; - } - - /** - * Get the type for the given tag. - * - * @param aTag a tag. - * @return the type - * @throws IllegalStateException if the type could not be located - */ - public String getTagTypeName(String aTag) - { - String type; - if (notFound) { - type = getDefault(BASE_TYPE); - if (type == null) { - throw new IllegalStateException("No base type defined!"); - } - } - else { - String tag = getTag(aTag); - - type = getResource().get(tag); - if (type == null) { - type = getResource().get("*"); - } - if (type == null) { - throw new IllegalStateException("No fallback (*) mapping defined!"); - } - - String basePackage = getResource().get(META_TYPE_BASE); - if (basePackage != null) { - type = basePackage + type; - } - } - - return type; - } - - public Set getTags() - { - return MappingUtils.stripMetadata(getResource().keySet()); - } - - @Override - protected Map produceResource(URL aUrl) throws IOException - { - if (aUrl != null) { - Map mapping = new HashMap(); - Properties props = PropertiesLoaderUtils.loadProperties(new UrlResource(aUrl)); - for (String key : props.stringPropertyNames()) { - mapping.put(key.trim(), props.getProperty(key).trim()); - } - return mapping; - } - else { - return null; - } - } - - @Override - protected URL followRedirects(URL aUrl) throws IOException - { - URL url = aUrl; - while (true) { - Properties tmpResourceMetaData = PropertiesLoaderUtils.loadProperties(new UrlResource( - url)); - - // Values in the redirecting properties override values in the redirected-to - // properties - except META_REDIRECT - getResourceMetaData().remove(META_REDIRECT); - - Properties overrides = new Properties(); - for (String key : tmpResourceMetaData.stringPropertyNames()) { - if (key.startsWith(META_OVERRIDE)) { - overrides.put(key.substring(META_OVERRIDE.length()+1), - tmpResourceMetaData.getProperty(key)); - } - } - - mergeProperties(getResourceMetaData(), overrides); - - String redirect = tmpResourceMetaData.getProperty(META_REDIRECT); - if (redirect == null) { - return url; - } - else { - url = resolveLocation(redirect, getClassLoader(), null); - } - } - } - - public void addTagMappingImport(String aLayerPrefix, HasResourceMetadata aSource) - { - tagMappingImports.put(aLayerPrefix, aSource); - } -} diff --git a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/MappingProviderFactory.java b/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/MappingProviderFactory.java deleted file mode 100644 index 70eed16432..0000000000 --- a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/MappingProviderFactory.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; - -public class MappingProviderFactory -{ - private static final String CONSTITUENT_TAGSET = "constituent.tagset"; - private static final String DEPENDENCY_TAGSET = "dependency.tagset"; - private static final String CHUNK_TAGSET = "chunk.tagset"; - private static final String POS_TAGSET = "pos.tagset"; - - public static MappingProvider createPosMappingProvider(String aMappingLocation, - String aLanguage, HasResourceMetadata aSource) - { - MappingProvider p = createPosMappingProvider(aMappingLocation, null, aLanguage); - p.addImport(POS_TAGSET, aSource); - return p; - } - - public static MappingProvider createPosMappingProvider(String aMappingLocation, String aTagset, - String aLanguage) - { - MappingProvider p = new MappingProvider(); - p.setDefault(MappingProvider.LOCATION, - "classpath:/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/" - + "${language}-${pos.tagset}-pos.map"); - p.setDefault(MappingProvider.BASE_TYPE, - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"); - p.setDefault(POS_TAGSET, "default"); - p.setOverride(MappingProvider.LOCATION, aMappingLocation); - p.setOverride(MappingProvider.LANGUAGE, aLanguage); - p.setOverride(POS_TAGSET, aTagset); - return p; - } - - public static MappingProvider createChunkMappingProvider(String aMappingLocation, - String aLanguage, HasResourceMetadata aSource) - { - MappingProvider p = createChunkMappingProvider(aMappingLocation, null, aLanguage); - p.addImport(CHUNK_TAGSET, aSource); - return p; - } - - public static MappingProvider createChunkMappingProvider(String aMappingLocation, String aTagset, - String aLanguage) - { - MappingProvider chunkMappingProvider = new MappingProvider(); - chunkMappingProvider = new MappingProvider(); - chunkMappingProvider.setDefault(MappingProvider.LOCATION, "classpath:/de/tudarmstadt/ukp/" - + "dkpro/core/api/syntax/tagset/${language}-${chunk.tagset}-chunk.map"); - chunkMappingProvider.setDefault(MappingProvider.BASE_TYPE, - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk"); - chunkMappingProvider.setDefault(CHUNK_TAGSET, "default"); - chunkMappingProvider.setOverride(MappingProvider.LOCATION, aMappingLocation); - chunkMappingProvider.setOverride(MappingProvider.LANGUAGE, aLanguage); - chunkMappingProvider.setOverride(CHUNK_TAGSET, aTagset); - return chunkMappingProvider; - } - - public static MappingProvider createConstituentMappingProvider(String aMappingLocation, - String aLanguage, HasResourceMetadata aSource) - { - MappingProvider p = createConstituentMappingProvider(aMappingLocation, null, aLanguage); - p.addImport(CONSTITUENT_TAGSET, aSource); - p.addTagMappingImport("constituent", aSource); - return p; - } - - public static MappingProvider createConstituentMappingProvider(String aMappingLocation, - String aTagset, String aLanguage) - { - MappingProvider p = new MappingProvider(); - p.setDefault(MappingProvider.LOCATION, - "classpath:/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/" - + "${language}-${constituent.tagset}-constituency.map"); - p.setDefault(MappingProvider.BASE_TYPE, - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent"); - p.setDefault(CONSTITUENT_TAGSET, "default"); - p.setOverride(MappingProvider.LOCATION, aMappingLocation); - p.setOverride(MappingProvider.LANGUAGE, aLanguage); - p.setOverride(CONSTITUENT_TAGSET, aTagset); - return p; - } - - public static MappingProvider createDependencyMappingProvider(String aMappingLocation, - String aLanguage, HasResourceMetadata aSource) - { - MappingProvider p = createDependencyMappingProvider(aMappingLocation, null, aLanguage); - p.addImport(DEPENDENCY_TAGSET, aSource); - return p; - } - - public static MappingProvider createDependencyMappingProvider(String aMappingLocation, - String aTagset, String aLanguage) - { - MappingProvider p = new MappingProvider(); - p.setDefault(MappingProvider.LOCATION, - "classpath:/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/" - + "${language}-${dependency.tagset}-dependency.map"); - p.setDefault(MappingProvider.BASE_TYPE, - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency"); - p.setDefault(DEPENDENCY_TAGSET, "default"); - p.setOverride(MappingProvider.LOCATION, aMappingLocation); - p.setOverride(MappingProvider.LANGUAGE, aLanguage); - p.setOverride(DEPENDENCY_TAGSET, aTagset); - return p; - } -} diff --git a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/PlatformDetector.java b/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/PlatformDetector.java deleted file mode 100644 index c5eb5b41dd..0000000000 --- a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/PlatformDetector.java +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Copyright 2009-2010 - * Richard Eckart de Castilho - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; - -import java.nio.ByteOrder; - -/** - * Detect platform information and normalize it. - * - */ -public -class PlatformDetector -{ - public static String OS_WINDOWS = "windows"; - public static String OS_OSX = "osx"; - public static String OS_SOLARIS = "solaris"; - public static String OS_LINUX = "linux"; - - public static String ARCH_PPC = "ppc"; - public static String ARCH_X86_32 = "x86_32"; - public static String ARCH_X86_64 = "x86_64"; - public static String ARCH_SPARC = "sparc"; - - private String _arch = ""; - private String _os = ""; - private String _executableSuffix = ""; - private ByteOrder _byteOrder = ByteOrder.nativeOrder(); - private String[] _chmodCmd; - - { - updatePlatform( - System.getProperties().getProperty("os.name"), - System.getProperties().getProperty("os.arch"), - ByteOrder.nativeOrder()); - } - - /** - * Override the operating system name. - * This should only be used in test cases. - * - * @param aOs an OS name as could be found in the os.name system - * property. - */ - public - void setOs( - final String aOs) - { - updatePlatform(aOs, _arch, _byteOrder); - } - - /** - * Get the operating system. - * - * @return {@literal "windows"}, {@literal "osx"}, {@literal "linux"} or - * {@literal "solaris"}. - */ - public - String getOs() - { - return _os; - } - - /** - * Override the architecture. - * This should only be used in test cases. - * - * @param aArch {@literal "big-endian"} for PowerPC or Sparc systems or - * {@literal "little-endian"} for x86 systems. - */ - public - void setArch( - final String aArch) - { - updatePlatform(_os, aArch, _byteOrder); - } - - /** - * Get the platform architecture. - * - * @return {@literal "ppc"}, {@literal "x86_32"}, {@literal "x86_64"} or - * {"amd64"} - */ - public - String getArch() - { - return _arch; - } - - /** - * Set the byte order. TreeTagger models are sensitive to the byte order. - * This should only be used in test cases. - * - * @param aByteOrder the byte order. - */ - public - void setByteOrder( - final ByteOrder aByteOrder) - { - updatePlatform(_os, _arch, aByteOrder); - } - - /** - * Get the file suffix used for executable files on the currently configured - * platform. - * - * @return the file suffix used for executable files. - */ - public - String getExecutableSuffix() - { - return _executableSuffix; - } - - /** - * Get the byte order. - * - * @return the byte order. - */ - public - String getByteOrder() - { - return _byteOrder.toString().replace("_", "-").toLowerCase(); - } - - /** - * Get the platform ID which is {@link #getOs()} and {@link #getArch()} - * separated by a {@literal "-"} (dash). - * - * @return the platform ID. - */ - public - String getPlatformId() - { - return _os+"-"+_arch; - } - - /** - * Updates the platform-specific settings and normalizes them. - * - * @param aOs the operating system string. - * @param aArch the architecture string. - * @param aByteOrder the byte-order string. - */ - public - void updatePlatform( - final String aOs, - final String aArch, - final ByteOrder aByteOrder) - { - _os = aOs.toLowerCase(); - _arch = aArch.toLowerCase(); - String[] chmod = { "chmod", "755", null }; - - // Resolve arch "synonyms" - if ( - _arch.equals("x86") || - _arch.equals("i386") || - _arch.equals("i486") || - _arch.equals("i586") || - _arch.equals("i686") - ) { - _arch = ARCH_X86_32; - } - if ( - _arch.equals("amd64") - ) { - _arch = ARCH_X86_64; - } - if (_arch.equals("powerpc")) { - _arch = ARCH_PPC; - } - - // Resolve name "synonyms" - if (_os.startsWith("windows")) { - _os = OS_WINDOWS; - _executableSuffix = ".exe"; - chmod = null; - } - if (_os.startsWith("mac")) { - _os = OS_OSX; - } - if (_os.startsWith("linux")) { - _os = OS_LINUX; - } - if (_os.startsWith("sun")) { - _os = OS_SOLARIS; - } - - _chmodCmd = chmod; - - _byteOrder = aByteOrder; - } - - /** - * Get the {@literal chmod} (change permissions) command for the current - * platform (if one is necessary). - * - * @return the name of the {@literal chmod} command. - */ - public - String[] getChmodCmd() - { - return _chmodCmd; - } -} diff --git a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/RuntimeProvider.java b/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/RuntimeProvider.java deleted file mode 100644 index 62f1a2ff97..0000000000 --- a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/RuntimeProvider.java +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; - -import static de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils.resolveLocation; -import static org.apache.commons.io.IOUtils.closeQuietly; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.net.URL; -import java.util.Properties; - -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.springframework.core.io.UrlResource; -import org.springframework.core.io.support.PropertiesLoaderUtils; - -/** - * Provides temporary installation of binaries from the classpath to the local file system. - */ -public class RuntimeProvider -{ - private Log log = LogFactory.getLog(getClass()); - - public static final String MODE_EXECUTABLE = "executable"; - - private boolean installed; - private File workspace; - - private String baseLocation; - private PlatformDetector platformDetector; - private String platformId; - private Properties manifest; - - public RuntimeProvider(String aBaseLocation) - { - setBaseLocation(aBaseLocation); - platformDetector = new PlatformDetector(); - } - - public void setBaseLocation(String aBaseLocation) - { - baseLocation = aBaseLocation; - } - - public Properties getManifest() throws IOException - { - if (manifest == null) { - String mfl = baseLocation; - if (!mfl.endsWith("/")) { - mfl += "/"; - } - - boolean fallbackTo32Tried = false; - URL manifestUrl = null; - try { - manifestUrl = resolveLocation( - baseLocation + platformDetector.getPlatformId() + "/manifest.properties", - this, null); - platformId = platformDetector.getPlatformId(); - } - catch (FileNotFoundException e) { - // Ok, maybe we try a 32-bit fallback - } - - if (manifestUrl == null - && PlatformDetector.ARCH_X86_64.equals(platformDetector.getArch())) { - fallbackTo32Tried = true; - try { - manifestUrl = resolveLocation(baseLocation + platformDetector.getOs() + "-" - + PlatformDetector.ARCH_X86_32 + "/manifest.properties", this, null); - platformId = platformDetector.getOs() + "-" + PlatformDetector.ARCH_X86_32; - } - catch (FileNotFoundException e) { - // Ok, well, then we will generate an error next. - } - } - - if (manifestUrl == null) { - StringBuilder sb = new StringBuilder(); - sb.append("No files found for [").append(platformDetector.getPlatformId()) - .append("]"); - if (fallbackTo32Tried) { - sb.append(" Also no files for 32bit."); - } - throw new FileNotFoundException(sb.toString()); - } - else if (fallbackTo32Tried && log.isWarnEnabled()) { - log.warn("No binaries found for [" + platformDetector.getPlatformId() + "], using [" - + platformId + "] instead"); - } - - manifest = PropertiesLoaderUtils.loadProperties(new UrlResource(manifestUrl)); - } - return manifest; - } - - public boolean isInstalled() - { - return installed; - } - - public File getFile(String aFilename) throws IOException - { - install(); - File file = new File(getWorkspace(), aFilename); - if (!file.exists()) { - throw new FileNotFoundException("File not found in workspace: ["+aFilename+"]"); - } - return file; - } - - public File getWorkspace() throws IOException - { - if (workspace == null) { - workspace = File.createTempFile("dkpro", "runtime"); - FileUtils.forceDelete(workspace); - FileUtils.forceMkdir(workspace); - workspace.deleteOnExit(); - } - return workspace; - } - - public void install() throws IOException - { - if (installed) { - return; - } - - Properties manifest = getManifest(); - for (String filename : manifest.stringPropertyNames()) { - URL source = resolveLocation(baseLocation + platformId + "/" + filename, this, null); - File target = new File(getWorkspace(), filename); - InputStream is = null; - OutputStream os = null; - try { - is = source.openStream(); - os = new FileOutputStream(target); - IOUtils.copyLarge(is, os); - } - finally { - closeQuietly(is); - closeQuietly(os); - } - - if (MODE_EXECUTABLE.equals(manifest.getProperty(filename))) { - target.setExecutable(true); - } - - target.deleteOnExit(); - } - - installed = true; - } - - public void uninstall() - { - if (workspace != null) { - FileUtils.deleteQuietly(workspace); - workspace = null; - installed = false; - } - } - - @Override - protected void finalize() - throws Throwable - { - uninstall(); - } -} diff --git a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/package-info.java b/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/package-info.java deleted file mode 100644 index 27e071d365..0000000000 --- a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * API for accessing resources. - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; diff --git a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/CasConfigurableProviderBase.java b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/CasConfigurableProviderBase.java similarity index 96% rename from dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/CasConfigurableProviderBase.java rename to dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/CasConfigurableProviderBase.java index 2cfc95991e..85b80ca064 100644 --- a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/CasConfigurableProviderBase.java +++ b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/CasConfigurableProviderBase.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; +package org.dkpro.core.api.resources; import java.io.IOException; import java.util.Properties; diff --git a/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/CasConfigurableStreamProviderBase.java b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/CasConfigurableStreamProviderBase.java new file mode 100644 index 0000000000..282715962a --- /dev/null +++ b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/CasConfigurableStreamProviderBase.java @@ -0,0 +1,49 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.resources; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; + +/** + * CAS-configurable provider produces a resource from a stream instead of an URL. The provider + * implementation does not have to care about opening/closing the stream. + * + * @param + * the type of resource to produce. + */ +public abstract class CasConfigurableStreamProviderBase + extends CasConfigurableProviderBase +{ + @Override + protected M produceResource(URL aUrl) throws IOException + { + try (InputStream is = aUrl.openStream()) { + return produceResource(is); + } + catch (IOException e) { + throw e; + } + catch (Exception e) { + throw new IOException(e); + } + } + + protected abstract M produceResource(InputStream aStream) throws Exception; +} diff --git a/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/CompressionMethod.java b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/CompressionMethod.java new file mode 100644 index 0000000000..e1d6ef9b56 --- /dev/null +++ b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/CompressionMethod.java @@ -0,0 +1,38 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.resources; + +/** + * Compression methods. + */ +public enum CompressionMethod +{ + NONE(""), GZIP(".gz"), BZIP2(".bz2"), XZ(".xz"); + + private String extension; + + private CompressionMethod(String aExtension) + { + extension = aExtension; + } + + public String getExtension() + { + return extension; + } +} diff --git a/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/CompressionUtils.java b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/CompressionUtils.java new file mode 100644 index 0000000000..ead8431978 --- /dev/null +++ b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/CompressionUtils.java @@ -0,0 +1,126 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.resources; + +import static org.apache.commons.io.FileUtils.forceMkdir; +import static org.dkpro.core.api.resources.CompressionMethod.BZIP2; +import static org.dkpro.core.api.resources.CompressionMethod.GZIP; +import static org.dkpro.core.api.resources.CompressionMethod.XZ; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; +import org.apache.commons.compress.compressors.xz.XZCompressorInputStream; +import org.apache.commons.compress.compressors.xz.XZCompressorOutputStream; + +/** + * Utility methods for dealing with compressed data. + * + */ +public class CompressionUtils +{ + public static String stripCompressionExtension(String aLocation) + { + String lcLocation = aLocation.toLowerCase(); + if (lcLocation.endsWith(GZIP.getExtension())) { + return aLocation.substring(0, GZIP.getExtension().length()); + } + else if (lcLocation.endsWith(BZIP2.getExtension()) || lcLocation.endsWith(".bzip2")) { + return aLocation.substring(0, BZIP2.getExtension().length()); + } + else if (lcLocation.endsWith(".bzip2")) { + return aLocation.substring(0, ".bzip2".length()); + } + else if (lcLocation.endsWith(XZ.getExtension())) { + return aLocation.substring(0, XZ.getExtension().length()); + } + else { + return aLocation; + } + } + + /** + * Get an uncompressed input stream for a given input stream created for a particular location. + * + * @param aLocation + * a resource location (e.g. a path, URL, etc.) + * @param aStream + * a raw stream of potentially compressed data. + * @return stream wrapped with a decompressing stream. + * @throws IOException if an I/O error has occurred, + */ + public static InputStream getInputStream(String aLocation, InputStream aStream) + throws IOException + { + String lcLocation = aLocation.toLowerCase(); + if (lcLocation.endsWith(GZIP.getExtension())) { + return new GZIPInputStream(aStream); + } + else if (lcLocation.endsWith(BZIP2.getExtension()) || lcLocation.endsWith(".bzip2")) { + return new BZip2CompressorInputStream(aStream); + } + else if (lcLocation.endsWith(XZ.getExtension())) { + return new XZCompressorInputStream(aStream); + } + else { + return aStream; + } + } + + /** + * Make sure the target directory exists and get a stream writing to the specified file within. + * If the file name ends with a typical extension for compressed files, the stream will be + * compressed. + * + * @param aFile + * the target file. + * @return a stream to write to. + * @throws IOException + * if an I/O error has occurred, + * @see CompressionMethod + */ + public static OutputStream getOutputStream(File aFile) + throws IOException + { + // Create parent folders for output file and set up stream + if (aFile.getParentFile() != null) { + forceMkdir(aFile.getParentFile()); + } + + String lcFilename = aFile.getName().toLowerCase(); + + OutputStream os = new FileOutputStream(aFile); + if (lcFilename.endsWith(GZIP.getExtension())) { + os = new GZIPOutputStream(os); + } + else if (lcFilename.endsWith(BZIP2.getExtension()) || lcFilename.endsWith(".bzip2")) { + os = new BZip2CompressorOutputStream(os); + } + else if (lcFilename.endsWith(XZ.getExtension())) { + os = new XZCompressorOutputStream(os); + } + return os; + } +} diff --git a/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/DkproContext.java b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/DkproContext.java new file mode 100644 index 0000000000..67033617ef --- /dev/null +++ b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/DkproContext.java @@ -0,0 +1,93 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.resources; + +import java.io.File; +import java.io.IOException; + +/** + * Runtime context for DKPro. + */ +public class DkproContext +{ + public static final String ENV_DKPRO_HOME = "DKPRO_HOME"; + public static final String DEFAULT_ENCODING = "UTF-8"; + + private static DkproContext context; + + /** + * The the current context. + * + * @return the context. + */ + public static synchronized DkproContext getContext() + { + if (context == null) { + context = new DkproContext(); + } + return context; + } + + /** + * Get the workspace directory. + * + * @return the workspace directory. + * @throws IOException + * if the workspace cannot be obtained + */ + public File getWorkspace() throws IOException + { + if (System.getenv(ENV_DKPRO_HOME) != null) { + File f = new File(System.getenv(ENV_DKPRO_HOME)); + f.mkdirs(); + return f; + } + + throw new IOException("Environment variable [" + ENV_DKPRO_HOME + "] not set"); + } + + /** + * Get the workspace directory for a particular class. + * + * @param aClass + * a class. + * @return the class workspace. + * @throws IOException + * if the workspace cannot be obtained. + */ + public File getWorkspace(final Class aClass) throws IOException + { + return getWorkspace(aClass.getName()); + } + + /** + * Get the workspace directory for a particular topic. + * + * @param aTopic + * the topic ID. + * @return the topic workspace. + * @throws IOException + * if the workspace cannot be obtained. + */ + public File getWorkspace(final String aTopic) throws IOException + { + File f = new File(getWorkspace(), aTopic); + f.mkdirs(); + return f; + } +} diff --git a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/HasResourceMetadata.java b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/HasResourceMetadata.java similarity index 90% rename from dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/HasResourceMetadata.java rename to dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/HasResourceMetadata.java index 3d978aede9..55a46da1e6 100644 --- a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/HasResourceMetadata.java +++ b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/HasResourceMetadata.java @@ -15,15 +15,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; +package org.dkpro.core.api.resources; import java.util.Properties; /** * Interface for objects that provide meta data that can be imported by a ResourceObjectProvider. - * */ public interface HasResourceMetadata { - Properties getResourceMetaData(); + Properties getResourceMetaData(); } diff --git a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/HasTagsets.java b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/HasTagsets.java similarity index 84% rename from dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/HasTagsets.java rename to dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/HasTagsets.java index 3824aa5e32..5d5982a06c 100644 --- a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/HasTagsets.java +++ b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/HasTagsets.java @@ -15,15 +15,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; +package org.dkpro.core.api.resources; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.Tagset; +import org.dkpro.core.api.metadata.Tagset; /** * Interface for objects that provide tagsets. - * */ public interface HasTagsets { - Tagset getTagset(); + Tagset getTagset(); } diff --git a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/LittleEndianDataInputStream.java b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/LittleEndianDataInputStream.java similarity index 94% rename from dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/LittleEndianDataInputStream.java rename to dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/LittleEndianDataInputStream.java index 870b27d9f8..42a8d29a95 100644 --- a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/LittleEndianDataInputStream.java +++ b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/LittleEndianDataInputStream.java @@ -29,7 +29,7 @@ * Contributors: * Peter Franza - initial implementation */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; +package org.dkpro.core.api.resources; import java.io.DataInput; import java.io.DataInputStream; @@ -46,7 +46,8 @@ public class LittleEndianDataInputStream // to get at the low-level read methods of InputStream private InputStream in; - private byte w[]; // work array for buffering input + // work array for buffering input + private byte[] w; public LittleEndianDataInputStream(final InputStream aIn) { @@ -131,21 +132,21 @@ public final double readDouble() } @Override - public final int read(final byte b[], final int off, final int len) + public final int read(final byte[] b, final int off, final int len) throws IOException { return in.read(b, off, len); } @Override - public final void readFully(final byte b[]) + public final void readFully(final byte[] b) throws IOException { d.readFully(b, 0, b.length); } @Override - public final void readFully(final byte b[], final int off, final int len) + public final void readFully(final byte[] b, final int off, final int len) throws IOException { d.readFully(b, off, len); @@ -207,4 +208,4 @@ public final void close() { d.close(); } -} \ No newline at end of file +} diff --git a/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/MappingProvider.java b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/MappingProvider.java new file mode 100644 index 0000000000..49f6160557 --- /dev/null +++ b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/MappingProvider.java @@ -0,0 +1,241 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.resources; + +import static org.dkpro.core.api.resources.MappingUtils.META_OVERRIDE; +import static org.dkpro.core.api.resources.MappingUtils.META_REDIRECT; +import static org.dkpro.core.api.resources.MappingUtils.META_TYPE_BASE; +import static org.dkpro.core.api.resources.ResourceUtils.resolveLocation; + +import java.io.IOException; +import java.net.URL; +import java.util.HashMap; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Properties; +import java.util.Set; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.TypeSystem; +import org.springframework.core.io.UrlResource; +import org.springframework.core.io.support.PropertiesLoaderUtils; + +public class MappingProvider extends CasConfigurableProviderBase> +{ + /** + * Flag indicating whether the entire mapping mechanism should be skipped and the default base + * type should always be applied. + */ + public static final String MAPPING_ENABLED = "mappingEnabled"; + + public static final String BASE_TYPE = "baseType"; + + private TypeSystem typeSystem; + private boolean notFound = false; + + private Map tagMappings; + + private Map tagMappingImports = new HashMap<>(); + + @Override + protected void init() + { + super.init(); + + // Mappings are expected to be provided by the components, not to be resolved from + // model artifacts. + setDefault(GROUP_ID, null); + setDefault(ARTIFACT_URI, null); + } + + @Override + public void configure(CAS aCas) throws AnalysisEngineProcessException + { + typeSystem = aCas.getTypeSystem(); + + // If mapping is disabled, then we simply skip the loading of the mapping and pretend we + // didn't find a mapping. + if ("false".equalsIgnoreCase(getOverride(MAPPING_ENABLED))) { + notFound = true; + return; + } + + // Tag mappings can exist independently from the type mappings because tag mappings + // are configured in the model metadata + tagMappings = new HashMap<>(); + for (Entry imp : tagMappingImports.entrySet()) { + String prefix = imp.getKey() + ".tag.map."; + Properties props = imp.getValue().getResourceMetaData(); + for (String key : props.stringPropertyNames()) { + if (key.startsWith(prefix)) { + String originalTag = key.substring(prefix.length()); + String mappedTag = props.getProperty(key); + tagMappings.put(originalTag, mappedTag); + } + } + } + + // Try loading the type mappings + try { + notFound = false; + super.configure(aCas); + } + catch (AnalysisEngineProcessException e) { + if (getOverride(LOCATION) != null) { + throw e; + } + notFound = true; + } + } + + public String getTag(String aTag) + { + String tag = aTag; + + // Apply tag mapping if configured + if (tagMappings != null) { + String t = tagMappings.get(aTag); + if (t != null) { + tag = t; + } + } + + return tag; + } + + /** + * Get the type for the given tag. + * + * @param aTag + * a tag. + * @return the type + * @throws IllegalStateException + * if the type could not be located + */ + public Type getTagType(String aTag) + { + String type = getTagTypeName(aTag); + + Type uimaType = typeSystem.getType(type); + + if (uimaType == null) { + throw new IllegalStateException("Type [" + type + "] mapped to tag [" + aTag + + "] is not defined in type system"); + } + + return uimaType; + } + + /** + * Get the type for the given tag. + * + * @param aTag + * a tag. + * @return the type + * @throws IllegalStateException + * if the type could not be located + */ + public String getTagTypeName(String aTag) + { + String type; + if (notFound) { + type = getDefault(BASE_TYPE); + if (type == null) { + throw new IllegalStateException("No base type defined!"); + } + } + else { + String tag = getTag(aTag); + + type = getResource().get(tag); + if (type == null) { + type = getResource().get("*"); + } + if (type == null) { + throw new IllegalStateException("No fallback (*) mapping defined!"); + } + + String basePackage = getResource().get(META_TYPE_BASE); + if (basePackage != null) { + type = basePackage + type; + } + } + + return type; + } + + public Set getTags() + { + return MappingUtils.stripMetadata(getResource().keySet()); + } + + @Override + protected Map produceResource(URL aUrl) throws IOException + { + if (aUrl != null) { + Map mapping = new HashMap(); + Properties props = PropertiesLoaderUtils.loadProperties(new UrlResource(aUrl)); + for (String key : props.stringPropertyNames()) { + mapping.put(key.trim(), props.getProperty(key).trim()); + } + return mapping; + } + else { + return null; + } + } + + @Override + protected URL followRedirects(URL aUrl) throws IOException + { + URL url = aUrl; + while (true) { + Properties tmpResourceMetaData = PropertiesLoaderUtils.loadProperties(new UrlResource( + url)); + + // Values in the redirecting properties override values in the redirected-to + // properties - except META_REDIRECT + getResourceMetaData().remove(META_REDIRECT); + + Properties overrides = new Properties(); + for (String key : tmpResourceMetaData.stringPropertyNames()) { + if (key.startsWith(META_OVERRIDE)) { + overrides.put(key.substring(META_OVERRIDE.length() + 1), + tmpResourceMetaData.getProperty(key)); + } + } + + mergeProperties(getResourceMetaData(), overrides); + + String redirect = tmpResourceMetaData.getProperty(META_REDIRECT); + if (redirect == null) { + return url; + } + else { + url = resolveLocation(redirect, getClassLoader(), null); + } + } + } + + public void addTagMappingImport(String aLayerPrefix, HasResourceMetadata aSource) + { + tagMappingImports.put(aLayerPrefix, aSource); + } +} diff --git a/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/MappingProviderFactory.java b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/MappingProviderFactory.java new file mode 100644 index 0000000000..d8c6f7d45a --- /dev/null +++ b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/MappingProviderFactory.java @@ -0,0 +1,186 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.resources; + +import static org.dkpro.core.api.parameter.ComponentParameters.PARAM_MAPPING_ENABLED; +import static org.dkpro.core.api.resources.MappingProvider.BASE_TYPE; +import static org.dkpro.core.api.resources.MappingProvider.MAPPING_ENABLED; +import static org.dkpro.core.api.resources.ResourceObjectProviderBase.LANGUAGE; +import static org.dkpro.core.api.resources.ResourceObjectProviderBase.LOCATION; +import static org.dkpro.core.api.resources.ResourceObjectProviderBase.VARIANT; + +public class MappingProviderFactory +{ + private static final String CONSTITUENT_TAGSET = "constituent.tagset"; + private static final String DEPENDENCY_TAGSET = "dependency.tagset"; + private static final String CHUNK_TAGSET = "chunk.tagset"; + private static final String POS_TAGSET = "pos.tagset"; + + public static MappingProvider createPosMappingProvider(Object aContextObject, + String aMappingLocation, String aLanguage, HasResourceMetadata aSource) + { + MappingProvider p = createPosMappingProvider(aContextObject, aMappingLocation, null, + aLanguage); + p.addImport(POS_TAGSET, aSource); + return p; + } + + public static MappingProvider createPosMappingProvider(Object aContextObject, + String aMappingLocation, String aTagset, String aLanguage) + { + MappingProvider p = new MappingProvider(); + p.setDefault(LOCATION, "classpath:/org/dkpro/core/api/lexmorph/tagset/${language}-${pos.tagset}-pos.map"); + p.setDefault(BASE_TYPE, "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"); + p.setDefault(POS_TAGSET, "default"); + p.setOverride(LOCATION, aMappingLocation); + p.setOverride(LANGUAGE, aLanguage); + p.setOverride(POS_TAGSET, aTagset); + + if (aContextObject != null) { + p.setContextObject(aContextObject); + p.addAutoOverride(PARAM_MAPPING_ENABLED, MAPPING_ENABLED); + p.applyAutoOverrides(aContextObject); + } + + return p; + } + + public static MappingProvider createChunkMappingProvider(Object aContextObject, + String aMappingLocation, String aLanguage, HasResourceMetadata aSource) + { + MappingProvider p = createChunkMappingProvider(aContextObject, aMappingLocation, null, + aLanguage); + p.addImport(CHUNK_TAGSET, aSource); + return p; + } + + public static MappingProvider createChunkMappingProvider(Object aContextObject, + String aMappingLocation, String aTagset, String aLanguage) + { + MappingProvider p = new MappingProvider(); + p = new MappingProvider(); + p.setDefault(LOCATION, + "classpath:/org/dkpro/core/api/syntax/tagset/${language}-${chunk.tagset}-chunk.map"); + p.setDefault(BASE_TYPE, "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk"); + p.setDefault(CHUNK_TAGSET, "default"); + p.setOverride(LOCATION, aMappingLocation); + p.setOverride(LANGUAGE, aLanguage); + p.setOverride(CHUNK_TAGSET, aTagset); + + if (aContextObject != null) { + p.setContextObject(aContextObject); + p.addAutoOverride(PARAM_MAPPING_ENABLED, MAPPING_ENABLED); + p.applyAutoOverrides(aContextObject); + } + + return p; + } + + public static MappingProvider createConstituentMappingProvider(Object aContextObject, + String aMappingLocation, String aLanguage, HasResourceMetadata aSource) + { + MappingProvider p = createConstituentMappingProvider(aContextObject, aMappingLocation, null, + aLanguage); + p.addImport(CONSTITUENT_TAGSET, aSource); + p.addTagMappingImport("constituent", aSource); + return p; + } + + public static MappingProvider createConstituentMappingProvider(Object aContextObject, + String aMappingLocation, String aTagset, String aLanguage) + { + MappingProvider p = new MappingProvider(); + p.setDefault(LOCATION, "classpath:/org/dkpro/core/api/syntax/tagset/" + + "${language}-${constituent.tagset}-constituency.map"); + p.setDefault(MappingProvider.BASE_TYPE, + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent"); + p.setDefault(CONSTITUENT_TAGSET, "default"); + p.setOverride(LOCATION, aMappingLocation); + p.setOverride(LANGUAGE, aLanguage); + p.setOverride(CONSTITUENT_TAGSET, aTagset); + + if (aContextObject != null) { + p.addAutoOverride(PARAM_MAPPING_ENABLED, MAPPING_ENABLED); + p.applyAutoOverrides(aContextObject); + } + + return p; + } + + public static MappingProvider createDependencyMappingProvider(Object aContextObject, + String aMappingLocation, String aLanguage, HasResourceMetadata aSource) + { + MappingProvider p = createDependencyMappingProvider(aContextObject, aMappingLocation, null, + aLanguage); + p.addImport(DEPENDENCY_TAGSET, aSource); + return p; + } + + public static MappingProvider createDependencyMappingProvider(Object aContextObject, + String aMappingLocation, String aTagset, String aLanguage) + { + MappingProvider p = new MappingProvider(); + p.setDefault(LOCATION, "classpath:/org/dkpro/core/api/syntax/tagset/" + + "${language}-${dependency.tagset}-dependency.map"); + p.setDefault(BASE_TYPE, + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency"); + p.setDefault(DEPENDENCY_TAGSET, "default"); + p.setOverride(LOCATION, aMappingLocation); + p.setOverride(LANGUAGE, aLanguage); + p.setOverride(DEPENDENCY_TAGSET, aTagset); + + if (aContextObject != null) { + p.setContextObject(aContextObject); + p.addAutoOverride(PARAM_MAPPING_ENABLED, MAPPING_ENABLED); + p.applyAutoOverrides(aContextObject); + } + + return p; + } + + public static MappingProvider createNerMappingProvider(Object aContextObject, + String aMappingLocation, String aLanguage, String aVariant, HasResourceMetadata aSource) + { + MappingProvider p = createNerMappingProvider(aContextObject, aMappingLocation, aLanguage, + aVariant); + + p.addTagMappingImport("ner", aSource); + + return p; + } + + public static MappingProvider createNerMappingProvider(Object aContextObject, + String aMappingLocation, String aLanguage, String aVariant) + { + MappingProvider p = new MappingProvider(); + p.setDefaultVariantsLocation("${package}/lib/ner-default-variants.map"); + p.setDefault(LOCATION, "classpath:/${package}/lib/ner-${language}-${variant}.map"); + p.setDefault(BASE_TYPE, "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity"); + p.setOverride(LOCATION, aMappingLocation); + p.setOverride(LANGUAGE, aLanguage); + p.setOverride(VARIANT, aVariant); + + if (aContextObject != null) { + p.setContextObject(aContextObject); + p.addAutoOverride(PARAM_MAPPING_ENABLED, MAPPING_ENABLED); + p.applyAutoOverrides(aContextObject); + } + + return p; + } +} diff --git a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/MappingUtils.java b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/MappingUtils.java similarity index 96% rename from dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/MappingUtils.java rename to dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/MappingUtils.java index 5c7de9854e..d784eb5917 100644 --- a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/MappingUtils.java +++ b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/MappingUtils.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; +package org.dkpro.core.api.resources; import java.util.ArrayList; import java.util.LinkedHashSet; diff --git a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/ModelProviderBase.java b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/ModelProviderBase.java similarity index 92% rename from dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/ModelProviderBase.java rename to dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/ModelProviderBase.java index 8de9f7d344..c2da048442 100644 --- a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/ModelProviderBase.java +++ b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/ModelProviderBase.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; +package org.dkpro.core.api.resources; import java.io.IOException; import java.io.InputStream; @@ -32,13 +32,13 @@ import org.apache.uima.cas.CASException; import org.apache.uima.fit.util.FSCollectionFactory; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.metadata.AggregateTagset; +import org.dkpro.core.api.metadata.Tagset; +import org.dkpro.core.api.metadata.TagsetMetaData; +import org.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.AggregateTagset; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.Tagset; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.TagsetMetaData; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagDescription; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; public class ModelProviderBase extends CasConfigurableStreamProviderBase @@ -67,10 +67,11 @@ public ModelProviderBase(Object aObject, String aShortName, String aType) setDefault(ARTIFACT_ID, "${groupId}." + aShortName + "-model-" + aType + "-${language}-${variant}"); setDefault(LOCATION, - "classpath:/${package}/lib/"+aType+"-${language}-${variant}.properties"); - setDefaultVariantsLocation("${package}/lib/"+aType+"-default-variants.map"); + "classpath:/${package}/lib/" + aType + "-${language}-${variant}.properties"); + setDefaultVariantsLocation("${package}/lib/" + aType + "-default-variants.map"); setDefault(VARIANT, "default"); + addAutoOverride(ComponentParameters.PARAM_MODEL_ARTIFACT_URI, ARTIFACT_URI); addAutoOverride(ComponentParameters.PARAM_MODEL_LOCATION, LOCATION); addAutoOverride(ComponentParameters.PARAM_VARIANT, VARIANT); addAutoOverride(ComponentParameters.PARAM_LANGUAGE, LANGUAGE); diff --git a/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/PlatformDetector.java b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/PlatformDetector.java new file mode 100644 index 0000000000..073ed7612d --- /dev/null +++ b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/PlatformDetector.java @@ -0,0 +1,191 @@ +/* + * Copyright 2009-2010 + * Richard Eckart de Castilho + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.resources; + +import java.nio.ByteOrder; + +/** + * Detect platform information and normalize it. + */ +public class PlatformDetector +{ + public static String OS_WINDOWS = "windows"; + public static String OS_OSX = "osx"; + public static String OS_SOLARIS = "solaris"; + public static String OS_LINUX = "linux"; + + public static String ARCH_PPC = "ppc"; + public static String ARCH_X86_32 = "x86_32"; + public static String ARCH_X86_64 = "x86_64"; + public static String ARCH_SPARC = "sparc"; + + private String _arch = ""; + private String _os = ""; + private String _executableSuffix = ""; + private ByteOrder _byteOrder = ByteOrder.nativeOrder(); + private String[] _chmodCmd; + + { + updatePlatform( + System.getProperties().getProperty("os.name"), + System.getProperties().getProperty("os.arch"), + ByteOrder.nativeOrder()); + } + + /** + * Override the operating system name. This should only be used in test cases. + * + * @param aOs + * an OS name as could be found in the os.name system property. + */ + public void setOs(final String aOs) + { + updatePlatform(aOs, _arch, _byteOrder); + } + + /** + * Get the operating system. + * + * @return {@literal "windows"}, {@literal "osx"}, {@literal "linux"} or {@literal "solaris"}. + */ + public String getOs() + { + return _os; + } + + /** + * Override the architecture. This should only be used in test cases. + * + * @param aArch + * {@literal "big-endian"} for PowerPC or Sparc systems or {@literal "little-endian"} + * for x86 systems. + */ + public void setArch(final String aArch) + { + updatePlatform(_os, aArch, _byteOrder); + } + + /** + * Get the platform architecture. + * + * @return {@literal "ppc"}, {@literal "x86_32"}, {@literal "x86_64"} or {"amd64"} + */ + public String getArch() + { + return _arch; + } + + /** + * Set the byte order. TreeTagger models are sensitive to the byte order. This should only be + * used in test cases. + * + * @param aByteOrder + * the byte order. + */ + public void setByteOrder(final ByteOrder aByteOrder) + { + updatePlatform(_os, _arch, aByteOrder); + } + + /** + * Get the file suffix used for executable files on the currently configured platform. + * + * @return the file suffix used for executable files. + */ + public String getExecutableSuffix() + { + return _executableSuffix; + } + + /** + * Get the byte order. + * + * @return the byte order. + */ + public String getByteOrder() + { + return _byteOrder.toString().replace("_", "-").toLowerCase(); + } + + /** + * Get the platform ID which is {@link #getOs()} and {@link #getArch()} separated by a + * {@literal "-"} (dash). + * + * @return the platform ID. + */ + public String getPlatformId() + { + return _os + "-" + _arch; + } + + /** + * Updates the platform-specific settings and normalizes them. + * + * @param aOs the operating system string. + * @param aArch the architecture string. + * @param aByteOrder the byte-order string. + */ + public void updatePlatform(final String aOs, final String aArch, final ByteOrder aByteOrder) + { + _os = aOs.toLowerCase(); + _arch = aArch.toLowerCase(); + String[] chmod = { "chmod", "755", null }; + + // Resolve arch "synonyms" + if (_arch.equals("x86") || _arch.equals("i386") || _arch.equals("i486") + || _arch.equals("i586") || _arch.equals("i686")) { + _arch = ARCH_X86_32; + } + if (_arch.equals("amd64")) { + _arch = ARCH_X86_64; + } + if (_arch.equals("powerpc")) { + _arch = ARCH_PPC; + } + + // Resolve name "synonyms" + if (_os.startsWith("windows")) { + _os = OS_WINDOWS; + _executableSuffix = ".exe"; + chmod = null; + } + if (_os.startsWith("mac")) { + _os = OS_OSX; + } + if (_os.startsWith("linux")) { + _os = OS_LINUX; + } + if (_os.startsWith("sun")) { + _os = OS_SOLARIS; + } + + _chmodCmd = chmod; + + _byteOrder = aByteOrder; + } + + /** + * Get the {@literal chmod} (change permissions) command for the current platform (if one is + * necessary). + * + * @return the name of the {@literal chmod} command. + */ + public String[] getChmodCmd() + { + return _chmodCmd; + } +} diff --git a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/ResourceMetadata.java b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/ResourceMetadata.java similarity index 98% rename from dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/ResourceMetadata.java rename to dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/ResourceMetadata.java index edac13716d..5fd166beed 100644 --- a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/ResourceMetadata.java +++ b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/ResourceMetadata.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; +package org.dkpro.core.api.resources; public class ResourceMetadata { diff --git a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/ResourceObjectProviderBase.java b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/ResourceObjectProviderBase.java similarity index 75% rename from dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/ResourceObjectProviderBase.java rename to dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/ResourceObjectProviderBase.java index d7127bfa35..95bf3c80e6 100644 --- a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/ResourceObjectProviderBase.java +++ b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/ResourceObjectProviderBase.java @@ -15,9 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; +package org.dkpro.core.api.resources; -import static de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils.resolveLocation; +import static java.util.Objects.isNull; +import static org.apache.commons.lang3.StringUtils.isBlank; +import static org.dkpro.core.api.resources.ResourceUtils.resolveLocation; import java.io.File; import java.io.FileNotFoundException; @@ -38,10 +40,11 @@ import java.util.Set; import java.util.UUID; import java.util.WeakHashMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.exception.ExceptionUtils; import org.apache.commons.lang3.reflect.FieldUtils; import org.apache.commons.lang3.time.StopWatch; import org.apache.commons.logging.Log; @@ -65,14 +68,14 @@ import org.apache.uima.fit.factory.ConfigurationParameterFactory; import org.apache.uima.fit.internal.ReflectionUtil; import org.codehaus.plexus.util.xml.pull.XmlPullParserException; +import org.dkpro.core.api.resources.internal.ApacheCommonsLoggingAdapter; import org.springframework.core.io.Resource; import org.springframework.core.io.UrlResource; import org.springframework.core.io.support.PathMatchingResourcePatternResolver; import org.springframework.core.io.support.PropertiesLoaderUtils; +import org.springframework.core.io.support.ResourcePatternResolver; import org.springframework.util.PropertyPlaceholderHelper; -import de.tudarmstadt.ukp.dkpro.core.api.resources.internal.ApacheCommonsLoggingAdapter; - /** * Base class for resource providers that produce a resource from some URL depending on changing * parameters such as language. @@ -139,6 +142,7 @@ public abstract class ResourceObjectProviderBase * resolved when {@link #configure()} is called. (optional) */ public static final String GROUP_ID = "groupId"; + public static final String COMPONENT_GROUP_ID = "componentGroupId"; /** * The artifact ID of the Maven artifact containing a resource. Variables in the location are @@ -146,6 +150,12 @@ public abstract class ResourceObjectProviderBase */ public static final String ARTIFACT_ID = "artifactId"; + /** + * A URI pointing to the artifact. Currently, this URI expected to be in the format + * {@code mvn:${groupId}:${artifactId}:${version}}. + */ + public static final String ARTIFACT_URI = "artifactUri"; + /** * The version of the Maven artifact containing a resource. Variables in the location are * resolved when {@link #configure()} is called. (optional) @@ -203,6 +213,9 @@ public abstract class ResourceObjectProviderBase protected void init() { setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(COMPONENT_GROUP_ID, "org.dkpro.core"); + setDefault(ARTIFACT_URI, + "mvn:${" + GROUP_ID + "}:${" + ARTIFACT_ID + "}:${" + VERSION + "}"); } public void setOverride(String aKey, String aValue) @@ -306,7 +319,7 @@ public void setContextObject(Object aObject) // finds the already-loaded model and simply reuses it. Sharing models can also lead to // unexpected results or crashes in multi-threaded environments. // Allowed values: "true" and "false" - String key = "dkpro.core.resourceprovider.sharable."+aObject.getClass().getName(); + String key = "dkpro.core.resourceprovider.sharable." + aObject.getClass().getName(); if (System.getProperty(key) != null) { setDefault(SHARABLE, System.getProperty(key)); } @@ -352,7 +365,8 @@ public void applyAutoOverrides(Object aObject) if (property != null) { try { - setOverride(property, (String) FieldUtils.readField(field, aObject, true)); + Object value = FieldUtils.readField(field, aObject, true); + setOverride(property, value != null ? value.toString() : null); } catch (IllegalAccessException e) { throw new IllegalStateException(e); @@ -362,37 +376,21 @@ public void applyAutoOverrides(Object aObject) } } - /** - * Tries to get the version of the required model from the dependency management section of the - * Maven POM belonging to the context object. - * - * @throws IOException - * if there was a problem loading the POM file - * @throws FileNotFoundException - * if no POM could be found - * @throws IllegalStateException - * if more than one POM was found, if the version information could not be found in - * the POM, or if no context object was set. - * @return the version of the required model. - */ - protected String getModelVersionFromMavenPom() + protected List getPomUrlsForClass(String aComponentGroupId, String aModelArtifactId, + Class aClass) throws IOException { - if (contextClass == null) { - throw new IllegalStateException("No context class specified"); + if (aClass == null) { + throw new IllegalArgumentException("No context class specified"); } - - // Get the properties and resolve the artifact coordinates - Properties props = getAggregatedProperties(); - String modelArtifact = pph.replacePlaceholders(props.getProperty(ARTIFACT_ID), props); - String modelGroup = pph.replacePlaceholders(props.getProperty(GROUP_ID), props); - + // Try to determine the location of the POM file belonging to the context object URL url = contextClass.getResource(contextClass.getSimpleName() + ".class"); String classPart = contextClass.getName().replace(".", "/") + ".class"; String base = url.toString(); base = base.substring(0, base.length() - classPart.length()); + List lookupPatterns = new ArrayList<>(); List urls = new LinkedList(); String extraNotFoundInfo = ""; @@ -414,28 +412,72 @@ protected String getModelVersionFromMavenPom() } } + // If the class is in a JAR (that should be the normal case), try deriving the + // POM location from the JAR file name. + if (urls.isEmpty()) { + Pattern pattern = Pattern.compile( + ".*/(?([a-zA-Z0-9-_]+\\.)*[a-zA-Z0-9-_]+)-([0-9]+\\.)*[0-9]+(-[a-zA-Z]+)?\\.jar!/.*"); + Matcher matcher = pattern.matcher(base); + if (matcher.matches()) { + String artifactIdAndVersion = matcher.group("ID"); + String pomPattern = base + "META-INF/maven/" + aComponentGroupId + "/" + + artifactIdAndVersion + "/pom.xml"; + lookupPatterns.add(pomPattern); + ResourcePatternResolver resolver = new PathMatchingResourcePatternResolver(); + Resource[] resources = resolver.getResources(pomPattern); + for (Resource r : resources) { + urls.add(r.getURL()); + } + } + } + + // Legacy lookup strategy deriving the POM location from the model artifact ID. This + // fails if a module is re-using models from another module (e.g. CoreNLP re-using + // models from the StanfordNLP module). if (urls.isEmpty()) { // This is the default strategy supposed to look in the JAR - String moduleArtifactId = modelArtifact.split("-")[0]; - String pomPattern = base + "META-INF/maven/" + modelGroup + "/" + moduleArtifactId + - "*/pom.xml"; - PathMatchingResourcePatternResolver resolver = new PathMatchingResourcePatternResolver(); + String moduleArtifactId = aModelArtifactId.split("-")[0]; + String pomPattern = base + "META-INF/maven/" + aComponentGroupId + "/" + + moduleArtifactId + "*/pom.xml"; + lookupPatterns.add(pomPattern); + ResourcePatternResolver resolver = new PathMatchingResourcePatternResolver(); Resource[] resources = resolver.getResources(pomPattern); - // Bail out if no POM was found - if (resources.length == 0) { - throw new FileNotFoundException("No POM file found using [" + pomPattern + "]" - + extraNotFoundInfo); + for (Resource r : resources) { + urls.add(r.getURL()); } - - for(Resource resource : resources){ - urls.add(resource.getURL()); - } - } + + // Bail out if no POM was found + if (urls.isEmpty()) { + throw new FileNotFoundException("No POM file found using the patterns " + lookupPatterns + + ". " + extraNotFoundInfo); + } + + return urls; + } + + /** + * Tries to get the version of the required model from the dependency management section of the + * Maven POM belonging to the context object. + * + * @throws IOException + * if there was a problem loading the POM file + * @throws FileNotFoundException + * if no POM could be found + * @throws IllegalStateException + * if more than one POM was found, if the version information could not be found in + * the POM, or if no context object was set. + * @return the version of the required model. + */ + protected String getModelVersionFromMavenPom(String aComponentGroupId, String aModelGroupId, + String aModelArtifactId, Class aClass) + throws IOException + { + List urls = getPomUrlsForClass(aComponentGroupId, aModelArtifactId, contextClass); - for(URL pomUrl : urls){ - // Parser the POM + for (URL pomUrl : urls) { + // Parse the POM Model model; try { MavenXpp3Reader reader = new MavenXpp3Reader(); @@ -451,8 +493,10 @@ protected String getModelVersionFromMavenPom() && (model.getDependencyManagement().getDependencies() != null)) { List deps = model.getDependencyManagement().getDependencies(); for (Dependency dep : deps) { - if (StringUtils.equals(dep.getGroupId(), modelGroup) - && StringUtils.equals(dep.getArtifactId(), modelArtifact)) { + if ( + StringUtils.equals(dep.getGroupId(), aModelGroupId) && + StringUtils.equals(dep.getArtifactId(), aModelArtifactId) + ) { return dep.getVersion(); } } @@ -525,7 +569,7 @@ public void configure() resourceUrl = null; initialResourceUrl = null; if (modelLocationChanged) { - log.info("Producing resource from thin air"); + log.debug("Producing resource from thin air"); loadResource(props); } } @@ -540,20 +584,13 @@ public void configure() catch (IOException e) { if (modelLocationChanged) { // Try resolving the dependency and adding the stuff to the loader + Properties resolved = props; try { - resolveDependency(props); - } - catch (Throwable re) { - // Ignore - if we cannot resolve, we cannot resolve. Re-throw the - // original exception - throw handleResolvingError(e, lastModelLocation, props); - } - - try { + resolved = resolveDependency(props); initialUrl = resolveLocation(modelLocation, loader, null); } catch (Throwable re) { - throw handleResolvingError(e, lastModelLocation, props); + throw handleResolvingError(re, lastModelLocation, resolved); } } else { @@ -565,15 +602,24 @@ public void configure() initialResourceUrl = initialUrl; resourceMetaData = new Properties(); resourceUrl = followRedirects(initialResourceUrl); - loadMetadata(); - if (initialResourceUrl.equals(resourceUrl)) { - log.info("Producing resource from " + resourceUrl); - } + if (resourceUrl == null) { + initialResourceUrl = null; + if (modelLocationChanged) { + log.debug("Producing resource from thin air"); + loadResource(props); + } + } else { - log.info("Producing resource from [" + resourceUrl + "] redirected from [" + initialResourceUrl - + "]"); + loadMetadata(); + if (initialResourceUrl.equals(resourceUrl)) { + log.info("Producing resource from [" + resourceUrl + "]"); + } + else { + log.info("Producing resource from [" + resourceUrl + "] redirected from [" + + initialResourceUrl + "]"); + } + loadResource(props); } - loadResource(props); } } success = true; @@ -605,7 +651,8 @@ protected URL followRedirects(URL aUrl) throws IOException // If the model points to a properties file, try to find a new location in that // file. If that points to a properties file again, repeat the process. - while (url.getPath().endsWith(".properties")) { + // If at some point the location is marked as not required return null. + while (url != null && url.getPath().endsWith(".properties")) { Properties tmpResourceMetaData = PropertiesLoaderUtils.loadProperties(new UrlResource( url)); @@ -618,8 +665,13 @@ protected URL followRedirects(URL aUrl) throws IOException if (redirect == null) { throw new IOException("Model URL resolves to properties at [" + url + "] but no redirect property [" + LOCATION + "] found there."); + } + else if (redirect.startsWith(NOT_REQUIRED)) { + url = null; + } + else { + url = resolveLocation(redirect, loader, null); } - url = resolveLocation(redirect, loader, null); } return url; @@ -709,7 +761,7 @@ protected synchronized void loadResource(Properties aProperties) throws IOExcept sw.start(); resource = produceResource(resourceUrl); sw.stop(); - log.info("Producing resource took " + sw.getTime() + "ms"); + log.trace("Producing resource took " + sw.getTime() + "ms"); // If cache is enabled, update the cache if (sharable) { @@ -728,38 +780,73 @@ protected synchronized void loadResource(Properties aProperties) throws IOExcept * @throws IOException if dependencies cannot be resolved. * @throws IllegalStateException if */ - private void resolveDependency(Properties aProps) + private Properties resolveDependency(Properties aProps) throws IOException, IllegalStateException { - Set names = aProps.stringPropertyNames(); - if (names.contains(ARTIFACT_ID) && names.contains(GROUP_ID)) { - String artifactId = pph.replacePlaceholders(aProps.getProperty(ARTIFACT_ID), aProps); - String groupId = pph.replacePlaceholders(aProps.getProperty(GROUP_ID), aProps); - String version = pph.replacePlaceholders(aProps.getProperty(VERSION, ""), aProps); - // Try getting better information about the model version. - try { - version = getModelVersionFromMavenPom(); + String artifactUri = null; + + Properties resolved = new Properties(aProps); + + // Try to get model version from POM if it has not been set explicitly yet + if ( + resolved.getProperty(ARTIFACT_URI, "").contains("${" + VERSION + "}") && + isNull(resolved.getProperty(VERSION)) + ) { + String modelGroupId = pph.replacePlaceholders(aProps.getProperty(GROUP_ID), resolved); + String componentGroupId; + + if (aProps.getProperty(COMPONENT_GROUP_ID) != null) { + componentGroupId = pph.replacePlaceholders(aProps.getProperty(COMPONENT_GROUP_ID), + resolved); } - catch (IOException e) { - // Ignore - this will be tried and reported again later by handleResolvingError + else { + componentGroupId = modelGroupId; } - catch (IllegalStateException e) { + + String artifactId = pph.replacePlaceholders(aProps.getProperty(ARTIFACT_ID), resolved); + try { + // If the version is to be auto-detected, then we must have a groupId and artifactId + resolved.put(VERSION, getModelVersionFromMavenPom(componentGroupId, modelGroupId, + artifactId, contextClass)); + } + catch (Throwable e) { + log.error("Unable to obtain version from POM", e); // Ignore - this will be tried and reported again later by handleResolvingError } + } - // Register files with loader - try { - List files = resolveWithIvy(groupId, artifactId, version); - for (File file : files) { - loader.addURL(file.toURI().toURL()); + // Fetch the artifact URI from the properties + Set names = aProps.stringPropertyNames(); + if (names.contains(ARTIFACT_URI)) { + artifactUri = pph.replacePlaceholders(aProps.getProperty(ARTIFACT_URI), resolved); + } + + // Register files with loader + if (artifactUri != null) { + if (artifactUri.startsWith("mvn:")) { + try { + String[] parts = artifactUri.split(":"); + String groupId = parts[1]; + String artifactId = parts[2]; + String version = parts[3]; + + List files = resolveWithIvy(groupId, artifactId, version); + for (File file : files) { + loader.addURL(file.toURI().toURL()); + } + } + catch (ParseException e) { + throw new IllegalStateException(e); } } - catch (ParseException e) { - throw new IllegalStateException(e); + else { + throw new IOException("Unknown URI format: [" + artifactUri + "]"); } } + + return resolved; } - + protected DependencyResolver getModelResolver() { IBiblioResolver ukpModels = new IBiblioResolver(); @@ -858,63 +945,52 @@ private IOException handleResolvingError(Throwable aCause, String aLocation, Pro StringBuilder sb = new StringBuilder(); Set names = aProps.stringPropertyNames(); - if (names.contains(ARTIFACT_ID) && names.contains(GROUP_ID)) { + if (!aProps.getProperty(ARTIFACT_URI, "").contains("$")) { + sb.append("Unable to load the model from the artifact [" + + aProps.getProperty(ARTIFACT_URI) + "]"); + } + else if (names.contains(ARTIFACT_ID) && names.contains(GROUP_ID)) { + // Fetch the groupdId/artifactId/version from the properties String artifactId = pph.replacePlaceholders(aProps.getProperty(ARTIFACT_ID), aProps); String groupId = pph.replacePlaceholders(aProps.getProperty(GROUP_ID), aProps); String version = pph.replacePlaceholders(aProps.getProperty(VERSION, ""), aProps); - - // Try getting better information about the model version. - String extraErrorInfo = ""; - try { - version = getModelVersionFromMavenPom(); - } - catch (IOException ex) { - extraErrorInfo = ExceptionUtils.getRootCauseMessage(ex); - } - catch (IllegalStateException ex) { - extraErrorInfo = ExceptionUtils.getRootCauseMessage(ex); - } - - // Tell user how to add model dependency - sb.append("\nPlease make sure that [").append(artifactId).append(']'); - if (StringUtils.isNotBlank(version)) { - sb.append(" version [").append(version).append(']'); - } - - sb.append(" is on the classpath.\n"); - - if (StringUtils.isNotBlank(version)) { - sb.append("If the version ").append( - "shown here is not available, try a recent version.\n"); - sb.append('\n'); - sb.append("If you are using Maven, add the following dependency to your pom.xml file:\n"); - sb.append('\n'); - sb.append("\n"); - sb.append(" ").append(groupId).append("\n"); - sb.append(" ").append(artifactId).append("\n"); - sb.append(" ").append(version).append("\n"); - sb.append("\n"); - sb.append('\n'); - sb.append("Please consider that the model you are trying to use may not be publicly\n"); - sb.append("distributable. Please refer to the DKPro Core User Guide for instructions\n"); - sb.append("on how to package non-redistributable models."); + + if (isBlank(version)) { + sb.append("I was unable to determine which version of the desired model is " + + "compatible with this component."); } else { - sb.append( - "I was unable to determine which version of the desired model is " - + "compatible with this component:\n").append(extraErrorInfo) - .append("\n"); - } + sb.append("\nPlease make sure that [").append(artifactId).append(']'); + if (StringUtils.isNotBlank(version)) { + sb.append(" version [").append(version).append(']'); + } + sb.append(" is on the classpath.\n"); + } + + // Tell user how to add model dependency + sb.append("If the version shown here is not available, try a recent version.\n"); + sb.append('\n'); + sb.append("If you are using Maven, add the following dependency to your pom.xml file:\n"); + sb.append('\n'); + sb.append("\n"); + sb.append(" ").append(groupId).append("\n"); + sb.append(" ").append(artifactId).append("\n"); + sb.append(" ").append(version).append("\n"); + sb.append("\n"); + sb.append('\n'); + sb.append("Please consider that the model you are trying to use may not be publicly\n"); + sb.append("distributable. Please refer to the DKPro Core User Guide for instructions\n"); + sb.append("on how to package non-redistributable models."); } + if (NOT_REQUIRED.equals(aLocation)) { - return new IOException("Unable to load resource: \n" - + ExceptionUtils.getRootCauseMessage(aCause) + "\n" + sb.toString()); + return new IOException("Unable to load resource: " + sb.toString(), aCause); } else { - return new IOException("Unable to load resource [" + aLocation + "]: \n" - + ExceptionUtils.getRootCauseMessage(aCause) + "\n" + sb.toString()); + return new IOException("Unable to load resource [" + aLocation + "]: " + sb.toString(), + aCause); } } @@ -946,6 +1022,7 @@ protected Properties getAggregatedProperties() throws IOException { Properties defaultValues = new Properties(defaults); + Properties props = getProperties(); if (props != null) { defaultValues.putAll(props); @@ -1072,6 +1149,102 @@ else if (!url.equals(other.url)) { return true; } } + + public static final class ArtifactCoordinates + { + private String groupId; + private String artifactId; + private String version; + + public ArtifactCoordinates(String aGroupId, String aArtifactId, String aVersion) + { + super(); + groupId = aGroupId; + artifactId = aArtifactId; + version = aVersion; + } + + public String getGroupId() + { + return groupId; + } + + public void setGroupId(String aGroupId) + { + groupId = aGroupId; + } + + public String getArtifactId() + { + return artifactId; + } + + public void setArtifactId(String aArtifactId) + { + artifactId = aArtifactId; + } + + public String getVersion() + { + return version; + } + + public void setVersion(String aVersion) + { + version = aVersion; + } + + @Override + public int hashCode() + { + final int prime = 31; + int result = 1; + result = prime * result + ((artifactId == null) ? 0 : artifactId.hashCode()); + result = prime * result + ((groupId == null) ? 0 : groupId.hashCode()); + result = prime * result + ((version == null) ? 0 : version.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) + { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + ArtifactCoordinates other = (ArtifactCoordinates) obj; + if (artifactId == null) { + if (other.artifactId != null) { + return false; + } + } + else if (!artifactId.equals(other.artifactId)) { + return false; + } + if (groupId == null) { + if (other.groupId != null) { + return false; + } + } + else if (!groupId.equals(other.groupId)) { + return false; + } + if (version == null) { + if (other.version != null) { + return false; + } + } + else if (!version.equals(other.version)) { + return false; + } + return true; + } + } private static final class ExtensibleURLClassLoader extends URLClassLoader diff --git a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/ResourceUtils.java b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/ResourceUtils.java similarity index 99% rename from dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/ResourceUtils.java rename to dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/ResourceUtils.java index 329c9c3846..52292744ee 100644 --- a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/ResourceUtils.java +++ b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/ResourceUtils.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; +package org.dkpro.core.api.resources; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.commons.io.IOUtils.copy; @@ -40,6 +40,7 @@ import org.apache.uima.resource.ResourceAccessException; import org.springframework.core.io.Resource; import org.springframework.core.io.support.PathMatchingResourcePatternResolver; +import org.springframework.core.io.support.ResourcePatternResolver; /** * @since 1.1.0 @@ -93,7 +94,7 @@ public static File getClasspathAsFolder(String aClasspathBase, boolean aCache) synchronized (classpathFolderCache) { File folder = classpathFolderCache.get(aClasspathBase); - PathMatchingResourcePatternResolver resolver = new PathMatchingResourcePatternResolver(); + ResourcePatternResolver resolver = new PathMatchingResourcePatternResolver(); if (!aCache || (folder == null) || !folder.exists()) { folder = File.createTempFile("dkpro-package", ""); diff --git a/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/RuntimeProvider.java b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/RuntimeProvider.java new file mode 100644 index 0000000000..d0007d8032 --- /dev/null +++ b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/RuntimeProvider.java @@ -0,0 +1,191 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.resources; + +import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.dkpro.core.api.resources.ResourceUtils.resolveLocation; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.URL; +import java.util.Properties; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.springframework.core.io.UrlResource; +import org.springframework.core.io.support.PropertiesLoaderUtils; + +/** + * Provides temporary installation of binaries from the classpath to the local file system. + */ +public class RuntimeProvider +{ + private Log log = LogFactory.getLog(getClass()); + + public static final String MODE_EXECUTABLE = "executable"; + + private boolean installed; + private File workspace; + + private String baseLocation; + private PlatformDetector platformDetector; + private String platformId; + private Properties manifest; + + public RuntimeProvider(String aBaseLocation) + { + setBaseLocation(aBaseLocation); + platformDetector = new PlatformDetector(); + } + + public void setBaseLocation(String aBaseLocation) + { + baseLocation = aBaseLocation; + } + + public Properties getManifest() throws IOException + { + if (manifest == null) { + String mfl = baseLocation; + if (!mfl.endsWith("/")) { + mfl += "/"; + } + + boolean fallbackTo32Tried = false; + URL manifestUrl = null; + try { + manifestUrl = resolveLocation( + baseLocation + platformDetector.getPlatformId() + "/manifest.properties", + this, null); + platformId = platformDetector.getPlatformId(); + } + catch (FileNotFoundException e) { + // Ok, maybe we try a 32-bit fallback + } + + if (manifestUrl == null + && PlatformDetector.ARCH_X86_64.equals(platformDetector.getArch())) { + fallbackTo32Tried = true; + try { + manifestUrl = resolveLocation(baseLocation + platformDetector.getOs() + "-" + + PlatformDetector.ARCH_X86_32 + "/manifest.properties", this, null); + platformId = platformDetector.getOs() + "-" + PlatformDetector.ARCH_X86_32; + } + catch (FileNotFoundException e) { + // Ok, well, then we will generate an error next. + } + } + + if (manifestUrl == null) { + StringBuilder sb = new StringBuilder(); + sb.append("No files found for [").append(platformDetector.getPlatformId()) + .append("]"); + if (fallbackTo32Tried) { + sb.append(" Also no files for 32bit."); + } + throw new FileNotFoundException(sb.toString()); + } + else if (fallbackTo32Tried && log.isWarnEnabled()) { + log.warn("No binaries found for [" + platformDetector.getPlatformId() + "], using [" + + platformId + "] instead"); + } + + manifest = PropertiesLoaderUtils.loadProperties(new UrlResource(manifestUrl)); + } + return manifest; + } + + public boolean isInstalled() + { + return installed; + } + + public File getFile(String aFilename) throws IOException + { + install(); + File file = new File(getWorkspace(), aFilename); + if (!file.exists()) { + throw new FileNotFoundException("File not found in workspace: [" + aFilename + "]"); + } + return file; + } + + public File getWorkspace() throws IOException + { + if (workspace == null) { + workspace = File.createTempFile("dkpro", "runtime"); + FileUtils.forceDelete(workspace); + FileUtils.forceMkdir(workspace); + workspace.deleteOnExit(); + } + return workspace; + } + + public void install() throws IOException + { + if (installed) { + return; + } + + Properties manifest = getManifest(); + for (String filename : manifest.stringPropertyNames()) { + URL source = resolveLocation(baseLocation + platformId + "/" + filename, this, null); + File target = new File(getWorkspace(), filename); + InputStream is = null; + OutputStream os = null; + try { + is = source.openStream(); + os = new FileOutputStream(target); + IOUtils.copyLarge(is, os); + } + finally { + closeQuietly(is); + closeQuietly(os); + } + + if (MODE_EXECUTABLE.equals(manifest.getProperty(filename))) { + target.setExecutable(true); + } + + target.deleteOnExit(); + } + + installed = true; + } + + public void uninstall() + { + if (workspace != null) { + FileUtils.deleteQuietly(workspace); + workspace = null; + installed = false; + } + } + + @Override + protected void finalize() throws Throwable + { + uninstall(); + } +} diff --git a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/internal/ApacheCommonsLoggingAdapter.java b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/internal/ApacheCommonsLoggingAdapter.java similarity index 90% rename from dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/internal/ApacheCommonsLoggingAdapter.java rename to dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/internal/ApacheCommonsLoggingAdapter.java index 6c5a59b305..2abfc2308f 100644 --- a/dkpro-core-api-resources-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/resources/internal/ApacheCommonsLoggingAdapter.java +++ b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/internal/ApacheCommonsLoggingAdapter.java @@ -1,115 +1,115 @@ -/* - * Copyright 2014 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.resources.internal; - -import org.apache.commons.logging.Log; -import org.apache.ivy.util.AbstractMessageLogger; -import org.apache.ivy.util.Message; - -public class ApacheCommonsLoggingAdapter - extends AbstractMessageLogger -{ - private final Log logger; - - public ApacheCommonsLoggingAdapter(Log aLogger) - { - logger = aLogger; - } - - @Override - public void log(String msg, int level) - { - switch (level) { - case Message.MSG_DEBUG: - debug(msg); - break; - case Message.MSG_VERBOSE: - verbose(msg); - break; - case Message.MSG_INFO: - info(msg); - break; - case Message.MSG_WARN: - warn(msg); - break; - case Message.MSG_ERR: - error(msg); - break; - default: - break; - } - } - - @Override - public void rawlog(String msg, int level) - { - log(msg, level); - } - - @Override - public void debug(String msg) - { - if (logger.isDebugEnabled()) { - logger.debug(msg); - } - } - - @Override - public void verbose(String msg) - { - if (logger.isTraceEnabled()) { - logger.trace(msg); - } - } - - @Override - public void info(String msg) - { - if (logger.isInfoEnabled()) { - logger.info(msg); - } - } - - @Override - public void warn(String msg) - { - if(logger.isWarnEnabled()){ - logger.warn(msg); - } - } - - @Override - public void error(String msg) - { - if(logger.isErrorEnabled()){ - logger.error(msg); - } - } - - @Override - protected void doProgress() - { - // No nothing - } - - @Override - protected void doEndProgress(String msg) - { - // No nothing - } -} +/* + * Copyright 2014 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.resources.internal; + +import org.apache.commons.logging.Log; +import org.apache.ivy.util.AbstractMessageLogger; +import org.apache.ivy.util.Message; + +public class ApacheCommonsLoggingAdapter + extends AbstractMessageLogger +{ + private final Log logger; + + public ApacheCommonsLoggingAdapter(Log aLogger) + { + logger = aLogger; + } + + @Override + public void log(String msg, int level) + { + switch (level) { + case Message.MSG_DEBUG: + debug(msg); + break; + case Message.MSG_VERBOSE: + verbose(msg); + break; + case Message.MSG_INFO: + info(msg); + break; + case Message.MSG_WARN: + warn(msg); + break; + case Message.MSG_ERR: + error(msg); + break; + default: + break; + } + } + + @Override + public void rawlog(String msg, int level) + { + log(msg, level); + } + + @Override + public void debug(String msg) + { + if (logger.isDebugEnabled()) { + logger.debug(msg); + } + } + + @Override + public void verbose(String msg) + { + if (logger.isTraceEnabled()) { + logger.trace(msg); + } + } + + @Override + public void info(String msg) + { + if (logger.isInfoEnabled()) { + logger.info(msg); + } + } + + @Override + public void warn(String msg) + { + if (logger.isWarnEnabled()) { + logger.warn(msg); + } + } + + @Override + public void error(String msg) + { + if (logger.isErrorEnabled()) { + logger.error(msg); + } + } + + @Override + protected void doProgress() + { + // No nothing + } + + @Override + protected void doEndProgress(String msg) + { + // No nothing + } +} diff --git a/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/package-info.java b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/package-info.java new file mode 100644 index 0000000000..a0e2dc0acd --- /dev/null +++ b/dkpro-core-api-resources-asl/src/main/java/org/dkpro/core/api/resources/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * API for accessing resources. + * + * @since 1.1.0 + */ +package org.dkpro.core.api.resources; diff --git a/dkpro-core-api-resources-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/resources/MappingProviderTest.java b/dkpro-core-api-resources-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/resources/MappingProviderTest.java deleted file mode 100644 index 0664b5b420..0000000000 --- a/dkpro-core-api-resources-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/resources/MappingProviderTest.java +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; - -import static org.junit.Assert.assertEquals; - -import java.io.IOException; -import java.net.URL; -import java.util.Map; -import java.util.Properties; - -import org.apache.uima.cas.CAS; -import org.apache.uima.resource.metadata.impl.TypeSystemDescription_impl; -import org.apache.uima.util.CasCreationUtils; -import org.junit.Test; - -public class MappingProviderTest -{ - @Test - public void testLanguageChange() throws Exception - { - MappingProvider mappingProvider = new MappingProvider(); - mappingProvider.setDefault(MappingProvider.LOCATION, "src/test/resources/${language}.map"); - - CAS cas = CasCreationUtils.createCas(new TypeSystemDescription_impl(), null, null); - - cas.setDocumentLanguage("en"); - mappingProvider.configure(cas); - Map enMap = mappingProvider.getResource(); - assertEquals("en", enMap.get("value")); - - cas.setDocumentLanguage("de"); - mappingProvider.configure(cas); - Map deMap = mappingProvider.getResource(); - assertEquals("de", deMap.get("value")); - } - - @Test - public void testDefaultVariantWithLanguageOverride() throws Exception - { - Properties defaultVariants = new Properties(); - defaultVariants.setProperty("de", "variant1"); - defaultVariants.setProperty("en", "variant2"); - - MappingProvider mappingProvider = new MappingProvider(); - mappingProvider.setDefaultVariants(defaultVariants); - mappingProvider.setDefault(MappingProvider.LOCATION, "${language}-${variant}.map"); - - CAS cas = CasCreationUtils.createCas(new TypeSystemDescription_impl(), null, null); - - // Test default behavior - cas.setDocumentLanguage("en"); - mappingProvider.configure(cas); - assertEquals("en-variant2.map", mappingProvider.getModelLocation()); - - // Test language override affects variant - mappingProvider.setOverride(MappingProvider.LANGUAGE, "de"); - mappingProvider.configure(cas); - assertEquals("de-variant1.map", mappingProvider.getModelLocation()); - - // Test variant can still be overridden after language override - mappingProvider.setOverride(MappingProvider.LANGUAGE, "de"); - mappingProvider.setOverride(MappingProvider.VARIANT, "variant3"); - mappingProvider.configure(cas); - assertEquals("de-variant3.map", mappingProvider.getModelLocation()); - } - - @Test - public void testTagsetChange() throws Exception - { - CasConfigurableProviderBase modelProvider = new CasConfigurableProviderBase() - { - { - setDefault(LOCATION, "src/test/resources/${language}.model"); - } - - @Override - protected String produceResource(URL aUrl) - throws IOException - { - return aUrl.toString(); - } - }; - - MappingProvider mappingProvider = new MappingProvider(); - mappingProvider.setDefault(MappingProvider.LOCATION, "src/test/resources/${language}-${tagset}.map"); - mappingProvider.addImport("tagset", modelProvider); - - CAS cas = CasCreationUtils.createCas(new TypeSystemDescription_impl(), null, null); - - cas.setDocumentLanguage("en"); - modelProvider.configure(cas); - mappingProvider.configure(cas); - Map enMap = mappingProvider.getResource(); - assertEquals("en", enMap.get("value")); - assertEquals("mytags1", enMap.get("tagset")); - - cas.setDocumentLanguage("de"); - modelProvider.configure(cas); - mappingProvider.configure(cas); - Map deMap = mappingProvider.getResource(); - assertEquals("de", deMap.get("value")); - assertEquals("mytags2", deMap.get("tagset")); - } - - @Test - public void testRedirectedModel() throws Exception - { - CasConfigurableProviderBase modelProvider = new CasConfigurableProviderBase() - { - { - setDefault(LOCATION, "src/test/resources/${language}-redirect2.properties"); - } - - @Override - protected String produceResource(URL aUrl) - throws IOException - { - return aUrl.toString(); - } - }; - - MappingProvider mappingProvider = new MappingProvider(); - mappingProvider.setDefault(MappingProvider.LOCATION, "src/test/resources/${language}-${tagset}.map"); - mappingProvider.addImport("tagset", modelProvider); - - CAS cas = CasCreationUtils.createCas(new TypeSystemDescription_impl(), null, null); - - cas.setDocumentLanguage("de"); - modelProvider.configure(cas); - mappingProvider.configure(cas); - Map deMap = mappingProvider.getResource(); - assertEquals("de-override", deMap.get("value")); - Properties meta = modelProvider.getResourceMetaData(); - assertEquals("mytags1", meta.getProperty("tagset")); - assertEquals("true", meta.getProperty("redirect")); - assertEquals("true", meta.getProperty("redirect2")); - } - - @Test - public void testRedirectedTagset() throws Exception - { - MappingProvider mappingProvider = new MappingProvider(); - mappingProvider.setOverride(MappingProvider.LOCATION, "src/test/resources/${language}-${tagset}.map"); - mappingProvider.setDefault("tagset", "redirect"); - - CAS cas = CasCreationUtils.createCas(new TypeSystemDescription_impl(), null, null); - - cas.setDocumentLanguage("de"); - mappingProvider.configure(cas); - - Map deMap = mappingProvider.getResource(); - assertEquals("de", deMap.get("value")); - - Properties meta = mappingProvider.getResourceMetaData(); - assertEquals("redirected", meta.getProperty("tagset")); - } -} diff --git a/dkpro-core-api-resources-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/resources/CompressionUtilsTest.java b/dkpro-core-api-resources-asl/src/test/java/org/dkpro/core/api/resources/CompressionUtilsTest.java similarity index 90% rename from dkpro-core-api-resources-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/resources/CompressionUtilsTest.java rename to dkpro-core-api-resources-asl/src/test/java/org/dkpro/core/api/resources/CompressionUtilsTest.java index 836797526a..14a578f60b 100644 --- a/dkpro-core-api-resources-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/resources/CompressionUtilsTest.java +++ b/dkpro-core-api-resources-asl/src/test/java/org/dkpro/core/api/resources/CompressionUtilsTest.java @@ -15,16 +15,23 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; +package org.dkpro.core.api.resources; + +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.PrintWriter; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; +import org.dkpro.core.api.resources.CompressionMethod; +import org.dkpro.core.api.resources.CompressionUtils; import org.junit.Test; -import java.io.*; - -import static org.junit.Assert.assertEquals; - public class CompressionUtilsTest { private static void testCompression(CompressionMethod compressionMethod) @@ -93,4 +100,4 @@ public void testPrintWriter() is.close(); file.delete(); } -} \ No newline at end of file +} diff --git a/dkpro-core-api-resources-asl/src/test/java/org/dkpro/core/api/resources/MappingProviderTest.java b/dkpro-core-api-resources-asl/src/test/java/org/dkpro/core/api/resources/MappingProviderTest.java new file mode 100644 index 0000000000..a1ab07bf6d --- /dev/null +++ b/dkpro-core-api-resources-asl/src/test/java/org/dkpro/core/api/resources/MappingProviderTest.java @@ -0,0 +1,178 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.resources; + +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.net.URL; +import java.util.Map; +import java.util.Properties; + +import org.apache.uima.cas.CAS; +import org.apache.uima.resource.metadata.impl.TypeSystemDescription_impl; +import org.apache.uima.util.CasCreationUtils; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.junit.Test; + +public class MappingProviderTest +{ + @Test + public void testLanguageChange() throws Exception + { + MappingProvider mappingProvider = new MappingProvider(); + mappingProvider.setDefault(MappingProvider.LOCATION, "src/test/resources/${language}.map"); + + CAS cas = CasCreationUtils.createCas(new TypeSystemDescription_impl(), null, null); + + cas.setDocumentLanguage("en"); + mappingProvider.configure(cas); + Map enMap = mappingProvider.getResource(); + assertEquals("en", enMap.get("value")); + + cas.setDocumentLanguage("de"); + mappingProvider.configure(cas); + Map deMap = mappingProvider.getResource(); + assertEquals("de", deMap.get("value")); + } + + @Test + public void testDefaultVariantWithLanguageOverride() throws Exception + { + Properties defaultVariants = new Properties(); + defaultVariants.setProperty("de", "variant1"); + defaultVariants.setProperty("en", "variant2"); + + MappingProvider mappingProvider = new MappingProvider(); + mappingProvider.setDefaultVariants(defaultVariants); + mappingProvider.setDefault(MappingProvider.LOCATION, "${language}-${variant}.map"); + + CAS cas = CasCreationUtils.createCas(new TypeSystemDescription_impl(), null, null); + + // Test default behavior + cas.setDocumentLanguage("en"); + mappingProvider.configure(cas); + assertEquals("en-variant2.map", mappingProvider.getModelLocation()); + + // Test language override affects variant + mappingProvider.setOverride(MappingProvider.LANGUAGE, "de"); + mappingProvider.configure(cas); + assertEquals("de-variant1.map", mappingProvider.getModelLocation()); + + // Test variant can still be overridden after language override + mappingProvider.setOverride(MappingProvider.LANGUAGE, "de"); + mappingProvider.setOverride(MappingProvider.VARIANT, "variant3"); + mappingProvider.configure(cas); + assertEquals("de-variant3.map", mappingProvider.getModelLocation()); + } + + @Test + public void testTagsetChange() throws Exception + { + CasConfigurableProviderBase modelProvider = + new CasConfigurableProviderBase() + { + { + setDefault(LOCATION, "src/test/resources/${language}.model"); + } + + @Override + protected String produceResource(URL aUrl) throws IOException + { + return aUrl.toString(); + } + }; + + MappingProvider mappingProvider = new MappingProvider(); + mappingProvider.setDefault(MappingProvider.LOCATION, + "src/test/resources/${language}-${tagset}.map"); + mappingProvider.addImport("tagset", modelProvider); + + CAS cas = CasCreationUtils.createCas(new TypeSystemDescription_impl(), null, null); + + cas.setDocumentLanguage("en"); + modelProvider.configure(cas); + mappingProvider.configure(cas); + Map enMap = mappingProvider.getResource(); + assertEquals("en", enMap.get("value")); + assertEquals("mytags1", enMap.get("tagset")); + + cas.setDocumentLanguage("de"); + modelProvider.configure(cas); + mappingProvider.configure(cas); + Map deMap = mappingProvider.getResource(); + assertEquals("de", deMap.get("value")); + assertEquals("mytags2", deMap.get("tagset")); + } + + @Test + public void testRedirectedModel() throws Exception + { + CasConfigurableProviderBase modelProvider = + new CasConfigurableProviderBase() + { + { + setDefault(LOCATION, "src/test/resources/${language}-redirect2.properties"); + } + + @Override + protected String produceResource(URL aUrl) throws IOException + { + return aUrl.toString(); + } + }; + + MappingProvider mappingProvider = new MappingProvider(); + mappingProvider.setDefault(MappingProvider.LOCATION, + "src/test/resources/${language}-${tagset}.map"); + mappingProvider.addImport("tagset", modelProvider); + + CAS cas = CasCreationUtils.createCas(new TypeSystemDescription_impl(), null, null); + + cas.setDocumentLanguage("de"); + modelProvider.configure(cas); + mappingProvider.configure(cas); + Map deMap = mappingProvider.getResource(); + assertEquals("de-override", deMap.get("value")); + Properties meta = modelProvider.getResourceMetaData(); + assertEquals("mytags1", meta.getProperty("tagset")); + assertEquals("true", meta.getProperty("redirect")); + assertEquals("true", meta.getProperty("redirect2")); + } + + @Test + public void testRedirectedTagset() throws Exception + { + MappingProvider mappingProvider = new MappingProvider(); + mappingProvider.setOverride(MappingProvider.LOCATION, + "src/test/resources/${language}-${tagset}.map"); + mappingProvider.setDefault("tagset", "redirect"); + + CAS cas = CasCreationUtils.createCas(new TypeSystemDescription_impl(), null, null); + + cas.setDocumentLanguage("de"); + mappingProvider.configure(cas); + + Map deMap = mappingProvider.getResource(); + assertEquals("de", deMap.get("value")); + + Properties meta = mappingProvider.getResourceMetaData(); + assertEquals("redirected", meta.getProperty("tagset")); + } +} diff --git a/dkpro-core-api-resources-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/resources/ResourceObjectProviderTest.java b/dkpro-core-api-resources-asl/src/test/java/org/dkpro/core/api/resources/ResourceObjectProviderTest.java similarity index 83% rename from dkpro-core-api-resources-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/resources/ResourceObjectProviderTest.java rename to dkpro-core-api-resources-asl/src/test/java/org/dkpro/core/api/resources/ResourceObjectProviderTest.java index c8f01b5097..438391b523 100644 --- a/dkpro-core-api-resources-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/resources/ResourceObjectProviderTest.java +++ b/dkpro-core-api-resources-asl/src/test/java/org/dkpro/core/api/resources/ResourceObjectProviderTest.java @@ -15,8 +15,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; +package org.dkpro.core.api.resources; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -24,7 +25,10 @@ import java.io.IOException; import java.net.URL; import java.util.Properties; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.dkpro.core.api.resources.ResourceObjectProviderBase; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -36,7 +40,7 @@ public class ResourceObjectProviderTest @Rule public TemporaryFolder folder = new TemporaryFolder(); - @Test(expected=IOException.class) + @Test(expected = IOException.class) public void testIOException() throws Exception { ResourceObjectProviderBase provider = new ResourceObjectProviderBase() @@ -62,7 +66,7 @@ protected Properties getProperties() provider.configure(); } - @Test(expected=RuntimeException.class) + @Test(expected = RuntimeException.class) public void testIORuntime() throws Exception { ResourceObjectProviderBase provider = new ResourceObjectProviderBase() @@ -136,6 +140,22 @@ public void testCaching() throws Exception assertTrue(provider1.getResource() == provider2.getResource()); } + @Test + public void testPomFindingInJar() + { + String location = "jar:file:/opt/TDMlocalRepo/de/tudarmstadt/ukp/dkpro/core/" + + "de.tudarmstadt.ukp.dkpro.core.corenlp-gpl/1.9.1/" + + "de.tudarmstadt.ukp.dkpro.core.corenlp-gpl-1.9.1.jar!/"; + + Pattern pattern = Pattern.compile(".*/(?([a-zA-Z0-9-_]+\\.)*[a-zA-Z0-9-_]+)-([0-9]+\\.)*[0-9]+(-[a-zA-Z]+)?\\.jar!/.*"); + + Matcher matcher = pattern.matcher(location); + + assertTrue(matcher.matches()); + + assertEquals("de.tudarmstadt.ukp.dkpro.core.corenlp-gpl", matcher.group("ID")); + } + private static class SharableObjectProvider extends ResourceObjectProviderBase { { diff --git a/dkpro-core-api-resources-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/resources/ResourceUtilsTest.java b/dkpro-core-api-resources-asl/src/test/java/org/dkpro/core/api/resources/ResourceUtilsTest.java similarity index 91% rename from dkpro-core-api-resources-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/resources/ResourceUtilsTest.java rename to dkpro-core-api-resources-asl/src/test/java/org/dkpro/core/api/resources/ResourceUtilsTest.java index 6a18de6bb2..2e9b756bbb 100644 --- a/dkpro-core-api-resources-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/resources/ResourceUtilsTest.java +++ b/dkpro-core-api-resources-asl/src/test/java/org/dkpro/core/api/resources/ResourceUtilsTest.java @@ -15,12 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.resources; +package org.dkpro.core.api.resources; import static java.util.Arrays.asList; -import static org.hamcrest.CoreMatchers.is; +import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; import java.io.File; @@ -34,6 +33,7 @@ import org.apache.commons.io.FileUtils; import org.apache.commons.io.FilenameUtils; +import org.dkpro.core.api.resources.ResourceUtils; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; @@ -60,7 +60,7 @@ public void testClasspathAsFolder() throws Exception { File file = ResourceUtils - .getClasspathAsFolder("classpath:/de/tudarmstadt/ukp/dkpro/core/api", true); + .getClasspathAsFolder("classpath:/org/dkpro/core/api", true); List paths = new ArrayList(); for (File f : FileUtils.listFiles(file, null, true)) { @@ -102,13 +102,13 @@ public void testGetUrlAsExecutable() URL url = new URL("jar:file:src/test/resources/testfiles.zip!/testfiles/" + "FileSetCollectionReaderBase.class"); File file = ResourceUtils.getUrlAsExecutable(url, false); - assertThat(file.getName().endsWith("temp"), is(true)); + + assertThat(file.getName()).endsWith("temp"); URL url2 = new URL("jar:file:src/test/resources/testfiles.zip!/testfiles/" + "ResourceCollectionReaderBase.class"); file = ResourceUtils.getUrlAsExecutable(url2, true); - assertThat(file.getName().endsWith("temp"), is(true)); - + + assertThat(file.getName()).endsWith("temp"); } - } diff --git a/dkpro-core-api-resources-asl/src/test/resources/log4j.properties b/dkpro-core-api-resources-asl/src/test/resources/log4j.properties deleted file mode 100644 index 9ef9876f5c..0000000000 --- a/dkpro-core-api-resources-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,7 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG diff --git a/dkpro-core-api-resources-asl/src/test/resources/log4j2.xml b/dkpro-core-api-resources-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..19bf03b585 --- /dev/null +++ b/dkpro-core-api-resources-asl/src/test/resources/log4j2.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/dkpro-core-api-segmentation-asl/pom.xml b/dkpro-core-api-segmentation-asl/pom.xml index adf58d3c0e..fdbba8a5af 100644 --- a/dkpro-core-api-segmentation-asl/pom.xml +++ b/dkpro-core-api-segmentation-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + dkpro-core-api-segmentation-asl jar DKPro Core ASL - Lexical Units API + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -36,12 +37,16 @@ uimafit-core - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -49,8 +54,8 @@ test - org.hamcrest - hamcrest-core + org.assertj + assertj-core test diff --git a/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/SegmenterBase.java b/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/SegmenterBase.java deleted file mode 100644 index ac8ed4c14a..0000000000 --- a/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/SegmenterBase.java +++ /dev/null @@ -1,327 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.segmentation; - -import static org.apache.uima.fit.util.CasUtil.getType; -import static org.apache.uima.fit.util.CasUtil.select; - -import java.util.Iterator; -import java.util.Locale; -import java.util.SortedSet; -import java.util.TreeSet; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.jcas.JCas; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.TokenForm; - -/** - */ -public abstract -class SegmenterBase -extends JCasAnnotator_ImplBase -{ - /** - * A list of type names used for zoning. - */ - public final static String PARAM_ZONE_TYPES = "zoneTypes"; - @ConfigurationParameter(name=PARAM_ZONE_TYPES, mandatory=false, defaultValue = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div" }) - private String[] zoneTypes; - - /** - * Strict zoning causes the segmentation to be applied only within the - * boundaries of a zone annotation. This works only if a single zone type - * is specified (the zone annotations should NOT overlap) or if no zone - * type is specified - in which case the whole document is taken as a zone. - * If strict zoning is turned off, multiple zone types can be specified. - * A list of all zone boundaries (start and end) is created and segmentation - * happens between them. - */ - public final static String PARAM_STRICT_ZONING = "strictZoning"; - @ConfigurationParameter(name=PARAM_STRICT_ZONING, mandatory=true, defaultValue="false") - private boolean strictZoning; - - /** - * The language. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name=PARAM_LANGUAGE, mandatory=false) - private String language; - - /** - * Create {@link Token} annotations. - */ - public static final String PARAM_WRITE_TOKEN = ComponentParameters.PARAM_WRITE_TOKEN; - @ConfigurationParameter(name=PARAM_WRITE_TOKEN, mandatory=true, defaultValue="true") - private boolean writeToken; - - /** - * Create {@link TokenForm} annotations. - */ - public static final String PARAM_WRITE_FORM = ComponentParameters.PARAM_WRITE_FORM; - @ConfigurationParameter(name=PARAM_WRITE_FORM, mandatory=true, defaultValue="true") - private boolean writeForm; - - /** - * Create {@link Sentence} annotations. - */ - public static final String PARAM_WRITE_SENTENCE = ComponentParameters.PARAM_WRITE_SENTENCE; - @ConfigurationParameter(name=PARAM_WRITE_SENTENCE, mandatory=true, defaultValue="true") - private boolean writeSentence; - - public boolean isStrictZoning() - { - return strictZoning; - } - - public boolean isWriteSentence() - { - return writeSentence; - } - - public boolean isWriteToken() - { - return writeToken; - } - - public String[] getZoneTypes() - { - return zoneTypes; - } - - @Override - public void process(JCas jcas) - throws AnalysisEngineProcessException - { - String text = jcas.getDocumentText(); - - String[] zones = getZoneTypes(); - if (isStrictZoning()) { - if (zones == null || zones.length == 0) { - process(jcas, text.substring(0, text.length()), 0); - } - else if (zones.length != 1) { - throw new AnalysisEngineProcessException(new IllegalStateException( - "Strict zoning cannot use multiple zone types")); - } else { - CAS cas = jcas.getCas(); - for (AnnotationFS zone : select(cas, getType(cas, zones[0]))) { - int[] adjusted = limit(text, zone.getBegin(), zone.getEnd()); - process(jcas, text.substring(adjusted[0], adjusted[1]), adjusted[0]); - } - } - } - else { - // This set collects all zone boundaries. - SortedSet boundarySet = new TreeSet(); - boundarySet.add(0); // Add start boundary - boundarySet.add(text.length()); // Add end boundary - - // If zoneTypes have been define then get the boundaries, otherwise we will - // simply have one big zone covering the whole document. - if (zones != null) { - // Iterate over all the zone indices and create sentences respecting - // the zone boundaries. If the zoneTypes overlap... well... bad luck! - for (String zoneName : zones) { - CAS cas = jcas.getCas(); - for (AnnotationFS zone : select(cas, getType(cas, zoneName))) { - int[] adjusted = limit(text, zone.getBegin(), zone.getEnd()); - boundarySet.add(adjusted[0]); - boundarySet.add(adjusted[1]); - } - } - } - - // Now process all zoneTypes. There will be at least two entries in the - // boundary set (see above). - Iterator bi = boundarySet.iterator(); - int begin = bi.next(); - while (bi.hasNext()) { - int end = bi.next(); - process(jcas, text.substring(begin, end), begin); - begin = end; - } - } - } - - /** - * Adjust the values in the two numeric arguments to be within the limits of the specified text. - * If the limits have to be adjusted, a warning is issued to the log. Illegal zone boundaries - * hint to a bug in the AE that produced the zone annotations. - * - * @param text - * the text. - * @param aBegin - * the zone begin. - * @param aEnd - * the zone end. - * @return reduced offsets. - */ - protected int[] limit(String text, int aBegin, int aEnd) - { - // checking to avoid out-of-bounds - int maxEnd = text.length(); - int begin = aBegin < 0 ? 0 : aBegin; - begin = begin > maxEnd ? maxEnd : begin; - - int end = aEnd < 0 ? 0 : aEnd; - end = end > maxEnd ? maxEnd : end; - - if (begin != aBegin || end != aEnd) { - getLogger().warn( - "Adjusted out-of-bounds zone [" + aBegin + "-" + aEnd + "] to [" + begin + "-" - + end + "]"); - } - - int[] offsets = { begin, end }; - return offsets; - } - - protected Sentence createSentence(final JCas aJCas, final int aBegin, - final int aEnd) - { - int[] span = new int[] { aBegin, aEnd }; - trim(aJCas.getDocumentText(), span); - if (!isEmpty(span[0], span[1]) && isWriteSentence()) { - Sentence seg = new Sentence(aJCas, span[0], span[1]); - seg.addToIndexes(aJCas); - return seg; - } - else { - return null; - } - } - - /** - * @deprecated use {@link #createToken(JCas, int, int)} - */ - @Deprecated - protected Token createToken(final JCas aJCas, final int aBegin, final int aEnd, final int aIndex) - { - return createToken(aJCas, null, aBegin, aEnd); - } - - protected Token createToken(final JCas aJCas, final int aBegin, final int aEnd) - { - return createToken(aJCas, null, aBegin, aEnd); - } - - protected Token createToken(final JCas aJCas, final String aForm, final int aBegin, - final int aEnd) - { - int[] span = new int[] { aBegin, aEnd }; - trim(aJCas.getDocumentText(), span); - if (!isEmpty(span[0], span[1]) && isWriteToken()) { - Token seg = new Token(aJCas, span[0], span[1]); - if (aForm != null && writeForm) { - seg.setText(aForm); - } - seg.addToIndexes(aJCas); - return seg; - } - else { - return null; - } - } - - protected abstract void process(JCas aJCas, String text, int zoneBegin) - throws AnalysisEngineProcessException; - - /** - * Remove trailing or leading whitespace from the annotation. - * - * @param aText - * the text. - * @param aSpan - * the offsets. - */ - public static void trim(String aText, int[] aSpan) - { - String data = aText; - - int begin = aSpan[0]; - int end = aSpan[1] - 1; - - // Remove whitespace at end - while ((end > 0) && trimChar(data.charAt(end))) { - end--; - } - end++; - - // Remove whitespace at start - while ((begin < end) && trimChar(data.charAt(begin))) { - begin++; - } - - aSpan[0] = begin; - aSpan[1] = end; - } - - public boolean isEmpty(int aBegin, int aEnd) - { - return aBegin >= aEnd; - } - - public static boolean trimChar(final char aChar) - { - switch (aChar) { - case '\n': return true; // Line break - case '\r': return true; // Carriage return - case '\t': return true; // Tab - case '\u200E': return true; // LEFT-TO-RIGHT MARK - case '\u200F': return true; // RIGHT-TO-LEFT MARK - case '\u2028': return true; // LINE SEPARATOR - case '\u2029': return true; // PARAGRAPH SEPARATOR - default: - return Character.isWhitespace(aChar); - } - } - - public String getLanguage(JCas aJCas) - { - if (language != null) { - return language; - } - else { - return aJCas.getDocumentLanguage(); - } - } - - /** - * Get the locale from the parameter, then from the document if available. - * If no locale is set get the default locale from the VM. - * @param aJCas the JCas. - * @return the locale. - */ - public Locale getLocale(JCas aJCas) - { - String lang = getLanguage(aJCas); - if (lang != null) { - return new Locale(lang); - } - else { - return Locale.getDefault(); - } - } -} diff --git a/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/TrimUtils.java b/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/TrimUtils.java new file mode 100644 index 0000000000..663422d484 --- /dev/null +++ b/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/TrimUtils.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.tudarmstadt.ukp.dkpro.core.api.segmentation; + +import org.apache.uima.jcas.tcas.Annotation; + +public class TrimUtils +{ + /** + * Trim the offsets of the given annotation to remove leading/trailing whitespace. + *

+ * Note: use this method if the document text of the CAS has not been set yet but you + * have it available in a buffer. + *

+ * Note: best use this method before adding the annotation to the indexes. + * + * @param aText + * the document text (available so far). + * @param aAnnotation + * the annotation to trim. Offsets are updated. + */ + public static void trim(CharSequence aText, Annotation aAnnotation) + { + int[] offsets = { aAnnotation.getBegin(), aAnnotation.getEnd() }; + trim(aText, offsets); + aAnnotation.setBegin(offsets[0]); + aAnnotation.setEnd(offsets[1]); + } + + /** + * Remove trailing or leading whitespace from the annotation. + * @param aText the text. + * @param aSpan the offsets. + */ + public static void trim(CharSequence aText, int[] aSpan) + { + if (aSpan[0] == aSpan[1]) { + // Nothing to do on empty spans + return; + } + + int begin = aSpan[0]; + int end = aSpan[1]; + + // First we trim at the end. If a trimmed span is empty, we want to return the original + // begin as the begin/end of the trimmed span + while ( + (end > 0) + && end > begin + && trimChar(aText.charAt(end - 1)) + ) { + end --; + } + + // Then, trim at the start + while ( + (begin < (aText.length() - 1)) + && begin < end + && trimChar(aText.charAt(begin)) + ) { + begin ++; + } + + aSpan[0] = begin; + aSpan[1] = end; + } + + private static boolean trimChar(final char aChar) + { + switch (aChar) { + case '\n': return true; // Line break + case '\r': return true; // Carriage return + case '\t': return true; // Tab + case '\u200E': return true; // LEFT-TO-RIGHT MARK + case '\u200F': return true; // RIGHT-TO-LEFT MARK + case '\u2028': return true; // LINE SEPARATOR + case '\u2029': return true; // PARAGRAPH SEPARATOR + default: + return Character.isWhitespace(aChar); + } + } +} diff --git a/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/package-info.java b/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/package-info.java deleted file mode 100644 index 3954389be2..0000000000 --- a/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * API for tokenization and segmentation. - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.api.segmentation; diff --git a/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/Compound.java b/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/Compound.java index f6d01a95b6..651a87997b 100644 --- a/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/Compound.java +++ b/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/Compound.java @@ -1,238 +1,256 @@ -/* First created by JCasGen Sat Aug 04 18:47:40 CEST 2012 */ + +/* Apache UIMA v3 - First created by JCasGen Sun Jan 28 11:38:37 CET 2018 */ + package de.tudarmstadt.ukp.dkpro.core.api.segmentation.type; +import java.lang.invoke.CallSite; +import java.lang.invoke.MethodHandle; import java.util.ArrayList; import java.util.Collection; import java.util.List; +import org.apache.uima.cas.impl.CASImpl; +import org.apache.uima.cas.impl.TypeImpl; +import org.apache.uima.cas.impl.TypeSystemImpl; import org.apache.uima.fit.util.FSCollectionFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.JCasRegistry; import org.apache.uima.jcas.cas.FSArray; -import org.apache.uima.jcas.cas.TOP_Type; import org.apache.uima.jcas.tcas.Annotation; - -/** - * Updated by JCasGen Sat Aug 04 18:48:32 CEST 2012 - * XML source: /Users/bluefire/UKP/Workspaces/dkpro-juno/de.tudarmstadt.ukp.dkpro.core-asl/de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl/src/main/resources/desc/type/Segmentation.xml +/** This type represents a decompounding word, i.e.: flowerpot. Each Compound one have at least two Splits. + * Updated by JCasGen Sun Jan 28 11:38:37 CET 2018 + * XML source: /Users/bluefire/git/dkpro-core/dkpro-core-api-segmentation-asl/src/main/resources/desc/type/LexicalUnits_customized.xml * @generated */ public class Compound extends Annotation { - /** @generated - * @ordered - */ - @SuppressWarnings ("hiding") - public final static int typeIndexID = JCasRegistry.register(Compound.class); - /** @generated - * @ordered - */ - @SuppressWarnings ("hiding") - public final static int type = typeIndexID; - /** @generated */ - @Override - public int getTypeIndexID() {return typeIndexID;} - - /** Never called. Disable default constructor - * @generated */ - protected Compound() {/* intentionally empty block */} - - /** Internal - constructor used by generator - * @generated - * @param addr low level Feature Structure reference - * @param type the type of this Feature Structure - */ - public Compound(int addr, TOP_Type type) { - super(addr, type); - readObject(); - } - - /** @generated - * @param jcas JCas to which this Feature Structure belongs - */ - public Compound(JCas jcas) { - super(jcas); - readObject(); - } - - /** @generated - * @param jcas JCas to which this Feature Structure belongs - * @param begin offset to the begin spot in the SofA - * @param end offset to the end spot in the SofA - */ - public Compound(JCas jcas, int begin, int end) { - super(jcas); - setBegin(begin); - setEnd(end); - readObject(); - } - - /** - * Write your own initialization here - * - @generated modifiable */ - private void readObject() {/*default - does nothing empty block */} - - - - //*--------------* - //* Feature: splits - - /** getter for splits - gets A word that can be decomposed into different parts. + + /** @generated + * @ordered + */ + @SuppressWarnings ("hiding") + public final static String _TypeName = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound"; + + /** @generated + * @ordered + */ + @SuppressWarnings ("hiding") + public final static int typeIndexID = JCasRegistry.register(Compound.class); + /** @generated + * @ordered + */ + @SuppressWarnings ("hiding") + public final static int type = typeIndexID; + /** @generated + * @return index of the type + */ + @Override + public int getTypeIndexID() {return typeIndexID;} + + + /* ******************* + * Feature Offsets * + * *******************/ + + public final static String _FeatName_splits = "splits"; + + + /* Feature Adjusted Offsets */ + private final static CallSite _FC_splits = TypeSystemImpl.createCallSite(Compound.class, "splits"); + private final static MethodHandle _FH_splits = _FC_splits.dynamicInvoker(); + + + /** Never called. Disable default constructor + * @generated */ + protected Compound() {/* intentionally empty block */} + + /** Internal - constructor used by generator + * @generated + * @param casImpl the CAS this Feature Structure belongs to + * @param type the type of this Feature Structure + */ + public Compound(TypeImpl type, CASImpl casImpl) { + super(type, casImpl); + readObject(); + } + + /** @generated + * @param jcas JCas to which this Feature Structure belongs + */ + public Compound(JCas jcas) { + super(jcas); + readObject(); + } + + + /** @generated + * @param jcas JCas to which this Feature Structure belongs + * @param begin offset to the begin spot in the SofA + * @param end offset to the end spot in the SofA + */ + public Compound(JCas jcas, int begin, int end) { + super(jcas); + setBegin(begin); + setEnd(end); + readObject(); + } + + /** + * + * Write your own initialization here + * + * + * @generated modifiable + */ + private void readObject() {/*default - does nothing empty block */} + + + + //*--------------* + //* Feature: splits + + /** getter for splits - gets A word that can be decomposed into different parts. * @generated * @return value of the feature */ - public FSArray getSplits() { - if (Compound_Type.featOkTst && ((Compound_Type)jcasType).casFeat_splits == null) { - jcasType.jcas.throwFeatMissing("splits", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound"); - } - return (FSArray)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((Compound_Type)jcasType).casFeatCode_splits)));} - - /** setter for splits - sets A word that can be decomposed into different parts. + public FSArray getSplits() { return (FSArray)(_getFeatureValueNc(wrapGetIntCatchException(_FH_splits)));} + + /** setter for splits - sets A word that can be decomposed into different parts. * @generated * @param v value to set into the feature */ - public void setSplits(FSArray v) { - if (Compound_Type.featOkTst && ((Compound_Type)jcasType).casFeat_splits == null) { - jcasType.jcas.throwFeatMissing("splits", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound"); - } - jcasType.ll_cas.ll_setRefValue(addr, ((Compound_Type)jcasType).casFeatCode_splits, jcasType.ll_cas.ll_getFSRef(v));} - - /** indexed getter for splits - gets an indexed value - A word that can be decomposed into different parts. + public void setSplits(FSArray v) { + _setFeatureValueNcWj(wrapGetIntCatchException(_FH_splits), v); + } + + + /** indexed getter for splits - gets an indexed value - A word that can be decomposed into different parts. * @generated * @param i index in the array to get * @return value of the element at index i */ - public Split getSplits(int i) { - if (Compound_Type.featOkTst && ((Compound_Type)jcasType).casFeat_splits == null) { - jcasType.jcas.throwFeatMissing("splits", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound"); - } - jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((Compound_Type)jcasType).casFeatCode_splits), i); - return (Split)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((Compound_Type)jcasType).casFeatCode_splits), i)));} + public Split getSplits(int i) { + return (Split)(((FSArray)(_getFeatureValueNc(wrapGetIntCatchException(_FH_splits)))).get(i));} - /** indexed setter for splits - sets an indexed value - A word that can be decomposed into different parts. + /** indexed setter for splits - sets an indexed value - A word that can be decomposed into different parts. * @generated * @param i index in the array to set * @param v value to set into the array */ - public void setSplits(int i, Split v) { - if (Compound_Type.featOkTst && ((Compound_Type)jcasType).casFeat_splits == null) { - jcasType.jcas.throwFeatMissing("splits", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound"); - } - jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((Compound_Type)jcasType).casFeatCode_splits), i); - jcasType.ll_cas.ll_setRefArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((Compound_Type)jcasType).casFeatCode_splits), i, jcasType.ll_cas.ll_getFSRef(v));} - - - /** - * Enum for all possible split levels for decompounding - */ - public enum CompoundSplitLevel - { - NONE, ALL, LOWEST, HIGHEST - }; - - /** - * - * Returns the splits from each leave from the split tree, excluding the linking morphemes - * @param splitLevel the split level. - * - * @return An array with the splits from each leave from the split tree. - */ - public Split[] getSplitsWithoutMorpheme(CompoundSplitLevel splitLevel) - { - List splits = getSplits(createSplitsFromFSArray(getSplits()), false, splitLevel); - return splits.toArray(new Split[splits.size()]); - } - - /** - * - * Returns the splits from each leave from the split tree, including the linking morphemes - * @param splitLevel the split level. - * - * @return An array with the splits from each leave from the split tree. - * - */ - - public Split[] getSplitsWithMorpheme(CompoundSplitLevel splitLevel) - { - final List splits = getSplits(createSplitsFromFSArray(getSplits()), true, splitLevel); - return splits.toArray(new Split[splits.size()]); - } - - /** - * - * Returns a list of the fragments present in the leaves from the split tree stored in the - * splits array. - * - * @param splits - * Array containing the split tree - * @param includeMorpheme - * Indicates whether or not the linking morphemes should be included - * @param splitLevel - * The level of leaves that should be returned - * @return A list of all splits on a level - */ - private List getSplits(final Split[] splits, final boolean includeMorpheme, - CompoundSplitLevel splitLevel) - { - List splitList = new ArrayList(); - - switch (splitLevel) { - - case ALL: - for (Split split : splits) { - if (includeMorpheme || !(split instanceof LinkingMorpheme)) { - splitList.add(split); - } - if (split.getSplits() != null) { - splitList.addAll(getSplits(createSplitsFromFSArray(split.getSplits()), - includeMorpheme, splitLevel)); - } - } - return splitList; - - case LOWEST: - for (Split split : splits) { - if ((includeMorpheme || !(split instanceof LinkingMorpheme)) - && (split.getSplits() == null || split.getSplits().size() == 0)) { - splitList.add(split); - } - if (split.getSplits() != null) { - splitList.addAll(getSplits(createSplitsFromFSArray(split.getSplits()), - includeMorpheme, splitLevel)); - } - - } - return splitList; - - case HIGHEST: - for (Split split : splits) { - if (includeMorpheme || !(split instanceof LinkingMorpheme)) { - splitList.add(split); - } - } - return splitList; - - default: - return splitList; - } - } - - /** - * - * Create a Split[] array from a FSArray - * - * @param splitsFSArray - * FSArray containing the splits - * @return The array containing the splits from FSArray - */ - private Split[] createSplitsFromFSArray(final FSArray splitsFSArray) - { - final Collection splitsCollection = FSCollectionFactory.create(splitsFSArray, - Split.class); - return splitsCollection.toArray(new Split[splitsCollection.size()]); - } -} + public void setSplits(int i, Split v) { + ((FSArray)(_getFeatureValueNc(wrapGetIntCatchException(_FH_splits)))).set(i, v); + } + + + /** + * Enum for all possible split levels for decompounding + */ + public enum CompoundSplitLevel + { + NONE, ALL, LOWEST, HIGHEST + }; + + /** + * + * Returns the splits from each leave from the split tree, excluding the linking morphemes + * @param splitLevel the split level. + * + * @return An array with the splits from each leave from the split tree. + */ + public Split[] getSplitsWithoutMorpheme(CompoundSplitLevel splitLevel) + { + List splits = getSplits(createSplitsFromFSArray(getSplits()), false, splitLevel); + return splits.toArray(new Split[splits.size()]); + } + + /** + * + * Returns the splits from each leave from the split tree, including the linking morphemes + * @param splitLevel the split level. + * + * @return An array with the splits from each leave from the split tree. + * + */ + public Split[] getSplitsWithMorpheme(CompoundSplitLevel splitLevel) + { + final List splits = getSplits(createSplitsFromFSArray(getSplits()), true, splitLevel); + return splits.toArray(new Split[splits.size()]); + } + + /** + * + * Returns a list of the fragments present in the leaves from the split tree stored in the + * splits array. + * + * @param splits + * Array containing the split tree + * @param includeMorpheme + * Indicates whether or not the linking morphemes should be included + * @param splitLevel + * The level of leaves that should be returned + * @return A list of all splits on a level + */ + private List getSplits(final Split[] splits, final boolean includeMorpheme, + CompoundSplitLevel splitLevel) + { + List splitList = new ArrayList(); + + switch (splitLevel) { + + case ALL: + for (Split split : splits) { + if (includeMorpheme || !(split instanceof LinkingMorpheme)) { + splitList.add(split); + } + if (split.getSplits() != null) { + splitList.addAll(getSplits(createSplitsFromFSArray(split.getSplits()), + includeMorpheme, splitLevel)); + } + } + return splitList; + + case LOWEST: + for (Split split : splits) { + if ((includeMorpheme || !(split instanceof LinkingMorpheme)) + && (split.getSplits() == null || split.getSplits().size() == 0)) { + splitList.add(split); + } + if (split.getSplits() != null) { + splitList.addAll(getSplits(createSplitsFromFSArray(split.getSplits()), + includeMorpheme, splitLevel)); + } + + } + return splitList; + + case HIGHEST: + for (Split split : splits) { + if (includeMorpheme || !(split instanceof LinkingMorpheme)) { + splitList.add(split); + } + } + return splitList; + + default: + return splitList; + } + } + + /** + * + * Create a Split[] array from a FSArray + * + * @param splitsFSArray + * FSArray containing the splits + * @return The array containing the splits from FSArray + */ + private Split[] createSplitsFromFSArray(final FSArray splitsFSArray) + { + final Collection splitsCollection = FSCollectionFactory.create(splitsFSArray, + Split.class); + return splitsCollection.toArray(new Split[splitsCollection.size()]); + } +} diff --git a/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/Compound_Type.java b/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/Compound_Type.java deleted file mode 100644 index f4904a46bd..0000000000 --- a/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/Compound_Type.java +++ /dev/null @@ -1,119 +0,0 @@ - -/* First created by JCasGen Sat Aug 04 18:47:40 CEST 2012 */ -package de.tudarmstadt.ukp.dkpro.core.api.segmentation.type; - -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.JCasRegistry; -import org.apache.uima.cas.impl.CASImpl; -import org.apache.uima.cas.impl.FSGenerator; -import org.apache.uima.cas.FeatureStructure; -import org.apache.uima.cas.impl.TypeImpl; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.impl.FeatureImpl; -import org.apache.uima.cas.Feature; -import org.apache.uima.jcas.tcas.Annotation_Type; - -/** - * Updated by JCasGen Sat Aug 04 18:48:32 CEST 2012 - * @generated */ -public class Compound_Type extends Annotation_Type { - /** @generated */ - @Override - protected FSGenerator getFSGenerator() {return fsGenerator;} - /** @generated */ - private final FSGenerator fsGenerator = - new FSGenerator() { - @Override - public FeatureStructure createFS(int addr, CASImpl cas) { - if (Compound_Type.this.useExistingInstance) { - // Return eq fs instance if already created - FeatureStructure fs = Compound_Type.this.jcas.getJfsFromCaddr(addr); - if (null == fs) { - fs = new Compound(addr, Compound_Type.this); - Compound_Type.this.jcas.putJfsFromCaddr(addr, fs); - return fs; - } - return fs; - } else return new Compound(addr, Compound_Type.this); - } - }; - /** @generated */ - @SuppressWarnings ("hiding") - public final static int typeIndexID = Compound.typeIndexID; - /** @generated - @modifiable */ - @SuppressWarnings ("hiding") - public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound"); - - /** @generated */ - final Feature casFeat_splits; - /** @generated */ - final int casFeatCode_splits; - /** @generated - * @param addr low level Feature Structure reference - * @return the feature value - */ - public int getSplits(int addr) { - if (featOkTst && casFeat_splits == null) - jcas.throwFeatMissing("splits", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound"); - return ll_cas.ll_getRefValue(addr, casFeatCode_splits); - } - /** @generated - * @param addr low level Feature Structure reference - * @param v value to set - */ - public void setSplits(int addr, int v) { - if (featOkTst && casFeat_splits == null) - jcas.throwFeatMissing("splits", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound"); - ll_cas.ll_setRefValue(addr, casFeatCode_splits, v);} - - /** @generated - * @param addr low level Feature Structure reference - * @param i index of item in the array - * @return value at index i in the array - */ - public int getSplits(int addr, int i) { - if (featOkTst && casFeat_splits == null) - jcas.throwFeatMissing("splits", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound"); - if (lowLevelTypeChecks) - return ll_cas.ll_getRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_splits), i, true); - jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_splits), i); - return ll_cas.ll_getRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_splits), i); - } - - /** @generated - * @param addr low level Feature Structure reference - * @param i index of item in the array - * @param v value to set - */ - public void setSplits(int addr, int i, int v) { - if (featOkTst && casFeat_splits == null) - jcas.throwFeatMissing("splits", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound"); - if (lowLevelTypeChecks) - ll_cas.ll_setRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_splits), i, v, true); - jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_splits), i); - ll_cas.ll_setRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_splits), i, v); - } - - - - - /** initialize variables to correspond with Cas Type and Features - * @generated - * @param jcas JCas - * @param casType Type - */ - public Compound_Type(JCas jcas, Type casType) { - super(jcas, casType); - casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator()); - - - casFeat_splits = jcas.getRequiredFeatureDE(casType, "splits", "uima.cas.FSArray", featOkTst); - casFeatCode_splits = (null == casFeat_splits) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_splits).getCode(); - - } -} - - - - \ No newline at end of file diff --git a/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/Token.java b/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/Token.java index 2a7f64daad..fbd2722373 100644 --- a/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/Token.java +++ b/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/Token.java @@ -1,38 +1,37 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * First created by JCasGen Thu Sep 15 23:03:44 EEST 2016 - */ + + + +/* Apache UIMA v3 - First created by JCasGen Sun Jan 28 11:38:37 CET 2018 */ + package de.tudarmstadt.ukp.dkpro.core.api.segmentation.type; -import org.apache.uima.jcas.JCas; +import java.lang.invoke.CallSite; +import java.lang.invoke.MethodHandle; + +import org.apache.uima.cas.CASException; +import org.apache.uima.cas.impl.CASImpl; +import org.apache.uima.cas.impl.TypeImpl; +import org.apache.uima.cas.impl.TypeSystemImpl; +import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.JCasRegistry; -import org.apache.uima.jcas.cas.TOP_Type; +import org.apache.uima.jcas.tcas.Annotation; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import org.apache.uima.jcas.tcas.Annotation; /**

Token is one of the two types commonly produced by a segmenter (the other being Sentence). A Token usually represents a word, although it may be used to represent multiple tightly connected words (e.g. "New York") or parts of a word (e.g. the possessive "'s"). One may choose to split compound words into multiple tokens, e.g. ("CamelCase" -> "Camel", "Case"; "Zauberstab" -> "Zauber", "stab"). Most processing components operate on Tokens, usually within the limits of the surrounding Sentence. E.g. a part-of-speech tagger analyses each Token in a Sentence and assigns a part-of-speech to each Token.

- * Updated by JCasGen Tue Mar 07 16:08:28 CET 2017 + * Updated by JCasGen Sun Jan 28 11:38:37 CET 2018 * XML source: /Users/bluefire/git/dkpro-core/dkpro-core-api-segmentation-asl/src/main/resources/desc/type/LexicalUnits_customized.xml * @generated */ public class Token extends Annotation { + + /** @generated + * @ordered + */ + @SuppressWarnings ("hiding") + public final static String _TypeName = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"; + /** @generated * @ordered */ @@ -49,17 +48,51 @@ public class Token extends Annotation { @Override public int getTypeIndexID() {return typeIndexID;} + + /* ******************* + * Feature Offsets * + * *******************/ + + public final static String _FeatName_parent = "parent"; + public final static String _FeatName_lemma = "lemma"; + public final static String _FeatName_stem = "stem"; + public final static String _FeatName_pos = "pos"; + public final static String _FeatName_morph = "morph"; + public final static String _FeatName_id = "id"; + public final static String _FeatName_form = "form"; + public final static String _FeatName_syntacticFunction = "syntacticFunction"; + + + /* Feature Adjusted Offsets */ + private final static CallSite _FC_parent = TypeSystemImpl.createCallSite(Token.class, "parent"); + private final static MethodHandle _FH_parent = _FC_parent.dynamicInvoker(); + private final static CallSite _FC_lemma = TypeSystemImpl.createCallSite(Token.class, "lemma"); + private final static MethodHandle _FH_lemma = _FC_lemma.dynamicInvoker(); + private final static CallSite _FC_stem = TypeSystemImpl.createCallSite(Token.class, "stem"); + private final static MethodHandle _FH_stem = _FC_stem.dynamicInvoker(); + private final static CallSite _FC_pos = TypeSystemImpl.createCallSite(Token.class, "pos"); + private final static MethodHandle _FH_pos = _FC_pos.dynamicInvoker(); + private final static CallSite _FC_morph = TypeSystemImpl.createCallSite(Token.class, "morph"); + private final static MethodHandle _FH_morph = _FC_morph.dynamicInvoker(); + private final static CallSite _FC_id = TypeSystemImpl.createCallSite(Token.class, "id"); + private final static MethodHandle _FH_id = _FC_id.dynamicInvoker(); + private final static CallSite _FC_form = TypeSystemImpl.createCallSite(Token.class, "form"); + private final static MethodHandle _FH_form = _FC_form.dynamicInvoker(); + private final static CallSite _FC_syntacticFunction = TypeSystemImpl.createCallSite(Token.class, "syntacticFunction"); + private final static MethodHandle _FH_syntacticFunction = _FC_syntacticFunction.dynamicInvoker(); + + /** Never called. Disable default constructor * @generated */ protected Token() {/* intentionally empty block */} /** Internal - constructor used by generator * @generated - * @param addr low level Feature Structure reference + * @param casImpl the CAS this Feature Structure belongs to * @param type the type of this Feature Structure */ - public Token(int addr, TOP_Type type) { - super(addr, type); + public Token(TypeImpl type, CASImpl casImpl) { + super(type, casImpl); readObject(); } @@ -71,6 +104,7 @@ public Token(JCas jcas) { readObject(); } + /** @generated * @param jcas JCas to which this Feature Structure belongs * @param begin offset to the begin spot in the SofA @@ -101,19 +135,16 @@ private void readObject() {/*default - does nothing empty block */} * @generated * @return value of the feature */ - public Annotation getParent() { - if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_parent == null) - jcasType.jcas.throwFeatMissing("parent", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - return (Annotation)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((Token_Type)jcasType).casFeatCode_parent)));} + public Annotation getParent() { return (Annotation)(_getFeatureValueNc(wrapGetIntCatchException(_FH_parent)));} /** setter for parent - sets the parent of this token. This feature is meant to be used in when the token participates in a constituency parse and then refers to a constituent containing this token. The type of this feature is {@link Annotation} to avoid adding a dependency on the syntax API module. * @generated * @param v value to set into the feature */ public void setParent(Annotation v) { - if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_parent == null) - jcasType.jcas.throwFeatMissing("parent", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - jcasType.ll_cas.ll_setRefValue(addr, ((Token_Type)jcasType).casFeatCode_parent, jcasType.ll_cas.ll_getFSRef(v));} + _setFeatureValueNcWj(wrapGetIntCatchException(_FH_parent), v); + } + //*--------------* @@ -123,19 +154,16 @@ public void setParent(Annotation v) { * @generated * @return value of the feature */ - public Lemma getLemma() { - if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_lemma == null) - jcasType.jcas.throwFeatMissing("lemma", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - return (Lemma)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((Token_Type)jcasType).casFeatCode_lemma)));} + public Lemma getLemma() { return (Lemma)(_getFeatureValueNc(wrapGetIntCatchException(_FH_lemma)));} /** setter for lemma - sets * @generated * @param v value to set into the feature */ public void setLemma(Lemma v) { - if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_lemma == null) - jcasType.jcas.throwFeatMissing("lemma", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - jcasType.ll_cas.ll_setRefValue(addr, ((Token_Type)jcasType).casFeatCode_lemma, jcasType.ll_cas.ll_getFSRef(v));} + _setFeatureValueNcWj(wrapGetIntCatchException(_FH_lemma), v); + } + //*--------------* @@ -145,19 +173,16 @@ public void setLemma(Lemma v) { * @generated * @return value of the feature */ - public Stem getStem() { - if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_stem == null) - jcasType.jcas.throwFeatMissing("stem", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - return (Stem)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((Token_Type)jcasType).casFeatCode_stem)));} + public Stem getStem() { return (Stem)(_getFeatureValueNc(wrapGetIntCatchException(_FH_stem)));} /** setter for stem - sets * @generated * @param v value to set into the feature */ public void setStem(Stem v) { - if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_stem == null) - jcasType.jcas.throwFeatMissing("stem", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - jcasType.ll_cas.ll_setRefValue(addr, ((Token_Type)jcasType).casFeatCode_stem, jcasType.ll_cas.ll_getFSRef(v));} + _setFeatureValueNcWj(wrapGetIntCatchException(_FH_stem), v); + } + //*--------------* @@ -167,19 +192,16 @@ public void setStem(Stem v) { * @generated * @return value of the feature */ - public POS getPos() { - if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_pos == null) - jcasType.jcas.throwFeatMissing("pos", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - return (POS)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((Token_Type)jcasType).casFeatCode_pos)));} + public POS getPos() { return (POS)(_getFeatureValueNc(wrapGetIntCatchException(_FH_pos)));} /** setter for pos - sets * @generated * @param v value to set into the feature */ public void setPos(POS v) { - if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_pos == null) - jcasType.jcas.throwFeatMissing("pos", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - jcasType.ll_cas.ll_setRefValue(addr, ((Token_Type)jcasType).casFeatCode_pos, jcasType.ll_cas.ll_getFSRef(v));} + _setFeatureValueNcWj(wrapGetIntCatchException(_FH_pos), v); + } + //*--------------* @@ -189,19 +211,16 @@ public void setPos(POS v) { * @generated * @return value of the feature */ - public MorphologicalFeatures getMorph() { - if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_morph == null) - jcasType.jcas.throwFeatMissing("morph", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - return (MorphologicalFeatures)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((Token_Type)jcasType).casFeatCode_morph)));} + public MorphologicalFeatures getMorph() { return (MorphologicalFeatures)(_getFeatureValueNc(wrapGetIntCatchException(_FH_morph)));} /** setter for morph - sets The morphological feature associated with this token. * @generated * @param v value to set into the feature */ public void setMorph(MorphologicalFeatures v) { - if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_morph == null) - jcasType.jcas.throwFeatMissing("morph", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - jcasType.ll_cas.ll_setRefValue(addr, ((Token_Type)jcasType).casFeatCode_morph, jcasType.ll_cas.ll_getFSRef(v));} + _setFeatureValueNcWj(wrapGetIntCatchException(_FH_morph), v); + } + //*--------------* @@ -211,19 +230,16 @@ public void setMorph(MorphologicalFeatures v) { * @generated * @return value of the feature */ - public String getId() { - if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_id == null) - jcasType.jcas.throwFeatMissing("id", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - return jcasType.ll_cas.ll_getStringValue(addr, ((Token_Type)jcasType).casFeatCode_id);} + public String getId() { return _getStringValueNc(wrapGetIntCatchException(_FH_id));} /** setter for id - sets If this unit had an ID in the source format from which it was imported, it may be stored here. IDs are typically not assigned by DKPro Core components. If an ID is present, it should be respected by writers. * @generated * @param v value to set into the feature */ public void setId(String v) { - if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_id == null) - jcasType.jcas.throwFeatMissing("id", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - jcasType.ll_cas.ll_setStringValue(addr, ((Token_Type)jcasType).casFeatCode_id, v);} + _setStringValueNfc(wrapGetIntCatchException(_FH_id), v); + } + //*--------------* @@ -233,19 +249,16 @@ public void setId(String v) { * @generated * @return value of the feature */ - public TokenForm getForm() { - if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_form == null) - jcasType.jcas.throwFeatMissing("form", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - return (TokenForm)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((Token_Type)jcasType).casFeatCode_form)));} + public TokenForm getForm() { return (TokenForm)(_getFeatureValueNc(wrapGetIntCatchException(_FH_form)));} /** setter for form - sets Potentially normalized form of the token text that should be used instead of the covered text if set. * @generated * @param v value to set into the feature */ public void setForm(TokenForm v) { - if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_form == null) - jcasType.jcas.throwFeatMissing("form", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - jcasType.ll_cas.ll_setRefValue(addr, ((Token_Type)jcasType).casFeatCode_form, jcasType.ll_cas.ll_getFSRef(v));} + _setFeatureValueNcWj(wrapGetIntCatchException(_FH_form), v); + } + //*--------------* @@ -255,20 +268,17 @@ public void setForm(TokenForm v) { * @generated * @return value of the feature */ - public String getSyntacticFunction() { - if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_syntacticFunction == null) - jcasType.jcas.throwFeatMissing("syntacticFunction", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - return jcasType.ll_cas.ll_getStringValue(addr, ((Token_Type)jcasType).casFeatCode_syntacticFunction);} + public String getSyntacticFunction() { return _getStringValueNc(wrapGetIntCatchException(_FH_syntacticFunction));} /** setter for syntacticFunction - sets * @generated * @param v value to set into the feature */ public void setSyntacticFunction(String v) { - if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_syntacticFunction == null) - jcasType.jcas.throwFeatMissing("syntacticFunction", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - jcasType.ll_cas.ll_setStringValue(addr, ((Token_Type)jcasType).casFeatCode_syntacticFunction, v);} - /** + _setStringValueNfc(wrapGetIntCatchException(_FH_syntacticFunction), v); + } + + /** * @return the POS value if there is a {@link POS} annotation linked to this token. */ public String getPosValue() { @@ -322,7 +332,14 @@ public void setText(String aText) { if (aText != null && !aText.equals(getCoveredText())) { // Create form annotation if none is here yet if (form == null) { - form = new TokenForm(jcasType.jcas, getBegin(), getEnd()); + try { + form = new TokenForm(getCAS().getJCas(), getBegin(), getEnd()); + } + catch (CASException e) { + // This should actually never happen since a JCas FS class should always be + // associated with a JCas. + throw new IllegalStateException(e); + } form.addToIndexes(); } @@ -336,5 +353,3 @@ else if (form != null) { } } } - - \ No newline at end of file diff --git a/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/Token_Type.java b/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/Token_Type.java deleted file mode 100644 index e65f666f48..0000000000 --- a/dkpro-core-api-segmentation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/Token_Type.java +++ /dev/null @@ -1,282 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* First created by JCasGen Thu Sep 15 23:03:44 EEST 2016 */ -package de.tudarmstadt.ukp.dkpro.core.api.segmentation.type; - -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.JCasRegistry; -import org.apache.uima.cas.impl.TypeImpl; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.impl.FeatureImpl; -import org.apache.uima.cas.Feature; -import org.apache.uima.jcas.tcas.Annotation_Type; - -/**

Token is one of the two types commonly produced by a segmenter (the other being Sentence). A Token usually represents a word, although it may be used to represent multiple tightly connected words (e.g. "New York") or parts of a word (e.g. the possessive "'s"). One may choose to split compound words into multiple tokens, e.g. ("CamelCase" -> "Camel", "Case"; "Zauberstab" -> "Zauber", "stab"). Most processing components operate on Tokens, usually within the limits of the surrounding Sentence. E.g. a part-of-speech tagger analyses each Token in a Sentence and assigns a part-of-speech to each Token.

- * Updated by JCasGen Tue Mar 07 16:08:28 CET 2017 - * @generated */ -public class Token_Type extends Annotation_Type { - /** @generated */ - @SuppressWarnings ("hiding") - public final static int typeIndexID = Token.typeIndexID; - /** @generated - @modifiable */ - @SuppressWarnings ("hiding") - public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - - /** @generated */ - final Feature casFeat_parent; - /** @generated */ - final int casFeatCode_parent; - /** @generated - * @param addr low level Feature Structure reference - * @return the feature value - */ - public int getParent(int addr) { - if (featOkTst && casFeat_parent == null) - jcas.throwFeatMissing("parent", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - return ll_cas.ll_getRefValue(addr, casFeatCode_parent); - } - /** @generated - * @param addr low level Feature Structure reference - * @param v value to set - */ - public void setParent(int addr, int v) { - if (featOkTst && casFeat_parent == null) - jcas.throwFeatMissing("parent", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - ll_cas.ll_setRefValue(addr, casFeatCode_parent, v);} - - - - /** @generated */ - final Feature casFeat_lemma; - /** @generated */ - final int casFeatCode_lemma; - /** @generated - * @param addr low level Feature Structure reference - * @return the feature value - */ - public int getLemma(int addr) { - if (featOkTst && casFeat_lemma == null) - jcas.throwFeatMissing("lemma", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - return ll_cas.ll_getRefValue(addr, casFeatCode_lemma); - } - /** @generated - * @param addr low level Feature Structure reference - * @param v value to set - */ - public void setLemma(int addr, int v) { - if (featOkTst && casFeat_lemma == null) - jcas.throwFeatMissing("lemma", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - ll_cas.ll_setRefValue(addr, casFeatCode_lemma, v);} - - - - /** @generated */ - final Feature casFeat_stem; - /** @generated */ - final int casFeatCode_stem; - /** @generated - * @param addr low level Feature Structure reference - * @return the feature value - */ - public int getStem(int addr) { - if (featOkTst && casFeat_stem == null) - jcas.throwFeatMissing("stem", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - return ll_cas.ll_getRefValue(addr, casFeatCode_stem); - } - /** @generated - * @param addr low level Feature Structure reference - * @param v value to set - */ - public void setStem(int addr, int v) { - if (featOkTst && casFeat_stem == null) - jcas.throwFeatMissing("stem", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - ll_cas.ll_setRefValue(addr, casFeatCode_stem, v);} - - - - /** @generated */ - final Feature casFeat_pos; - /** @generated */ - final int casFeatCode_pos; - /** @generated - * @param addr low level Feature Structure reference - * @return the feature value - */ - public int getPos(int addr) { - if (featOkTst && casFeat_pos == null) - jcas.throwFeatMissing("pos", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - return ll_cas.ll_getRefValue(addr, casFeatCode_pos); - } - /** @generated - * @param addr low level Feature Structure reference - * @param v value to set - */ - public void setPos(int addr, int v) { - if (featOkTst && casFeat_pos == null) - jcas.throwFeatMissing("pos", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - ll_cas.ll_setRefValue(addr, casFeatCode_pos, v);} - - - - /** @generated */ - final Feature casFeat_morph; - /** @generated */ - final int casFeatCode_morph; - /** @generated - * @param addr low level Feature Structure reference - * @return the feature value - */ - public int getMorph(int addr) { - if (featOkTst && casFeat_morph == null) - jcas.throwFeatMissing("morph", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - return ll_cas.ll_getRefValue(addr, casFeatCode_morph); - } - /** @generated - * @param addr low level Feature Structure reference - * @param v value to set - */ - public void setMorph(int addr, int v) { - if (featOkTst && casFeat_morph == null) - jcas.throwFeatMissing("morph", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - ll_cas.ll_setRefValue(addr, casFeatCode_morph, v);} - - - - /** @generated */ - final Feature casFeat_id; - /** @generated */ - final int casFeatCode_id; - /** @generated - * @param addr low level Feature Structure reference - * @return the feature value - */ - public String getId(int addr) { - if (featOkTst && casFeat_id == null) - jcas.throwFeatMissing("id", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - return ll_cas.ll_getStringValue(addr, casFeatCode_id); - } - /** @generated - * @param addr low level Feature Structure reference - * @param v value to set - */ - public void setId(int addr, String v) { - if (featOkTst && casFeat_id == null) - jcas.throwFeatMissing("id", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - ll_cas.ll_setStringValue(addr, casFeatCode_id, v);} - - - - /** @generated */ - final Feature casFeat_form; - /** @generated */ - final int casFeatCode_form; - /** @generated - * @param addr low level Feature Structure reference - * @return the feature value - */ - public int getForm(int addr) { - if (featOkTst && casFeat_form == null) - jcas.throwFeatMissing("form", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - return ll_cas.ll_getRefValue(addr, casFeatCode_form); - } - /** @generated - * @param addr low level Feature Structure reference - * @param v value to set - */ - public void setForm(int addr, int v) { - if (featOkTst && casFeat_form == null) - jcas.throwFeatMissing("form", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - ll_cas.ll_setRefValue(addr, casFeatCode_form, v);} - - - - /** @generated */ - final Feature casFeat_syntacticFunction; - /** @generated */ - final int casFeatCode_syntacticFunction; - /** @generated - * @param addr low level Feature Structure reference - * @return the feature value - */ - public String getSyntacticFunction(int addr) { - if (featOkTst && casFeat_syntacticFunction == null) - jcas.throwFeatMissing("syntacticFunction", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - return ll_cas.ll_getStringValue(addr, casFeatCode_syntacticFunction); - } - /** @generated - * @param addr low level Feature Structure reference - * @param v value to set - */ - public void setSyntacticFunction(int addr, String v) { - if (featOkTst && casFeat_syntacticFunction == null) - jcas.throwFeatMissing("syntacticFunction", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"); - ll_cas.ll_setStringValue(addr, casFeatCode_syntacticFunction, v);} - - - - - - /** initialize variables to correspond with Cas Type and Features - * @generated - * @param jcas JCas - * @param casType Type - */ - public Token_Type(JCas jcas, Type casType) { - super(jcas, casType); - casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator()); - - - casFeat_parent = jcas.getRequiredFeatureDE(casType, "parent", "uima.tcas.Annotation", featOkTst); - casFeatCode_parent = (null == casFeat_parent) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_parent).getCode(); - - - casFeat_lemma = jcas.getRequiredFeatureDE(casType, "lemma", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", featOkTst); - casFeatCode_lemma = (null == casFeat_lemma) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_lemma).getCode(); - - - casFeat_stem = jcas.getRequiredFeatureDE(casType, "stem", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem", featOkTst); - casFeatCode_stem = (null == casFeat_stem) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_stem).getCode(); - - - casFeat_pos = jcas.getRequiredFeatureDE(casType, "pos", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", featOkTst); - casFeatCode_pos = (null == casFeat_pos) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_pos).getCode(); - - - casFeat_morph = jcas.getRequiredFeatureDE(casType, "morph", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures", featOkTst); - casFeatCode_morph = (null == casFeat_morph) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_morph).getCode(); - - - casFeat_id = jcas.getRequiredFeatureDE(casType, "id", "uima.cas.String", featOkTst); - casFeatCode_id = (null == casFeat_id) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_id).getCode(); - - - casFeat_form = jcas.getRequiredFeatureDE(casType, "form", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.TokenForm", featOkTst); - casFeatCode_form = (null == casFeat_form) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_form).getCode(); - - - casFeat_syntacticFunction = jcas.getRequiredFeatureDE(casType, "syntacticFunction", "uima.cas.String", featOkTst); - casFeatCode_syntacticFunction = (null == casFeat_syntacticFunction) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_syntacticFunction).getCode(); - - } -} - - - - \ No newline at end of file diff --git a/dkpro-core-api-segmentation-asl/src/main/java/org/dkpro/core/api/segmentation/SegmenterBase.java b/dkpro-core-api-segmentation-asl/src/main/java/org/dkpro/core/api/segmentation/SegmenterBase.java new file mode 100644 index 0000000000..b3dec6cc01 --- /dev/null +++ b/dkpro-core-api-segmentation-asl/src/main/java/org/dkpro/core/api/segmentation/SegmenterBase.java @@ -0,0 +1,287 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.segmentation; + +import static org.apache.uima.fit.util.CasUtil.getType; +import static org.apache.uima.fit.util.CasUtil.select; + +import java.util.Iterator; +import java.util.Locale; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.parameter.ComponentParameters; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.TrimUtils; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.TokenForm; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Base-class for segmenters. + */ +@Component(OperationType.SEGMENTER) +public abstract class SegmenterBase + extends JCasAnnotator_ImplBase +{ + /** + * A list of type names used for zoning. + */ + public final static String PARAM_ZONE_TYPES = "zoneTypes"; + @ConfigurationParameter(name = PARAM_ZONE_TYPES, mandatory = false, defaultValue = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div" }) + private String[] zoneTypes; + + /** + * Strict zoning causes the segmentation to be applied only within the boundaries of a zone + * annotation. This works only if a single zone type is specified (the zone annotations should + * NOT overlap) or if no zone type is specified - in which case the whole document is taken as a + * zone. If strict zoning is turned off, multiple zone types can be specified. A list of all + * zone boundaries (start and end) is created and segmentation happens between them. + */ + public final static String PARAM_STRICT_ZONING = "strictZoning"; + @ConfigurationParameter(name = PARAM_STRICT_ZONING, mandatory = true, defaultValue = "false") + private boolean strictZoning; + + /** + * The language. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + private String language; + + /** + * Create {@link Token} annotations. + */ + public static final String PARAM_WRITE_TOKEN = ComponentParameters.PARAM_WRITE_TOKEN; + @ConfigurationParameter(name = PARAM_WRITE_TOKEN, mandatory = true, defaultValue = "true") + private boolean writeToken; + + /** + * Create {@link TokenForm} annotations. + */ + public static final String PARAM_WRITE_FORM = ComponentParameters.PARAM_WRITE_FORM; + @ConfigurationParameter(name = PARAM_WRITE_FORM, mandatory = true, defaultValue = "true") + private boolean writeForm; + + /** + * Create {@link Sentence} annotations. + */ + public static final String PARAM_WRITE_SENTENCE = ComponentParameters.PARAM_WRITE_SENTENCE; + @ConfigurationParameter(name = PARAM_WRITE_SENTENCE, mandatory = true, defaultValue = "true") + private boolean writeSentence; + + public boolean isStrictZoning() + { + return strictZoning; + } + + public boolean isWriteSentence() + { + return writeSentence; + } + + public boolean isWriteToken() + { + return writeToken; + } + + public String[] getZoneTypes() + { + return zoneTypes; + } + + @Override + public void process(JCas jcas) throws AnalysisEngineProcessException + { + String text = jcas.getDocumentText(); + + String[] zones = getZoneTypes(); + if (isStrictZoning()) { + if (zones == null || zones.length == 0) { + process(jcas, text.substring(0, text.length()), 0); + } + else if (zones.length != 1) { + throw new AnalysisEngineProcessException( + new IllegalStateException("Strict zoning cannot use multiple zone types")); + } + else { + CAS cas = jcas.getCas(); + for (AnnotationFS zone : select(cas, getType(cas, zones[0]))) { + int[] adjusted = limit(text, zone.getBegin(), zone.getEnd()); + process(jcas, text.substring(adjusted[0], adjusted[1]), adjusted[0]); + } + } + } + else { + // This set collects all zone boundaries. + SortedSet boundarySet = new TreeSet(); + boundarySet.add(0); // Add start boundary + boundarySet.add(text.length()); // Add end boundary + + // If zoneTypes have been define then get the boundaries, otherwise we will + // simply have one big zone covering the whole document. + if (zones != null) { + // Iterate over all the zone indices and create sentences respecting + // the zone boundaries. If the zoneTypes overlap... well... bad luck! + for (String zoneName : zones) { + CAS cas = jcas.getCas(); + for (AnnotationFS zone : select(cas, getType(cas, zoneName))) { + int[] adjusted = limit(text, zone.getBegin(), zone.getEnd()); + boundarySet.add(adjusted[0]); + boundarySet.add(adjusted[1]); + } + } + } + + // Now process all zoneTypes. There will be at least two entries in the + // boundary set (see above). + Iterator bi = boundarySet.iterator(); + int begin = bi.next(); + while (bi.hasNext()) { + int end = bi.next(); + process(jcas, text.substring(begin, end), begin); + begin = end; + } + } + } + + /** + * Adjust the values in the two numeric arguments to be within the limits of the specified text. + * If the limits have to be adjusted, a warning is issued to the log. Illegal zone boundaries + * hint to a bug in the AE that produced the zone annotations. + * + * @param text + * the text. + * @param aBegin + * the zone begin. + * @param aEnd + * the zone end. + * @return reduced offsets. + */ + protected int[] limit(String text, int aBegin, int aEnd) + { + // checking to avoid out-of-bounds + int maxEnd = text.length(); + int begin = aBegin < 0 ? 0 : aBegin; + begin = begin > maxEnd ? maxEnd : begin; + + int end = aEnd < 0 ? 0 : aEnd; + end = end > maxEnd ? maxEnd : end; + + if (begin != aBegin || end != aEnd) { + getLogger().warn("Adjusted out-of-bounds zone [" + aBegin + "-" + aEnd + "] to [" + + begin + "-" + end + "]"); + } + + int[] offsets = { begin, end }; + return offsets; + } + + protected Sentence createSentence(final JCas aJCas, final int aBegin, final int aEnd) + { + int[] span = new int[] { aBegin, aEnd }; + TrimUtils.trim(aJCas.getDocumentText(), span); + if (!isEmpty(span[0], span[1]) && isWriteSentence()) { + Sentence seg = new Sentence(aJCas, span[0], span[1]); + seg.addToIndexes(aJCas); + return seg; + } + else { + return null; + } + } + + /** + * @deprecated use {@link #createToken(JCas, int, int)} + */ + @Deprecated + protected Token createToken(final JCas aJCas, final int aBegin, final int aEnd, + final int aIndex) + { + return createToken(aJCas, null, aBegin, aEnd); + } + + protected Token createToken(final JCas aJCas, final int aBegin, final int aEnd) + { + return createToken(aJCas, null, aBegin, aEnd); + } + + protected Token createToken(final JCas aJCas, final String aForm, final int aBegin, + final int aEnd) + { + int[] span = new int[] { aBegin, aEnd }; + TrimUtils.trim(aJCas.getDocumentText(), span); + if (!isEmpty(span[0], span[1]) && isWriteToken()) { + Token seg = new Token(aJCas, span[0], span[1]); + if (aForm != null && writeForm) { + seg.setText(aForm); + } + seg.addToIndexes(aJCas); + return seg; + } + else { + return null; + } + } + + protected abstract void process(JCas aJCas, String text, int zoneBegin) + throws AnalysisEngineProcessException; + + public boolean isEmpty(int aBegin, int aEnd) + { + return aBegin >= aEnd; + } + + public String getLanguage(JCas aJCas) + { + if (language != null) { + return language; + } + else { + return aJCas.getDocumentLanguage(); + } + } + + /** + * Get the locale from the parameter, then from the document if available. If no locale is set + * get the default locale from the VM. + * + * @param aJCas + * the JCas. + * @return the locale. + */ + public Locale getLocale(JCas aJCas) + { + String lang = getLanguage(aJCas); + if (lang != null) { + return new Locale(lang); + } + else { + return Locale.getDefault(); + } + } +} diff --git a/dkpro-core-api-segmentation-asl/src/main/java/org/dkpro/core/api/segmentation/package-info.java b/dkpro-core-api-segmentation-asl/src/main/java/org/dkpro/core/api/segmentation/package-info.java new file mode 100644 index 0000000000..25a3c8676b --- /dev/null +++ b/dkpro-core-api-segmentation-asl/src/main/java/org/dkpro/core/api/segmentation/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * API for tokenization and segmentation. + * + * @since 1.1.0 + */ +package org.dkpro.core.api.segmentation; diff --git a/dkpro-core-api-segmentation-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map b/dkpro-core-api-segmentation-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map new file mode 100644 index 0000000000..8d566f7883 --- /dev/null +++ b/dkpro-core-api-segmentation-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map @@ -0,0 +1,17 @@ +de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound=http://w3id.org/meta-share/omtd-share/Compound +#de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.CompoundPart +de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div=http://w3id.org/meta-share/omtd-share/StructuralAnnotationType +de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Document=http://w3id.org/meta-share/omtd-share/StructuralAnnotationType +de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading=http://w3id.org/meta-share/omtd-share/StructuralAnnotationType +de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma=http://w3id.org/meta-share/omtd-share/Lemma +#de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LexicalPhrase +#de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LinkingMorpheme +#de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.NGram +de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph=http://w3id.org/meta-share/omtd-share/Paragraph +de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence=http://w3id.org/meta-share/omtd-share/Sentence +#de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split +de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem=http://w3id.org/meta-share/omtd-share/Stem +#de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.StopWord +#de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm +de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token=http://w3id.org/meta-share/omtd-share/Token +#de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.TokenForm \ No newline at end of file diff --git a/dkpro-core-api-segmentation-asl/src/main/resources/desc/type/LexicalUnits_customized.xml b/dkpro-core-api-segmentation-asl/src/main/resources/desc/type/LexicalUnits_customized.xml index 1e9ec281c6..e74907c2cb 100644 --- a/dkpro-core-api-segmentation-asl/src/main/resources/desc/type/LexicalUnits_customized.xml +++ b/dkpro-core-api-segmentation-asl/src/main/resources/desc/type/LexicalUnits_customized.xml @@ -1,72 +1,151 @@ + Segmentation + + ${version} + Ubiquitous Knowledge Processing (UKP) Lab, Technische Universität Darmstadt + + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound + This type represents a decompounding word, i.e.: flowerpot. Each Compound one have at least two Splits. + uima.tcas.Annotation + + + splits + A word that can be decomposed into different parts. + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token + <p>Token is one of the two types commonly produced by a segmenter (the other being Sentence). A Token usually represents a word, although it may be used to represent multiple tightly connected words (e.g. "New York") or parts of a word (e.g. the possessive "'s"). One may choose to split compound words into multiple tokens, e.g. ("CamelCase" -&gt; "Camel", "Case"; "Zauberstab" -&gt; "Zauber", "stab"). Most processing components operate on Tokens, usually within the limits of the surrounding Sentence. E.g. a part-of-speech tagger analyses each Token in a Sentence and assigns a part-of-speech to each Token.</p> + uima.tcas.Annotation + + + parent + the parent of this token. This feature is meant to be used in when the token participates in a constituency parse and then refers to a constituent containing this token. The type of this feature is {@link Annotation} to avoid adding a dependency on the syntax API module. + uima.tcas.Annotation + + + lemma + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma + + + stem + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem + + + pos + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + morph + The morphological feature associated with this token. + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures + + + id + If this unit had an ID in the source format from which it was imported, it may be stored here. IDs are typically not assigned by DKPro Core components. If an ID is present, it should be respected by writers. + uima.cas.String + + + form + Potentially normalized form of the token text that should be used instead of the covered text if set. + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.TokenForm + + + syntacticFunction + + uima.cas.String + + + + + + order + + Disambiguates the token order for tokens which have the same offsets, e.g. when the contraction "à" is analyzed as two tokens "a" and "a". + + uima.cas.Integer + + + + diff --git a/dkpro-core-api-segmentation-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/CompoundTest.java b/dkpro-core-api-segmentation-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/CompoundTest.java deleted file mode 100644 index c17a3ccb3e..0000000000 --- a/dkpro-core-api-segmentation-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/CompoundTest.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright 2013 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.segmentation.type; - -import static org.hamcrest.CoreMatchers.is; -import static org.junit.Assert.assertThat; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.uima.UIMAException; -import org.apache.uima.fit.factory.JCasBuilder; -import org.apache.uima.fit.factory.JCasFactory; -import org.apache.uima.fit.util.FSCollectionFactory; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Before; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound.CompoundSplitLevel; - -public class CompoundTest -{ - - private Compound compound; - - @Before - public void setUpCompound() - throws UIMAException - { - final JCas jcas = JCasFactory.createJCas(); - final JCasBuilder jcasBuilder = new JCasBuilder(jcas); - final int beginPosition = jcasBuilder.getPosition(); - final CompoundPart getrank = jcasBuilder.add("getränk", CompoundPart.class); - final int secondPosition = jcasBuilder.getPosition(); - final CompoundPart auto = jcasBuilder.add("auto", CompoundPart.class); - final CompoundPart mat = jcasBuilder.add("mat", CompoundPart.class); - final CompoundPart automat = new CompoundPart(jcas, secondPosition, - jcasBuilder.getPosition()); - final List splits = new ArrayList(); - splits.add(auto); - splits.add(mat); - automat.setSplits(FSCollectionFactory.createFSArray(jcas, splits)); - automat.addToIndexes(); - compound = new Compound(jcas, beginPosition, jcasBuilder.getPosition()); - splits.clear(); - splits.add(getrank); - splits.add(automat); - compound.setSplits(FSCollectionFactory.createFSArray(jcas, splits)); - compound.addToIndexes(); - jcasBuilder.close(); - - } - - @Test - public void testAll() - throws UIMAException - { - - final String[] splitsList = new String[] { "getränk", "automat", "auto", "mat" }; - assertThat(coveredTextArrayFromAnnotations(compound.getSplitsWithoutMorpheme(CompoundSplitLevel.ALL)), - is(splitsList)); - - } - - @Test - public void testLowest() - throws UIMAException - { - - final String[] splitsList = new String[] { "getränk", "auto", "mat" }; - assertThat(coveredTextArrayFromAnnotations(compound.getSplitsWithoutMorpheme(CompoundSplitLevel.LOWEST)), - is(splitsList)); - - } - - @Test - public void testHighest() - throws UIMAException - { - - final String[] splitsList = new String[] { "getränk", "automat" }; - assertThat(coveredTextArrayFromAnnotations(compound.getSplitsWithoutMorpheme(CompoundSplitLevel.HIGHEST)), - is(splitsList)); - - } - - @Test - public void testNone() - throws UIMAException - { - - final String[] splitsList = new String[] { }; - assertThat(coveredTextArrayFromAnnotations(compound.getSplitsWithoutMorpheme(CompoundSplitLevel.NONE)), - is(splitsList)); - - } - - public String[] coveredTextArrayFromAnnotations(final T[] annotations) - { - final List list = new ArrayList(); - for (T annotation : annotations) { - list.add(annotation.getCoveredText()); - } - return list.toArray(new String[list.size()]); - } - -} diff --git a/dkpro-core-api-segmentation-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/TrimTest.java b/dkpro-core-api-segmentation-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/TrimTest.java deleted file mode 100644 index b4ca21701e..0000000000 --- a/dkpro-core-api-segmentation-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/segmentation/type/TrimTest.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.segmentation.type; - -import static org.junit.Assert.assertArrayEquals; - -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; - -public class TrimTest -{ - @Test - public void testSingleCharacter() - { - assertTrim(".", new int[] {0, 1}, new int[] {0, 1}); - } - - @Test - public void testLeadingWhitespace() - { - assertTrim(" \t\n\r.", new int[] {0, 5}, new int[] {4, 5}); - } - - @Test - public void testTrailingWhitespace() - { - assertTrim(". \n\r\t", new int[] {0, 5}, new int[] {0, 1}); - } - - @Test - public void testLeadingTrailingWhitespace() - { - assertTrim(" \t\n\r. \n\r\t", new int[] {0, 9}, new int[] {4, 5}); - } - - @Test - public void testBlankString() - { - assertTrim(" ", new int[] {1, 2}, new int[] {1, 1}); - } - - private void assertTrim(String aText, int[] aStart, int[] aExpected) - { - int[] span = { aStart[0], aStart[1] }; - SegmenterBase.trim(aText, span); - assertArrayEquals(aExpected, span); - } -} diff --git a/dkpro-core-api-segmentation-asl/src/test/java/org/dkpro/core/api/segmentation/CompoundTest.java b/dkpro-core-api-segmentation-asl/src/test/java/org/dkpro/core/api/segmentation/CompoundTest.java new file mode 100644 index 0000000000..64704d52ce --- /dev/null +++ b/dkpro-core-api-segmentation-asl/src/test/java/org/dkpro/core/api/segmentation/CompoundTest.java @@ -0,0 +1,105 @@ +/* + * Copyright 2013 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.segmentation; + +import static de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound.CompoundSplitLevel.ALL; +import static de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound.CompoundSplitLevel.HIGHEST; +import static de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound.CompoundSplitLevel.LOWEST; +import static de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound.CompoundSplitLevel.NONE; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.uima.UIMAException; +import org.apache.uima.fit.factory.JCasBuilder; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.FSCollectionFactory; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.junit.Before; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.CompoundPart; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split; + +public class CompoundTest +{ + private Compound compound; + + @Before + public void setUpCompound() throws UIMAException + { + final JCas jcas = JCasFactory.createJCas(); + final JCasBuilder jcasBuilder = new JCasBuilder(jcas); + final int beginPosition = jcasBuilder.getPosition(); + final CompoundPart getrank = jcasBuilder.add("getränk", CompoundPart.class); + final int secondPosition = jcasBuilder.getPosition(); + final CompoundPart auto = jcasBuilder.add("auto", CompoundPart.class); + final CompoundPart mat = jcasBuilder.add("mat", CompoundPart.class); + final CompoundPart automat = new CompoundPart(jcas, secondPosition, + jcasBuilder.getPosition()); + final List splits = new ArrayList(); + splits.add(auto); + splits.add(mat); + automat.setSplits(FSCollectionFactory.createFSArray(jcas, splits)); + automat.addToIndexes(); + compound = new Compound(jcas, beginPosition, jcasBuilder.getPosition()); + splits.clear(); + splits.add(getrank); + splits.add(automat); + compound.setSplits(FSCollectionFactory.createFSArray(jcas, splits)); + compound.addToIndexes(); + jcasBuilder.close(); + } + + @Test + public void testAll() throws UIMAException + { + assertThat(compound.getSplitsWithoutMorpheme(ALL)) + .extracting(Annotation::getCoveredText) + .containsExactly("getränk", "automat", "auto", "mat"); + } + + @Test + public void testLowest() throws UIMAException + { + assertThat(compound.getSplitsWithoutMorpheme(LOWEST)) + .extracting(Annotation::getCoveredText) + .containsExactly("getränk", "auto", "mat"); + + } + + @Test + public void testHighest() throws UIMAException + { + assertThat(compound.getSplitsWithoutMorpheme(HIGHEST)) + .extracting(Annotation::getCoveredText) + .containsExactly("getränk", "automat"); + + } + + @Test + public void testNone() throws UIMAException + { + assertThat(compound.getSplitsWithoutMorpheme(NONE)) + .extracting(Annotation::getCoveredText) + .isEmpty(); + } +} diff --git a/dkpro-core-api-segmentation-asl/src/test/java/org/dkpro/core/api/segmentation/TrimUtilsTest.java b/dkpro-core-api-segmentation-asl/src/test/java/org/dkpro/core/api/segmentation/TrimUtilsTest.java new file mode 100644 index 0000000000..ac201514df --- /dev/null +++ b/dkpro-core-api-segmentation-asl/src/test/java/org/dkpro/core/api/segmentation/TrimUtilsTest.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.segmentation; + +import static de.tudarmstadt.ukp.dkpro.core.api.segmentation.TrimUtils.trim; +import static org.assertj.core.api.Assertions.assertThat; + +import org.junit.Test; + +public class TrimUtilsTest +{ + @Test + public void thatEmptySpanIsTrimmedToEmptySpan() + { + int[] span = new int[] { 2, 2 }; + trim(" ", span); + assertThat(span).containsExactly(2, 2); + } + + @Test + public void thatSpanIsTrimmedToEmptySpanStartingAtOriginalStart() + { + int[] span = new int[] { 2, 3 }; + trim(" ", span); + assertThat(span).containsExactly(2, 2); + } + + @Test + public void thatLeadingAndTrailingWhitespaceIsRemoved() + { + int[] span = new int[] { 0, 4 }; + trim(" ab ", span); + assertThat(span).containsExactly(1, 3); + } + + @Test + public void thatInnerWhitespaceIsRemoved1() + { + int[] span = new int[] { 0, 2 }; + trim(" a b ", span); + assertThat(span).containsExactly(1, 2); + } + + @Test + public void thatInnerWhitespaceIsRemoved2() + { + int[] span = new int[] { 2, 5 }; + trim(" a b ", span); + assertThat(span).containsExactly(3, 4); + } + + @Test + public void testSingleCharacter() + { + int[] span = { 0, 1 }; + trim(".", span); + assertThat(span).containsExactly(0, 1); + } + + @Test + public void testLeadingWhitespace() + { + int[] span = { 0, 5 }; + trim(" \t\n\r.", span); + assertThat(span).containsExactly(4, 5); + } + + @Test + public void testTrailingWhitespace() + { + int[] span = { 0, 5 }; + trim(". \n\r\t", span); + assertThat(span).containsExactly(0, 1); + } + + @Test + public void testLeadingTrailingWhitespace() + { + int[] span = { 0, 9 }; + trim(" \t\n\r. \n\r\t", span); + assertThat(span).containsExactly(4, 5); + } + + @Test + public void testBlankString() + { + int[] span = { 1, 2 }; + trim(" ", span); + assertThat(span).containsExactly(1, 1); + } +} diff --git a/dkpro-core-api-segmentation-asl/suppressions.xml b/dkpro-core-api-segmentation-asl/suppressions.xml new file mode 100644 index 0000000000..bad8402c63 --- /dev/null +++ b/dkpro-core-api-segmentation-asl/suppressions.xml @@ -0,0 +1,10 @@ + + + + + + + + diff --git a/dkpro-core-api-semantics-asl/pom.xml b/dkpro-core-api-semantics-asl/pom.xml index 5ccbec0b35..77bdca7dbb 100644 --- a/dkpro-core-api-semantics-asl/pom.xml +++ b/dkpro-core-api-semantics-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.api.semantics-asl + dkpro-core-api-semantics-asl jar DKPro Core ASL - Semantics API + https://dkpro.github.io/dkpro-core/ org.apache.uima diff --git a/dkpro-core-api-semantics-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map b/dkpro-core-api-semantics-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map new file mode 100644 index 0000000000..7da1b0f75f --- /dev/null +++ b/dkpro-core-api-semantics-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map @@ -0,0 +1,2 @@ +de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg=http://w3id.org/meta-share/omtd-share/SemanticAnnotationType +de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred=http://w3id.org/meta-share/omtd-share/SemanticFrame diff --git a/dkpro-core-api-semantics-asl/suppressions.xml b/dkpro-core-api-semantics-asl/suppressions.xml new file mode 100644 index 0000000000..05381817ea --- /dev/null +++ b/dkpro-core-api-semantics-asl/suppressions.xml @@ -0,0 +1,9 @@ + + + + + + + diff --git a/dkpro-core-api-sentiment-asl/pom.xml b/dkpro-core-api-sentiment-asl/pom.xml index 79dbad4bee..263d8b0a3b 100644 --- a/dkpro-core-api-sentiment-asl/pom.xml +++ b/dkpro-core-api-sentiment-asl/pom.xml @@ -15,26 +15,26 @@ See the License for the specific language governing permissions and limitations under the License. --> - + 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.api.sentiment-asl + dkpro-core-api-sentiment-asl jar DKPro Core ASL - Sentiment API + https://dkpro.github.io/dkpro-core/ org.apache.uima uimaj-core - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl @@ -65,7 +65,7 @@ - Maven doesn't detect the parameters module to be used because we - only use the XML type descriptors from it, not any actual Java code. --> - de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core:dkpro-core-api-segmentation-asl diff --git a/dkpro-core-api-sentiment-asl/suppressions.xml b/dkpro-core-api-sentiment-asl/suppressions.xml new file mode 100644 index 0000000000..05381817ea --- /dev/null +++ b/dkpro-core-api-sentiment-asl/suppressions.xml @@ -0,0 +1,9 @@ + + + + + + + diff --git a/dkpro-core-api-structure-asl/pom.xml b/dkpro-core-api-structure-asl/pom.xml index 94b8665469..de29d4420a 100644 --- a/dkpro-core-api-structure-asl/pom.xml +++ b/dkpro-core-api-structure-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.api.structure-asl + dkpro-core-api-structure-asl jar DKPro Core ASL - Structured and Semi-Structured Data API + https://dkpro.github.io/dkpro-core/ org.apache.uima diff --git a/dkpro-core-api-structure-asl/suppressions.xml b/dkpro-core-api-structure-asl/suppressions.xml new file mode 100644 index 0000000000..05381817ea --- /dev/null +++ b/dkpro-core-api-structure-asl/suppressions.xml @@ -0,0 +1,9 @@ + + + + + + + diff --git a/dkpro-core-api-syntax-asl/pom.xml b/dkpro-core-api-syntax-asl/pom.xml index 7a29529154..241b892850 100644 --- a/dkpro-core-api-syntax-asl/pom.xml +++ b/dkpro-core-api-syntax-asl/pom.xml @@ -1,72 +1,73 @@ - 4.0.0 - - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT - ../dkpro-core-asl - - de.tudarmstadt.ukp.dkpro.core.api.syntax-asl - jar - DKPro Core ASL - Syntax API - - - org.apache.uima - uimaj-core - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl - - - junit - junit - test - - - commons-io - commons-io - test - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl - test - - - - - - false - src/main/resources - - desc/type/**/* - - - - true - src/main/resources - - desc/type/**/* - - - - + 4.0.0 + + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT + ../dkpro-core-asl + + dkpro-core-api-syntax-asl + jar + DKPro Core ASL - Syntax API + https://dkpro.github.io/dkpro-core/ + + + org.apache.uima + uimaj-core + + + org.dkpro.core + dkpro-core-api-segmentation-asl + + + junit + junit + test + + + commons-io + commons-io + test + + + org.dkpro.core + dkpro-core-api-resources-asl + test + + + + + + false + src/main/resources + + desc/type/**/* + + + + true + src/main/resources + + desc/type/**/* + + + + diff --git a/dkpro-core-api-syntax-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map b/dkpro-core-api-syntax-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map new file mode 100644 index 0000000000..26400b9aba --- /dev/null +++ b/dkpro-core-api-syntax-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map @@ -0,0 +1,4 @@ +de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk=http://w3id.org/meta-share/omtd-share/Chunk +de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent=http://w3id.org/meta-share/omtd-share/Constituent +de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency=http://w3id.org/meta-share/omtd-share/Dependency + diff --git a/dkpro-core-api-syntax-asl/src/main/resources/META-INF/org.apache.uima.fit/types.txt b/dkpro-core-api-syntax-asl/src/main/resources/META-INF/org.apache.uima.fit/types.txt index 7ad9385338..24b884199d 100644 --- a/dkpro-core-api-syntax-asl/src/main/resources/META-INF/org.apache.uima.fit/types.txt +++ b/dkpro-core-api-syntax-asl/src/main/resources/META-INF/org.apache.uima.fit/types.txt @@ -2,4 +2,3 @@ classpath*:desc/type/Chunks.xml classpath*:desc/type/Constituency.xml classpath*:desc/type/Dependency.xml classpath*:desc/type/PennTree.xml -classpath*:desc/type/Tag.xml diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/bg-default-constituency.map b/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/bg-default-constituency.map deleted file mode 100644 index 7b2f597232..0000000000 --- a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/bg-default-constituency.map +++ /dev/null @@ -1,2 +0,0 @@ -__META_REDIRECT__=classpath:/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/bg-btb-constituency.map -__META_OVERRIDE__.chunk.tagset=btb diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-default-chunk.map b/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-default-chunk.map deleted file mode 100644 index c081982f0a..0000000000 --- a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-default-chunk.map +++ /dev/null @@ -1,2 +0,0 @@ -__META_REDIRECT__=classpath:/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-tt-chunk.map -__META_OVERRIDE__.chunk.tagset=tt diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-default-constituency.map b/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-default-constituency.map deleted file mode 100644 index abb4a0066a..0000000000 --- a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-default-constituency.map +++ /dev/null @@ -1,2 +0,0 @@ -__META_REDIRECT__=classpath:/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-tiger-constituency.map -__META_OVERRIDE__.chunk.tagset=negra diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-default-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-default-dependency.map deleted file mode 100644 index dbff1d1612..0000000000 --- a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-default-dependency.map +++ /dev/null @@ -1,2 +0,0 @@ -__META_REDIRECT__=classpath:/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-tiger-dependency.map -__META_OVERRIDE__.chunk.tagset=negra diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-default-chunk.map b/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-default-chunk.map deleted file mode 100644 index d8b301ca6f..0000000000 --- a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-default-chunk.map +++ /dev/null @@ -1,2 +0,0 @@ -__META_REDIRECT__=classpath:/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-conll2000-chunk.map -__META_OVERRIDE__.chunk.tagset=conll2000 diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-default-constituency.map b/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-default-constituency.map deleted file mode 100644 index 345be6461d..0000000000 --- a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-default-constituency.map +++ /dev/null @@ -1,2 +0,0 @@ -__META_REDIRECT__=classpath:/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-ptb-constituency.map -__META_OVERRIDE__.chunk.tagset=ptb diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-default-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-default-dependency.map deleted file mode 100644 index be96448314..0000000000 --- a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-default-dependency.map +++ /dev/null @@ -1,2 +0,0 @@ -__META_REDIRECT__=classpath:/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-stanford341-dependency.map -__META_OVERRIDE__.chunk.tagset=stanford341 diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/fr-default-constituency.map b/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/fr-default-constituency.map deleted file mode 100644 index 0912421c71..0000000000 --- a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/fr-default-constituency.map +++ /dev/null @@ -1,2 +0,0 @@ -__META_REDIRECT__=classpath:/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/fr-ftb-constituency.map -__META_OVERRIDE__.chunk.tagset=ftb diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/zh-default-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/zh-default-dependency.map deleted file mode 100644 index 1d614eedca..0000000000 --- a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/zh-default-dependency.map +++ /dev/null @@ -1,2 +0,0 @@ -__META_REDIRECT__=classpath:/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/zh-stanford341-dependency.map -__META_OVERRIDE__.chunk.tagset=stanford341 diff --git a/dkpro-core-api-syntax-asl/src/main/resources/desc/type/Tag.xml b/dkpro-core-api-syntax-asl/src/main/resources/desc/type/Tag.xml deleted file mode 100644 index f3e3daf86e..0000000000 --- a/dkpro-core-api-syntax-asl/src/main/resources/desc/type/Tag.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - Tag - - ${version} - Ubiquitous Knowledge Processing (UKP) Lab, Technische Universität Darmstadt - - - de.tudarmstadt.ukp.dkpro.core.api.syntax.type.Tag - A multi purpose tag - uima.tcas.Annotation - - - value - - uima.cas.String - - - - - diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/ar-atb-constituency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/ar-atb-constituency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/ar-atb-constituency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/ar-atb-constituency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/bg-btb-constituency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/bg-btb-constituency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/bg-btb-constituency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/bg-btb-constituency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/bg-default-constituency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/bg-default-constituency.map new file mode 100644 index 0000000000..a87e29984a --- /dev/null +++ b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/bg-default-constituency.map @@ -0,0 +1,2 @@ +__META_REDIRECT__=classpath:/org/dkpro/core/api/syntax/tagset/bg-btb-constituency.map +__META_OVERRIDE__.chunk.tagset=btb diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-cdg-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-cdg-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-cdg-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-cdg-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-default-chunk.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-default-chunk.map new file mode 100644 index 0000000000..d88e69cfde --- /dev/null +++ b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-default-chunk.map @@ -0,0 +1,2 @@ +__META_REDIRECT__=classpath:/org/dkpro/core/api/syntax/tagset/de-tt-chunk.map +__META_OVERRIDE__.chunk.tagset=tt diff --git a/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-default-constituency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-default-constituency.map new file mode 100644 index 0000000000..6454db41ea --- /dev/null +++ b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-default-constituency.map @@ -0,0 +1,2 @@ +__META_REDIRECT__=classpath:/org/dkpro/core/api/syntax/tagset/de-tiger-constituency.map +__META_OVERRIDE__.chunk.tagset=negra diff --git a/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-default-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-default-dependency.map new file mode 100644 index 0000000000..4e84e777ea --- /dev/null +++ b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-default-dependency.map @@ -0,0 +1,2 @@ +__META_REDIRECT__=classpath:/org/dkpro/core/api/syntax/tagset/de-tiger-dependency.map +__META_OVERRIDE__.chunk.tagset=negra diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-negra-constituency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-negra-constituency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-negra-constituency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-negra-constituency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-negra-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-negra-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-negra-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-negra-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-tiger-constituency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-tiger-constituency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-tiger-constituency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-tiger-constituency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-tiger-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-tiger-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-tiger-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-tiger-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-tt-chunk.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-tt-chunk.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-tt-chunk.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-tt-chunk.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-tueba-versley-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-tueba-versley-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-tueba-versley-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-tueba-versley-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-universal-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-universal-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/de-universal-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/de-universal-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-conll-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-conll-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-conll-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-conll-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-conll2000-chunk.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-conll2000-chunk.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-conll2000-chunk.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-conll2000-chunk.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-conll2008-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-conll2008-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-conll2008-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-conll2008-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-default-chunk.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-default-chunk.map new file mode 100644 index 0000000000..9c1b0ea0b2 --- /dev/null +++ b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-default-chunk.map @@ -0,0 +1,2 @@ +__META_REDIRECT__=classpath:/org/dkpro/core/api/syntax/tagset/en-conll2000-chunk.map +__META_OVERRIDE__.chunk.tagset=conll2000 diff --git a/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-default-constituency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-default-constituency.map new file mode 100644 index 0000000000..ed3c48b373 --- /dev/null +++ b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-default-constituency.map @@ -0,0 +1,2 @@ +__META_REDIRECT__=classpath:/org/dkpro/core/api/syntax/tagset/en-ptb-constituency.map +__META_OVERRIDE__.chunk.tagset=ptb diff --git a/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-default-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-default-dependency.map new file mode 100644 index 0000000000..2f1d9b4041 --- /dev/null +++ b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-default-dependency.map @@ -0,0 +1,2 @@ +__META_REDIRECT__=classpath:/org/dkpro/core/api/syntax/tagset/en-stanford341-dependency.map +__META_OVERRIDE__.chunk.tagset=stanford341 diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-emory-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-emory-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-emory-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-emory-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-ptb-constituency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-ptb-constituency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-ptb-constituency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-ptb-constituency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-stanford-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-stanford-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-stanford-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-stanford-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-stanford331-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-stanford331-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-stanford331-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-stanford331-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-stanford341-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-stanford341-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-stanford341-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-stanford341-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-tt-chunk.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-tt-chunk.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-tt-chunk.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-tt-chunk.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-universal-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-universal-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-universal-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/en-universal-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/es-ancora-constituency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/es-ancora-constituency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/es-ancora-constituency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/es-ancora-constituency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/es-ancora-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/es-ancora-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/es-ancora-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/es-ancora-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/es-iula-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/es-iula-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/es-iula-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/es-iula-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/fr-default-constituency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/fr-default-constituency.map new file mode 100644 index 0000000000..1c73a85e1c --- /dev/null +++ b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/fr-default-constituency.map @@ -0,0 +1,2 @@ +__META_REDIRECT__=classpath:/org/dkpro/core/api/syntax/tagset/fr-ftb-constituency.map +__META_OVERRIDE__.chunk.tagset=ftb diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/fr-ftb-chunk.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/fr-ftb-chunk.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/fr-ftb-chunk.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/fr-ftb-chunk.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/fr-ftb-constituency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/fr-ftb-constituency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/fr-ftb-constituency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/fr-ftb-constituency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/fr-ftb-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/fr-ftb-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/fr-ftb-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/fr-ftb-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/fr-universal-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/fr-universal-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/fr-universal-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/fr-universal-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/no-universal-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/no-universal-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/no-universal-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/no-universal-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/sv-stb-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/sv-stb-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/sv-stb-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/sv-stb-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/zh-conll-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/zh-conll-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/zh-conll-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/zh-conll-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/zh-conll2008-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/zh-conll2008-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/zh-conll2008-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/zh-conll2008-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/zh-ctb-constituency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/zh-ctb-constituency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/zh-ctb-constituency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/zh-ctb-constituency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/zh-default-constituency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/zh-default-constituency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/zh-default-constituency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/zh-default-constituency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/zh-default-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/zh-default-dependency.map new file mode 100644 index 0000000000..6c7e277fb8 --- /dev/null +++ b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/zh-default-dependency.map @@ -0,0 +1,2 @@ +__META_REDIRECT__=classpath:/org/dkpro/core/api/syntax/tagset/zh-stanford341-dependency.map +__META_OVERRIDE__.chunk.tagset=stanford341 diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/zh-stanford341-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/zh-stanford341-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/zh-stanford341-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/zh-stanford341-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/zh-universal-dependency.map b/dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/zh-universal-dependency.map similarity index 100% rename from dkpro-core-api-syntax-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/zh-universal-dependency.map rename to dkpro-core-api-syntax-asl/src/main/resources/org/dkpro/core/api/syntax/tagset/zh-universal-dependency.map diff --git a/dkpro-core-api-syntax-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/MappingsTest.java b/dkpro-core-api-syntax-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/MappingsTest.java deleted file mode 100644 index 286a5fcc65..0000000000 --- a/dkpro-core-api-syntax-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/MappingsTest.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.syntax.tagset; - -import static org.junit.Assert.assertFalse; - -import java.io.File; -import java.io.IOException; -import java.util.Collection; - -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.filefilter.TrueFileFilter; -import org.apache.commons.io.filefilter.WildcardFileFilter; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; - -public class MappingsTest -{ - @Test - public void testMappings() throws Exception - { - Collection files = FileUtils.listFiles( - new File("src/main/resources/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset"), - new WildcardFileFilter("*.map"), - TrueFileFilter.TRUE); - - assertTagsetMapping(files); - } - - public static void assertTagsetMapping(Collection files) - throws IOException - { - for (File file : files) { - boolean failure = false; - System.out.printf("== %s ==%n", file.getName()); - MappingProvider mappingProvider = new MappingProvider(); - mappingProvider.setDefault(MappingProvider.LOCATION, file.toURI().toURL().toString()); - mappingProvider.configure(); - for (String tag : mappingProvider.getTags()) { - String typeName = mappingProvider.getTagTypeName(tag); - try { - Class.forName(typeName); - } - catch (Throwable e) { - System.out.printf("%s FAILED: %s %n", tag, e.getMessage()); - failure = true; - } - } - assertFalse(failure); - } - } -} diff --git a/dkpro-core-api-syntax-asl/src/test/java/org/dkpro/core/api/syntax/tagset/MappingsTest.java b/dkpro-core-api-syntax-asl/src/test/java/org/dkpro/core/api/syntax/tagset/MappingsTest.java new file mode 100644 index 0000000000..17d45a3786 --- /dev/null +++ b/dkpro-core-api-syntax-asl/src/test/java/org/dkpro/core/api/syntax/tagset/MappingsTest.java @@ -0,0 +1,67 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.syntax.tagset; + +import static org.junit.Assert.assertFalse; + +import java.io.File; +import java.io.IOException; +import java.util.Collection; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.filefilter.TrueFileFilter; +import org.apache.commons.io.filefilter.WildcardFileFilter; +import org.dkpro.core.api.resources.MappingProvider; +import org.junit.Test; + +public class MappingsTest +{ + @Test + public void testMappings() throws Exception + { + Collection files = FileUtils.listFiles( + new File("src/main/resources/org/dkpro/core/api/syntax/tagset"), + new WildcardFileFilter("*.map"), + TrueFileFilter.TRUE); + + assertTagsetMapping(files); + } + + public static void assertTagsetMapping(Collection files) + throws IOException + { + for (File file : files) { + boolean failure = false; + System.out.printf("== %s ==%n", file.getName()); + MappingProvider mappingProvider = new MappingProvider(); + mappingProvider.setDefault(MappingProvider.LOCATION, file.toURI().toURL().toString()); + mappingProvider.configure(); + for (String tag : mappingProvider.getTags()) { + String typeName = mappingProvider.getTagTypeName(tag); + try { + Class.forName(typeName); + } + catch (Throwable e) { + System.out.printf("%s FAILED: %s %n", tag, e.getMessage()); + failure = true; + } + } + assertFalse(failure); + } + } +} diff --git a/dkpro-core-api-syntax-asl/suppressions.xml b/dkpro-core-api-syntax-asl/suppressions.xml new file mode 100644 index 0000000000..05381817ea --- /dev/null +++ b/dkpro-core-api-syntax-asl/suppressions.xml @@ -0,0 +1,9 @@ + + + + + + + diff --git a/dkpro-core-api-transform-asl/pom.xml b/dkpro-core-api-transform-asl/pom.xml index 35fe670baa..adaeaf81ec 100644 --- a/dkpro-core-api-transform-asl/pom.xml +++ b/dkpro-core-api-transform-asl/pom.xml @@ -1,53 +1,54 @@ - 4.0.0 - - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT - ../dkpro-core-asl - - de.tudarmstadt.ukp.dkpro.core.api.transform-asl - jar - DKPro Core ASL - CAS Transformation API - - - org.apache.uima - uimaj-core - - - org.apache.uima - uimafit-core - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl - test - - - junit - junit - test - - + 4.0.0 + + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT + ../dkpro-core-asl + + dkpro-core-api-transform-asl + jar + DKPro Core ASL - CAS Transformation API + https://dkpro.github.io/dkpro-core/ + + + org.apache.uima + uimaj-core + + + org.apache.uima + uimafit-core + + + org.dkpro.core + dkpro-core-api-metadata-asl + + + org.dkpro.core + dkpro-core-api-segmentation-asl + test + + + junit + junit + test + + diff --git a/dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/ImmutableInterval.java b/dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/ImmutableInterval.java deleted file mode 100644 index b03af472fa..0000000000 --- a/dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/ImmutableInterval.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright 2008 - * Richard Eckart de Castilho - * Institut für Sprach- und Literaturwissenschaft - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.transform.alignment; - -public -class ImmutableInterval -extends AbstractInterval -{ - private final int start; - private final int end; - - /** - * Copy constructor. - * - * @param interval the original interval. - */ - public - ImmutableInterval( - final Interval interval) - { - this(interval.getStart(), interval.getEnd()); - } - - /** - * Constructor. - * - * @param s start offset. - * @param e end offset. - */ - public - ImmutableInterval( - final int s, - final int e) - { - start = Math.min(s, e); - end = Math.max(s, e); - } - - @Override - public - int getStart() - { - return start; - } - - @Override - public - int getEnd() - { - return end; - } -} diff --git a/dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/WeakHashSet.java b/dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/WeakHashSet.java deleted file mode 100644 index adfd0bfb25..0000000000 --- a/dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/WeakHashSet.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright 2008 - * Richard Eckart de Castilho - * Institut für Sprach- und Literaturwissenschaft - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.transform.alignment; - -import java.util.AbstractSet; -import java.util.Iterator; -import java.util.Map; -import java.util.Set; -import java.util.WeakHashMap; - -public -class WeakHashSet -extends AbstractSet -implements Iterable, Set -{ - private final static Object present = new Object(); - - private final Map data = new WeakHashMap(); - - @Override - public - boolean add( - final E o) - { - final int beforeSize = size(); - data.put(o, present); - return beforeSize != size(); - } - - @Override - public - boolean remove( - final Object o) - { - return data.remove(o) != null; - } - - @Override - public boolean contains(Object o) - { - return data.containsKey(o); - } - - @Override - public - Iterator iterator() - { - return data.keySet().iterator(); - } - - @Override - public - int size() - { - return data.size(); - } -} diff --git a/dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/package-info.java b/dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/package-info.java deleted file mode 100644 index 4e5bf95d5c..0000000000 --- a/dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/package-info.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright 2008 - * Richard Eckart de Castilho - * Institut für Sprach- und Literaturwissenschaft - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Classes for tracking changes in a String. These have been originally developed as part of - * AnnoLab within the DFG-Projekt "Linguistische Profile interdisziplinärer Register" (LingPro) - * (10.2006 - 09.2009). - */ -package de.tudarmstadt.ukp.dkpro.core.api.transform.alignment; diff --git a/dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/JCasTransformerChangeBased_ImplBase.java b/dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/JCasTransformerChangeBased_ImplBase.java similarity index 96% rename from dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/JCasTransformerChangeBased_ImplBase.java rename to dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/JCasTransformerChangeBased_ImplBase.java index 5984a0be87..b13b989411 100644 --- a/dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/JCasTransformerChangeBased_ImplBase.java +++ b/dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/JCasTransformerChangeBased_ImplBase.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.transform; +package org.dkpro.core.api.transform; import static org.apache.uima.fit.util.CasUtil.getType; import static org.apache.uima.fit.util.CasUtil.selectFS; @@ -35,10 +35,9 @@ import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.jcas.JCas; import org.apache.uima.util.CasCopier; - -import de.tudarmstadt.ukp.dkpro.core.api.transform.alignment.AlignedString; -import de.tudarmstadt.ukp.dkpro.core.api.transform.alignment.ImmutableInterval; -import de.tudarmstadt.ukp.dkpro.core.api.transform.alignment.Interval; +import org.dkpro.core.api.transform.alignment.AlignedString; +import org.dkpro.core.api.transform.alignment.ImmutableInterval; +import org.dkpro.core.api.transform.alignment.Interval; /** * Base-class for normalizers that do insert/delete/replace operations. Please mind that these diff --git a/dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/JCasTransformer_ImplBase.java b/dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/JCasTransformer_ImplBase.java similarity index 98% rename from dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/JCasTransformer_ImplBase.java rename to dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/JCasTransformer_ImplBase.java index 3a6eefc9e7..975798dde4 100644 --- a/dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/JCasTransformer_ImplBase.java +++ b/dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/JCasTransformer_ImplBase.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.transform; +package org.dkpro.core.api.transform; import static org.apache.uima.fit.util.CasUtil.getType; import static org.apache.uima.fit.util.CasUtil.selectFS; @@ -34,6 +34,9 @@ import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +/** + * Base-class for transformers. + */ public abstract class JCasTransformer_ImplBase extends JCasMultiplier_ImplBase { diff --git a/dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/AbstractInterval.java b/dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/alignment/AbstractInterval.java similarity index 97% rename from dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/AbstractInterval.java rename to dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/alignment/AbstractInterval.java index 60ed8287b7..a16787bba3 100644 --- a/dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/AbstractInterval.java +++ b/dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/alignment/AbstractInterval.java @@ -16,7 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.transform.alignment; +package org.dkpro.core.api.transform.alignment; import java.util.Collection; @@ -66,7 +66,7 @@ public boolean overlaps(final Interval i) return (((i.getStart() <= getStart()) && (getStart() < i.getEnd())) || // Case 1-3 ((i.getStart() < getEnd()) && (getEnd() <= i.getEnd())) || // Case 1-3 - ((getStart() <= i.getStart()) && (i.getEnd() <= getEnd()))); // Case 4 + ((getStart() <= i.getStart()) && (i.getEnd() <= getEnd()))); // Case 4 } @Override diff --git a/dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/AlignedString.java b/dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/alignment/AlignedString.java similarity index 96% rename from dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/AlignedString.java rename to dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/alignment/AlignedString.java index f4f4b858a5..10f68654a3 100644 --- a/dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/AlignedString.java +++ b/dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/alignment/AlignedString.java @@ -16,7 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.transform.alignment; +package org.dkpro.core.api.transform.alignment; import java.util.ArrayList; import java.util.Collection; @@ -1105,54 +1105,53 @@ public String toString() return sb.toString(); } } -} - -/** - * DataSegment iterator. - * - */ -class DataSegmentIterator - implements Iterator -{ - private final boolean _includeAll; - - private AlignedString.AbstractDataSegment _next = null; - - public DataSegmentIterator(final AlignedString.AbstractDataSegment first) + + /** + * DataSegment iterator. + */ + private static class DataSegmentIterator + implements Iterator { - _next = first; - _includeAll = false; - } + private final boolean _includeAll; - public DataSegmentIterator(final AlignedString.AbstractDataSegment first, - final boolean includeAll) - { - _next = first; - _includeAll = includeAll; - } + private AlignedString.AbstractDataSegment _next = null; - @Override - public boolean hasNext() - { - return _next != null; - } + public DataSegmentIterator(final AlignedString.AbstractDataSegment first) + { + _next = first; + _includeAll = false; + } - @Override - public AlignedString.DataSegment next() - { - final AlignedString.DataSegment result = _next; - if (_includeAll) { - _next = _next._next; + public DataSegmentIterator(final AlignedString.AbstractDataSegment first, + final boolean includeAll) + { + _next = first; + _includeAll = includeAll; } - else { - _next = _next.getNext(); + + @Override + public boolean hasNext() + { + return _next != null; } - return result; - } - @Override - public void remove() - { - throw new UnsupportedOperationException(); + @Override + public AlignedString.DataSegment next() + { + final AlignedString.DataSegment result = _next; + if (_includeAll) { + _next = _next._next; + } + else { + _next = _next.getNext(); + } + return result; + } + + @Override + public void remove() + { + throw new UnsupportedOperationException(); + } } } diff --git a/dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/alignment/ImmutableInterval.java b/dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/alignment/ImmutableInterval.java new file mode 100644 index 0000000000..60751ed7cf --- /dev/null +++ b/dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/alignment/ImmutableInterval.java @@ -0,0 +1,63 @@ +/* + * Copyright 2008 + * Richard Eckart de Castilho + * Institut für Sprach- und Literaturwissenschaft + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.transform.alignment; + +public class ImmutableInterval + extends AbstractInterval +{ + private final int start; + private final int end; + + /** + * Copy constructor. + * + * @param interval + * the original interval. + */ + public ImmutableInterval(final Interval interval) + { + this(interval.getStart(), interval.getEnd()); + } + + /** + * Constructor. + * + * @param s + * start offset. + * @param e + * end offset. + */ + public ImmutableInterval(final int s, final int e) + { + start = Math.min(s, e); + end = Math.max(s, e); + } + + @Override + public int getStart() + { + return start; + } + + @Override + public int getEnd() + { + return end; + } +} diff --git a/dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/Interval.java b/dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/alignment/Interval.java similarity index 98% rename from dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/Interval.java rename to dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/alignment/Interval.java index 329489f2e5..63349bae14 100644 --- a/dkpro-core-api-transform-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/Interval.java +++ b/dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/alignment/Interval.java @@ -16,7 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.transform.alignment; +package org.dkpro.core.api.transform.alignment; import java.util.Comparator; diff --git a/dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/alignment/WeakHashSet.java b/dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/alignment/WeakHashSet.java new file mode 100644 index 0000000000..e3b23f18a8 --- /dev/null +++ b/dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/alignment/WeakHashSet.java @@ -0,0 +1,66 @@ +/* + * Copyright 2008 + * Richard Eckart de Castilho + * Institut für Sprach- und Literaturwissenschaft + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.transform.alignment; + +import java.util.AbstractSet; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.WeakHashMap; + +public class WeakHashSet + extends AbstractSet + implements Iterable, Set +{ + private final static Object present = new Object(); + + private final Map data = new WeakHashMap(); + + @Override + public boolean add(final E o) + { + final int beforeSize = size(); + data.put(o, present); + return beforeSize != size(); + } + + @Override + public boolean remove(final Object o) + { + return data.remove(o) != null; + } + + @Override + public boolean contains(Object o) + { + return data.containsKey(o); + } + + @Override + public Iterator iterator() + { + return data.keySet().iterator(); + } + + @Override + public int size() + { + return data.size(); + } +} diff --git a/dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/alignment/package-info.java b/dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/alignment/package-info.java new file mode 100644 index 0000000000..248996dc81 --- /dev/null +++ b/dkpro-core-api-transform-asl/src/main/java/org/dkpro/core/api/transform/alignment/package-info.java @@ -0,0 +1,25 @@ +/* + * Copyright 2008 + * Richard Eckart de Castilho + * Institut für Sprach- und Literaturwissenschaft + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Classes for tracking changes in a String. These have been originally developed as part of + * AnnoLab within the DFG-Projekt "Linguistische Profile interdisziplinärer Register" (LingPro) + * (10.2006 - 09.2009). + */ +package org.dkpro.core.api.transform.alignment; diff --git a/dkpro-core-api-transform-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/AlignedStringTest.java b/dkpro-core-api-transform-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/AlignedStringTest.java deleted file mode 100644 index 47656a95be..0000000000 --- a/dkpro-core-api-transform-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/transform/alignment/AlignedStringTest.java +++ /dev/null @@ -1,482 +0,0 @@ -/* - * Copyright 2008 - * Richard Eckart de Castilho - * Institut für Sprach- und Literaturwissenschaft - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.api.transform.alignment; - -import static org.junit.Assert.assertEquals; - -import org.junit.After; -import org.junit.Before; -import org.junit.Ignore; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.transform.alignment.AlignedString; -import de.tudarmstadt.ukp.dkpro.core.api.transform.alignment.ImmutableInterval; -import de.tudarmstadt.ukp.dkpro.core.api.transform.alignment.Interval; - -public -class AlignedStringTest -{ - private String baseString; - private AlignedString bottom; - private AlignedString top; - - - @Before - public - void setUp() - throws Exception - { - // 11 - // 012345678901 - baseString = "I am a test."; - bottom = new AlignedString(baseString); - top = new AlignedString(bottom); - - System.out.println("-------------------------------------------"); - } - - @After - public - void after() - { - System.out.println(" 1 | 2 | 3 | 4 | 5"); - System.out.println(" 012345678901234567890123456789012345678901234567890"); - System.out.println("Base : "+baseString); - System.out.println("Bottom : "+bottom.get()+" - "+bottom.dataSegmentsToString()); - System.out.println("Top : "+top.get()+" - "+top.dataSegmentsToString()); - } - - @Test - public - void testGet() - { - assertEquals(baseString, top.get()); - } - - @Test - public - void testInsert() - { - final String insertString = "such "; - final int insertPos = 2; - - bottom.insert(insertPos, insertString); - - final StringBuilder sb = new StringBuilder(baseString); - sb.insert(insertPos, insertString); - - assertEquals(sb.toString(), bottom.get()); - assertEquals(sb.toString(), top.get()); - } - - @Test - public - void testInsert2() - { - // 0123456789012345678901234567890 - baseString = "This is a hyphen- ated sentence"; - bottom = new AlignedString(baseString); - top = new AlignedString(bottom); - - System.out.println("Delete word fragment"); - final String fragment = top.get(18,22); - top.delete(18,22); - System.out.println("Top : "+top.get()+" - "+top.dataSegmentsToString()); - System.out.println("Bottom : "+bottom.get()+" - "+bottom.dataSegmentsToString()); - - System.out.println("Insert word fragment to complete word"); - top.insert(16, fragment); - System.out.println("Top : "+top.get()+" - "+top.dataSegmentsToString()); - System.out.println("Bottom : "+bottom.get()+" - "+bottom.dataSegmentsToString()); - - System.out.println("Delete hyphen"); - top.delete(16+fragment.length(), 18+fragment.length()); - System.out.println("Top : "+top.get()+" - "+top.dataSegmentsToString()); - System.out.println("Bottom : "+bottom.get()+" - "+bottom.dataSegmentsToString()); - - ImmutableInterval uli = new ImmutableInterval(0, 18); - ImmutableInterval adi = top.inverseResolve(uli); - System.out.println("ADI : "+top.get(adi.getStart(), adi.getEnd())); - System.out.println("ULI : "+bottom.get(uli.getStart(), uli.getEnd())); - - assertEquals("This is a hyphenated", top.get(adi.getStart(), adi.getEnd())); - - uli = new ImmutableInterval(18, 31); - adi = top.inverseResolve(uli); - System.out.println("ADI : "+top.get(adi.getStart(), adi.getEnd())); - System.out.println("ULI : "+bottom.get(uli.getStart(), uli.getEnd())); - - assertEquals(" sentence", top.get(adi.getStart(), adi.getEnd())); - } - - /** - * This is how you would expect to do hypenation removal, but it's wrong - use method used in - * testInsert2. This here will not work, because AlignedString will try to interpolate the - * start position of the uli interval (18) within the replaced interval (16-22). - */ - @Test @Ignore("Wrong method to do hypenation removal") - public - void testInsert3() - { - // 0123456789012345678901234567890 - baseString = "This is a hyphen- ated sentence"; - bottom = new AlignedString(baseString); - top = new AlignedString(bottom); - - top.replace(16, 22, "ated"); - - ImmutableInterval uli = new ImmutableInterval(18, 31); - Interval adi = top.inverseResolve(uli); - System.out.println("ADI : "+top.get(adi.getStart(), adi.getEnd())); - System.out.println("ULI : "+bottom.get(uli.getStart(), uli.getEnd())); - - assertEquals(" sentence", top.get(adi.getStart(), adi.getEnd())); - } - - @Test - public - void testDelete_1() - { - bottom.delete(2, 5); - top.delete(2, 4); - - final StringBuilder bottomRef = new StringBuilder(baseString); - bottomRef.delete(2, 5); - - final StringBuilder topRef = new StringBuilder(bottomRef); - topRef.delete(2, 4); - - assertEquals(bottomRef.toString(), bottom.get()); - assertEquals(topRef.toString(), top.get()); - } - - @Test - public - void testDelete_2() - { - bottom.delete(2, 5); - top.insert(4, "new "); - - final StringBuilder bottomRef = new StringBuilder(baseString); - bottomRef.delete(2, 5); - - final StringBuilder topRef = new StringBuilder(bottomRef); - topRef.insert(4, "new "); - - assertEquals(bottomRef.toString(), bottom.get()); - assertEquals(topRef.toString(), top.get()); - } - - @Test - public - void testDelete_3() - { - bottom.delete(7, 11); - bottom.delete(6, 7); - - final StringBuilder bottomRef = new StringBuilder(baseString); - bottomRef.delete(7, 11); - bottomRef.delete(6, 7); - - assertEquals(bottomRef.toString(), bottom.get()); - } - - @Test - public - void testDelete_4() - { - final StringBuilder bottomRef = new StringBuilder(baseString); - bottomRef.delete(7, 12); - bottomRef.delete(6, 9); - - bottom.delete(7, 12); - bottom.delete(6, 7); - - assertEquals(bottomRef.toString(), bottom.get()); - } - - /** - * If we delete and then try to resolve a segment start ends at the start - * boundary of the deleted segment, we do not want the deleted segment to - * be included in the resolved interval. - */ - @Test - public - void testResolve() - { - top.delete(4, 7); - - final ImmutableInterval ri = new ImmutableInterval(3, 4); - final Interval i = top.resolve(ri); - - assertEquals(1, i.getLength()); - } - - @Test - public - void testResolve2() - { - top.delete(0, 5); - top.replace(0, 1, "I want a"); - - final ImmutableInterval ri = new ImmutableInterval(0, 8); - final Interval i = top.resolve(ri); - - assertEquals(5, i.getStart()); - assertEquals(6, i.getEnd()); - } - - @Test - public - void testResolve3() - { - bottom = new AlignedString("11-08-adultsUser13"); - top = new AlignedString(bottom); - - top.replace(0, 47, " "); - after(); - top.replace(1, 19, "John"); - after(); - - ImmutableInterval ri = new ImmutableInterval(1, 5); - Interval i = top.resolve(ri); - - assertEquals(47, i.getStart()); - assertEquals(65, i.getEnd()); - - bottom = new AlignedString("11-08-adultsUser13"); - top = new AlignedString(bottom); - - top.replace(47, 65, "John"); - after(); - top.replace(0, 47, " "); - - ri = new ImmutableInterval(1, 5); - i = top.resolve(ri); - - assertEquals(47, i.getStart()); - assertEquals(65, i.getEnd()); - } - - @Test - public - void testDeleteInsert() - { - bottom.delete(2, 5); - top.insert(4, "new "); - bottom.insert(8, ", man"); - - final StringBuilder bottomRef = new StringBuilder(baseString); - bottomRef.delete(2, 5); - bottomRef.insert(8, ", man"); - - final StringBuilder topRef = new StringBuilder(bottomRef); - topRef.insert(4, "new "); - - assertEquals(bottomRef.toString(), bottom.get()); - assertEquals(topRef.toString(), top.get()); - } - - @Test - public - void testReplace() - { - top.replace(2, 4, "want"); - - final StringBuilder topRef = new StringBuilder(baseString); - topRef.replace(2,4,"want"); - - assertEquals(topRef.toString(), top.get()); - } - - @Test - public - void testReplace2() - { - top.replace(2, 4, "want"); - top.replace(4, 8, "nnahave"); - - final StringBuilder topRef = new StringBuilder(baseString); - topRef.replace(2,4,"want"); - topRef.replace(4,8,"nnahave"); - - assertEquals(topRef.toString(), top.get()); - - final Interval i1 = top.resolve(new ImmutableInterval(2, 11)); - assertEquals(2, i1.getStart()); - assertEquals(6, i1.getEnd()); - - final Interval i2 = top.inverseResolve(new ImmutableInterval(i1.getStart(), i1.getEnd())); - final String replaced = top.get(i2.getStart(), i2.getEnd()); - - System.out.println("Inverse resolved: "+i2); - - assertEquals("wannahave", replaced); - assertEquals(i1.getStart(), i2.getStart()); - assertEquals(i2.getEnd(), i2.getEnd()); - } - -// @Ignore // FIXME http://code.google.com/p/dkpro-core-asl/issues/detail?id=50 - @Test - public - void testReplace3() - { - top.replace(0, 1, "i"); - - final StringBuilder topRef = new StringBuilder(baseString); - topRef.replace(0, 1, "i"); - - assertEquals(topRef.toString(), top.get()); - } - - @Test - public - void testReplace4() - { - top.replace(11, 12, "!"); - - final StringBuilder topRef = new StringBuilder(baseString); - topRef.replace(11, 12, "!"); - - assertEquals(topRef.toString(), top.get()); - } - - @Test - public - void testReplace5() - { - baseString = ""; - bottom = new AlignedString(baseString); - top = new AlignedString(bottom); - - top.replace(0, 0, "Hello!"); - - final StringBuilder topRef = new StringBuilder(baseString); - topRef.replace(0, 0, "Hello!"); - - assertEquals(topRef.toString(), top.get()); - } - - @Test - public - void testReplace6() - { - StringBuilder bottomRef = new StringBuilder(baseString); - StringBuilder topRef = new StringBuilder(bottomRef); - - top.delete(2, 5); - topRef.delete(2, 5); - - assertEquals(bottomRef.toString(), bottom.get()); - assertEquals(topRef.toString(), top.get()); - - top.insert(2, "was "); - topRef.insert(2, "was "); - - assertEquals(bottomRef.toString(), bottom.get()); - assertEquals(topRef.toString(), top.get()); - - System.out.println("Resolved: "+top.resolve(new ImmutableInterval(2, 5))); - System.out.println("Inv resolved: "+top.inverseResolve(new ImmutableInterval(2, 5))); - } - - @Test - public - void testDirty() - { - final StringBuilder bottomRef = new StringBuilder(baseString); - final StringBuilder topRef = new StringBuilder(bottomRef); - - bottom.delete(2, 5); - bottomRef.delete(2, 5); - topRef.delete(2, 5); - - assertEquals(bottomRef.toString(), bottom.get()); - assertEquals(topRef.toString(), top.get()); - - bottom.insert(8, ", man"); - bottomRef.insert(8, ", man"); - topRef.insert(8, ", man"); - - assertEquals(bottomRef.toString(), bottom.get()); - assertEquals(topRef.toString(), top.get()); - } - - /** - * For the given interval on the underlying data, get the corresponding - * interval on this level. - * - * Example: - * 11 11 11 111 12 - * 012 34567 8901 23 45 678 90 - * AD |111|22ZZ2|3333|44|55|YYY|55| - * - * UL |111|XX|22|ZZ|2|XXXXX|3333|XX|44|XXXX|5555|XXXX| - * 012 34 56 78 9 11111 1111 12 22 2222 2223 3333 - * 01234 5678 90 12 3456 7890 1234 - * - * As you can see there is a YYY inserted in the AD. Otherwise some parts - * of the UL (marked "X") have been removed in the AD. Also an ZZ part has - * been added to UL - * - * Calling this method with getStart()=22 getEnd()=30 ("4XXXX555") should return - * [13, 20] ("455YYY5"). - * - * Generally: - * - if the getStart() is within a deleted region, then find the next oblique - * segment in AD to the right and return its getStart() position. - * - if the getEnd() is within a deleted region, then find the next oblique - * segment in AD to the left and return its getEnd() position. - * - * Anchors are always in UL. They are referenced from the ObliqueSegments - * in AD. - */ - @Test - public - void testInverseResolve() - { - bottom = new AlignedString("111XX222XXXXX3333XX44XXXX5555XXXX"); - bottom.insert(7, "ZZ"); - assertEquals("111XX22ZZ2XXXXX3333XX44XXXX5555XXXX", bottom.get()); - - top = new AlignedString(bottom); - top.delete(31, 35); - assertEquals("111XX22ZZ2XXXXX3333XX44XXXX5555", top.get()); - top.delete(23, 27); - assertEquals("111XX22ZZ2XXXXX3333XX445555", top.get()); - top.delete(19, 21); - assertEquals("111XX22ZZ2XXXXX3333445555", top.get()); - top.delete(10, 15); - assertEquals("111XX22ZZ23333445555", top.get()); - top.delete(3, 5); - assertEquals("11122ZZ23333445555", top.get()); - top.insert(16, "YYY"); - assertEquals("11122ZZ233334455YYY55", top.get()); - - final ImmutableInterval uli = new ImmutableInterval(22, 30); - System.out.println("ULI : "+bottom.get(uli.getStart(), uli.getEnd())); - - final ImmutableInterval adi = top.inverseResolve(uli); - System.out.println("ADI : "+top.get(adi.getStart(), adi.getEnd())); - - assertEquals(new ImmutableInterval(13, 20), adi); - assertEquals("455YYY5", top.get(adi.getStart(), adi.getEnd())); - assertEquals("4XXXX555", bottom.get(uli.getStart(), uli.getEnd())); - } -} diff --git a/dkpro-core-api-transform-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/transform/JCasTransformerChangeBasedTest.java b/dkpro-core-api-transform-asl/src/test/java/org/dkpro/core/api/transform/JCasTransformerChangeBasedTest.java similarity index 96% rename from dkpro-core-api-transform-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/transform/JCasTransformerChangeBasedTest.java rename to dkpro-core-api-transform-asl/src/test/java/org/dkpro/core/api/transform/JCasTransformerChangeBasedTest.java index 298c3fd653..4b2fcc4676 100644 --- a/dkpro-core-api-transform-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/api/transform/JCasTransformerChangeBasedTest.java +++ b/dkpro-core-api-transform-asl/src/test/java/org/dkpro/core/api/transform/JCasTransformerChangeBasedTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.api.transform; +package org.dkpro.core.api.transform; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; @@ -31,6 +31,7 @@ import org.apache.uima.analysis_engine.JCasIterator; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div; diff --git a/dkpro-core-api-transform-asl/src/test/java/org/dkpro/core/api/transform/alignment/AlignedStringTest.java b/dkpro-core-api-transform-asl/src/test/java/org/dkpro/core/api/transform/alignment/AlignedStringTest.java new file mode 100644 index 0000000000..2ae6f29d14 --- /dev/null +++ b/dkpro-core-api-transform-asl/src/test/java/org/dkpro/core/api/transform/alignment/AlignedStringTest.java @@ -0,0 +1,458 @@ +/* + * Copyright 2008 + * Richard Eckart de Castilho + * Institut für Sprach- und Literaturwissenschaft + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.transform.alignment; + +import static org.junit.Assert.assertEquals; + +import org.dkpro.core.api.transform.alignment.AlignedString; +import org.dkpro.core.api.transform.alignment.ImmutableInterval; +import org.dkpro.core.api.transform.alignment.Interval; +import org.junit.After; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; + +public class AlignedStringTest +{ + private String baseString; + private AlignedString bottom; + private AlignedString top; + + + @Before + public void setUp() throws Exception + { + // 11 + // 012345678901 + baseString = "I am a test."; + bottom = new AlignedString(baseString); + top = new AlignedString(bottom); + + System.out.println("-------------------------------------------"); + } + + @After + public void after() + { + System.out.println(" 1 | 2 | 3 | 4 | 5"); + System.out.println(" 012345678901234567890123456789012345678901234567890"); + System.out.println("Base : " + baseString); + System.out.println("Bottom : " + bottom.get() + " - " + bottom.dataSegmentsToString()); + System.out.println("Top : " + top.get() + " - " + top.dataSegmentsToString()); + } + + @Test + public void testGet() + { + assertEquals(baseString, top.get()); + } + + @Test + public void testInsert() + { + final String insertString = "such "; + final int insertPos = 2; + + bottom.insert(insertPos, insertString); + + final StringBuilder sb = new StringBuilder(baseString); + sb.insert(insertPos, insertString); + + assertEquals(sb.toString(), bottom.get()); + assertEquals(sb.toString(), top.get()); + } + + @Test + public void testInsert2() + { + // 0123456789012345678901234567890 + baseString = "This is a hyphen- ated sentence"; + bottom = new AlignedString(baseString); + top = new AlignedString(bottom); + + System.out.println("Delete word fragment"); + final String fragment = top.get(18,22); + top.delete(18,22); + System.out.println("Top : " + top.get() + " - " + top.dataSegmentsToString()); + System.out.println("Bottom : " + bottom.get() + " - " + bottom.dataSegmentsToString()); + + System.out.println("Insert word fragment to complete word"); + top.insert(16, fragment); + System.out.println("Top : " + top.get() + " - " + top.dataSegmentsToString()); + System.out.println("Bottom : " + bottom.get() + " - " + bottom.dataSegmentsToString()); + + System.out.println("Delete hyphen"); + top.delete(16 + fragment.length(), 18 + fragment.length()); + System.out.println("Top : " + top.get() + " - " + top.dataSegmentsToString()); + System.out.println("Bottom : " + bottom.get() + " - " + bottom.dataSegmentsToString()); + + ImmutableInterval uli = new ImmutableInterval(0, 18); + ImmutableInterval adi = top.inverseResolve(uli); + System.out.println("ADI : " + top.get(adi.getStart(), adi.getEnd())); + System.out.println("ULI : " + bottom.get(uli.getStart(), uli.getEnd())); + + assertEquals("This is a hyphenated", top.get(adi.getStart(), adi.getEnd())); + + uli = new ImmutableInterval(18, 31); + adi = top.inverseResolve(uli); + System.out.println("ADI : " + top.get(adi.getStart(), adi.getEnd())); + System.out.println("ULI : " + bottom.get(uli.getStart(), uli.getEnd())); + + assertEquals(" sentence", top.get(adi.getStart(), adi.getEnd())); + } + + /** + * This is how you would expect to do hypenation removal, but it's wrong - use method used in + * testInsert2. This here will not work, because AlignedString will try to interpolate the + * start position of the uli interval (18) within the replaced interval (16-22). + */ + @Test + @Ignore("Wrong method to do hypenation removal") + public void testInsert3() + { + // 0123456789012345678901234567890 + baseString = "This is a hyphen- ated sentence"; + bottom = new AlignedString(baseString); + top = new AlignedString(bottom); + + top.replace(16, 22, "ated"); + + ImmutableInterval uli = new ImmutableInterval(18, 31); + Interval adi = top.inverseResolve(uli); + System.out.println("ADI : " + top.get(adi.getStart(), adi.getEnd())); + System.out.println("ULI : " + bottom.get(uli.getStart(), uli.getEnd())); + + assertEquals(" sentence", top.get(adi.getStart(), adi.getEnd())); + } + + @Test + public void testDelete_1() + { + bottom.delete(2, 5); + top.delete(2, 4); + + final StringBuilder bottomRef = new StringBuilder(baseString); + bottomRef.delete(2, 5); + + final StringBuilder topRef = new StringBuilder(bottomRef); + topRef.delete(2, 4); + + assertEquals(bottomRef.toString(), bottom.get()); + assertEquals(topRef.toString(), top.get()); + } + + @Test + public void testDelete_2() + { + bottom.delete(2, 5); + top.insert(4, "new "); + + final StringBuilder bottomRef = new StringBuilder(baseString); + bottomRef.delete(2, 5); + + final StringBuilder topRef = new StringBuilder(bottomRef); + topRef.insert(4, "new "); + + assertEquals(bottomRef.toString(), bottom.get()); + assertEquals(topRef.toString(), top.get()); + } + + @Test + public void testDelete_3() + { + bottom.delete(7, 11); + bottom.delete(6, 7); + + final StringBuilder bottomRef = new StringBuilder(baseString); + bottomRef.delete(7, 11); + bottomRef.delete(6, 7); + + assertEquals(bottomRef.toString(), bottom.get()); + } + + @Test + public void testDelete_4() + { + final StringBuilder bottomRef = new StringBuilder(baseString); + bottomRef.delete(7, 12); + bottomRef.delete(6, 9); + + bottom.delete(7, 12); + bottom.delete(6, 7); + + assertEquals(bottomRef.toString(), bottom.get()); + } + + /** + * If we delete and then try to resolve a segment start ends at the start + * boundary of the deleted segment, we do not want the deleted segment to + * be included in the resolved interval. + */ + @Test + public void testResolve() + { + top.delete(4, 7); + + final ImmutableInterval ri = new ImmutableInterval(3, 4); + final Interval i = top.resolve(ri); + + assertEquals(1, i.getLength()); + } + + @Test + public void testResolve2() + { + top.delete(0, 5); + top.replace(0, 1, "I want a"); + + final ImmutableInterval ri = new ImmutableInterval(0, 8); + final Interval i = top.resolve(ri); + + assertEquals(5, i.getStart()); + assertEquals(6, i.getEnd()); + } + + @Test + public void testResolve3() + { + bottom = new AlignedString("11-08-adultsUser13"); + top = new AlignedString(bottom); + + top.replace(0, 47, " "); + after(); + top.replace(1, 19, "John"); + after(); + + ImmutableInterval ri = new ImmutableInterval(1, 5); + Interval i = top.resolve(ri); + + assertEquals(47, i.getStart()); + assertEquals(65, i.getEnd()); + + bottom = new AlignedString("11-08-adultsUser13"); + top = new AlignedString(bottom); + + top.replace(47, 65, "John"); + after(); + top.replace(0, 47, " "); + + ri = new ImmutableInterval(1, 5); + i = top.resolve(ri); + + assertEquals(47, i.getStart()); + assertEquals(65, i.getEnd()); + } + + @Test + public void testDeleteInsert() + { + bottom.delete(2, 5); + top.insert(4, "new "); + bottom.insert(8, ", man"); + + final StringBuilder bottomRef = new StringBuilder(baseString); + bottomRef.delete(2, 5); + bottomRef.insert(8, ", man"); + + final StringBuilder topRef = new StringBuilder(bottomRef); + topRef.insert(4, "new "); + + assertEquals(bottomRef.toString(), bottom.get()); + assertEquals(topRef.toString(), top.get()); + } + + @Test + public void testReplace() + { + top.replace(2, 4, "want"); + + final StringBuilder topRef = new StringBuilder(baseString); + topRef.replace(2,4,"want"); + + assertEquals(topRef.toString(), top.get()); + } + + @Test + public void testReplace2() + { + top.replace(2, 4, "want"); + top.replace(4, 8, "nnahave"); + + final StringBuilder topRef = new StringBuilder(baseString); + topRef.replace(2,4,"want"); + topRef.replace(4,8,"nnahave"); + + assertEquals(topRef.toString(), top.get()); + + final Interval i1 = top.resolve(new ImmutableInterval(2, 11)); + assertEquals(2, i1.getStart()); + assertEquals(6, i1.getEnd()); + + final Interval i2 = top.inverseResolve(new ImmutableInterval(i1.getStart(), i1.getEnd())); + final String replaced = top.get(i2.getStart(), i2.getEnd()); + + System.out.println("Inverse resolved: " + i2); + + assertEquals("wannahave", replaced); + assertEquals(i1.getStart(), i2.getStart()); + assertEquals(i2.getEnd(), i2.getEnd()); + } + + // @Ignore // FIXME http://code.google.com/p/dkpro-core-asl/issues/detail?id=50 + @Test + public void testReplace3() + { + top.replace(0, 1, "i"); + + final StringBuilder topRef = new StringBuilder(baseString); + topRef.replace(0, 1, "i"); + + assertEquals(topRef.toString(), top.get()); + } + + @Test + public void testReplace4() + { + top.replace(11, 12, "!"); + + final StringBuilder topRef = new StringBuilder(baseString); + topRef.replace(11, 12, "!"); + + assertEquals(topRef.toString(), top.get()); + } + + @Test + public void testReplace5() + { + baseString = ""; + bottom = new AlignedString(baseString); + top = new AlignedString(bottom); + + top.replace(0, 0, "Hello!"); + + final StringBuilder topRef = new StringBuilder(baseString); + topRef.replace(0, 0, "Hello!"); + + assertEquals(topRef.toString(), top.get()); + } + + @Test + public void testReplace6() + { + StringBuilder bottomRef = new StringBuilder(baseString); + StringBuilder topRef = new StringBuilder(bottomRef); + + top.delete(2, 5); + topRef.delete(2, 5); + + assertEquals(bottomRef.toString(), bottom.get()); + assertEquals(topRef.toString(), top.get()); + + top.insert(2, "was "); + topRef.insert(2, "was "); + + assertEquals(bottomRef.toString(), bottom.get()); + assertEquals(topRef.toString(), top.get()); + + System.out.println("Resolved: " + top.resolve(new ImmutableInterval(2, 5))); + System.out.println("Inv resolved: " + top.inverseResolve(new ImmutableInterval(2, 5))); + } + + @Test + public void testDirty() + { + final StringBuilder bottomRef = new StringBuilder(baseString); + final StringBuilder topRef = new StringBuilder(bottomRef); + + bottom.delete(2, 5); + bottomRef.delete(2, 5); + topRef.delete(2, 5); + + assertEquals(bottomRef.toString(), bottom.get()); + assertEquals(topRef.toString(), top.get()); + + bottom.insert(8, ", man"); + bottomRef.insert(8, ", man"); + topRef.insert(8, ", man"); + + assertEquals(bottomRef.toString(), bottom.get()); + assertEquals(topRef.toString(), top.get()); + } + + /** + * For the given interval on the underlying data, get the corresponding + * interval on this level. + * + * Example: + * 11 11 11 111 12 + * 012 34567 8901 23 45 678 90 + * AD |111|22ZZ2|3333|44|55|YYY|55| + * + * UL |111|XX|22|ZZ|2|XXXXX|3333|XX|44|XXXX|5555|XXXX| + * 012 34 56 78 9 11111 1111 12 22 2222 2223 3333 + * 01234 5678 90 12 3456 7890 1234 + * + * As you can see there is a YYY inserted in the AD. Otherwise some parts + * of the UL (marked "X") have been removed in the AD. Also an ZZ part has + * been added to UL + * + * Calling this method with getStart()=22 getEnd()=30 ("4XXXX555") should return + * [13, 20] ("455YYY5"). + * + * Generally: + * - if the getStart() is within a deleted region, then find the next oblique + * segment in AD to the right and return its getStart() position. + * - if the getEnd() is within a deleted region, then find the next oblique + * segment in AD to the left and return its getEnd() position. + * + * Anchors are always in UL. They are referenced from the ObliqueSegments + * in AD. + */ + @Test + public void testInverseResolve() + { + bottom = new AlignedString("111XX222XXXXX3333XX44XXXX5555XXXX"); + bottom.insert(7, "ZZ"); + assertEquals("111XX22ZZ2XXXXX3333XX44XXXX5555XXXX", bottom.get()); + + top = new AlignedString(bottom); + top.delete(31, 35); + assertEquals("111XX22ZZ2XXXXX3333XX44XXXX5555", top.get()); + top.delete(23, 27); + assertEquals("111XX22ZZ2XXXXX3333XX445555", top.get()); + top.delete(19, 21); + assertEquals("111XX22ZZ2XXXXX3333445555", top.get()); + top.delete(10, 15); + assertEquals("111XX22ZZ23333445555", top.get()); + top.delete(3, 5); + assertEquals("11122ZZ23333445555", top.get()); + top.insert(16, "YYY"); + assertEquals("11122ZZ233334455YYY55", top.get()); + + final ImmutableInterval uli = new ImmutableInterval(22, 30); + System.out.println("ULI : " + bottom.get(uli.getStart(), uli.getEnd())); + + final ImmutableInterval adi = top.inverseResolve(uli); + System.out.println("ADI : " + top.get(adi.getStart(), adi.getEnd())); + + assertEquals(new ImmutableInterval(13, 20), adi); + assertEquals("455YYY5", top.get(adi.getStart(), adi.getEnd())); + assertEquals("4XXXX555", bottom.get(uli.getStart(), uli.getEnd())); + } +} diff --git a/dkpro-core-api-transform-asl/suppressions.xml b/dkpro-core-api-transform-asl/suppressions.xml new file mode 100644 index 0000000000..05381817ea --- /dev/null +++ b/dkpro-core-api-transform-asl/suppressions.xml @@ -0,0 +1,9 @@ + + + + + + + diff --git a/dkpro-core-api-xml-asl/.activate-run-jcasgen b/dkpro-core-api-xml-asl/.activate-run-jcasgen new file mode 100644 index 0000000000..fc486faafd --- /dev/null +++ b/dkpro-core-api-xml-asl/.activate-run-jcasgen @@ -0,0 +1 @@ +Marker to activate run-jcasgen profile. diff --git a/dkpro-core-io-fangorn-asl/LICENSE.txt b/dkpro-core-api-xml-asl/LICENSE.txt similarity index 100% rename from dkpro-core-io-fangorn-asl/LICENSE.txt rename to dkpro-core-api-xml-asl/LICENSE.txt diff --git a/dkpro-core-api-xml-asl/pom.xml b/dkpro-core-api-xml-asl/pom.xml new file mode 100644 index 0000000000..8fc70b1268 --- /dev/null +++ b/dkpro-core-api-xml-asl/pom.xml @@ -0,0 +1,44 @@ + + + 4.0.0 + + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT + ../dkpro-core-asl + + dkpro-core-api-xml-asl + jar + DKPro Core ASL - XML API + https://dkpro.github.io/dkpro-core/ + + + org.apache.uima + uimaj-core + + + org.apache.uima + uimafit-core + + + org.apache.commons + commons-lang3 + + + diff --git a/dkpro-core-api-xml-asl/src/main/java/org/dkpro/core/api/xml/Cas2SaxEvents.java b/dkpro-core-api-xml-asl/src/main/java/org/dkpro/core/api/xml/Cas2SaxEvents.java new file mode 100644 index 0000000000..1424cd7204 --- /dev/null +++ b/dkpro-core-api-xml-asl/src/main/java/org/dkpro/core/api/xml/Cas2SaxEvents.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.xml; + +import static org.apache.commons.lang3.StringUtils.defaultString; +import static org.apache.uima.fit.util.JCasUtil.selectSingle; + +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.xml.type.XmlAttribute; +import org.dkpro.core.api.xml.type.XmlDocument; +import org.dkpro.core.api.xml.type.XmlElement; +import org.dkpro.core.api.xml.type.XmlNode; +import org.dkpro.core.api.xml.type.XmlTextNode; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +public class Cas2SaxEvents +{ + private final ContentHandler handler; + + public Cas2SaxEvents(ContentHandler aHandler) + { + handler = aHandler; + } + + public void process(JCas aJCas) throws SAXException + { + XmlDocument doc = selectSingle(aJCas, XmlDocument.class); + + process(doc); + } + + public void process(XmlDocument aDoc) throws SAXException + { + handler.startDocument(); + + if (aDoc.getRoot() == null) { + throw new SAXException("Document has no root element"); + } + + process(aDoc.getRoot()); + + handler.endDocument(); + } + + public void process(XmlElement aElement) throws SAXException + { + AttributesImpl attributes = new AttributesImpl(); + + if (aElement.getAttributes() != null) { + for (XmlAttribute attr : aElement.getAttributes()) { + attributes.addAttribute(defaultString(attr.getUri()), + defaultString(attr.getLocalName()), defaultString(attr.getQName()), + defaultString(attr.getValueType(), "CDATA"), + defaultString(attr.getValue())); + } + } + + String uri = defaultString(aElement.getUri()); + String localName = defaultString(aElement.getLocalName()); + String qName = defaultString(aElement.getQName()); + + handler.startElement(uri, localName, qName, attributes); + + if (aElement.getChildren() != null) { + for (XmlNode child : aElement.getChildren()) { + if (child instanceof XmlElement) { + process((XmlElement) child); + } + else if (child instanceof XmlTextNode) { + process((XmlTextNode) child); + } + } + } + + handler.endElement(uri, localName, qName); + } + + private void process(XmlTextNode aChild) throws SAXException + { + char[] text; + + if (aChild.getCaptured()) { + text = aChild.getCoveredText().toCharArray(); + } + else { + text = aChild.getText().toCharArray(); + } + + handler.characters(text, 0, text.length); + } +} diff --git a/dkpro-core-api-xml-asl/src/main/java/org/dkpro/core/api/xml/CasXmlHandler.java b/dkpro-core-api-xml-asl/src/main/java/org/dkpro/core/api/xml/CasXmlHandler.java new file mode 100644 index 0000000000..7eb9874b62 --- /dev/null +++ b/dkpro-core-api-xml-asl/src/main/java/org/dkpro/core/api/xml/CasXmlHandler.java @@ -0,0 +1,242 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.api.xml; + +import static org.apache.commons.lang3.StringUtils.trimToNull; + +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Deque; +import java.util.List; + +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.dkpro.core.api.xml.type.XmlAttribute; +import org.dkpro.core.api.xml.type.XmlDocument; +import org.dkpro.core.api.xml.type.XmlElement; +import org.dkpro.core.api.xml.type.XmlNode; +import org.dkpro.core.api.xml.type.XmlTextNode; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +public class CasXmlHandler + extends DefaultHandler +{ + private final JCas jcas; + private final StringBuilder text; + private final Deque stack; + + private XmlDocument docNode; + private boolean captureText = true; + + public CasXmlHandler(JCas aJCas) + { + jcas = aJCas; + text = new StringBuilder(); + stack = new ArrayDeque<>(); + } + + @Override + public void startDocument() throws SAXException + { + if (docNode != null || !stack.isEmpty() || text.length() != 0) { + throw new SAXException( + "Illegal document start event when data has already been seen."); + } + + docNode = new XmlDocument(jcas); + docNode.setBegin(text.length()); + } + + @Override + public void endDocument() throws SAXException + { + docNode.setEnd(text.length()); + docNode.addToIndexes(); + + jcas.setDocumentText(text.toString()); + }; + + @Override + public void startElement(String aUri, String aLocalName, String aQName, Attributes aAttributes) + throws SAXException + { + if (docNode == null) { + throw new SAXException( + "Illegal element start event when document start has not been seen."); + } + + XmlElement element = new XmlElement(jcas); + element.setBegin(text.length()); + element.setUri(trimToNull(aUri)); + element.setLocalName(trimToNull(aLocalName)); + element.setQName(trimToNull(aQName)); + + if (aAttributes.getLength() > 0) { + FSArray attributes = new FSArray(jcas, aAttributes.getLength()); + for (int i = 0; i < aAttributes.getLength(); i++) { + XmlAttribute attribute = new XmlAttribute(jcas); + attribute.setUri(trimToNull(aAttributes.getURI(i))); + attribute.setLocalName(trimToNull(aAttributes.getLocalName(i))); + attribute.setQName(trimToNull(aAttributes.getQName(i))); + attribute.setValueType(trimToNull(aAttributes.getType(i))); + attribute.setValue(aAttributes.getValue(i)); + attributes.set(i, attribute); + } + element.setAttributes(attributes); + } + + attachToParent(element); + + boolean capture; + StackFrame parentFrame = stack.peek(); + if (parentFrame != null) { + capture = parentFrame.isCaptureText(); + } + else { + capture = captureText; + } + + stack.push(new StackFrame(element, capture)); + } + + @Override + public void endElement(String aUri, String aLocalName, String aQName) throws SAXException + { + StackFrame frame = stack.pop(); + + XmlElement element = frame.getElement(); + element.setEnd(text.length()); + + // Fill in children + if (!frame.getChildren().isEmpty()) { + FSArray children = new FSArray(jcas, frame.getChildren().size()); + for (int i = 0; i < frame.getChildren().size(); i++) { + children.set(i, frame.getChildren().get(i)); + } + element.setChildren(children); + } + + element.addToIndexes(); + } + + @Override + public void characters(char[] aCh, int aStart, int aLength) throws SAXException + { + XmlTextNode textNode = new XmlTextNode(jcas); + textNode.setBegin(text.length()); + + if (stack.peek().isCaptureText()) { + text.append(aCh, aStart, aLength); + textNode.setCaptured(true); + } + else { + textNode.setText(new String(aCh, aStart, aLength)); + textNode.setCaptured(false); + } + + textNode.setEnd(text.length()); + textNode.addToIndexes(); + + attachToParent(textNode); + } + + @Override + public void ignorableWhitespace(char[] aCh, int aStart, int aLength) throws SAXException + { + characters(aCh, aStart, aLength); + } + + private void attachToParent(XmlNode aNode) + { + StackFrame parentFrame = stack.peek(); + if (parentFrame != null) { + aNode.setParent(parentFrame.getElement()); + parentFrame.addChild(aNode); + } + else { + docNode.setRoot((XmlElement) aNode); + } + } + + public CharSequence getText() + { + return text; + } + + public Collection getStack() + { + return Collections.unmodifiableCollection(stack); + } + + public XmlElement getCurrentElement() + { + return stack.peek().getElement(); + } + + public void captureText(boolean aCapture) + { + if (stack.isEmpty()) { + captureText = aCapture; + } + else { + stack.peek().setCaptureText(aCapture); + } + } + + private static class StackFrame + { + private final XmlElement element; + private final List children = new ArrayList<>(); + private boolean captureText; + + public StackFrame(XmlElement aElement, boolean aCaptureText) + { + element = aElement; + captureText = aCaptureText; + } + + public XmlElement getElement() + { + return element; + } + + public void addChild(XmlNode aChild) + { + children.add(aChild); + } + + public List getChildren() + { + return children; + } + + public boolean isCaptureText() + { + return captureText; + } + + public void setCaptureText(boolean aCaptureText) + { + captureText = aCaptureText; + } + } +} diff --git a/dkpro-core-api-xml-asl/src/main/resources/META-INF/org.apache.uima.fit/types.txt b/dkpro-core-api-xml-asl/src/main/resources/META-INF/org.apache.uima.fit/types.txt new file mode 100644 index 0000000000..b6cfe83b00 --- /dev/null +++ b/dkpro-core-api-xml-asl/src/main/resources/META-INF/org.apache.uima.fit/types.txt @@ -0,0 +1 @@ +classpath*:desc/type/XmlStructure.xml diff --git a/dkpro-core-api-xml-asl/src/main/resources/desc/type/XmlStructure.xml b/dkpro-core-api-xml-asl/src/main/resources/desc/type/XmlStructure.xml new file mode 100644 index 0000000000..4b64184e08 --- /dev/null +++ b/dkpro-core-api-xml-asl/src/main/resources/desc/type/XmlStructure.xml @@ -0,0 +1,546 @@ + + + + + + + XML + + + + + + + + + + ${version} + + + + + Ubiquitous Knowledge Processing (UKP) Lab, Technische Universität Darmstadt + + + + + + + + + + + + + + + org.dkpro.core.api.xml.type.XmlElement + + + + + XML element + + + + + org.dkpro.core.api.xml.type.XmlNode + + + + + + + + + + + + + + + uri + + + + + Namespace URI of the element. + + + + + uima.cas.String + + + + + + + + + + + + + + + localName + + + + + Local name of the XML element. + + + + + uima.cas.String + + + + + + + + + + + + + + + attributes + + + + + Array of attributes of the XML element. + + + + + uima.cas.FSArray + + + + + org.dkpro.core.api.xml.type.XmlAttribute + + + + + + + + + + + + + + + children + + + + + Children of this XML element. + + + + + uima.cas.FSArray + + + + + org.dkpro.core.api.xml.type.XmlNode + + + + + + + + + + + + + + + qName + + + + + + + + + + uima.cas.String + + + + + + + + + + + + + + + + + + + + + + + + + org.dkpro.core.api.xml.type.XmlAttribute + + + + + + + + + + uima.cas.TOP + + + + + + + + + + + + + + + uri + + + + + Namespace URI of the attribute. + + + + + uima.cas.String + + + + + + + + + + + + + + + localName + + + + + Local name of the attribute. + + + + + uima.cas.String + + + + + + + + + + + + + + + value + + + + + Value of the XML attribute. + + + + + uima.cas.String + + + + + + + + + + + + + + + qName + + + + + + + + + + uima.cas.String + + + + + + + + + + + + + + + valueType + + + + + + + + + + uima.cas.String + + + + + + + + + + + + + + + + + + + + + + + + + org.dkpro.core.api.xml.type.XmlNode + + + + + Supertype for XmlElements and XmlTextNodes. + + + + + uima.tcas.Annotation + + + + + + + + + + + + + + + parent + + + + + + + + + + org.dkpro.core.api.xml.type.XmlElement + + + + + + + + + + + + + + + + + + + + + + + + + org.dkpro.core.api.xml.type.XmlDocument + + + + + XML document + + + + + uima.tcas.Annotation + + + + + + + + + + + + + + + root + + + + + Root element of the XML document. + + + + + org.dkpro.core.api.xml.type.XmlElement + + + + + + + + + + + + + + + + + + + + + + + + + org.dkpro.core.api.xml.type.XmlTextNode + + + + + XML text node. + + + + + org.dkpro.core.api.xml.type.XmlNode + + + + + + + + + + + text + + + + + + uima.cas.String + + + + + + + + + captured + + + Whether the text node has been added to the document text. + + + uima.cas.Boolean + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-api-xml-asl/suppressions.xml b/dkpro-core-api-xml-asl/suppressions.xml new file mode 100644 index 0000000000..05381817ea --- /dev/null +++ b/dkpro-core-api-xml-asl/suppressions.xml @@ -0,0 +1,9 @@ + + + + + + + diff --git a/dkpro-core-arktools-gpl/.license-header.txt b/dkpro-core-arktools-gpl/.license-header.txt index ab08133a17..bbaf6e0e56 100644 --- a/dkpro-core-arktools-gpl/.license-header.txt +++ b/dkpro-core-arktools-gpl/.license-header.txt @@ -13,4 +13,4 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with this program. If not, see http://www.gnu.org/licenses/. +along with this program. If not, see http://www.gnu.org/licenses/. diff --git a/dkpro-core-arktools-gpl/LICENSE.txt b/dkpro-core-arktools-gpl/LICENSE.txt index 6e22a15c3c..99ace43661 100644 --- a/dkpro-core-arktools-gpl/LICENSE.txt +++ b/dkpro-core-arktools-gpl/LICENSE.txt @@ -654,7 +654,7 @@ the "copyright" line and a pointer to where the full notice is found. GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. + along with this program. If not, see http://www.gnu.org/licenses/. Also add information on how to contact you by electronic and paper mail. diff --git a/dkpro-core-arktools-gpl/pom.xml b/dkpro-core-arktools-gpl/pom.xml index 5c637ce445..3056be8e76 100644 --- a/dkpro-core-arktools-gpl/pom.xml +++ b/dkpro-core-arktools-gpl/pom.xml @@ -1,6 +1,6 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-gpl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-gpl + 2.3.0-SNAPSHOT ../dkpro-core-gpl - de.tudarmstadt.ukp.dkpro.core.arktools-gpl + dkpro-core-arktools-gpl jar DKPro Core GPL - ARK (v ${arktweet.version}) + https://dkpro.github.io/dkpro-core/ 0.3.2 + + commons-io + commons-io + org.apache.uima uimaj-core @@ -47,47 +52,60 @@ ${arktweet.version} - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-resources-asl + + + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api + + + + xml-apis + xml-apis + runtime junit junit test + + org.dkpro.core + dkpro-core-testing-asl + test + de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + de.tudarmstadt.ukp.dkpro.core.arktools-model-tagger-en-default test de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + de.tudarmstadt.ukp.dkpro.core.arktools-model-tagger-en-irc test de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.arktools-model-tagger-en-default + de.tudarmstadt.ukp.dkpro.core.arktools-model-tagger-en-ritter test - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.arktools-model-tagger-en-irc - test - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.arktools-model-tagger-en-ritter - test - @@ -96,16 +114,16 @@ de.tudarmstadt.ukp.dkpro.core.arktools-model-tagger-en-default 20120919.1 - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.arktools-model-tagger-en-irc - 20121211.1 - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.arktools-model-tagger-en-ritter - 20130723.1 - + + de.tudarmstadt.ukp.dkpro.core + de.tudarmstadt.ukp.dkpro.core.arktools-model-tagger-en-irc + 20121211.1 + + + de.tudarmstadt.ukp.dkpro.core + de.tudarmstadt.ukp.dkpro.core.arktools-model-tagger-en-ritter + 20130723.1 + @@ -116,6 +134,7 @@ maven-dependency-plugin + xml-apis:xml-apis de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.arktools-model-tagger-en-default de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.arktools-model-tagger-en-irc diff --git a/dkpro-core-arktools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/arktools/ArktweetPosTagger.java b/dkpro-core-arktools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/arktools/ArktweetPosTagger.java deleted file mode 100644 index 77b58df855..0000000000 --- a/dkpro-core-arktools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/arktools/ArktweetPosTagger.java +++ /dev/null @@ -1,236 +0,0 @@ -package de.tudarmstadt.ukp.dkpro.core.arktools; - -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ - -import java.io.IOException; -import java.net.URL; -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.TypeSystem; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.fit.component.CasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.CasUtil; -import org.apache.uima.resource.ResourceInitializationException; - -import cmu.arktweetnlp.Twokenize; -import cmu.arktweetnlp.impl.Model; -import cmu.arktweetnlp.impl.ModelSentence; -import cmu.arktweetnlp.impl.Sentence; -import cmu.arktweetnlp.impl.features.FeatureExtractor; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * Wrapper for Twitter Tokenizer and POS Tagger. - * - * As described in: Olutobi Owoputi, Brendan O’Connor, Chris Dyer, Kevin Gimpel, Nathan Schneider - * and Noah A. Smith. Improved Part-of-Speech Tagging for Online Conversational Text with Word - * Clusters In Proceedings of NAACL 2013. - */ -@ResourceMetaData(name="ArkTweet POS-Tagger") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }) -public class ArktweetPosTagger - extends CasAnnotator_ImplBase -{ - - /** - * Use this language instead of the document language to resolve the model and tag set mapping. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Variant of a model the model. Used to address a specific model if here are multiple models - * for one language. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Location from which the model is read. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - protected String modelLocation; - - /** - * Location of the mapping file for part-of-speech tags to UIMA types. - */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - protected String posMappingLocation; - - private Type tokenType; - private Feature featPos; - - private CasConfigurableProviderBase modelProvider; - private MappingProvider mappingProvider; - - /** - * Loads a model from a file. The tagger should be ready to tag after calling this. - */ - public class TweetTagger - { - Model model; - FeatureExtractor featureExtractor; - - public void loadModel(String modelFilename) - throws IOException - { - model = Model.loadModelFromText(modelFilename); - featureExtractor = new FeatureExtractor(model, false); - } - } - - /** - * One token and its tag. - **/ - public static class TaggedToken - { - public AnnotationFS token; - public String tag; - } - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - - modelProvider = new ModelProviderBase() - { - { - setContextObject(ArktweetPosTagger.this); - - setDefault(ARTIFACT_ID, "${groupId}.arktools-model-tagger-${language}-${variant}"); - setDefault(LOCATION, - "classpath:/${package}/lib/tagger-${language}-${variant}.properties"); - setDefault(VARIANT, "default"); - - setOverride(LOCATION, modelLocation); - setOverride(LANGUAGE, language); - setOverride(VARIANT, variant); - } - - @Override - protected TweetTagger produceResource(URL aUrl) - throws IOException - { - try { - TweetTagger model = new TweetTagger(); - model.loadModel(ResourceUtils.getUrlAsFile(aUrl, false).getAbsolutePath()); - - return model; - } - catch (Exception e) { - throw new IOException(e); - } - } - }; - - mappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - language, modelProvider); - - } - - @Override - public void typeSystemInit(TypeSystem aTypeSystem) - throws AnalysisEngineProcessException - { - super.typeSystemInit(aTypeSystem); - - tokenType = aTypeSystem.getType(Token.class.getName()); - featPos = tokenType.getFeatureByBaseName("pos"); - } - - @Override - public void process(CAS cas) - throws AnalysisEngineProcessException - { - - modelProvider.configure(cas); - mappingProvider.configure(cas); - - List tokens = CasUtil.selectCovered(cas, tokenType, 0, cas.getDocumentText() - .length()); - List taggedTokens = tagTweetTokens(tokens, modelProvider.getResource()); - - for (TaggedToken taggedToken : taggedTokens) { - - Type posType = mappingProvider.getTagType(taggedToken.tag); - - AnnotationFS posAnno = cas.createAnnotation(posType, taggedToken.token.getBegin(), - taggedToken.token.getEnd()); - posAnno.setStringValue(posType.getFeatureByBaseName("PosValue"), taggedToken.tag); - cas.addFsToIndexes(posAnno); - - taggedToken.token.setFeatureValue(featPos, posAnno); - } - } - - private List tagTweetTokens(List annotatedTokens, - TweetTagger tweetTagModel) - { - - List tokens = new LinkedList(); - for (AnnotationFS a : annotatedTokens) { - String tokenText = a.getCoveredText(); - tokenText = Twokenize.normalizeTextForTagger(tokenText); - tokens.add(tokenText); - } - - Sentence sentence = new Sentence(); - sentence.tokens = tokens; - ModelSentence ms = new ModelSentence(sentence.T()); - tweetTagModel.featureExtractor.computeFeatures(sentence, ms); - tweetTagModel.model.greedyDecode(ms, false); - - ArrayList taggedTokens = new ArrayList(); - - for (int t = 0; t < sentence.T(); t++) { - TaggedToken tt = new TaggedToken(); - tt.token = annotatedTokens.get(t); - tt.tag = tweetTagModel.model.labelVocab.name(ms.labels[t]); - taggedTokens.add(tt); - } - return taggedTokens; - } -} diff --git a/dkpro-core-arktools-gpl/src/main/java/org/dkpro/core/arktools/ArktweetPosTagger.java b/dkpro-core-arktools-gpl/src/main/java/org/dkpro/core/arktools/ArktweetPosTagger.java new file mode 100644 index 0000000000..9ba64d16fe --- /dev/null +++ b/dkpro-core-arktools-gpl/src/main/java/org/dkpro/core/arktools/ArktweetPosTagger.java @@ -0,0 +1,260 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.arktools; + +import static org.apache.uima.fit.util.JCasUtil.selectCovered; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.Type; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.parameter.ResourceParameter; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.MappingProviderFactory; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.resources.ResourceUtils; + +import cmu.arktweetnlp.Twokenize; +import cmu.arktweetnlp.impl.Model; +import cmu.arktweetnlp.impl.ModelSentence; +import cmu.arktweetnlp.impl.Sentence; +import cmu.arktweetnlp.impl.features.FeatureExtractor; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + + +/** + * Wrapper for Twitter Tokenizer and POS Tagger. + * + * As described in: Olutobi Owoputi, Brendan O’Connor, Chris Dyer, Kevin Gimpel, Nathan Schneider + * and Noah A. Smith. Improved Part-of-Speech Tagging for Online Conversational Text with Word + * Clusters In Proceedings of NAACL 2013. + */ +@Component(OperationType.POS_TAGGING) +@ResourceMetaData(name = "ArkTweet POS-Tagger") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }) +public class ArktweetPosTagger + extends JCasAnnotator_ImplBase +{ + + /** + * Use this language instead of the document language to resolve the model and tag set mapping. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Variant of a model the model. Used to address a specific model if here are multiple models + * for one language. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + *

The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.

+ */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Location from which the model is read. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + @ResourceParameter(MimeTypes.APPLICATION_X_ARKTWEET_TAGGER) + protected String modelLocation; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Location of the mapping file for part-of-speech tags to UIMA types. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + protected String posMappingLocation; + + private CasConfigurableProviderBase modelProvider; + private MappingProvider mappingProvider; + + /** + * Loads a model from a file. The tagger should be ready to tag after calling this. + */ + public class TweetTagger + { + Model model; + FeatureExtractor featureExtractor; + + public void loadModel(String modelFilename) + throws IOException + { + model = Model.loadModelFromText(modelFilename); + featureExtractor = new FeatureExtractor(model, false); + } + } + + /** + * One token and its tag. + **/ + public static class TaggedToken + { + public Token token; + public String tag; + + private int getBegin() { + return token.getBegin(); + } + + private int getEnd() { + return token.getEnd(); + } + } + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + modelProvider = new ModelProviderBase() + { + { + setContextObject(ArktweetPosTagger.this); + + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(ARTIFACT_ID, "${groupId}.arktools-model-tagger-${language}-${variant}"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/arktools/lib/tagger-${language}-${variant}.properties"); + setDefault(VARIANT, "default"); + + setOverride(LOCATION, modelLocation); + setOverride(LANGUAGE, language); + setOverride(VARIANT, variant); + } + + @Override + protected TweetTagger produceResource(URL aUrl) + throws IOException + { + try { + TweetTagger model = new TweetTagger(); + model.loadModel(ResourceUtils.getUrlAsFile(aUrl, false).getAbsolutePath()); + + return model; + } + catch (Exception e) { + throw new IOException(e); + } + } + }; + + mappingProvider = MappingProviderFactory.createPosMappingProvider(this, posMappingLocation, + language, modelProvider); + + } + + @Override + public void process(JCas jCas) + throws AnalysisEngineProcessException + { + + modelProvider.configure(jCas.getCas()); + mappingProvider.configure(jCas.getCas()); + + List tokens = selectCovered(jCas, Token.class, 0, jCas.getDocumentText().length()); + List taggedTokens = tagTweetTokens(tokens, modelProvider.getResource()); + + for (TaggedToken taggedToken : taggedTokens) { + + Type posType = mappingProvider.getTagType(taggedToken.tag); + + POS pos = (POS) jCas.getCas().createAnnotation(posType, taggedToken.getBegin(), + taggedToken.getEnd()); + pos.setPosValue(taggedToken.tag.intern()); + pos.addToIndexes(); + taggedToken.token.setPos(pos); + } + } + + private List tagTweetTokens(List annotatedTokens, + TweetTagger tweetTagModel) + { + + List tokens = new LinkedList(); + for (Token a : annotatedTokens) { + String tokenText = a.getText(); + tokenText = Twokenize.normalizeTextForTagger(tokenText); + tokens.add(tokenText); + } + + Sentence sentence = new Sentence(); + sentence.tokens = tokens; + ModelSentence ms = new ModelSentence(sentence.T()); + tweetTagModel.featureExtractor.computeFeatures(sentence, ms); + tweetTagModel.model.greedyDecode(ms, false); + + ArrayList taggedTokens = new ArrayList(); + + for (int t = 0; t < sentence.T(); t++) { + TaggedToken tt = new TaggedToken(); + tt.token = annotatedTokens.get(t); + tt.tag = tweetTagModel.model.labelVocab.name(ms.labels[t]); + taggedTokens.add(tt); + } + return taggedTokens; + } +} diff --git a/dkpro-core-arktools-gpl/src/main/java/org/dkpro/core/arktools/ArktweetPosTaggerTrainer.java b/dkpro-core-arktools-gpl/src/main/java/org/dkpro/core/arktools/ArktweetPosTaggerTrainer.java new file mode 100644 index 0000000000..815694f7f3 --- /dev/null +++ b/dkpro-core-arktools-gpl/src/main/java/org/dkpro/core/arktools/ArktweetPosTaggerTrainer.java @@ -0,0 +1,133 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.arktools; + +import static org.apache.uima.fit.util.JCasUtil.indexCovered; +import static org.apache.uima.fit.util.JCasUtil.select; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.nio.charset.StandardCharsets; +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasConsumer_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; + +import cmu.arktweetnlp.Train; +import cmu.arktweetnlp.impl.features.WordClusterPaths; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Trainer for ark-tweet POS tagger. + */ +@Component(OperationType.TRAINER_OF_MACHINE_LEARNING_MODELS) +@MimeTypeCapability(MimeTypes.APPLICATION_X_ARKTWEET_TAGGER) +@ResourceMetaData(name = "ArkTweet POS-Tagger Trainer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }) +public class ArktweetPosTaggerTrainer extends JCasConsumer_ImplBase { + + /** + * Location to which the model is written. + */ + public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; + @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) + private File targetLocation; + + /** + * Classpath resource pointing to the the word cluster file calculated with + * brown clustering algorithm. + */ + public static final String PARAM_WORD_CLUSTER_FILE = "wordClusterFile"; + @ConfigurationParameter(name = PARAM_WORD_CLUSTER_FILE) + private String wordClusterFile; + + private File tempData; + private PrintWriter out; + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + if (tempData == null) { + try { + tempData = File.createTempFile("dkpro-arktweet-pos-trainer", ".tsv"); + out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(tempData), + StandardCharsets.UTF_8)); + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + + + Map> index = indexCovered(jCas, Sentence.class, Token.class); + for (Sentence sentence : select(jCas, Sentence.class)) { + Collection tokens = index.get(sentence); + for (Token token : tokens) { + out.printf("%s\t%s%n", token.getText(), token.getPos().getPosValue()); + } + out.println(); + } + } + + @Override + public void collectionProcessComplete() throws AnalysisEngineProcessException { + try { + if (wordClusterFile != null) { + getLogger().debug("Use word cluster file " + wordClusterFile); + WordClusterPaths.clusterResourceName = wordClusterFile.toString(); + } + + Train.main(new String[] { + tempData.toString(), + targetLocation.toString() + }); + } catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + + @Override + public void destroy() { + IOUtils.closeQuietly(out); + FileUtils.deleteQuietly(tempData); + super.destroy(); + } +} diff --git a/dkpro-core-arktools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/arktools/ArktweetTokenizer.java b/dkpro-core-arktools-gpl/src/main/java/org/dkpro/core/arktools/ArktweetTokenizer.java similarity index 89% rename from dkpro-core-arktools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/arktools/ArktweetTokenizer.java rename to dkpro-core-arktools-gpl/src/main/java/org/dkpro/core/arktools/ArktweetTokenizer.java index 968248e241..e4fe99d832 100644 --- a/dkpro-core-arktools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/arktools/ArktweetTokenizer.java +++ b/dkpro-core-arktools-gpl/src/main/java/org/dkpro/core/arktools/ArktweetTokenizer.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,9 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.arktools; +package org.dkpro.core.arktools; import java.util.List; @@ -31,11 +31,13 @@ import cmu.arktweetnlp.Twokenize; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.DocumentationResource; /** * ArkTweet tokenizer. */ -@ResourceMetaData(name="ArkTweet Tokenizer") +@ResourceMetaData(name = "ArkTweet Tokenizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") public class ArktweetTokenizer extends CasAnnotator_ImplBase { diff --git a/dkpro-core-arktools-gpl/src/scripts/build.xml b/dkpro-core-arktools-gpl/src/scripts/build.xml index b80c46a5a3..dc0a573b89 100644 --- a/dkpro-core-arktools-gpl/src/scripts/build.xml +++ b/dkpro-core-arktools-gpl/src/scripts/build.xml @@ -1,6 +1,6 @@ diff --git a/dkpro-core-arktools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/arktools/ArktweetTaggerTest.java b/dkpro-core-arktools-gpl/src/test/java/og/dkpro/core/arktools/ArktweetTaggerTest.java similarity index 89% rename from dkpro-core-arktools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/arktools/ArktweetTaggerTest.java rename to dkpro-core-arktools-gpl/src/test/java/og/dkpro/core/arktools/ArktweetTaggerTest.java index b0d9498d5e..ed573131dc 100644 --- a/dkpro-core-arktools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/arktools/ArktweetTaggerTest.java +++ b/dkpro-core-arktools-gpl/src/test/java/og/dkpro/core/arktools/ArktweetTaggerTest.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,21 +14,25 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.arktools; +package og.dkpro.core.arktools; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.util.JCasUtil.select; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.*; +import static org.dkpro.core.testing.AssertAnnotations.assertPOS; +import static org.dkpro.core.testing.AssertAnnotations.assertToken; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; +import org.dkpro.core.arktools.ArktweetPosTagger; +import org.dkpro.core.arktools.ArktweetTokenizer; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; + import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class ArktweetTaggerTest { @@ -52,8 +56,8 @@ public void arktweetTaggerTest() "Spending the day withhh mommma !", new String[] { "Spending", "the", "day", "withhh", "mommma", "!" }, new String[] { "V", "D", "N", "P", "N", "," }, - new String[] { "POS_VERB", "POS_DET", "POS_NOUN", "POS_ADP", "POS_NOUN", "POS_PUNCT" } - ); + new String[] { "POS_VERB", "POS_DET", "POS_NOUN", "POS_ADP", "POS_NOUN", + "POS_PUNCT" }); runTest("en", "default", "lmao ... s/o to the cool ass asian officer 4 #1 not runnin my license and #2 not takin dru boo to jail . Thank u God . #amen", @@ -65,24 +69,23 @@ public void arktweetTaggerTest() new String[] { "POS_INT", "POS_PUNCT", "POS_VERB", "POS_ADP", "POS_DET", "POS_ADJ", "POS_NOUN", "POS_ADJ", "POS_NOUN", "POS_ADP", "POS_NUM", "POS_ADV", "POS_VERB", "POS_DET", "POS_NOUN", "POS_CONJ", "POS_NUM", "POS_ADV", "POS_VERB", "POS_NOUN", "POS_NOUN", "POS_ADP", "POS_NOUN", "POS_PUNCT", "POS_VERB", "POS_PRON", "POS_PROPN", "POS_PUNCT", - "POS_HASH" } - ); + "POS_HASH" }); runTest("en", "default", "Different smiley styles :) :-) (^_^) ^o #smiley", new String[] { "Different", "smiley", "styles", ":)", ":-)", "(^_^)", "^o", "#smiley" }, new String[] { "A", "A", "N", "E", "E", "E", "E", "#" }, - new String[] { "POS_ADJ", "POS_ADJ", "POS_NOUN", "POS_EMO", "POS_EMO", "POS_EMO", "POS_EMO", "POS_HASH" } - ); + new String[] { "POS_ADJ", "POS_ADJ", "POS_NOUN", "POS_EMO", "POS_EMO", "POS_EMO", + "POS_EMO", "POS_HASH" }); runTest("en", "irc", "Different smiley styles :) :-) (^_^) ^o #smiley", new String[] { "Different", "smiley", "styles", ":)", ":-)", "(^_^)", "^o", "#smiley" }, new String[] { "JJ", "JJ", "NNS", "UH", "UH", "UH", "UH", "UH" }, - new String[] { "POS_ADJ", "POS_ADJ", "POS_NOUN", "POS_INTJ", "POS_INTJ", "POS_INTJ", "POS_INTJ", "POS_INTJ" } - ); + new String[] { "POS_ADJ", "POS_ADJ", "POS_NOUN", "POS_INTJ", "POS_INTJ", "POS_INTJ", + "POS_INTJ", "POS_INTJ" }); runTest("en", "irc", "@Gunservatively obozo will go nuts when PA elects a Republican Governor next Tue. Can you say redistricting?", @@ -101,9 +104,9 @@ public void arktweetTaggerTest() new String[] { "Different", "smiley", "styles", ":)", ":-)", "(^_^)", "^o", "#smiley" }, new String[] { "JJ", "JJ", "NNS", "UH", "UH", "UH", "UH", "HT" }, - new String[] { "POS_ADJ", "POS_ADJ", "POS_NOUN", "POS_INTJ", "POS_INTJ", "POS_INTJ", "POS_INTJ", "POS" } - ); - + new String[] { "POS_ADJ", "POS_ADJ", "POS_NOUN", "POS_INTJ", "POS_INTJ", "POS_INTJ", + "POS_INTJ", "POS" }); + runTest("en", "ritter", "@Gunservatively obozo will go nuts when PA elects a Republican Governor next Tue. Can you say redistricting?", new String[] { "@Gunservatively", "obozo", "will", "go", "nuts", "when", "PA", @@ -114,7 +117,7 @@ public void arktweetTaggerTest() new String[] { "POS", "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_ADV", "POS_PROPN", "POS_VERB", "POS_DET", "POS_PROPN", "POS_PROPN", "POS_ADJ", "POS_PROPN", "POS_PUNCT", "POS_VERB", "POS_PRON", "POS_VERB", "POS_NOUN", "POS_PUNCT" } - ); + ); } // // Test for issue 335 @@ -127,10 +130,10 @@ public void arktweetTaggerTest() // new String[] { "NN", "NN" } // ); // } - - private JCas runTest(String language, String variant, String testDocument, String[] tokens, String[] tags, - String[] tagClasses) - throws Exception + + private JCas runTest(String language, String variant, String testDocument, String[] tokens, + String[] tags, String[] tagClasses) + throws Exception { AnalysisEngine tokenizer = createEngine( ArktweetTokenizer.class diff --git a/dkpro-core-arktools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/arktools/ArktweetTokenizationTest.java b/dkpro-core-arktools-gpl/src/test/java/og/dkpro/core/arktools/ArktweetTokenizationTest.java similarity index 97% rename from dkpro-core-arktools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/arktools/ArktweetTokenizationTest.java rename to dkpro-core-arktools-gpl/src/test/java/og/dkpro/core/arktools/ArktweetTokenizationTest.java index 4e58cedf82..82ec66daab 100644 --- a/dkpro-core-arktools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/arktools/ArktweetTokenizationTest.java +++ b/dkpro-core-arktools-gpl/src/test/java/og/dkpro/core/arktools/ArktweetTokenizationTest.java @@ -1,7 +1,5 @@ -package de.tudarmstadt.ukp.dkpro.core.arktools; - -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -16,8 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ +package og.dkpro.core.arktools; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.junit.Assert.assertEquals; @@ -31,11 +30,11 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.arktools.ArktweetTokenizer; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.arktools.ArktweetTokenizer; public class ArktweetTokenizationTest { diff --git a/dkpro-core-asl/pom.xml b/dkpro-core-asl/pom.xml index ace46cc33b..83fbc9024e 100644 --- a/dkpro-core-asl/pom.xml +++ b/dkpro-core-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core + 2.3.0-SNAPSHOT .. - de.tudarmstadt.ukp.dkpro.core-asl + dkpro-core-asl pom DKPro Core ASL + https://dkpro.github.io/dkpro-core/ Apache License Version 2.0 @@ -36,474 +37,509 @@ - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.anomaly-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-api-anomaly-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.datasets-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-api-datasets-asl + 2.3.0-SNAPSHOT org.dkpro.core dkpro-core-api-embeddings-asl - 1.10.0-SNAPSHOT + 2.3.0-SNAPSHOT + + + org.dkpro.core + dkpro-core-api-featurepath-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.featurepath-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-api-frequency-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.frequency-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-api-io-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-api-coref-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.coref-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-api-lexmorph-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-api-metadata-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-api-ner-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.ner-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-api-parameter-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-api-phonetics-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.phonetics-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-api-resources-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-api-segmentation-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-api-semantics-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.semantics-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-api-sentiment-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.sentiment-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-api-structure-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.structure-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-api-syntax-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.syntax-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-api-transform-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.transform-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-api-xml-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.castransformation-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-castransformation-asl + 2.3.0-SNAPSHOT org.dkpro.core dkpro-core-cisstem-asl - 1.10.0-SNAPSHOT + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.clearnlp-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-clearnlp-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.commonscodec-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-commonscodec-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.decompounding-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-decompounding-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.dictionaryannotator-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-dictionaryannotator-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.eval-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-eval-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.flextag-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-flextag-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.frequency-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-frequency-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.gate-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-gate-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.hunpos-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-hunpos-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.fs.hdfs-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-fs-hdfs-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.gosen-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-gosen-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.icu-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-icu-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.aclanthology-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-aclanthology-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.ancora-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-ancora-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.bincas-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-bincas-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.bliki-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-bliki-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.bnc-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-bnc-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.brat-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-brat-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.combination-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-combination-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.conll-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-conll-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.ditop-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-ditop-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.imscwb-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-gigaword-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.json-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-html-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.jdbc-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-imscwb-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.jwpl-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-json-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.lcc-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-jdbc-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.lif-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-jwpl-asl + 2.3.0-SNAPSHOT + + + org.dkpro.core + dkpro-core-io-lcc-asl + 2.3.0-SNAPSHOT + + + org.dkpro.core + dkpro-core-io-lif-asl + 2.3.0-SNAPSHOT org.dkpro.core dkpro-core-io-lxf-asl - 1.10.0-SNAPSHOT + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.negra-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-negra-asl + 2.3.0-SNAPSHOT org.dkpro.core dkpro-core-io-nif-asl - 1.10.0-SNAPSHOT + 2.3.0-SNAPSHOT org.dkpro.core - dkpro-core-io-nyt-asl - 1.10.0-SNAPSHOT + dkpro-core-io-nitf-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.pdf-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-pdf-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.penntree-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-penntree-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.rtf-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-perseus-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.solr-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-pubannotation-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.tcf-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-rtf-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.tei-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-solr-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.text-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-tcf-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.tiger-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-tei-asl + 2.3.0-SNAPSHOT + + + org.dkpro.core + dkpro-core-io-text-asl + 2.3.0-SNAPSHOT + + + org.dkpro.core + dkpro-core-io-tiger-asl + 2.3.0-SNAPSHOT org.dkpro.core dkpro-core-io-tika-asl - 1.10.0-SNAPSHOT + 2.3.0-SNAPSHOT + + + org.dkpro.core + dkpro-core-io-webanno-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.tuebadz-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-tuebadz-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.tuepp-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-tuepp-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.web1t-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-web1t-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.xmi-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-xmi-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.xml-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-xml-asl + 2.3.0-SNAPSHOT org.dkpro.core dkpro-core-io-xces-asl - 1.10.0-SNAPSHOT + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.ixa-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-ixa-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.jazzy-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-jazzy-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.jtok-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-jtok-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.languagetool-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-languagetool-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.langdetect-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-langdetect-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.ldweb1t-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-ldweb1t-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.mallet-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-mallet-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.maltparser-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-maltparser-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.morpha-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-morpha-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.mstparser-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-mstparser-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.mecab-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-mecab-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.ngrams-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-mystem-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.nlp4j-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-ngrams-asl + 2.3.0-SNAPSHOT + + + org.dkpro.core + dkpro-core-nlp4j-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.norvig-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-norvig-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-opennlp-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.performance-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-performance-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.posfilter-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-posfilter-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.readability-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-readability-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.rftagger-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-rftagger-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.snowball-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-snowball-asl + 2.3.0-SNAPSHOT org.dkpro.core - org-dkpro-core-lancaster-asl - 1.10.0-SNAPSHOT + org-dkpro-core-smile-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.stopwordremover-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-testing-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-textcat-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.textcat-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-tokit-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.tokit-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-treetagger-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.treetagger-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-textnormalizer-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.textnormalizer-asl - 1.10.0-SNAPSHOT + org.dkpro.core + org-dkpro-core-udpipe-asl + 2.3.0-SNAPSHOT @@ -529,6 +565,7 @@ ../dkpro-core-api-transform-asl ../dkpro-core-api-ner-asl ../dkpro-core-api-frequency-asl + ../dkpro-core-api-xml-asl ../dkpro-core-fs-hdfs-asl @@ -541,19 +578,21 @@ ../dkpro-core-io-combination-asl ../dkpro-core-io-conll-asl ../dkpro-core-io-ditop-asl + ../dkpro-core-io-gigaword-asl ../dkpro-core-io-imscwb-asl ../dkpro-core-io-html-asl ../dkpro-core-io-json-asl ../dkpro-core-io-jdbc-asl ../dkpro-core-io-jwpl-asl - ../dkpro-core-io-lcc-asl ../dkpro-core-io-lif-asl ../dkpro-core-io-lxf-asl ../dkpro-core-io-negra-asl ../dkpro-core-io-nif-asl - ../dkpro-core-io-nyt-asl + ../dkpro-core-io-nitf-asl ../dkpro-core-io-pdf-asl ../dkpro-core-io-penntree-asl + ../dkpro-core-io-perseus-asl + ../dkpro-core-io-pubannotation-asl ../dkpro-core-io-reuters-asl ../dkpro-core-io-rtf-asl ../dkpro-core-io-solr-asl @@ -565,6 +604,7 @@ ../dkpro-core-io-tuepp-asl ../dkpro-core-io-tuebadz-asl ../dkpro-core-io-web1t-asl + ../dkpro-core-io-webanno-asl ../dkpro-core-io-xmi-asl ../dkpro-core-io-xml-asl ../dkpro-core-io-xces-asl @@ -576,7 +616,10 @@ ../dkpro-core-decompounding-asl ../dkpro-core-dictionaryannotator-asl ../dkpro-core-eval-asl + ../dkpro-core-frequency-asl ../dkpro-core-gate-asl ../dkpro-core-gosen-asl @@ -585,6 +628,7 @@ ../dkpro-core-ixa-asl ../dkpro-core-jazzy-asl ../dkpro-core-jtok-asl + ../dkpro-core-jieba-asl ../dkpro-core-languagetool-asl ../dkpro-core-langdetect-asl ../dkpro-core-ldweb1t-asl @@ -593,6 +637,7 @@ ../dkpro-core-mecab-asl ../dkpro-core-morpha-asl ../dkpro-core-mstparser-asl + ../dkpro-core-mystem-asl ../dkpro-core-ngrams-asl ../dkpro-core-nlp4j-asl ../dkpro-core-norvig-asl @@ -602,13 +647,13 @@ ../dkpro-core-readability-asl ../dkpro-core-rftagger-asl ../dkpro-core-snowball-asl - ../dkpro-core-lancaster-asl - ../dkpro-core-stopwordremover-asl + ../dkpro-core-smile-asl ../dkpro-core-testing-asl ../dkpro-core-textcat-asl ../dkpro-core-tokit-asl ../dkpro-core-treetagger-asl ../dkpro-core-textnormalizer-asl + ../dkpro-core-udpipe-asl @@ -624,29 +669,29 @@ - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.annis-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-annis-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.rdf-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-io-rdf-asl + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.cogroo-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-cogroo-asl + 2.3.0-SNAPSHOT org.dkpro.core dkpro-core-kuromoji-asl - 1.10.0-SNAPSHOT + 2.3.0-SNAPSHOT - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.gate-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-gate-asl + 2.3.0-SNAPSHOT @@ -685,51 +730,17 @@ false - - - anc-releases - http://www.anc.org:8080/nexus/content/repositories/releases/ - - true - - - false - - - - - - ../dkpro-core-io-fangorn-asl - ../dkpro-core-io-graf-asl - ../dkpro-core-lbj-asl - ../dkpro-core-udpipe-asl + ../dkpro-core-illinoisnlp-asl - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.fangorn-asl - 1.10.0-SNAPSHOT - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.graf-asl - 1.10.0-SNAPSHOT - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.lbj-asl - 1.10.0-SNAPSHOT - org.dkpro.core - org-dkpro-core-udpipe-asl - 1.10.0-SNAPSHOT + dkpro-core-illinoisnlp-asl + 2.3.0-SNAPSHOT @@ -754,9 +765,13 @@ check + true + .factorypath .activate-run-jcasgen .gitignore + .checkstyle + suppressions.xml src/filter/**/* diff --git a/dkpro-core-berkeleyparser-gpl/.license-header.txt b/dkpro-core-berkeleyparser-gpl/.license-header.txt index ab08133a17..bbaf6e0e56 100644 --- a/dkpro-core-berkeleyparser-gpl/.license-header.txt +++ b/dkpro-core-berkeleyparser-gpl/.license-header.txt @@ -13,4 +13,4 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with this program. If not, see http://www.gnu.org/licenses/. +along with this program. If not, see http://www.gnu.org/licenses/. diff --git a/dkpro-core-berkeleyparser-gpl/LICENSE.txt b/dkpro-core-berkeleyparser-gpl/LICENSE.txt index 6e22a15c3c..99ace43661 100644 --- a/dkpro-core-berkeleyparser-gpl/LICENSE.txt +++ b/dkpro-core-berkeleyparser-gpl/LICENSE.txt @@ -654,7 +654,7 @@ the "copyright" line and a pointer to where the full notice is found. GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. + along with this program. If not, see http://www.gnu.org/licenses/. Also add information on how to contact you by electronic and paper mail. diff --git a/dkpro-core-berkeleyparser-gpl/pom.xml b/dkpro-core-berkeleyparser-gpl/pom.xml index 81ba765fda..31547e92f7 100644 --- a/dkpro-core-berkeleyparser-gpl/pom.xml +++ b/dkpro-core-berkeleyparser-gpl/pom.xml @@ -1,6 +1,6 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-gpl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-gpl + 2.3.0-SNAPSHOT ../dkpro-core-gpl - de.tudarmstadt.ukp.dkpro.core.berkeleyparser-gpl + dkpro-core-berkeleyparser-gpl jar DKPro Core GPL - Berkeley Parser + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -38,10 +39,6 @@ org.apache.uima uimafit-core - - commons-io - commons-io - org.apache.commons commons-lang3 @@ -52,28 +49,32 @@ r32 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.syntax-asl + org.dkpro.core + dkpro-core-api-syntax-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -81,13 +82,13 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl + org.dkpro.core + dkpro-core-opennlp-asl test @@ -159,9 +160,9 @@ 20090917.1 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-opennlp-asl + 2.3.0-SNAPSHOT pom import diff --git a/dkpro-core-berkeleyparser-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/berkeleyparser/package-info.java b/dkpro-core-berkeleyparser-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/berkeleyparser/package-info.java deleted file mode 100644 index fbfc819bce..0000000000 --- a/dkpro-core-berkeleyparser-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/berkeleyparser/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -/** - * Integration of the Berkeley Parser. - * - * @since 1.5.0 - */ -package de.tudarmstadt.ukp.dkpro.core.berkeleyparser; diff --git a/dkpro-core-berkeleyparser-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/berkeleyparser/BerkeleyParser.java b/dkpro-core-berkeleyparser-gpl/src/main/java/org/dkpro/core/berkeleyparser/BerkeleyParser.java similarity index 79% rename from dkpro-core-berkeleyparser-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/berkeleyparser/BerkeleyParser.java rename to dkpro-core-berkeleyparser-gpl/src/main/java/org/dkpro/core/berkeleyparser/BerkeleyParser.java index 3eea2301c3..fa7bcea082 100644 --- a/dkpro-core-berkeleyparser-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/berkeleyparser/BerkeleyParser.java +++ b/dkpro-core-berkeleyparser-gpl/src/main/java/org/dkpro/core/berkeleyparser/BerkeleyParser.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,14 +14,15 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.berkeleyparser; +package org.dkpro.core.berkeleyparser; -import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import static org.apache.uima.util.Level.INFO; +import static org.dkpro.core.api.resources.MappingProviderFactory.createConstituentMappingProvider; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; import java.io.IOException; import java.io.ObjectInputStream; @@ -47,15 +48,14 @@ import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; @@ -67,18 +67,26 @@ import edu.berkeley.nlp.PCFGLA.TreeAnnotations; import edu.berkeley.nlp.syntax.Tree; import edu.berkeley.nlp.util.Numberer; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** - * Berkeley Parser annotator . Requires {@link Sentence}s to be annotated before. + * Berkeley Parser annotator. Requires {@link Sentence}s to be annotated before. * * @see CoarseToFineMaxRuleParser */ -@ResourceMetaData(name="Berkeley Parser") +@Component(OperationType.CONSTITUENCY_PARSER) +@ResourceMetaData(name = "Berkeley Parser") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @OperationalProperties(multipleDeploymentAllowed = false) -@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent", - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree" }) +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent", + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree" }) public class BerkeleyParser extends JCasAnnotator_ImplBase { @@ -96,6 +104,20 @@ public class BerkeleyParser @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + *

The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.

+ */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + /** * Load the model from this location instead of locating the model automatically. */ @@ -103,34 +125,32 @@ public class BerkeleyParser @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) protected String modelLocation; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Location of the mapping file for part-of-speech tags to UIMA types. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; /** * Location of the mapping file for constituent tags to UIMA types. */ - public static final String PARAM_CONSTITUENT_MAPPING_LOCATION = ComponentParameters.PARAM_CONSTITUENT_MAPPING_LOCATION; + public static final String PARAM_CONSTITUENT_MAPPING_LOCATION = + ComponentParameters.PARAM_CONSTITUENT_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_CONSTITUENT_MAPPING_LOCATION, mandatory = false) protected String constituentMappingLocation; - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid spaming - * the heap with thousands of strings representing only a few different tags. - * - * Default: {@code true} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - /** * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} */ public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") @@ -139,8 +159,6 @@ public class BerkeleyParser /** * Sets whether to use or not to use already existing POS tags from another annotator for the * parsing process. - *

- * Default: {@code false} */ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") @@ -149,8 +167,6 @@ public class BerkeleyParser /** * Sets whether to create or not to create POS tags. The creation of constituent tags must be * turned on for this to work. - *

- * Default: {@code true} */ public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "false") @@ -159,8 +175,6 @@ public class BerkeleyParser /** * If this parameter is set to true, each sentence is annotated with a PennTree-Annotation, * containing the whole parse tree in Penn Treebank style format. - *

- * Default: {@code false} */ public static final String PARAM_WRITE_PENN_TREE = ComponentParameters.PARAM_WRITE_PENN_TREE; @ConfigurationParameter(name = PARAM_WRITE_PENN_TREE, mandatory = true, defaultValue = "false") @@ -168,8 +182,6 @@ public class BerkeleyParser /** * Compute Viterbi derivation instead of max-rule tree. - *

- * Default: {@code false} (max-rule) */ public static final String PARAM_VITERBI = "viterbi"; @ConfigurationParameter(name = PARAM_VITERBI, mandatory = true, defaultValue = "false") @@ -177,8 +189,6 @@ public class BerkeleyParser /** * Output sub-categories (only for binarized Viterbi trees). - *

- * Default: {@code false} */ public static final String PARAM_SUBSTATES = "substates"; @ConfigurationParameter(name = PARAM_SUBSTATES, mandatory = true, defaultValue = "false") @@ -186,17 +196,13 @@ public class BerkeleyParser /** * Output inside scores (only for binarized viterbi trees). - *

- * Default: {@code false} */ public static final String PARAM_SCORES = "scores"; @ConfigurationParameter(name = PARAM_SCORES, mandatory = true, defaultValue = "false") private boolean scores; /** - * Set thresholds for accuracy. - *

- * Default: {@code false} (set thresholds for efficiency) + * Set thresholds for accuracy instead of efficiency. */ public static final String PARAM_ACCURATE = "accurate"; @ConfigurationParameter(name = PARAM_ACCURATE, mandatory = true, defaultValue = "false") @@ -204,8 +210,6 @@ public class BerkeleyParser /** * Use variational rule score approximation instead of max-rule - *

- * Default: {@code false} */ public static final String PARAM_VARIATIONAL = "variational"; @ConfigurationParameter(name = PARAM_VARIATIONAL, mandatory = true, defaultValue = "false") @@ -213,8 +217,6 @@ public class BerkeleyParser /** * Retain predicted function labels. Model must have been trained with function labels. - *

- * Default: {@code false} */ public static final String PARAM_KEEP_FUNCTION_LABELS = "keepFunctionLabels"; @ConfigurationParameter(name = PARAM_KEEP_FUNCTION_LABELS, mandatory = true, defaultValue = "false") @@ -222,8 +224,6 @@ public class BerkeleyParser /** * Output binarized trees. - *

- * Default: {@code false} */ public static final String PARAM_BINARIZE = "binarize"; @ConfigurationParameter(name = PARAM_BINARIZE, mandatory = true, defaultValue = "false") @@ -241,10 +241,10 @@ public void initialize(UimaContext aContext) modelProvider = new BerkeleyParserModelProvider(); - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - language, modelProvider); + posMappingProvider = createPosMappingProvider(this, posMappingLocation, language, + modelProvider); - constituentMappingProvider = MappingProviderFactory.createConstituentMappingProvider( + constituentMappingProvider = createConstituentMappingProvider(this, constituentMappingLocation, language, modelProvider); } @@ -260,7 +260,8 @@ public void process(JCas aJCas) for (Sentence sentence : select(aJCas, Sentence.class)) { List tokens = selectCovered(aJCas, Token.class, sentence); - List tokenText = tokens.stream().map(t -> t.getText()).collect(Collectors.toList()); + List tokenText = tokens.stream().map(t -> + t.getText()).collect(Collectors.toList()); List posTags = null; if (readPos) { @@ -283,7 +284,8 @@ public void process(JCas aJCas) parseOutput = TreeAnnotations.unAnnotateTree(parseOutput, keepFunctionLabels); } - createConstituentAnnotationFromTree(aJCas, parseOutput, null, tokens, new MutableInt(0)); + createConstituentAnnotationFromTree(aJCas, parseOutput, null, tokens, + new MutableInt(0)); if (writePennTree) { PennTree pTree = new PennTree(aJCas, sentence.getBegin(), sentence.getEnd()); @@ -319,7 +321,7 @@ private Annotation createConstituentAnnotationFromTree(JCas aJCas, Tree Type posTag = posMappingProvider.getTagType(typeName); POS posAnno = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); - posAnno.setPosValue(internTags ? typeName.intern() : typeName); + posAnno.setPosValue(typeName != null ? typeName.intern() : null); POSUtils.assignCoarseValue(posAnno); posAnno.addToIndexes(); token.setPos(posAnno); @@ -376,8 +378,11 @@ private class BerkeleyParserModelProvider { setContextObject(BerkeleyParser.this); - setDefault(ARTIFACT_ID, "${groupId}.berkeleyparser-model-parser-${language}-${variant}"); - setDefault(LOCATION, "classpath:/${package}/lib/parser-${language}-${variant}.bin"); + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(ARTIFACT_ID, + "${groupId}.berkeleyparser-model-parser-${language}-${variant}"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/berkeleyparser/lib/parser-${language}-${variant}.bin"); setDefaultVariantsLocation("${package}/lib/parser-default-variants.map"); setOverride(LOCATION, modelLocation); @@ -389,9 +394,8 @@ private class BerkeleyParserModelProvider protected CoarseToFineMaxRuleParser produceResource(URL aUrl) throws IOException { - ObjectInputStream is = null; - try { - is = new ObjectInputStream(new GZIPInputStream(aUrl.openStream())); + try (ObjectInputStream is = new ObjectInputStream( + new GZIPInputStream(aUrl.openStream()))) { ParserData pData = (ParserData) is.readObject(); Grammar grammar = pData.getGrammar(); @@ -436,9 +440,6 @@ else if ("ROOT".equals(tag)) { catch (ClassNotFoundException e) { throw new IOException(e); } - finally { - closeQuietly(is); - } } }; } diff --git a/dkpro-core-berkeleyparser-gpl/src/main/java/org/dkpro/core/berkeleyparser/package-info.java b/dkpro-core-berkeleyparser-gpl/src/main/java/org/dkpro/core/berkeleyparser/package-info.java new file mode 100644 index 0000000000..8cb88d3efc --- /dev/null +++ b/dkpro-core-berkeleyparser-gpl/src/main/java/org/dkpro/core/berkeleyparser/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +/** + * Integration of the Berkeley Parser. + * + * @since 1.5.0 + */ +package org.dkpro.core.berkeleyparser; diff --git a/dkpro-core-berkeleyparser-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/berkeleyparser/lib/parser-default-variants.map b/dkpro-core-berkeleyparser-gpl/src/main/resources/org/dkpro/core/berkeleyparser/lib/parser-default-variants.map similarity index 100% rename from dkpro-core-berkeleyparser-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/berkeleyparser/lib/parser-default-variants.map rename to dkpro-core-berkeleyparser-gpl/src/main/resources/org/dkpro/core/berkeleyparser/lib/parser-default-variants.map diff --git a/dkpro-core-berkeleyparser-gpl/src/scripts/build.xml b/dkpro-core-berkeleyparser-gpl/src/scripts/build.xml index 2b0ccab994..3adb0c2d67 100644 --- a/dkpro-core-berkeleyparser-gpl/src/scripts/build.xml +++ b/dkpro-core-berkeleyparser-gpl/src/scripts/build.xml @@ -1,6 +1,6 @@ diff --git a/dkpro-core-berkeleyparser-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/berkeleyparser/BerkeleyParserTest.java b/dkpro-core-berkeleyparser-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/berkeleyparser/BerkeleyParserTest.java deleted file mode 100644 index bbc71ffa1e..0000000000 --- a/dkpro-core-berkeleyparser-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/berkeleyparser/BerkeleyParserTest.java +++ /dev/null @@ -1,455 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.berkeleyparser; - -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertConstituents; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertPOS; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertPennTree; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTagset; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTagsetMapping; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectSingle; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.commons.lang3.ArrayUtils; -import org.apache.uima.fit.factory.AggregateBuilder; -import org.apache.uima.jcas.JCas; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -public class BerkeleyParserTest -{ - static final String documentEnglish = "We need a very complicated example sentence , which " + - "contains as many constituents and dependencies as possible ."; - - @Test - public void testArabic() - throws Exception - { - JCas jcas = runTest("ar", - "نحتاج مثالا معقدا جدا ل جملة تحتوي على أكبر قدر ممكن من العناصر و الروابط ."); - - String[] constituentMapped = { "ROOT 0,75", "X 0,75" }; - - String[] constituentOriginal = { "ROOT 0,75", "X 0,75" }; - - String[] dependencies = {}; - - String pennTree = "(ROOT (ROOT (X (PUNC نحتاج) (PUNC مثالا) (NN معقدا) (NN جدا) (NN ل) (NN جملة) " - + "(NN تحتوي) (NN على) (NN أكبر) (NN قدر) (NN ممكن) (NN من) (NN العناصر) (NN و) (NN الروابط) " - + "(PUNC .))))"; - - String[] posMapped = { "POS_PUNCT", "POS_PUNCT", "POS_NOUN", "POS_NOUN", "POS_NOUN", "POS_NOUN", "POS_NOUN", "POS_NOUN", - "POS_NOUN", "POS_NOUN", "POS_NOUN", "POS_NOUN", "POS_NOUN", "POS_NOUN", "POS_NOUN", "POS_PUNCT" }; - - String[] posOriginal = { "PUNC", "PUNC", "NN", "NN", "NN", "NN", "NN", "NN", "NN", "NN", - "NN", "NN", "NN", "NN", "NN", "PUNC" }; - - String[] posTags = { "CC", "CD", "DEM", "DT", "IN", "JJ", "NN", "NNP", "NNPS", "NNS", - "NOFUNC", "NUMCOMMA", "PRP", "PRP$", "PUNC", "RB", "RP", "UH", "VB", "VBD", "VBN", - "VBP", "VERB", "WP", "WRB" }; - - String[] constituentTags = { "ADJP", "ADVP", "CONJP", "FRAG", "INTJ", "LST", "NAC", "NP", - "NX", "PP", "PRN", "PRT", "QP", "ROOT", "S", "SBAR", "SBARQ", "SINV", "SQ", "UCP", - "VP", "WHADJP", "WHADVP", "WHNP", "WHPP", "X" }; - - String[] unmappedPos = { "DEM", "NOFUNC", "NUMCOMMA", "PRP$", "VERB" }; - - String[] unmappedConst = { "LST", "SINV" }; - - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, - select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(POS.class, "atb", posTags, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "atb", unmappedPos, jcas); - AssertAnnotations.assertTagset(Constituent.class, "atb", constituentTags, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "atb", unmappedConst, jcas); - } - - @Test - public void testBulgarian() - throws Exception - { - JCas jcas = runTest("bg", "Имаме нужда от един много сложен пример изречение , " + - "което съдържа най-много съставки и зависимости, колкото е възможно ."); - - String[] constituentMapped = { "ROOT 0,120", "X 0,118", "X 0,120", "X 0,5", "X 100,107", - "X 100,118", "X 108,109", "X 108,118", "X 110,118", "X 12,118", "X 12,14", - "X 15,118", "X 15,19", "X 15,39", "X 20,25", "X 20,32", "X 20,39", "X 26,32", - "X 33,39", "X 40,118", "X 40,49", "X 40,84", "X 50,84", "X 52,57", "X 52,84", - "X 58,65", "X 58,84", "X 6,11", "X 6,118", "X 66,75", "X 66,84", "X 76,84", - "X 85,86", "X 87,118", "X 87,99" }; - - String[] constituentOriginal = { "A 26,32", "APA 20,32", "Adv 100,107", "Adv 110,118", - "Adv 20,25", "Adv 66,75", "Adv 87,99", "AdvPA 87,118", "C 85,86", "CL 100,118", - "CLR 50,84", "Conj 85,86", "ConjArg 40,84", "ConjArg 87,118", "CoordP 40,118", - "M 15,19", "N 33,39", "N 40,49", "N 6,11", "N 76,84", "NPA 15,118", "NPA 15,39", - "NPA 20,39", "NPA 40,84", "NPA 6,118", "NPA 66,84", "PP 12,118", "Prep 12,14", - "Pron 52,57", "ROOT 0,120", "S 0,120", "V 0,5", "V 108,109", "V 58,65", - "VPA 100,118", "VPC 0,118", "VPC 108,118", "VPC 58,84", "VPS 52,84" }; - - String[] posMapped = { "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", - "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS" }; - - String[] posOriginal = { "Vpitf", "Ncfsi", "R", "Mcmsi", "Md", "Amsi", "Ncmsi", "Ncnsi", - "pt", "Pre", "Vpitf", "Md", "Ncmpi", "Cp", "Dm", "Prq", "Vxitf", "Dd", "pt" }; - - String pennTree = "(ROOT (ROOT (S (VPC (V (Vpitf Имаме)) (NPA (N (Ncfsi нужда)) (PP " - + "(Prep (R от)) (NPA (NPA (M (Mcmsi един)) (NPA (APA (Adv (Md много)) (A " - + "(Amsi сложен))) (N (Ncmsi пример)))) (CoordP (ConjArg (NPA (N " - + "(Ncnsi изречение)) (CLR (pt ,) (VPS (Pron (Pre което)) (VPC (V " - + "(Vpitf съдържа)) (NPA (Adv (Md най-много)) (N (Ncmpi съставки)))))))) " - + "(Conj (C (Cp и))) (ConjArg (AdvPA (Adv (Dm зависимости,)) (CL (VPA (Adv " - + "(Prq колкото)) (VPC (V (Vxitf е)) (Adv (Dd възможно)))))))))))) (pt .))))"; - - String[] posTags = { "A", "Afsd", "Afsi", "Ams", "Amsf", "Amsh", "Amsi", - "Ansd", "Ansi", "Cc", "Cp", "Cr", "Cs", "Dd", "Dl", "Dm", "Dq", "Dt", "Hfsi", - "Hmsf", "I", "Mc", "Mcf", "Mcfpd", "Mcfpi", "Mcfsd", "Mcfsi", "Mcm", "Mcmpd", - "Mcmpi", "Mcmsf", "Mcmsi", "Mcn", "Mcnpd", "Mcnpi", "Mcnsd", "Mcnsi", "Md", "Mo", - "Mofsd", "Mofsi", "Momsf", "Momsh", "Momsi", "Monsd", "Monsi", "My", "Nc", "Ncfpd", - "Ncfpi", "Ncfs", "Ncfsd", "Ncfsi", "Ncmpd", "Ncmpi", "Ncms", "Ncmsd", "Ncmsf", - "Ncmsh", "Ncmsi", "Ncmt", "Ncnpd", "Ncnpi", "Ncnsd", "Ncnsi", "Npfsi", "Npnsi", - "Pca", "Pce", "Pcl", "Pcq", "Pct", "Pda", "Pde", "Pdl", "Pdm", "Pdq", "Pds", "Pdt", - "Pfa", "Pfe", "Pfl", "Pfm", "Pfp", "Pfq", "Pft", "Pfy", "Pia", "Pic", "Pie", "Pil", - "Pim", "Pip", "Piq", "Pit", "Pna", "Pne", "Pnl", "Pnm", "Pnp", "Pnt", "Ppe", - "Ppelap1", "Ppelap2", "Ppelap3", "Ppelas1", "Ppelas2", "Ppelas3f", "Ppelas3m", - "Ppelas3n", "Ppeldp1", "Ppelds1", "Ppelds2", "Ppelds3m", "Ppetap1", "Ppetap2", - "Ppetap3", "Ppetas1", "Ppetas2", "Ppetas3f", "Ppetas3m", "Ppetas3n", "Ppetdp1", - "Ppetdp2", "Ppetdp3", "Ppetds1", "Ppetds2", "Ppetds3f", "Ppetds3m", "Ppetds3n", - "Ppetsp1", "Ppetsp2", "Ppetsp3", "Ppetss1", "Ppetss2", "Ppetss3f", "Ppetss3m", - "Pph", "Pphlas2", "Pphtas2", "Pphtds2", "Pphtss2", "Ppxta", "Ppxtd", "Ppxts", - "Pra", "Pre", "Prl", "Prm", "Prp", "Prq", "Prs", "Prt", "Pshl", "Psht", "Psol", - "Psot", "Psxlop", "Psxlos", "Psxto", "Pszl", "Pszt", "R", "Ta", "Te", "Tg", "Ti", - "Tm", "Tn", "Tv", "Tx", "Viitf", "Vniicam", "Vniicao", "Vniif", "Vnitcam", - "Vnitcao", "Vnitf", "Vnpicao", "Vnpif", "Vnptcao", "Vnptf", "Vpiicam", "Vpiicao", - "Vpiicar", "Vpiif", "Vpiig", "Vpiiz", "Vpitcam", "Vpitcao", "Vpitcar", "Vpitcv", - "Vpitf", "Vpitg", "Vpitz", "Vppicam", "Vppicao", "Vppif", "Vppiz", "Vpptcam", - "Vpptcao", "Vpptcv", "Vpptf", "Vpptz", "Vxitcat", "Vxitf", "Vxitu", "Vyptf", - "Vyptz", "abbr", "foreign", "mw", "name", "pt", "w" }; - - String[] constituentTags = { "A", "APA", "APC", "Adv", "AdvPA", "AdvPC", "C", - "CL", "CLCHE", "CLDA", "CLQ", "CLR", "CLZADA", "Conj", "ConjArg", "CoordP", - "Gerund", "H", "M", "N", "NPA", "NPC", "PP", "Participle", "Prep", "Pron", "ROOT", - "S", "T", "V", "VPA", "VPC", "VPF", "VPS", "Verbalised" }; - - String[] unmappedConstituents = { "Conj", "ConjArg", "Verbalised" }; - - assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - assertTagset(POS.class, "btb", posTags, jcas); - // FIXME assertTagsetMapping(POS.class, "btb", new String[] {}, jcas); - assertTagset(Constituent.class, "btb", constituentTags, jcas); - assertTagsetMapping(Constituent.class, "btb", unmappedConstituents, jcas); - } - - @Test - public void testChinese() - throws Exception - { - JCas jcas = runTest("zh", - "我们 需要 一个 非常 复杂 的 句子 例如 其中 包含 许多 成分 和 尽可能 的 依赖 。"); - - String[] constituentMapped = { "ADVP 20,22", "ADVP 9,11", "NP 0,2", "NP 17,19", "NP 23,25", - "NP 23,34", "NP 32,34", "NP 37,40", "NP 37,45", "NP 43,45", "NP 6,34", "NP 6,45", - "NP 6,8", "PARN 20,34", "QP 29,31", "ROOT 0,47", "VP 12,14", "VP 26,28", "VP 3,45", - "VP 9,14", "X 0,47", "X 23,28", "X 37,42", "X 6,14", "X 6,16" }; - - String[] constituentOriginal = { "ADVP 20,22", "ADVP 9,11", "CP 6,16", "DNP 37,42", - "IP 0,47", "IP 23,28", "IP 6,14", "NP 0,2", "NP 17,19", "NP 23,25", "NP 23,34", - "NP 32,34", "NP 37,40", "NP 37,45", "NP 43,45", "NP 6,34", "NP 6,45", "NP 6,8", - "PRN 20,34", "QP 29,31", "ROOT 0,47", "VP 12,14", "VP 26,28", "VP 3,45", "VP 9,14" }; - - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_NOUN", "POS_ADJ", "POS_VERB", "POS_PART", "POS_NOUN", "POS_ADJ", "POS_NOUN", - "POS_VERB", "POS_NUM", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_PART", "POS_NOUN", "POS_PUNCT" }; - - String[] posOriginal = { "PN", "VV", "NN", "AD", "VA", "DEC", "NN", "AD", "NN", "VV", "CD", - "NN", "CC", "NN", "DEG", "NN", "PU" }; - - String pennTree = "(ROOT (IP (NP (PN 我们)) (VP (VV 需要) (NP (NP (CP (IP (NP (NN 一个)) " - + "(VP (ADVP (AD 非常)) (VP (VA 复杂)))) (DEC 的)) (NP (NN 句子)) (PRN (ADVP " - + "(AD 例如)) (NP (IP (NP (NN 其中)) (VP (VV 包含))) (QP (CD 许多)) (NP " - + "(NN 成分))))) (CC 和) (NP (DNP (NP (NN 尽可能)) (DEG 的)) (NP (NN 依赖))))) " - + "(PU 。)))"; - - String[] posTags = { "AD", "AS", "BA", "CC", "CD", "CS", "DEC", "DEG", "DER", "DEV", "DT", - "ETC", "FW", "IJ", "JJ", "LB", "LC", "M", "MSP", "NN", "NP", "NR", "NT", "OD", "P", - "PN", "PU", "SB", "SP", "VA", "VC", "VE", "VP", "VV", "X" }; - - String[] constituentTags = { "ADJP", "ADVP", "CLP", "CP", "DNP", "DP", "DVP", "FRAG", - "INTJ", "IP", "LCP", "LST", "MSP", "NN", "NP", "PP", "PRN", "QP", "ROOT", "UCP", - "VCD", "VCP", "VNV", "VP", "VPT", "VRD", "VSB" }; - - String[] unmappedPos = { "NP", "VP" }; - - assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - List trees = new ArrayList(select(jcas, PennTree.class)); - assertPennTree(pennTree, trees.get(0)); - assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - assertTagset(POS.class, "ctb", posTags, jcas); - assertTagsetMapping(POS.class, "ctb", unmappedPos, jcas); - assertTagset(Constituent.class, "ctb", constituentTags, jcas); - // FIXME assertTagsetMapping(Constituent.class, "ctb", new String[] {}, jcas); - } - - @Test - public void testEnglish() - throws Exception - { - JCas jcas = runTest("en", documentEnglish); - - String[] constituentMapped = { "ADJP 10,26", "ADJP 102,110", "ADJP 61,68", "NP 0,2", - "NP 61,98", "NP 8,110", "NP 8,43", "PP 99,110", "ROOT 0,112", "S 0,112", - "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; - - String[] constituentOriginal = { "ADJP 10,26", "ADJP 102,110", "ADJP 61,68", "NP 0,2", - "NP 61,98", "NP 8,110", "NP 8,43", "PP 99,110", "ROOT 0,112", "S 0,112", - "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; - - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", "POS_NOUN", "POS_PUNCT", "POS_DET", - "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; - - String[] posOriginal = { "PRP", "VBP", "DT", "RB", "JJ", "NN", "NN", ",", - "WDT", "VBZ", "IN", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; - - String pennTree = "(ROOT (S (NP (PRP We)) (VP (VBP need) (NP (NP (DT a) (ADJP (RB very) " + - "(JJ complicated)) (NN example) (NN sentence)) (, ,) (SBAR (WHNP (WDT which)) (S " + - "(VP (VBZ contains) (NP (ADJP (IN as) (JJ many)) (NNS constituents) (CC and) " + - "(NNS dependencies)) (PP (IN as) (ADJP (JJ possible)))))))) (. .)))"; - - String[] posTags = { "#", "$", "''", ",", "-LRB-", "-RRB-", ".", ":", "CC", - "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", - "NNS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", - "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "``" }; - - String[] constituentTags = { "ADJP", "ADVP", "CONJP", "FRAG", "INTJ", "LST", - "NAC", "NP", "NX", "PP", "PRN", "PRT", "PRT|ADVP", "QP", "ROOT", "RRC", "S", "SBAR", - "SBARQ", "SINV", "SQ", "UCP", "VP", "WHADJP", "WHADVP", "WHNP", "WHPP", "X" }; - - String[] unmappedPos = {}; - - String[] unmappedConst = {}; - - assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - assertTagset(POS.class, "ptb", posTags, jcas); - assertTagsetMapping(POS.class, "ptb", unmappedPos, jcas); - assertTagset(Constituent.class, "ptb", constituentTags, jcas); - // FIXME assertTagsetMapping(Constituent.class, "ptb", unmappedConst, jcas); - } - - @Test - public void testEnglishPreTagged() - throws Exception - { - JCas jcas = runTest("en", null, documentEnglish, true); - - String[] constituentMapped = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,110", - "NP 64,98", "NP 8,110", "NP 8,43", "PP 61,110", "PP 99,110", "ROOT 0,112", - "S 0,112", "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; - - String[] constituentOriginal = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,110", - "NP 64,98", "NP 8,110", "NP 8,43", "PP 61,110", "PP 99,110", "ROOT 0,112", - "S 0,112", "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; - - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", "POS_NOUN", "POS_PUNCT", "POS_DET", - "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; - - String[] posOriginal = { "PRP", "VBP", "DT", "RB", "JJ", "NN", "NN", ",", "WDT", "VBZ", - "IN", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; - - String pennTree = "(ROOT (S (NP (PRP We)) (VP (VBP need) (NP (NP (DT a) (ADJP " - + "(RB very) (JJ complicated)) (NN example) (NN sentence)) (, ,) (SBAR (WHNP " - + "(WDT which)) (S (VP (VBZ contains) (PP (IN as) (NP (NP (JJ many) " - + "(NNS constituents) (CC and) (NNS dependencies)) (PP (IN as) (ADJP " - + "(JJ possible)))))))))) (. .)))"; - - String[] posTags = { "#", "$", "''", ",", "-LRB-", "-RRB-", ".", ":", "CC", "CD", "DT", - "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", - "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", - "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "``" }; - - String[] constituentTags = { "ADJP", "ADVP", "CONJP", "FRAG", "INTJ", "LST", "NAC", "NP", - "NX", "PP", "PRN", "PRT", "PRT|ADVP", "QP", "ROOT", "RRC", "S", "SBAR", "SBARQ", - "SINV", "SQ", "UCP", "VP", "WHADJP", "WHADVP", "WHNP", "WHPP", "X" }; - - String[] unmappedPos = {}; - - String[] unmappedConst = {}; - - assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - assertConstituents(constituentMapped, constituentOriginal, - select(jcas, Constituent.class)); - assertTagset(POS.class, "ptb", posTags, jcas); - assertTagsetMapping(POS.class, "ptb", unmappedPos, jcas); - assertTagset(Constituent.class, "ptb", constituentTags, jcas); - // FIXME assertTagsetMapping(Constituent.class, "ptb", unmappedConst, - // jcas); - } - - @Test - public void testGerman() - throws Exception - { - JCas jcas = runTest("de", "Wir brauchen ein sehr kompliziertes Beispiel , welches " + - "möglichst viele Konstituenten und Dependenzen beinhaltet ."); - - String[] constituentMapped = { "ADJP 17,35", "Constituent 0,113", "NP 13,111", "NP 55,100", - "NP 71,100", "ROOT 0,113", "S 0,111", "S 47,111" }; - - String[] constituentOriginal = { "AP 17,35", "CNP 71,100", "NP 13,111", "NP 55,100", - "PSEUDO 0,113", "ROOT 0,113", "S 0,111", "S 47,111" }; - - String[] posOriginal = { "PPER", "VVFIN", "ART", "ADV", "ADJA", "NN", "$,", "PRELS", "ADV", - "PIDAT", "NN", "KON", "NN", "VVFIN", "$." }; - - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", "POS_PUNCT", "POS_PRON", "POS_ADV", - "POS_PRON", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_VERB", "POS_PUNCT" }; - - String pennTree = "(ROOT (PSEUDO (S (PPER Wir) (VVFIN brauchen) (NP (ART ein) (AP " + - "(ADV sehr) (ADJA kompliziertes)) (NN Beispiel) ($, ,) (S (PRELS welches) (NP " + - "(ADV möglichst) (PIDAT viele) (CNP (NN Konstituenten) (KON und) " + - "(NN Dependenzen))) (VVFIN beinhaltet)))) ($. .)))"; - - String[] posTags = { "$*LRB*", "$,", "$.", "*T1*", "*T2*", "*T3*", "*T4*", - "*T5*", "*T6*", "*T7*", "*T8*", "--", "ADJA", "ADJD", "ADV", "APPO", "APPR", - "APPRART", "APZR", "ART", "CARD", "FM", "ITJ", "KOKOM", "KON", "KOUI", "KOUS", - "NE", "NN", "PDAT", "PDS", "PIAT", "PIDAT", "PIS", "PPER", "PPOSAT", "PPOSS", - "PRELAT", "PRELS", "PRF", "PROAV", "PTKA", "PTKANT", "PTKNEG", "PTKVZ", "PTKZU", - "PWAT", "PWAV", "PWS", "TRUNC", "VAFIN", "VAIMP", "VAINF", "VAPP", "VMFIN", - "VMINF", "VMPP", "VVFIN", "VVIMP", "VVINF", "VVIZU", "VVPP", "XY" }; - - String[] constituentTags = { "---CJ", "AA", "AP", "AVP", "CAC", "CAP", "CAVP", - "CCP", "CH", "CNP", "CO", "CPP", "CS", "CVP", "CVZ", "DL", "ISU", "MPN", "MTA", - "NM", "NP", "PP", "PSEUDO", "QL", "ROOT", "S", "VP", "VZ" }; - - String[] unmappedPos = { "$*LRB*", "*T1*", "*T2*", "*T3*", "*T4*", "*T5*", - "*T6*", "*T7*", "*T8*", "--" }; - - String[] unmappedConst = { "---CJ", "PSEUDO" }; - - assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - assertTagset(POS.class, "stts", posTags, jcas); - assertTagsetMapping(POS.class, "stts", unmappedPos, jcas); - assertTagset(Constituent.class, "negra", constituentTags, jcas); - assertTagsetMapping(Constituent.class, "negra", unmappedConst, jcas); - } - - @Test - public void testFrench() - throws Exception - { - JCas jcas = runTest("fr", "Nous avons besoin d' une phrase par exemple très " + - "compliqué , qui contient des constituants que de nombreuses dépendances et que " + - "possible ."); - - String[] constituentMapped = { "ADJP 44,58", "NP 21,90", "NP 36,43", "NP 61,64", - "NP 74,90", "NP 95,120", "PP 18,90", "PP 32,43", "ROOT 0,138", "S 0,138", - "SBAR 124,136", "SBAR 61,90", "SBAR 91,120", "VP 0,17", "VP 65,73" }; - - String[] constituentOriginal = { "AP 44,58", "NP 21,90", "NP 36,43", "NP 61,64", - "NP 74,90", "NP 95,120", "PP 18,90", "PP 32,43", "ROOT 0,138", "SENT 0,138", - "Srel 61,90", "Ssub 124,136", "Ssub 91,120", "VN 0,17", "VN 65,73" }; - - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_VERB", "POS_ADP", "POS_DET", "POS_NOUN", "POS_ADP", "POS_NOUN", "POS_ADV", - "POS_ADJ", "POS_PUNCT", "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_CONJ", "POS_DET", "POS_ADJ", "POS_NOUN", "POS_CONJ", - "POS_CONJ", "POS_ADJ", "POS_PUNCT" }; - - String[] posOriginal = { "CL", "V", "V", "P", "D", "N", "P", "N", "ADV", "A", - ",", "PRO", "V", "D", "N", "C", "D", "A", "N", "C", "C", "A", "." }; - - String pennTree = "(ROOT (ROOT (SENT (VN (CL Nous) (V avons) (V besoin)) (PP (P d') (NP " - + "(D une) (N phrase) (PP (P par) (NP (N exemple))) (AP (ADV très) (A compliqué)) " - + "(, ,) (Srel (NP (PRO qui)) (VN (V contient)) (NP (D des) (N constituants))))) " - + "(Ssub (C que) (NP (D de) (A nombreuses) (N dépendances))) (C et) (Ssub (C que) " - + "(A possible)) (. .))))"; - - String[] posTags = { "\"", ",", "-LRB-", "-RRB-", ".", ":", "A", "ADV", - "ADVP", "Afs", "C", "CC", "CL", "CS", "D", "Dmp", "ET", "I", "N", "ND", "P", "PC", - "PREF", "PRO", "S", "V", "X", "_unknown_", "p", "près" }; - - String[] constituentTags = { "AP", "AdP", "NP", "PP", "ROOT", "SENT", "Sint", - "Srel", "Ssub", "VN", "VPinf", "VPpart" }; - - String[] unmappedPos = { "\"", "-LRB-", "-RRB-", "ADVP", "Afs", "CC", - "CS", "Dmp", "ND", "PC", "S", "X", "_unknown_", "p", "près" }; - - assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - assertTagset(POS.class, "ftb", posTags, jcas); - assertTagsetMapping(POS.class, "ftb", unmappedPos, jcas); - assertTagset(Constituent.class, "ftb", constituentTags, jcas); - assertTagsetMapping(Constituent.class, "ftb", new String[] {}, jcas); - } - - /** - * Setup CAS to test parser for the English language (is only called once if - * an English test is run) - */ - private JCas runTest(String aLanguage, String aText) - throws Exception - { - return runTest(aLanguage, null, aText, false); - } - - - private JCas runTest(String aLanguage, String aVariant, String aText, boolean aGoldPos, - Object... aExtraParams) - throws Exception - { - AggregateBuilder aggregate = new AggregateBuilder(); - - if (aGoldPos) { - aggregate.add(createEngineDescription(OpenNlpPosTagger.class)); - } - - Object[] params = new Object[] { - BerkeleyParser.PARAM_VARIANT, aVariant, - BerkeleyParser.PARAM_PRINT_TAGSET, true, - BerkeleyParser.PARAM_WRITE_PENN_TREE, true, - BerkeleyParser.PARAM_WRITE_POS, !aGoldPos, - BerkeleyParser.PARAM_READ_POS, aGoldPos}; - params = ArrayUtils.addAll(params, aExtraParams); - aggregate.add(createEngineDescription(BerkeleyParser.class, params)); - - return TestRunner.runTest(aggregate.createAggregateDescription(), aLanguage, aText); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-berkeleyparser-gpl/src/test/java/org/dkpro/core/berkeleyparser/BerkeleyParserTest.java b/dkpro-core-berkeleyparser-gpl/src/test/java/org/dkpro/core/berkeleyparser/BerkeleyParserTest.java new file mode 100644 index 0000000000..7584a10dd1 --- /dev/null +++ b/dkpro-core-berkeleyparser-gpl/src/test/java/org/dkpro/core/berkeleyparser/BerkeleyParserTest.java @@ -0,0 +1,461 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.berkeleyparser; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectSingle; +import static org.dkpro.core.testing.AssertAnnotations.assertConstituents; +import static org.dkpro.core.testing.AssertAnnotations.assertPOS; +import static org.dkpro.core.testing.AssertAnnotations.assertPennTree; +import static org.dkpro.core.testing.AssertAnnotations.assertTagset; +import static org.dkpro.core.testing.AssertAnnotations.assertTagsetMapping; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.lang3.ArrayUtils; +import org.apache.uima.fit.factory.AggregateBuilder; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.berkeleyparser.BerkeleyParser; +import org.dkpro.core.opennlp.OpenNlpPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; + +public class BerkeleyParserTest +{ + static final String documentEnglish = "We need a very complicated example sentence , which " + + "contains as many constituents and dependencies as possible ."; + + @Test + public void testArabic() + throws Exception + { + JCas jcas = runTest("ar", + "نحتاج مثالا معقدا جدا ل جملة تحتوي على أكبر قدر ممكن من العناصر و الروابط ."); + + String[] constituentMapped = { "ROOT 0,75", "X 0,75" }; + + String[] constituentOriginal = { "ROOT 0,75", "X 0,75" }; + + String[] dependencies = {}; + + String pennTree = "(ROOT (ROOT (X (PUNC نحتاج) (PUNC مثالا) (NN معقدا) (NN جدا) (NN ل) (NN جملة) " + + "(NN تحتوي) (NN على) (NN أكبر) (NN قدر) (NN ممكن) (NN من) (NN العناصر) (NN و) (NN الروابط) " + + "(PUNC .))))"; + + String[] posMapped = { "POS_PUNCT", "POS_PUNCT", "POS_NOUN", "POS_NOUN", "POS_NOUN", + "POS_NOUN", "POS_NOUN", "POS_NOUN", "POS_NOUN", "POS_NOUN", "POS_NOUN", "POS_NOUN", + "POS_NOUN", "POS_NOUN", "POS_NOUN", "POS_PUNCT" }; + + String[] posOriginal = { "PUNC", "PUNC", "NN", "NN", "NN", "NN", "NN", "NN", "NN", "NN", + "NN", "NN", "NN", "NN", "NN", "PUNC" }; + + String[] posTags = { "CC", "CD", "DEM", "DT", "IN", "JJ", "NN", "NNP", "NNPS", "NNS", + "NOFUNC", "NUMCOMMA", "PRP", "PRP$", "PUNC", "RB", "RP", "UH", "VB", "VBD", "VBN", + "VBP", "VERB", "WP", "WRB" }; + + String[] constituentTags = { "ADJP", "ADVP", "CONJP", "FRAG", "INTJ", "LST", "NAC", "NP", + "NX", "PP", "PRN", "PRT", "QP", "ROOT", "S", "SBAR", "SBARQ", "SINV", "SQ", "UCP", + "VP", "WHADJP", "WHADVP", "WHNP", "WHPP", "X" }; + + String[] unmappedPos = { "DEM", "NOFUNC", "NUMCOMMA", "PRP$", "VERB" }; + + String[] unmappedConst = { "LST", "SINV" }; + + AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, + select(jcas, Constituent.class)); + AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); + AssertAnnotations.assertTagset(POS.class, "atb", posTags, jcas); + AssertAnnotations.assertTagsetMapping(POS.class, "atb", unmappedPos, jcas); + AssertAnnotations.assertTagset(Constituent.class, "atb", constituentTags, jcas); + AssertAnnotations.assertTagsetMapping(Constituent.class, "atb", unmappedConst, jcas); + } + + @Test + public void testBulgarian() + throws Exception + { + JCas jcas = runTest("bg", "Имаме нужда от един много сложен пример изречение , " + + "което съдържа най-много съставки и зависимости, колкото е възможно ."); + + String[] constituentMapped = { "ROOT 0,120", "X 0,118", "X 0,120", "X 0,5", "X 100,107", + "X 100,118", "X 108,109", "X 108,118", "X 110,118", "X 12,118", "X 12,14", + "X 15,118", "X 15,19", "X 15,39", "X 20,25", "X 20,32", "X 20,39", "X 26,32", + "X 33,39", "X 40,118", "X 40,49", "X 40,84", "X 50,84", "X 52,57", "X 52,84", + "X 58,65", "X 58,84", "X 6,11", "X 6,118", "X 66,75", "X 66,84", "X 76,84", + "X 85,86", "X 87,118", "X 87,99" }; + + String[] constituentOriginal = { "A 26,32", "APA 20,32", "Adv 100,107", "Adv 110,118", + "Adv 20,25", "Adv 66,75", "Adv 87,99", "AdvPA 87,118", "C 85,86", "CL 100,118", + "CLR 50,84", "Conj 85,86", "ConjArg 40,84", "ConjArg 87,118", "CoordP 40,118", + "M 15,19", "N 33,39", "N 40,49", "N 6,11", "N 76,84", "NPA 15,118", "NPA 15,39", + "NPA 20,39", "NPA 40,84", "NPA 6,118", "NPA 66,84", "PP 12,118", "Prep 12,14", + "Pron 52,57", "ROOT 0,120", "S 0,120", "V 0,5", "V 108,109", "V 58,65", + "VPA 100,118", "VPC 0,118", "VPC 108,118", "VPC 58,84", "VPS 52,84" }; + + String[] posMapped = { "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", + "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS" }; + + String[] posOriginal = { "Vpitf", "Ncfsi", "R", "Mcmsi", "Md", "Amsi", "Ncmsi", "Ncnsi", + "pt", "Pre", "Vpitf", "Md", "Ncmpi", "Cp", "Dm", "Prq", "Vxitf", "Dd", "pt" }; + + String pennTree = "(ROOT (ROOT (S (VPC (V (Vpitf Имаме)) (NPA (N (Ncfsi нужда)) (PP " + + "(Prep (R от)) (NPA (NPA (M (Mcmsi един)) (NPA (APA (Adv (Md много)) (A " + + "(Amsi сложен))) (N (Ncmsi пример)))) (CoordP (ConjArg (NPA (N " + + "(Ncnsi изречение)) (CLR (pt ,) (VPS (Pron (Pre което)) (VPC (V " + + "(Vpitf съдържа)) (NPA (Adv (Md най-много)) (N (Ncmpi съставки)))))))) " + + "(Conj (C (Cp и))) (ConjArg (AdvPA (Adv (Dm зависимости,)) (CL (VPA (Adv " + + "(Prq колкото)) (VPC (V (Vxitf е)) (Adv (Dd възможно)))))))))))) (pt .))))"; + + String[] posTags = { "A", "Afsd", "Afsi", "Ams", "Amsf", "Amsh", "Amsi", + "Ansd", "Ansi", "Cc", "Cp", "Cr", "Cs", "Dd", "Dl", "Dm", "Dq", "Dt", "Hfsi", + "Hmsf", "I", "Mc", "Mcf", "Mcfpd", "Mcfpi", "Mcfsd", "Mcfsi", "Mcm", "Mcmpd", + "Mcmpi", "Mcmsf", "Mcmsi", "Mcn", "Mcnpd", "Mcnpi", "Mcnsd", "Mcnsi", "Md", "Mo", + "Mofsd", "Mofsi", "Momsf", "Momsh", "Momsi", "Monsd", "Monsi", "My", "Nc", "Ncfpd", + "Ncfpi", "Ncfs", "Ncfsd", "Ncfsi", "Ncmpd", "Ncmpi", "Ncms", "Ncmsd", "Ncmsf", + "Ncmsh", "Ncmsi", "Ncmt", "Ncnpd", "Ncnpi", "Ncnsd", "Ncnsi", "Npfsi", "Npnsi", + "Pca", "Pce", "Pcl", "Pcq", "Pct", "Pda", "Pde", "Pdl", "Pdm", "Pdq", "Pds", "Pdt", + "Pfa", "Pfe", "Pfl", "Pfm", "Pfp", "Pfq", "Pft", "Pfy", "Pia", "Pic", "Pie", "Pil", + "Pim", "Pip", "Piq", "Pit", "Pna", "Pne", "Pnl", "Pnm", "Pnp", "Pnt", "Ppe", + "Ppelap1", "Ppelap2", "Ppelap3", "Ppelas1", "Ppelas2", "Ppelas3f", "Ppelas3m", + "Ppelas3n", "Ppeldp1", "Ppelds1", "Ppelds2", "Ppelds3m", "Ppetap1", "Ppetap2", + "Ppetap3", "Ppetas1", "Ppetas2", "Ppetas3f", "Ppetas3m", "Ppetas3n", "Ppetdp1", + "Ppetdp2", "Ppetdp3", "Ppetds1", "Ppetds2", "Ppetds3f", "Ppetds3m", "Ppetds3n", + "Ppetsp1", "Ppetsp2", "Ppetsp3", "Ppetss1", "Ppetss2", "Ppetss3f", "Ppetss3m", + "Pph", "Pphlas2", "Pphtas2", "Pphtds2", "Pphtss2", "Ppxta", "Ppxtd", "Ppxts", + "Pra", "Pre", "Prl", "Prm", "Prp", "Prq", "Prs", "Prt", "Pshl", "Psht", "Psol", + "Psot", "Psxlop", "Psxlos", "Psxto", "Pszl", "Pszt", "R", "Ta", "Te", "Tg", "Ti", + "Tm", "Tn", "Tv", "Tx", "Viitf", "Vniicam", "Vniicao", "Vniif", "Vnitcam", + "Vnitcao", "Vnitf", "Vnpicao", "Vnpif", "Vnptcao", "Vnptf", "Vpiicam", "Vpiicao", + "Vpiicar", "Vpiif", "Vpiig", "Vpiiz", "Vpitcam", "Vpitcao", "Vpitcar", "Vpitcv", + "Vpitf", "Vpitg", "Vpitz", "Vppicam", "Vppicao", "Vppif", "Vppiz", "Vpptcam", + "Vpptcao", "Vpptcv", "Vpptf", "Vpptz", "Vxitcat", "Vxitf", "Vxitu", "Vyptf", + "Vyptz", "abbr", "foreign", "mw", "name", "pt", "w" }; + + String[] constituentTags = { "A", "APA", "APC", "Adv", "AdvPA", "AdvPC", "C", + "CL", "CLCHE", "CLDA", "CLQ", "CLR", "CLZADA", "Conj", "ConjArg", "CoordP", + "Gerund", "H", "M", "N", "NPA", "NPC", "PP", "Participle", "Prep", "Pron", "ROOT", + "S", "T", "V", "VPA", "VPC", "VPF", "VPS", "Verbalised" }; + + String[] unmappedConstituents = { "Conj", "ConjArg", "Verbalised" }; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); + assertTagset(POS.class, "btb", posTags, jcas); + // FIXME assertTagsetMapping(POS.class, "btb", new String[] {}, jcas); + assertTagset(Constituent.class, "btb", constituentTags, jcas); + assertTagsetMapping(Constituent.class, "btb", unmappedConstituents, jcas); + } + + @Test + public void testChinese() + throws Exception + { + JCas jcas = runTest("zh", + "我们 需要 一个 非常 复杂 的 句子 例如 其中 包含 许多 成分 和 尽可能 的 依赖 。"); + + String[] constituentMapped = { "ADVP 20,22", "ADVP 9,11", "NP 0,2", "NP 17,19", "NP 23,25", + "NP 23,34", "NP 32,34", "NP 37,40", "NP 37,45", "NP 43,45", "NP 6,34", "NP 6,45", + "NP 6,8", "PARN 20,34", "QP 29,31", "ROOT 0,47", "VP 12,14", "VP 26,28", "VP 3,45", + "VP 9,14", "X 0,47", "X 23,28", "X 37,42", "X 6,14", "X 6,16" }; + + String[] constituentOriginal = { "ADVP 20,22", "ADVP 9,11", "CP 6,16", "DNP 37,42", + "IP 0,47", "IP 23,28", "IP 6,14", "NP 0,2", "NP 17,19", "NP 23,25", "NP 23,34", + "NP 32,34", "NP 37,40", "NP 37,45", "NP 43,45", "NP 6,34", "NP 6,45", "NP 6,8", + "PRN 20,34", "QP 29,31", "ROOT 0,47", "VP 12,14", "VP 26,28", "VP 3,45", + "VP 9,14" }; + + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_NOUN", "POS_ADJ", "POS_VERB", + "POS_PART", "POS_NOUN", "POS_ADJ", "POS_NOUN", "POS_VERB", "POS_NUM", "POS_NOUN", + "POS_CONJ", "POS_NOUN", "POS_PART", "POS_NOUN", "POS_PUNCT" }; + + String[] posOriginal = { "PN", "VV", "NN", "AD", "VA", "DEC", "NN", "AD", "NN", "VV", "CD", + "NN", "CC", "NN", "DEG", "NN", "PU" }; + + String pennTree = "(ROOT (IP (NP (PN 我们)) (VP (VV 需要) (NP (NP (CP (IP (NP (NN 一个)) " + + "(VP (ADVP (AD 非常)) (VP (VA 复杂)))) (DEC 的)) (NP (NN 句子)) (PRN (ADVP " + + "(AD 例如)) (NP (IP (NP (NN 其中)) (VP (VV 包含))) (QP (CD 许多)) (NP " + + "(NN 成分))))) (CC 和) (NP (DNP (NP (NN 尽可能)) (DEG 的)) (NP (NN 依赖))))) " + + "(PU 。)))"; + + String[] posTags = { "AD", "AS", "BA", "CC", "CD", "CS", "DEC", "DEG", "DER", "DEV", "DT", + "ETC", "FW", "IJ", "JJ", "LB", "LC", "M", "MSP", "NN", "NP", "NR", "NT", "OD", "P", + "PN", "PU", "SB", "SP", "VA", "VC", "VE", "VP", "VV", "X" }; + + String[] constituentTags = { "ADJP", "ADVP", "CLP", "CP", "DNP", "DP", "DVP", "FRAG", + "INTJ", "IP", "LCP", "LST", "MSP", "NN", "NP", "PP", "PRN", "QP", "ROOT", "UCP", + "VCD", "VCP", "VNV", "VP", "VPT", "VRD", "VSB" }; + + String[] unmappedPos = { "NP", "VP" }; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + List trees = new ArrayList(select(jcas, PennTree.class)); + assertPennTree(pennTree, trees.get(0)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); + assertTagset(POS.class, "ctb", posTags, jcas); + assertTagsetMapping(POS.class, "ctb", unmappedPos, jcas); + assertTagset(Constituent.class, "ctb", constituentTags, jcas); + // FIXME assertTagsetMapping(Constituent.class, "ctb", new String[] {}, jcas); + } + + @Test + public void testEnglish() + throws Exception + { + JCas jcas = runTest("en", documentEnglish); + + String[] constituentMapped = { "ADJP 10,26", "ADJP 102,110", "ADJP 61,68", "NP 0,2", + "NP 61,98", "NP 8,110", "NP 8,43", "PP 99,110", "ROOT 0,112", "S 0,112", + "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; + + String[] constituentOriginal = { "ADJP 10,26", "ADJP 102,110", "ADJP 61,68", "NP 0,2", + "NP 61,98", "NP 8,110", "NP 8,43", "PP 99,110", "ROOT 0,112", "S 0,112", + "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; + + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", + "POS_NOUN", "POS_PUNCT", "POS_DET", "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", + "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; + + String[] posOriginal = { "PRP", "VBP", "DT", "RB", "JJ", "NN", "NN", ",", + "WDT", "VBZ", "IN", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; + + String pennTree = "(ROOT (S (NP (PRP We)) (VP (VBP need) (NP (NP (DT a) (ADJP (RB very) " + + "(JJ complicated)) (NN example) (NN sentence)) (, ,) (SBAR (WHNP (WDT which)) (S " + + "(VP (VBZ contains) (NP (ADJP (IN as) (JJ many)) (NNS constituents) (CC and) " + + "(NNS dependencies)) (PP (IN as) (ADJP (JJ possible)))))))) (. .)))"; + + String[] posTags = { "#", "$", "''", ",", "-LRB-", "-RRB-", ".", ":", "CC", + "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", + "NNS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", + "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "``" }; + + String[] constituentTags = { "ADJP", "ADVP", "CONJP", "FRAG", "INTJ", "LST", + "NAC", "NP", "NX", "PP", "PRN", "PRT", "PRT|ADVP", "QP", "ROOT", "RRC", "S", "SBAR", + "SBARQ", "SINV", "SQ", "UCP", "VP", "WHADJP", "WHADVP", "WHNP", "WHPP", "X" }; + + String[] unmappedPos = {}; + + String[] unmappedConst = {}; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); + assertTagset(POS.class, "ptb", posTags, jcas); + assertTagsetMapping(POS.class, "ptb", unmappedPos, jcas); + assertTagset(Constituent.class, "ptb", constituentTags, jcas); + // FIXME assertTagsetMapping(Constituent.class, "ptb", unmappedConst, jcas); + } + + @Test + public void testEnglishPreTagged() + throws Exception + { + JCas jcas = runTest("en", null, documentEnglish, true); + + String[] constituentMapped = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,110", + "NP 64,98", "NP 8,110", "NP 8,43", "PP 61,110", "PP 99,110", "ROOT 0,112", + "S 0,112", "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; + + String[] constituentOriginal = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,110", + "NP 64,98", "NP 8,110", "NP 8,43", "PP 61,110", "PP 99,110", "ROOT 0,112", + "S 0,112", "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; + + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", + "POS_NOUN", "POS_PUNCT", "POS_DET", "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", + "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; + + String[] posOriginal = { "PRP", "VBP", "DT", "RB", "JJ", "NN", "NN", ",", "WDT", "VBZ", + "IN", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; + + String pennTree = "(ROOT (S (NP (PRP We)) (VP (VBP need) (NP (NP (DT a) (ADJP " + + "(RB very) (JJ complicated)) (NN example) (NN sentence)) (, ,) (SBAR (WHNP " + + "(WDT which)) (S (VP (VBZ contains) (PP (IN as) (NP (NP (JJ many) " + + "(NNS constituents) (CC and) (NNS dependencies)) (PP (IN as) (ADJP " + + "(JJ possible)))))))))) (. .)))"; + + String[] posTags = { "#", "$", "''", ",", "-LRB-", "-RRB-", ".", ":", "CC", "CD", "DT", + "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", + "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", + "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "``" }; + + String[] constituentTags = { "ADJP", "ADVP", "CONJP", "FRAG", "INTJ", "LST", "NAC", "NP", + "NX", "PP", "PRN", "PRT", "PRT|ADVP", "QP", "ROOT", "RRC", "S", "SBAR", "SBARQ", + "SINV", "SQ", "UCP", "VP", "WHADJP", "WHADVP", "WHNP", "WHPP", "X" }; + + String[] unmappedPos = {}; + + String[] unmappedConst = {}; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, + select(jcas, Constituent.class)); + assertTagset(POS.class, "ptb", posTags, jcas); + assertTagsetMapping(POS.class, "ptb", unmappedPos, jcas); + assertTagset(Constituent.class, "ptb", constituentTags, jcas); + // FIXME assertTagsetMapping(Constituent.class, "ptb", unmappedConst, + // jcas); + } + + @Test + public void testGerman() + throws Exception + { + JCas jcas = runTest("de", "Wir brauchen ein sehr kompliziertes Beispiel , welches " + + "möglichst viele Konstituenten und Dependenzen beinhaltet ."); + + String[] constituentMapped = { "ADJP 17,35", "Constituent 0,113", "NP 13,111", "NP 55,100", + "NP 71,100", "ROOT 0,113", "S 0,111", "S 47,111" }; + + String[] constituentOriginal = { "AP 17,35", "CNP 71,100", "NP 13,111", "NP 55,100", + "PSEUDO 0,113", "ROOT 0,113", "S 0,111", "S 47,111" }; + + String[] posOriginal = { "PPER", "VVFIN", "ART", "ADV", "ADJA", "NN", "$,", "PRELS", "ADV", + "PIDAT", "NN", "KON", "NN", "VVFIN", "$." }; + + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", "POS_PUNCT", "POS_PRON", "POS_ADV", + "POS_PRON", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_VERB", "POS_PUNCT" }; + + String pennTree = "(ROOT (PSEUDO (S (PPER Wir) (VVFIN brauchen) (NP (ART ein) (AP " + + "(ADV sehr) (ADJA kompliziertes)) (NN Beispiel) ($, ,) (S (PRELS welches) (NP " + + "(ADV möglichst) (PIDAT viele) (CNP (NN Konstituenten) (KON und) " + + "(NN Dependenzen))) (VVFIN beinhaltet)))) ($. .)))"; + + String[] posTags = { "$*LRB*", "$,", "$.", "*T1*", "*T2*", "*T3*", "*T4*", + "*T5*", "*T6*", "*T7*", "*T8*", "--", "ADJA", "ADJD", "ADV", "APPO", "APPR", + "APPRART", "APZR", "ART", "CARD", "FM", "ITJ", "KOKOM", "KON", "KOUI", "KOUS", + "NE", "NN", "PDAT", "PDS", "PIAT", "PIDAT", "PIS", "PPER", "PPOSAT", "PPOSS", + "PRELAT", "PRELS", "PRF", "PROAV", "PTKA", "PTKANT", "PTKNEG", "PTKVZ", "PTKZU", + "PWAT", "PWAV", "PWS", "TRUNC", "VAFIN", "VAIMP", "VAINF", "VAPP", "VMFIN", + "VMINF", "VMPP", "VVFIN", "VVIMP", "VVINF", "VVIZU", "VVPP", "XY" }; + + String[] constituentTags = { "---CJ", "AA", "AP", "AVP", "CAC", "CAP", "CAVP", + "CCP", "CH", "CNP", "CO", "CPP", "CS", "CVP", "CVZ", "DL", "ISU", "MPN", "MTA", + "NM", "NP", "PP", "PSEUDO", "QL", "ROOT", "S", "VP", "VZ" }; + + String[] unmappedPos = { "$*LRB*", "*T1*", "*T2*", "*T3*", "*T4*", "*T5*", + "*T6*", "*T7*", "*T8*", "--" }; + + String[] unmappedConst = { "---CJ", "PSEUDO" }; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); + assertTagset(POS.class, "stts", posTags, jcas); + assertTagsetMapping(POS.class, "stts", unmappedPos, jcas); + assertTagset(Constituent.class, "negra", constituentTags, jcas); + assertTagsetMapping(Constituent.class, "negra", unmappedConst, jcas); + } + + @Test + public void testFrench() + throws Exception + { + JCas jcas = runTest("fr", "Nous avons besoin d' une phrase par exemple très " + + "compliqué , qui contient des constituants que de nombreuses dépendances et que " + + "possible ."); + + String[] constituentMapped = { "ADJP 44,58", "NP 21,90", "NP 36,43", "NP 61,64", + "NP 74,90", "NP 95,120", "PP 18,90", "PP 32,43", "ROOT 0,138", "S 0,138", + "SBAR 124,136", "SBAR 61,90", "SBAR 91,120", "VP 0,17", "VP 65,73" }; + + String[] constituentOriginal = { "AP 44,58", "NP 21,90", "NP 36,43", "NP 61,64", + "NP 74,90", "NP 95,120", "PP 18,90", "PP 32,43", "ROOT 0,138", "SENT 0,138", + "Srel 61,90", "Ssub 124,136", "Ssub 91,120", "VN 0,17", "VN 65,73" }; + + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_VERB", "POS_ADP", "POS_DET", "POS_NOUN", "POS_ADP", "POS_NOUN", "POS_ADV", + "POS_ADJ", "POS_PUNCT", "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_CONJ", "POS_DET", "POS_ADJ", "POS_NOUN", "POS_CONJ", + "POS_CONJ", "POS_ADJ", "POS_PUNCT" }; + + String[] posOriginal = { "CL", "V", "V", "P", "D", "N", "P", "N", "ADV", "A", + ",", "PRO", "V", "D", "N", "C", "D", "A", "N", "C", "C", "A", "." }; + + String pennTree = "(ROOT (ROOT (SENT (VN (CL Nous) (V avons) (V besoin)) (PP (P d') (NP " + + "(D une) (N phrase) (PP (P par) (NP (N exemple))) (AP (ADV très) (A compliqué)) " + + "(, ,) (Srel (NP (PRO qui)) (VN (V contient)) (NP (D des) (N constituants))))) " + + "(Ssub (C que) (NP (D de) (A nombreuses) (N dépendances))) (C et) (Ssub (C que) " + + "(A possible)) (. .))))"; + + String[] posTags = { "\"", ",", "-LRB-", "-RRB-", ".", ":", "A", "ADV", + "ADVP", "Afs", "C", "CC", "CL", "CS", "D", "Dmp", "ET", "I", "N", "ND", "P", "PC", + "PREF", "PRO", "S", "V", "X", "_unknown_", "p", "près" }; + + String[] constituentTags = { "AP", "AdP", "NP", "PP", "ROOT", "SENT", "Sint", + "Srel", "Ssub", "VN", "VPinf", "VPpart" }; + + String[] unmappedPos = { "\"", "-LRB-", "-RRB-", "ADVP", "Afs", "CC", + "CS", "Dmp", "ND", "PC", "S", "X", "_unknown_", "p", "près" }; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); + assertTagset(POS.class, "ftb", posTags, jcas); + assertTagsetMapping(POS.class, "ftb", unmappedPos, jcas); + assertTagset(Constituent.class, "ftb", constituentTags, jcas); + assertTagsetMapping(Constituent.class, "ftb", new String[] {}, jcas); + } + + /** + * Setup CAS to test parser for the English language (is only called once if + * an English test is run) + */ + private JCas runTest(String aLanguage, String aText) + throws Exception + { + return runTest(aLanguage, null, aText, false); + } + + + private JCas runTest(String aLanguage, String aVariant, String aText, boolean aGoldPos, + Object... aExtraParams) + throws Exception + { + AggregateBuilder aggregate = new AggregateBuilder(); + + if (aGoldPos) { + aggregate.add(createEngineDescription(OpenNlpPosTagger.class)); + } + + Object[] params = new Object[] { + BerkeleyParser.PARAM_VARIANT, aVariant, + BerkeleyParser.PARAM_PRINT_TAGSET, true, + BerkeleyParser.PARAM_WRITE_PENN_TREE, true, + BerkeleyParser.PARAM_WRITE_POS, !aGoldPos, + BerkeleyParser.PARAM_READ_POS, aGoldPos}; + params = ArrayUtils.addAll(params, aExtraParams); + aggregate.add(createEngineDescription(BerkeleyParser.class, params)); + + return TestRunner.runTest(aggregate.createAggregateDescription(), aLanguage, aText); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-berkeleyparser-gpl/src/test/resources/log4j.properties b/dkpro-core-berkeleyparser-gpl/src/test/resources/log4j.properties deleted file mode 100644 index 9ef9876f5c..0000000000 --- a/dkpro-core-berkeleyparser-gpl/src/test/resources/log4j.properties +++ /dev/null @@ -1,7 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG diff --git a/dkpro-core-berkeleyparser-gpl/src/test/resources/log4j2.xml b/dkpro-core-berkeleyparser-gpl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..19bf03b585 --- /dev/null +++ b/dkpro-core-berkeleyparser-gpl/src/test/resources/log4j2.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/dkpro-core-build/LICENSE.txt b/dkpro-core-build/LICENSE.txt new file mode 100644 index 0000000000..9ea00377fd --- /dev/null +++ b/dkpro-core-build/LICENSE.txt @@ -0,0 +1,268 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +=== brat === + +Copyright (C) 2010-2012 The brat contributors, all rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +=== JQuery SVG === + +Copyright 2007 - 2014 Keith Wood + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software +and associated documentation files (the "Software"), to deal in the Software without restriction, +including without limitation the rights to use, copy, modify, merge, publish, distribute, +sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or +substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +=== JQuery JSON === + +Copyright 2009-2011 Brantley Harris +Copyright 2010–2014 Timo Tijhof + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + \ No newline at end of file diff --git a/dkpro-core-build/pom.xml b/dkpro-core-build/pom.xml new file mode 100644 index 0000000000..e8753d985e --- /dev/null +++ b/dkpro-core-build/pom.xml @@ -0,0 +1,30 @@ + + + 4.0.0 + + org.dkpro.core + dkpro-core + 2.3.0-SNAPSHOT + .. + + dkpro-core-build + DKPro Core - Build resources + https://dkpro.github.io/dkpro-core/ + jar + \ No newline at end of file diff --git a/dkpro-core-build/src/main/resources/dkpro-core/checkstyle.xml b/dkpro-core-build/src/main/resources/dkpro-core/checkstyle.xml new file mode 100644 index 0000000000..e5e6fd4483 --- /dev/null +++ b/dkpro-core-build/src/main/resources/dkpro-core/checkstyle.xml @@ -0,0 +1,114 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-build/src/main/resources/dkpro-core/version-rules.xml b/dkpro-core-build/src/main/resources/dkpro-core/version-rules.xml new file mode 100644 index 0000000000..35158d1a83 --- /dev/null +++ b/dkpro-core-build/src/main/resources/dkpro-core/version-rules.xml @@ -0,0 +1,112 @@ + + + .*-alpha[0-9]* + .*-Alpha[0-9]* + .*-ALPHA[0-9]* + .*-beta[0-9]* + .*\.rc[0-9]* + .*-M[0-9]* + .*-b[0-9]+ + .*-b[0-9]+\.[0-9]+ + .*-atlassian.* + + + + + [0-9]{8}(\.[0-9]{6})? + + + + + + [0-9]{8}(\.[0-9]{6})? + + + + + + [0-9]{8}(\.[0-9]{6})? + + + + + + [0-9]{8}(\.[0-9]{6})? + + + + + + [0-9]{8}(\.[0-9]{6})? + + + + + + 20040902.021138 + + + + + + + ^5.* + + + + + + + .* + + + + + + + ^4.* + + + + + + + ^[4-9].* + + + + + + + .* + + + + + + + .* + + + + + + + 2\.[0-9]{2} + + + + + + + 20140113.0 + + + + \ No newline at end of file diff --git a/dkpro-core-castransformation-asl/pom.xml b/dkpro-core-castransformation-asl/pom.xml index 0728aa3099..af0cc671e9 100644 --- a/dkpro-core-castransformation-asl/pom.xml +++ b/dkpro-core-castransformation-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.castransformation-asl + dkpro-core-castransformation-asl jar DKPro Core ASL - CAS Transformation (ASL) + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -36,12 +37,16 @@ uimafit-core - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.transform-asl + org.dkpro.core + dkpro-core-api-transform-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -54,23 +59,23 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.text-asl + org.dkpro.core + dkpro-core-io-text-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.xmi-asl + org.dkpro.core + dkpro-core-io-xmi-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.tokit-asl + org.dkpro.core + dkpro-core-tokit-asl test diff --git a/dkpro-core-castransformation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/castransformation/ApplyChangesAnnotator.java b/dkpro-core-castransformation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/castransformation/ApplyChangesAnnotator.java deleted file mode 100644 index 7ca5b92942..0000000000 --- a/dkpro-core-castransformation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/castransformation/ApplyChangesAnnotator.java +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.castransformation; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CASException; -import org.apache.uima.cas.FSIndex; -import org.apache.uima.cas.FSIterator; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.tcas.Annotation; - -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.transform.alignment.AlignedString; -import de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation; -import de.tudarmstadt.ukp.dkpro.core.castransformation.internal.AlignmentStorage; - -/** - * Applies changes annotated using a {@link SofaChangeAnnotation}. - * - * @since 1.1.0 - * @see Backmapper - */ -@ResourceMetaData(name="CAS Transformation - Apply") -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation"}, - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation"}) - -public class ApplyChangesAnnotator - extends JCasAnnotator_ImplBase -{ - public static final String VIEW_SOURCE = "source"; - public static final String VIEW_TARGET = "target"; - - public static final String OP_INSERT = "insert"; - public static final String OP_REPLACE = "replace"; - public static final String OP_DELETE = "delete"; - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - try { - JCas sourceView = aJCas.getView(VIEW_SOURCE); - JCas targetView = aJCas.createView(VIEW_TARGET); - DocumentMetaData.copy(sourceView, targetView); - applyChanges(sourceView, targetView); - } - catch (CASException e) { - throw new AnalysisEngineProcessException(e); - } - } - - protected void applyChanges(JCas aSourceView, JCas aTargetView) - { - FSIndex idx = aSourceView.getAnnotationIndex(SofaChangeAnnotation.type); - - getLogger().info("Found " + idx.size() + " changes"); - - // Apply all the changes - AlignedString as = new AlignedString(aSourceView.getDocumentText()); - - // Collect all those edits that are going to be executed. - // - // | A | C1 C2 R - // BBBBBB + - - - // BBBBBBBBBB + + + - // BBBBBBBBBBBBBBBBB + + + - // BBBBBBB - + - - // BBBBBBBBBBBBB - + - - // BBBBBBBB - + - - // - if (idx.size() > 0) { - List edits = new ArrayList(); - { - // Get an iterator over all the change annotations. Per UIMA default - // this iterator is sorted first by begin and then by end offsets. - // We will make use of this fact here to skip change annotations that - // are covered by others. The earliest longest change wins - this means - // the one with the smallest begin offset and the largest end offset. - FSIterator it = idx.iterator(); - - SofaChangeAnnotation top = (SofaChangeAnnotation) it.get(); - edits.add(top); - it.moveToNext(); - while (it.isValid()) { - SofaChangeAnnotation b = (SofaChangeAnnotation) it.get(); - if (((top.getBegin() <= b.getBegin()) && // C1 - (top.getEnd() > b.getBegin()) // C2 - ) - || ((top.getBegin() == b.getBegin()) && (top.getEnd() == b.getEnd()))) { - // Found annotation covering current annotation. Skipping - // current annotation. - } - else { - top = b; - edits.add(top); - } - it.moveToNext(); - } - } - - // If we remove or add stuff all offsets right of the change location - // will change and thus the offsets in the change annotation are no - // longer valid. If we move from right to left it works better because - // the left offsets remain stable. - Collections.reverse(edits); - for (SofaChangeAnnotation a : edits) { - if (OP_INSERT.equals(a.getOperation())) { -// getLogger().debug("Performing insert[" + a.getBegin() + "-" + a.getEnd() + "]: [" -// + a.getCoveredText() + "]"); - as.insert(a.getBegin(), a.getValue()); - } - if (OP_DELETE.equals(a.getOperation())) { -// getLogger().debug("Performing delete[" + a.getBegin() + "-" + a.getEnd() + "]: [" -// + a.getCoveredText() + "]"); - as.delete(a.getBegin(), a.getEnd()); - } - if (OP_REPLACE.equals(a.getOperation())) { -// getLogger().debug("Performing replace[" + a.getBegin() + "-" + a.getEnd() + "]: [" -// + a.getCoveredText() + "]"); - as.replace(a.getBegin(), a.getEnd(), a.getValue()); - } - } - } - - // Set the text of the new Sofa - aTargetView.setDocumentText(as.get()); - - // Set document language - aTargetView.setDocumentLanguage(aSourceView.getDocumentLanguage()); - - // Optionally we may want to remember the AlignedString for the backmapper. - AlignmentStorage.getInstance().put(aSourceView.getCasImpl().getBaseCAS(), - aSourceView.getViewName(), aTargetView.getViewName(), as); - } -} diff --git a/dkpro-core-castransformation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/castransformation/Backmapper.java b/dkpro-core-castransformation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/castransformation/Backmapper.java deleted file mode 100644 index a1dae4d7fa..0000000000 --- a/dkpro-core-castransformation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/castransformation/Backmapper.java +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.castransformation; - -import static org.apache.uima.fit.util.CasUtil.selectAllFS; - -import java.util.LinkedList; - -import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.AnnotationBaseFS; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.CASException; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.FeatureStructure; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.internal.util.IntListIterator; -import org.apache.uima.internal.util.PositiveIntSet; -import org.apache.uima.internal.util.PositiveIntSet_impl; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.tcas.Annotation; -import org.apache.uima.util.CasCopier; - -import de.tudarmstadt.ukp.dkpro.core.api.transform.alignment.AlignedString; -import de.tudarmstadt.ukp.dkpro.core.api.transform.alignment.ImmutableInterval; -import de.tudarmstadt.ukp.dkpro.core.api.transform.alignment.Interval; -import de.tudarmstadt.ukp.dkpro.core.castransformation.internal.AlignmentStorage; - -/** - * After processing a file with the {@code ApplyChangesAnnotator} this annotator - * can be used to map the annotations created in the cleaned view back to the - * original view. - * - * @see ApplyChangesAnnotator - */ -@ResourceMetaData(name="CAS Transformation - Map back") -public class Backmapper - extends JCasAnnotator_ImplBase -{ - /** - * Chain of views for backmapping. This should be the reverse of the chain of views that the - * {@code ApplyChangesAnnotator} has used. - * - * For example, if view A has been mapped to B using {@code ApplyChangesAnnotator}, then this - * parameter should be set using an array containing [B, A]. - */ - public static final String PARAM_CHAIN = "Chain"; - - @ConfigurationParameter(name = PARAM_CHAIN, mandatory = false, defaultValue = {ApplyChangesAnnotator.VIEW_SOURCE, - ApplyChangesAnnotator.VIEW_TARGET}) - protected LinkedList sofaChain = new LinkedList<>(); - - @Override - public void process(final JCas aJCas) - throws AnalysisEngineProcessException - { - try { - // Now we can copy the complete CAS while mapping back the offsets. - // We first use the CAS copier and then update the offsets. - getLogger().info("Copying annotations from [" + sofaChain.getFirst() + "] to [" - + sofaChain.getLast() + "]"); - - // Copy the annotations - CAS sourceView = aJCas.getCas().getView(sofaChain.getFirst()); - CAS targetView = aJCas.getCas().getView(sofaChain.getLast()); - Feature mDestSofaFeature = targetView.getTypeSystem() - .getFeatureByFullName(CAS.FEATURE_FULL_NAME_SOFA); - CasCopier cc = new CasCopier(sourceView, targetView); - int docAnno = sourceView.getLowLevelCAS().ll_getFSRef(sourceView.getDocumentAnnotation()); - final PositiveIntSet copiedFs = new PositiveIntSet_impl(); - for (FeatureStructure fs : selectAllFS(sourceView)) { - int ref = sourceView.getLowLevelCAS().ll_getFSRef(fs); - if (ref == docAnno) { - // Skip document annotation - continue; - } - - // This returns either a new copy -- or -- if an FS has been copied as a - // transitively referenced feature of another FS, it will return an existing copy - FeatureStructure fsCopy = cc.copyFs(fs); - - // Make sure that the sofa annotation in the copy is set - if (fs instanceof AnnotationBaseFS) { - FeatureStructure sofa = fsCopy.getFeatureValue(mDestSofaFeature); - if (sofa == null) { - fsCopy.setFeatureValue(mDestSofaFeature, targetView.getSofa()); - } - } - - // We will still update the offsets, so we do not index the copy just yet - copiedFs.add(targetView.getLowLevelCAS().ll_getFSRef(fsCopy)); - } - - // Get the final target view - JCas targetViewJCas = aJCas.getView(sofaChain.getLast()); - - LinkedList workChain = new LinkedList<>(sofaChain); - String target = workChain.poll(); - String source = null; - - do { - source = target; - target = workChain.poll(); - - // Ok, so now we update the offsets. - String realSource = aJCas.getCas().getView(source).getViewName(); - String realTarget = aJCas.getCas().getView(target).getViewName(); - - AlignedString as = getAlignedString(aJCas, realSource, realTarget); - - updateOffsets(sourceView, targetViewJCas, as, copiedFs); - } - while (!workChain.isEmpty()); - - // Now we index the copied FSes again - IntListIterator it = copiedFs.iterator(); - while (it.hasNext()) { - FeatureStructure fs = targetView.getLowLevelCAS().ll_getFSForRef(it.next()); - targetView.addFsToIndexes(fs); - } - } - catch (UIMAException e) { - throw new AnalysisEngineProcessException(e); - } - } - - private AlignedString getAlignedString(JCas aSomeCase, String from, String to) - throws AnalysisEngineProcessException - { - CAS baseCas = aSomeCase.getCasImpl().getBaseCAS(); - - // Try to get the AlignedString for the current JCas. - AlignmentStorage asstore = AlignmentStorage.getInstance(); - AlignedString as = asstore.get(baseCas, to, from); - - // If there is none we have to fail. - if (as == null) { - throw new AnalysisEngineProcessException(new IllegalStateException( - "No mapping found from [" + from + "] to [" + to + "] on [" - + baseCas.hashCode() + "]")); - } - - return as; - } - - private void updateOffsets(CAS sourceView, JCas targetView, AlignedString as, - PositiveIntSet aCopiedFs) - throws CASException, AnalysisEngineProcessException - { - // We only update annotations that were copied, nothing that was already there. - IntListIterator it = aCopiedFs.iterator(); - - while (it.hasNext()) { - FeatureStructure fs = targetView.getLowLevelCas().ll_getFSForRef(it.next()); - if(fs instanceof Annotation) { - - // Now we update the offsets - Annotation a = (Annotation) fs; -// System.out.printf("Orig %s %3d %3d : %s%n", a.getType().getShortName(), -// a.getBegin(), a.getEnd(), -// sourceView.getDocumentText().substring(a.getBegin(), a.getEnd())); -// System.out.printf("Before %s %3d %3d : %s%n", a.getType().getShortName(), -// a.getBegin(), a.getEnd(), a.getCoveredText()); - Interval resolved = as.resolve(new ImmutableInterval(a.getBegin(), a.getEnd())); - a.setBegin(resolved.getStart()); - a.setEnd(resolved.getEnd()); -// System.out.printf("After %s %3d %3d : %s%n", a.getType().getShortName(), -// a.getBegin(), a.getEnd(), a.getCoveredText()); - } - } - } -} diff --git a/dkpro-core-castransformation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/castransformation/internal/AlignmentStorage.java b/dkpro-core-castransformation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/castransformation/internal/AlignmentStorage.java deleted file mode 100644 index b9bb2b018f..0000000000 --- a/dkpro-core-castransformation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/castransformation/internal/AlignmentStorage.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.castransformation.internal; - -import java.util.HashMap; -import java.util.Map; -import java.util.WeakHashMap; - -import org.apache.uima.cas.CAS; - -import de.tudarmstadt.ukp.dkpro.core.api.transform.alignment.AlignedString; -import de.tudarmstadt.ukp.dkpro.core.castransformation.ApplyChangesAnnotator; -import de.tudarmstadt.ukp.dkpro.core.castransformation.Backmapper; - -/** - * Use to smuggle the alignment state from the {@link ApplyChangesAnnotator} to the - * {@link Backmapper}. - * - * @since 1.1.0 - */ -public class AlignmentStorage -{ - private static AlignmentStorage instance; - - private Map> mmap; - - { - mmap = new WeakHashMap>(); - } - - public static synchronized AlignmentStorage getInstance() - { - if (instance == null) { - instance = new AlignmentStorage(); - } - return instance; - } - - public AlignedString get(final CAS aCas, final String from, final String to) - { - Map map = mmap.get(aCas); - if (map == null) { - return null; - } - return map.get(new Key(from, to)); - } - - public void put(final CAS aCas, final String from, final String to, final AlignedString aAs) - { - Map map = mmap.get(aCas); - if (map == null) { - map = new HashMap(); - mmap.put(aCas, map); - } - - System.out.println("Adding from [" + from + "] to [" + to + "] on [" + aCas.hashCode() - + "]"); - map.put(new Key(from, to), aAs); - } - - private static class Key - { - final String from; - final String to; - - public Key(final String aFrom, final String aTo) - { - from = aFrom; - to = aTo; - } - - @Override - public int hashCode() - { - final int prime = 31; - int result = 1; - result = prime * result + ((from == null) ? 0 : from.hashCode()); - result = prime * result + ((to == null) ? 0 : to.hashCode()); - return result; - } - - @Override - public boolean equals(Object obj) - { - if (this == obj) { - return true; - } - if (obj == null) { - return false; - } - if (getClass() != obj.getClass()) { - return false; - } - Key other = (Key) obj; - if (from == null) { - if (other.from != null) { - return false; - } - } - else if (!from.equals(other.from)) { - return false; - } - if (to == null) { - if (other.to != null) { - return false; - } - } - else if (!to.equals(other.to)) { - return false; - } - return true; - } - } -} diff --git a/dkpro-core-castransformation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/castransformation/package-info.java b/dkpro-core-castransformation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/castransformation/package-info.java deleted file mode 100644 index 57f90aa676..0000000000 --- a/dkpro-core-castransformation-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/castransformation/package-info.java +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Components for working with texts that require to be corrected or otherwise changed in order to be - * analyzed further. Modifications are tracked and analysises on the modified text can be related to - * the original texts. - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.castransformation; diff --git a/dkpro-core-castransformation-asl/src/main/java/org/dkpro/core/castransformation/ApplyChangesAnnotator.java b/dkpro-core-castransformation-asl/src/main/java/org/dkpro/core/castransformation/ApplyChangesAnnotator.java new file mode 100644 index 0000000000..f8eb6c34ef --- /dev/null +++ b/dkpro-core-castransformation-asl/src/main/java/org/dkpro/core/castransformation/ApplyChangesAnnotator.java @@ -0,0 +1,89 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.castransformation; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CASException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.transform.alignment.AlignedString; +import org.dkpro.core.castransformation.internal.AlignmentFactory; +import org.dkpro.core.castransformation.internal.AlignmentStorage; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Applies changes annotated using a {@link SofaChangeAnnotation}. + * + * @since 1.1.0 + * @see Backmapper + */ +@ResourceMetaData(name = "CAS Transformation - Apply") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation"}, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation"}) + +public class ApplyChangesAnnotator + extends JCasAnnotator_ImplBase +{ + public static final String VIEW_SOURCE = "source"; + public static final String VIEW_TARGET = "target"; + + public static final String OP_INSERT = "insert"; + public static final String OP_REPLACE = "replace"; + public static final String OP_DELETE = "delete"; + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + try { + JCas sourceView = aJCas.getView(VIEW_SOURCE); + JCas targetView = aJCas.createView(VIEW_TARGET); + DocumentMetaData.copy(sourceView, targetView); + applyChanges(sourceView, targetView); + } + catch (CASException e) { + throw new AnalysisEngineProcessException(e); + } + } + + protected void applyChanges(JCas aSourceView, JCas aTargetView) + { + AlignedString as = AlignmentFactory.createAlignmentsFor(aSourceView); + + // Set the text of the new Sofa + aTargetView.setDocumentText(as.get()); + + // Set document language + aTargetView.setDocumentLanguage(aSourceView.getDocumentLanguage()); + + // Optionally we may want to remember the AlignedString for the backmapper. + AlignmentStorage.getInstance().put(aSourceView.getCasImpl().getBaseCAS(), + aSourceView.getViewName(), aTargetView.getViewName(), as); + } +} diff --git a/dkpro-core-castransformation-asl/src/main/java/org/dkpro/core/castransformation/Backmapper.java b/dkpro-core-castransformation-asl/src/main/java/org/dkpro/core/castransformation/Backmapper.java new file mode 100644 index 0000000000..5d83257329 --- /dev/null +++ b/dkpro-core-castransformation-asl/src/main/java/org/dkpro/core/castransformation/Backmapper.java @@ -0,0 +1,211 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.castransformation; + +import static org.apache.uima.fit.util.CasUtil.selectAllFS; + +import java.util.LinkedList; + +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.AnnotationBaseFS; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.CASException; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.FeatureStructure; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.internal.util.IntListIterator; +import org.apache.uima.internal.util.PositiveIntSet; +import org.apache.uima.internal.util.PositiveIntSet_impl; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.apache.uima.util.CasCopier; +import org.apache.uima.util.Logger; +import org.dkpro.core.api.transform.alignment.AlignedString; +import org.dkpro.core.api.transform.alignment.ImmutableInterval; +import org.dkpro.core.api.transform.alignment.Interval; +import org.dkpro.core.castransformation.internal.AlignmentFactory; +import org.dkpro.core.castransformation.internal.AlignmentStorage; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * After processing a file with the {@code ApplyChangesAnnotator} this annotator + * can be used to map the annotations created in the cleaned view back to the + * original view. + *

+ * This annotator is able to resume the mapping after a CAS restore from any point after the cleaned + * view has been created, as long as no changes were made to SofaChangeAnnotations in the original + * view. + * @see ApplyChangesAnnotator + */ +@ResourceMetaData(name = "CAS Transformation - Map back") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +public class Backmapper + extends JCasAnnotator_ImplBase +{ + /** + * Chain of views for backmapping. This should be the reverse of the chain of views that the + * {@code ApplyChangesAnnotator} has used. + * + * For example, if view A has been mapped to B using {@code ApplyChangesAnnotator}, then this + * parameter should be set using an array containing [B, A]. + */ + public static final String PARAM_CHAIN = "Chain"; + + @ConfigurationParameter(name = PARAM_CHAIN, mandatory = false, defaultValue = { + ApplyChangesAnnotator.VIEW_SOURCE, ApplyChangesAnnotator.VIEW_TARGET}) + protected LinkedList sofaChain = new LinkedList<>(); + + @Override + public void process(final JCas aJCas) + throws AnalysisEngineProcessException + { + try { + // Now we can copy the complete CAS while mapping back the offsets. + // We first use the CAS copier and then update the offsets. + getLogger().info("Copying annotations from [" + sofaChain.getFirst() + + "] to [" + sofaChain.getLast() + "]"); + + // Copy the annotations + CAS sourceView = aJCas.getCas().getView(sofaChain.getFirst()); + CAS targetView = aJCas.getCas().getView(sofaChain.getLast()); + Feature mDestSofaFeature = targetView.getTypeSystem() + .getFeatureByFullName(CAS.FEATURE_FULL_NAME_SOFA); + CasCopier cc = new CasCopier(sourceView, targetView); + int docAnno = sourceView.getLowLevelCAS() + .ll_getFSRef(sourceView.getDocumentAnnotation()); + final PositiveIntSet copiedFs = new PositiveIntSet_impl(); + for (FeatureStructure fs : selectAllFS(sourceView)) { + int ref = sourceView.getLowLevelCAS().ll_getFSRef(fs); + if (ref == docAnno) { + // Skip document annotation + continue; + } + + // This returns either a new copy -- or -- if an FS has been copied as a + // transitively referenced feature of another FS, it will return an existing copy + FeatureStructure fsCopy = cc.copyFs(fs); + + // Make sure that the sofa annotation in the copy is set + if (fs instanceof AnnotationBaseFS) { + FeatureStructure sofa = fsCopy.getFeatureValue(mDestSofaFeature); + if (sofa == null) { + fsCopy.setFeatureValue(mDestSofaFeature, targetView.getSofa()); + } + } + + // We will still update the offsets, so we do not index the copy just yet + copiedFs.add(targetView.getLowLevelCAS().ll_getFSRef(fsCopy)); + } + + // Get the final target view + JCas targetViewJCas = aJCas.getView(sofaChain.getLast()); + + LinkedList workChain = new LinkedList<>(sofaChain); + String target = workChain.poll(); + String source = null; + + do { + source = target; + target = workChain.poll(); + + // Ok, so now we update the offsets. + String realSource = aJCas.getCas().getView(source).getViewName(); + String realTarget = aJCas.getCas().getView(target).getViewName(); + + AlignedString as = getAlignedString(aJCas, realSource, realTarget); + + updateOffsets(sourceView, targetViewJCas, as, copiedFs); + } + while (!workChain.isEmpty()); + + // Now we index the copied FSes again + IntListIterator it = copiedFs.iterator(); + while (it.hasNext()) { + FeatureStructure fs = targetView.getLowLevelCAS().ll_getFSForRef(it.next()); + targetView.addFsToIndexes(fs); + } + } + catch (UIMAException e) { + throw new AnalysisEngineProcessException(e); + } + } + + private AlignedString getAlignedString(JCas aSomeCase, String from, String to) + throws AnalysisEngineProcessException, CASException { + CAS baseCas = aSomeCase.getCasImpl().getBaseCAS(); + + // Try to get the AlignedString for the current JCas. + AlignmentStorage asstore = AlignmentStorage.getInstance(); + AlignedString as = asstore.get(baseCas, to, from); + + if (as == null) { + // Attempt to reconstruct the alignment from the SofaChangeAnnotations. + // This only works when they have not been altered in the mean time. + Logger logger = getLogger(); + if (logger.isInfoEnabled()) { + logger.info("No mapping found from [" + from + "] to [" + to + "] on [" + + baseCas.hashCode() + "]. " + + "Restoring mapping from SofaChangeAnnotation found in [" + to + "]." + ); + } + JCas view = aSomeCase.getCas().getView(to).getJCas(); + as = AlignmentFactory.createAlignmentsFor(view); + } + + // If there is none we have to fail. Practically this should never happen + // when the alignment state is reconstructed in the previous step. + if (as == null) { + throw new AnalysisEngineProcessException(new IllegalStateException( + "No mapping found from [" + from + "] to [" + to + "] on [" + + baseCas.hashCode() + "]")); + } + + return as; + } + + private void updateOffsets(CAS sourceView, JCas targetView, AlignedString as, + PositiveIntSet aCopiedFs) + throws CASException, AnalysisEngineProcessException + { + // We only update annotations that were copied, nothing that was already there. + IntListIterator it = aCopiedFs.iterator(); + + while (it.hasNext()) { + FeatureStructure fs = targetView.getLowLevelCas().ll_getFSForRef(it.next()); + if (fs instanceof Annotation) { + + // Now we update the offsets + Annotation a = (Annotation) fs; +// System.out.printf("Orig %s %3d %3d : %s%n", a.getType().getShortName(), +// a.getBegin(), a.getEnd(), +// sourceView.getDocumentText().substring(a.getBegin(), a.getEnd())); +// System.out.printf("Before %s %3d %3d : %s%n", a.getType().getShortName(), +// a.getBegin(), a.getEnd(), a.getCoveredText()); + Interval resolved = as.resolve(new ImmutableInterval(a.getBegin(), a.getEnd())); + a.setBegin(resolved.getStart()); + a.setEnd(resolved.getEnd()); +// System.out.printf("After %s %3d %3d : %s%n", a.getType().getShortName(), +// a.getBegin(), a.getEnd(), a.getCoveredText()); + } + } + } +} diff --git a/dkpro-core-castransformation-asl/src/main/java/org/dkpro/core/castransformation/internal/AlignmentFactory.java b/dkpro-core-castransformation-asl/src/main/java/org/dkpro/core/castransformation/internal/AlignmentFactory.java new file mode 100644 index 0000000000..1bdf52dcfd --- /dev/null +++ b/dkpro-core-castransformation-asl/src/main/java/org/dkpro/core/castransformation/internal/AlignmentFactory.java @@ -0,0 +1,111 @@ +/* + * Copyright 2018 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.castransformation.internal; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.uima.cas.FSIndex; +import org.apache.uima.cas.FSIterator; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.dkpro.core.api.transform.alignment.AlignedString; +import org.dkpro.core.castransformation.ApplyChangesAnnotator; +import org.dkpro.core.castransformation.Backmapper; + +import de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation; + +/** + * Creates alignment state for the {@link ApplyChangesAnnotator} and the {@link Backmapper} using + * {@link de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation} found in the view + * that was mapped. + * + * @since 1.9.3 + */ + +public class AlignmentFactory { + + public static AlignedString createAlignmentsFor(JCas aSourceView) { + + FSIndex idx = aSourceView.getAnnotationIndex(SofaChangeAnnotation.type); + + // Apply all the changes + AlignedString alignmentState = new AlignedString(aSourceView.getDocumentText()); + + // Collect all those edits that are going to be executed. + // + // | A | C1 C2 R + // BBBBBB + - - + // BBBBBBBBBB + + + + // BBBBBBBBBBBBBBBBB + + + + // BBBBBBB - + - + // BBBBBBBBBBBBB - + - + // BBBBBBBB - + - + // + if (idx.size() > 0) { + List edits = new ArrayList(); + { + // Get an iterator over all the change annotations. Per UIMA default + // this iterator is sorted first by begin and then by end offsets. + // We will make use of this fact here to skip change annotations that + // are covered by others. The earliest longest change wins - this means + // the one with the smallest begin offset and the largest end offset. + FSIterator it = idx.iterator(); + + SofaChangeAnnotation top = (SofaChangeAnnotation) it.get(); + edits.add(top); + it.moveToNext(); + while (it.isValid()) { + SofaChangeAnnotation b = (SofaChangeAnnotation) it.get(); + if (((top.getBegin() <= b.getBegin()) && // C1 + (top.getEnd() > b.getBegin()) // C2 + ) + || ((top.getBegin() == b.getBegin()) && (top.getEnd() == b.getEnd()))) { + // Found annotation covering current annotation. Skipping + // current annotation. + } + else { + top = b; + edits.add(top); + } + it.moveToNext(); + } + } + + // If we remove or add stuff all offsets right of the change location + // will change and thus the offsets in the change annotation are no + // longer valid. If we move from right to left it works better because + // the left offsets remain stable. + Collections.reverse(edits); + for (SofaChangeAnnotation a : edits) { + if (ApplyChangesAnnotator.OP_INSERT.equals(a.getOperation())) { + alignmentState.insert(a.getBegin(), a.getValue()); + } + if (ApplyChangesAnnotator.OP_DELETE.equals(a.getOperation())) { + alignmentState.delete(a.getBegin(), a.getEnd()); + } + if (ApplyChangesAnnotator.OP_REPLACE.equals(a.getOperation())) { + alignmentState.replace(a.getBegin(), a.getEnd(), a.getValue()); + } + } + } + + return alignmentState; + } +} diff --git a/dkpro-core-castransformation-asl/src/main/java/org/dkpro/core/castransformation/internal/AlignmentStorage.java b/dkpro-core-castransformation-asl/src/main/java/org/dkpro/core/castransformation/internal/AlignmentStorage.java new file mode 100644 index 0000000000..98c4d3993a --- /dev/null +++ b/dkpro-core-castransformation-asl/src/main/java/org/dkpro/core/castransformation/internal/AlignmentStorage.java @@ -0,0 +1,130 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.castransformation.internal; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.WeakHashMap; + +import org.apache.uima.cas.CAS; +import org.dkpro.core.api.transform.alignment.AlignedString; +import org.dkpro.core.castransformation.ApplyChangesAnnotator; +import org.dkpro.core.castransformation.Backmapper; + +/** + * Use to smuggle the alignment state from the {@link ApplyChangesAnnotator} to the + * {@link Backmapper}. + * + * @since 1.1.0 + */ +public class AlignmentStorage +{ + private static AlignmentStorage instance; + + private Map> mmap; + + { + // WeakHashMap is not threadsafe, so we need to wrap it, because it will likely be + // invoked by many concurrent pipeline threads. + mmap = Collections.synchronizedMap(new WeakHashMap>()); + } + + public static synchronized AlignmentStorage getInstance() + { + if (instance == null) { + instance = new AlignmentStorage(); + } + return instance; + } + + public AlignedString get(final CAS aCas, final String from, final String to) + { + Map map = mmap.get(aCas); + if (map == null) { + return null; + } + return map.get(new Key(from, to)); + } + + public void put(final CAS aCas, final String from, final String to, final AlignedString aAs) + { + Map map = mmap.computeIfAbsent(aCas, k -> new HashMap<>()); + // No reason to keep the internal map synchronized because it's specific to the CAS, and + // it is assumed that two different pipeline threads in any execution environment never + // manipulate the same CAS instance simultaneously. + //System.out.println("Adding from [" + from + "] to [" + to + "] on [" + aCas.hashCode() + // + "]"); + map.put(new Key(from, to), aAs); + + } + + private static class Key + { + final String from; + final String to; + + public Key(final String aFrom, final String aTo) + { + from = aFrom; + to = aTo; + } + + @Override + public int hashCode() + { + final int prime = 31; + int result = 1; + result = prime * result + ((from == null) ? 0 : from.hashCode()); + result = prime * result + ((to == null) ? 0 : to.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) + { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + Key other = (Key) obj; + if (from == null) { + if (other.from != null) { + return false; + } + } + else if (!from.equals(other.from)) { + return false; + } + if (to == null) { + if (other.to != null) { + return false; + } + } + else if (!to.equals(other.to)) { + return false; + } + return true; + } + } +} diff --git a/dkpro-core-castransformation-asl/src/main/java/org/dkpro/core/castransformation/package-info.java b/dkpro-core-castransformation-asl/src/main/java/org/dkpro/core/castransformation/package-info.java new file mode 100644 index 0000000000..d6af9553f4 --- /dev/null +++ b/dkpro-core-castransformation-asl/src/main/java/org/dkpro/core/castransformation/package-info.java @@ -0,0 +1,26 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Components for working with texts that require to be corrected or otherwise changed in order to + * be analyzed further. Modifications are tracked and analyses on the modified text can be related + * to the original texts. + * + * @since 1.1.0 + */ +package org.dkpro.core.castransformation; diff --git a/dkpro-core-castransformation-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/castransformation/ApplyChangesBackmapperTest.java b/dkpro-core-castransformation-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/castransformation/ApplyChangesBackmapperTest.java deleted file mode 100644 index 232a19fc50..0000000000 --- a/dkpro-core-castransformation-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/castransformation/ApplyChangesBackmapperTest.java +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.castransformation; - -import de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.io.xmi.XmiWriter; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.EOLUtils; -import de.tudarmstadt.ukp.dkpro.core.testing.dumper.CasDumpWriter; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; -import org.apache.commons.io.FileUtils; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.factory.AggregateBuilder; -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.cas.AnnotationBase; -import org.junit.Rule; -import org.junit.Test; - -import java.io.File; -import java.io.FileWriter; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.junit.Assert.assertEquals; - -public class ApplyChangesBackmapperTest -{ - public static final String TARGET_VIEW = "TargetView"; - - @Test - public void test() - throws Exception - { - File output = testContext.getTestOutputFolder(); - File inputFile = new File("src/test/resources/input.txt"); - File dumpFile = new File(output, "output.txt"); - String pipelineFilePath = new File(output, "pipeline.xml").getPath(); - - CollectionReaderDescription reader = createReaderDescription(TextReader.class, - TextReader.PARAM_SOURCE_LOCATION, inputFile, TextReader.PARAM_LANGUAGE, "en"); - - AnalysisEngineDescription deletes = createEngineDescription(SofaDeleteAnnotator.class); - - AnalysisEngineDescription applyChanges = createEngineDescription( - ApplyChangesAnnotator.class); - - AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - - AnalysisEngineDescription backMapper = createEngineDescription(Backmapper.class, - Backmapper.PARAM_CHAIN, new String[] { TARGET_VIEW, CAS.NAME_DEFAULT_SOFA }); - - AnalysisEngineDescription xmiWriter = createEngineDescription(XmiWriter.class, - XmiWriter.PARAM_TARGET_LOCATION, output); - - AnalysisEngineDescription dumpWriter = createEngineDescription(CasDumpWriter.class, - CasDumpWriter.PARAM_TARGET_LOCATION, dumpFile); - - AggregateBuilder builder = new AggregateBuilder(); - builder.add(deletes); // Removing some lines to make sure to confuse the backmapper - builder.add(applyChanges, ApplyChangesAnnotator.VIEW_TARGET, TARGET_VIEW, - ApplyChangesAnnotator.VIEW_SOURCE, CAS.NAME_DEFAULT_SOFA); - builder.add(segmenter, CAS.NAME_DEFAULT_SOFA, TARGET_VIEW); - builder.add(backMapper); - builder.add(xmiWriter, CAS.NAME_DEFAULT_SOFA, TARGET_VIEW); - builder.add(dumpWriter, CAS.NAME_DEFAULT_SOFA, TARGET_VIEW); - AnalysisEngineDescription pipeline = builder.createAggregateDescription(); - - try (FileWriter writer = new FileWriter(pipelineFilePath)) { - pipeline.toXML(writer); - } - - SimplePipeline.runPipeline(reader, pipeline); - - String expected = FileUtils.readFileToString(new File("src/test/resources/output.txt"), - "UTF-8"); - String actual = FileUtils.readFileToString(dumpFile, "UTF-8"); - - expected = EOLUtils.normalizeLineEndings(expected); - actual = EOLUtils.normalizeLineEndings(actual); - assertEquals(expected, actual); - } - - public static class SofaDeleteAnnotator - extends JCasAnnotator_ImplBase - { - @Override - public void process(JCas jCas) - throws AnalysisEngineProcessException - { - try { - // Removes some "sentences" in a deterministic way. Assumes there are at least 5 - // sentences though :-) - String text = jCas.getDocumentText(); - int previousPunctuation = -1; - int sentenceCount = 0; - for (int i = 0; i < text.length(); i++) { - if (text.charAt(i) == '.') { - if (sentenceCount % 5 == 0) { - SofaChangeAnnotation delete = new SofaChangeAnnotation(jCas); - delete.setOperation("delete"); - delete.setBegin(previousPunctuation + 1); - delete.setEnd(i + 1); - delete.addToIndexes(); - } - previousPunctuation = i; - sentenceCount++; - } - } - } - catch (Exception e) { - throw new AnalysisEngineProcessException(e); - } - } - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); - - @Test - public void testBackMappingOfGeneralFeatureStructures() - throws Exception - { - - File inputFile = new File("src/test/resources/input.txt"); - CollectionReaderDescription reader = createReaderDescription(TextReader.class, - TextReader.PARAM_SOURCE_LOCATION, inputFile, TextReader.PARAM_LANGUAGE, "en"); - - AnalysisEngineDescription applyChanges = createEngineDescription( - ApplyChangesAnnotator.class); - - AnalysisEngineDescription fsCreator = createEngineDescription(CreateFeatureStructure.class); - - AnalysisEngineDescription backMapper = createEngineDescription(Backmapper.class, - Backmapper.PARAM_CHAIN, new String[] { TARGET_VIEW, CAS.NAME_DEFAULT_SOFA }); - - AnalysisEngineDescription assertNotYetMappedBack = createEngineDescription( - AssertFeatureStructureCount.class, AssertFeatureStructureCount.PARAM_EXPECTED_COUNT, - 0); - - AnalysisEngineDescription assertMappedBack = createEngineDescription( - AssertFeatureStructureCount.class, AssertFeatureStructureCount.PARAM_EXPECTED_COUNT, - 1); - - AggregateBuilder builder = new AggregateBuilder(); - builder.add(applyChanges, ApplyChangesAnnotator.VIEW_TARGET, TARGET_VIEW, - ApplyChangesAnnotator.VIEW_SOURCE, CAS.NAME_DEFAULT_SOFA); - builder.add(fsCreator, CAS.NAME_DEFAULT_SOFA, TARGET_VIEW); - builder.add(assertNotYetMappedBack); // Should only exist in target view - builder.add(backMapper); - builder.add(assertMappedBack); // Should now be present in initial view - - AnalysisEngineDescription pipeline = builder.createAggregateDescription(); - - SimplePipeline.runPipeline(reader, pipeline); - } - - public static class CreateFeatureStructure - extends JCasAnnotator_ImplBase - { - - @Override - public void process(JCas jCas) - throws AnalysisEngineProcessException - { - new AnnotationBase(jCas).addToIndexes(); - } - } - - public static class AssertFeatureStructureCount - extends JCasAnnotator_ImplBase - { - - public static final String PARAM_EXPECTED_COUNT = "expectedCount"; - - @ConfigurationParameter(name = PARAM_EXPECTED_COUNT, mandatory = true) - private int expectedCount; - - @Override - public void process(JCas jCas) - throws AnalysisEngineProcessException - { - int fsCount = (int) JCasUtil.select(jCas, AnnotationBase.class).stream() - .filter(t -> t.getClass().equals(AnnotationBase.class)).count(); - - assertEquals(fsCount, expectedCount); - } - } -} diff --git a/dkpro-core-castransformation-asl/src/test/java/org/dkpro/core/castransformation/ApplyChangesBackmapperTest.java b/dkpro-core-castransformation-asl/src/test/java/org/dkpro/core/castransformation/ApplyChangesBackmapperTest.java new file mode 100644 index 0000000000..27f859dc77 --- /dev/null +++ b/dkpro-core-castransformation-asl/src/test/java/org/dkpro/core/castransformation/ApplyChangesBackmapperTest.java @@ -0,0 +1,270 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.castransformation; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.FileWriter; + +import org.apache.commons.io.FileUtils; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.factory.AggregateBuilder; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.AnnotationBase; +import org.dkpro.core.castransformation.internal.AlignmentStorage; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.io.xmi.XmiWriter; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.EOLUtils; +import org.dkpro.core.testing.dumper.CasDumpWriter; +import org.dkpro.core.tokit.BreakIteratorSegmenter; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation; + +public class ApplyChangesBackmapperTest +{ + public static final String TARGET_VIEW = "TargetView"; + + + @Test + public void testBackMappingWithCachedAlignmentStateWhenRemovingTextFromTarget() + throws Exception + { + boolean clearAlignmentCache = false; + testBackMappingWhenRemovingTextFromTarget(clearAlignmentCache); + } + + @Test + public void testBackMappingWithoutCachedAlignmentStateWhenRemovingTextFromTarget() + throws Exception + { + boolean clearAlignmentCache = true; + testBackMappingWhenRemovingTextFromTarget(clearAlignmentCache); + } + + private void testBackMappingWhenRemovingTextFromTarget(boolean clearAlignmentState) + throws Exception + { + File output = testContext.getTestOutputFolder(); + File inputFile = new File("src/test/resources/input.txt"); + File dumpFile = new File(output, "output.txt"); + String pipelineFilePath = new File(output, "pipeline.xml").getPath(); + + CollectionReaderDescription reader = createReaderDescription( + TextReader.class, + TextReader.PARAM_SOURCE_LOCATION, + inputFile, + TextReader.PARAM_LANGUAGE, "en" + ); + + AnalysisEngineDescription deletes = createEngineDescription(SofaDeleteAnnotator.class); + + AnalysisEngineDescription applyChanges = createEngineDescription( + ApplyChangesAnnotator.class); + + AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); + + AnalysisEngineDescription clearAlignmentCache = + createEngineDescription(ClearAlignmentCache.class); + + AnalysisEngineDescription backMapper = createEngineDescription(Backmapper.class, + Backmapper.PARAM_CHAIN, new String[] { TARGET_VIEW, CAS.NAME_DEFAULT_SOFA }); + + AnalysisEngineDescription xmiWriter = createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, output); + + AnalysisEngineDescription dumpWriter = createEngineDescription(CasDumpWriter.class, + CasDumpWriter.PARAM_TARGET_LOCATION, dumpFile); + + AggregateBuilder builder = new AggregateBuilder(); + builder.add(deletes); // Removing some lines to make sure to confuse the backmapper + builder.add(applyChanges, ApplyChangesAnnotator.VIEW_TARGET, TARGET_VIEW, + ApplyChangesAnnotator.VIEW_SOURCE, CAS.NAME_DEFAULT_SOFA); + builder.add(segmenter, CAS.NAME_DEFAULT_SOFA, TARGET_VIEW); + if (clearAlignmentState) { + builder.add(clearAlignmentCache, CAS.NAME_DEFAULT_SOFA, TARGET_VIEW); + } + builder.add(backMapper); + builder.add(xmiWriter, CAS.NAME_DEFAULT_SOFA, TARGET_VIEW); + builder.add(dumpWriter, CAS.NAME_DEFAULT_SOFA, TARGET_VIEW); + AnalysisEngineDescription pipeline = builder.createAggregateDescription(); + + try (FileWriter writer = new FileWriter(pipelineFilePath)) { + pipeline.toXML(writer); + } + + SimplePipeline.runPipeline(reader, pipeline); + + String expected = FileUtils.readFileToString( + new File("src/test/resources/output.txt"), + "UTF-8" + ); + String actual = FileUtils.readFileToString(dumpFile, "UTF-8"); + + expected = EOLUtils.normalizeLineEndings(expected); + actual = EOLUtils.normalizeLineEndings(actual); + assertEquals(expected, actual); + } + + public static class SofaDeleteAnnotator + extends JCasAnnotator_ImplBase + { + @Override + public void process(JCas jCas) + throws AnalysisEngineProcessException + { + try { + // Removes some "sentences" in a deterministic way. Assumes there are at least 5 + // sentences though :-) + String text = jCas.getDocumentText(); + int previousPunctuation = -1; + int sentenceCount = 0; + for (int i = 0; i < text.length(); i++) { + if (text.charAt(i) == '.') { + if (sentenceCount % 5 == 0) { + SofaChangeAnnotation delete = new SofaChangeAnnotation(jCas); + delete.setOperation("delete"); + delete.setBegin(previousPunctuation + 1); + delete.setEnd(i + 1); + delete.addToIndexes(); + } + previousPunctuation = i; + sentenceCount++; + } + } + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + } + } + + public static class ClearAlignmentCache extends JCasAnnotator_ImplBase { + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + // Simulates a CAS restore before backmapping where the alignment cache has been + // cleared, so that the fallback to reconstructing alignment state works in the + // Backmapper. This is somewhat hacked, since it depends very much on inner mechanics of + // the Backmapper and the alignment store involved, but it is much simpler compared + // to building pipelines that store the CAS at the point before backmapping, and then + // restore it to resume processing from that point with a complete process restart + // between store and restore, since the alignment store is a singleton that will + // otherwise persist and not be cleared. + AlignmentStorage.getInstance().put( + jCas.getCasImpl().getBaseCAS(), + CAS.NAME_DEFAULT_SOFA, TARGET_VIEW, + null + ); + } + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); + + @Test + public void testBackMappingOfGeneralFeatureStructures() + throws Exception + { + + File inputFile = new File("src/test/resources/input.txt"); + CollectionReaderDescription reader = createReaderDescription( + TextReader.class, + TextReader.PARAM_SOURCE_LOCATION, + inputFile, + TextReader.PARAM_LANGUAGE, + "en" + ); + + AnalysisEngineDescription applyChanges = createEngineDescription( + ApplyChangesAnnotator.class); + + AnalysisEngineDescription fsCreator = createEngineDescription(CreateFeatureStructure.class); + + AnalysisEngineDescription backMapper = createEngineDescription(Backmapper.class, + Backmapper.PARAM_CHAIN, new String[] { TARGET_VIEW, CAS.NAME_DEFAULT_SOFA }); + + AnalysisEngineDescription assertNotYetMappedBack = createEngineDescription( + AssertFeatureStructureCount.class, + AssertFeatureStructureCount.PARAM_EXPECTED_COUNT, + 0 + ); + + AnalysisEngineDescription assertMappedBack = createEngineDescription( + AssertFeatureStructureCount.class, + AssertFeatureStructureCount.PARAM_EXPECTED_COUNT, + 1 + ); + + AggregateBuilder builder = new AggregateBuilder(); + builder.add(applyChanges, ApplyChangesAnnotator.VIEW_TARGET, TARGET_VIEW, + ApplyChangesAnnotator.VIEW_SOURCE, CAS.NAME_DEFAULT_SOFA); + builder.add(fsCreator, CAS.NAME_DEFAULT_SOFA, TARGET_VIEW); + builder.add(assertNotYetMappedBack); // Should only exist in target view + builder.add(backMapper); + builder.add(assertMappedBack); // Should now be present in initial view + + AnalysisEngineDescription pipeline = builder.createAggregateDescription(); + + SimplePipeline.runPipeline(reader, pipeline); + } + + public static class CreateFeatureStructure + extends JCasAnnotator_ImplBase + { + + @Override + public void process(JCas jCas) + throws AnalysisEngineProcessException + { + new AnnotationBase(jCas).addToIndexes(); + } + } + + public static class AssertFeatureStructureCount + extends JCasAnnotator_ImplBase + { + + public static final String PARAM_EXPECTED_COUNT = "expectedCount"; + + @ConfigurationParameter(name = PARAM_EXPECTED_COUNT, mandatory = true) + private int expectedCount; + + @Override + public void process(JCas jCas) + throws AnalysisEngineProcessException + { + int fsCount = (int) JCasUtil.select(jCas, AnnotationBase.class).stream() + .filter(t -> t.getClass().equals(AnnotationBase.class)).count(); + + assertEquals(fsCount, expectedCount); + } + } +} diff --git a/dkpro-core-castransformation-asl/src/test/resources/log4j.properties b/dkpro-core-castransformation-asl/src/test/resources/log4j.properties deleted file mode 100644 index ca0c5b7a89..0000000000 --- a/dkpro-core-castransformation-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.castransformation.ApplyChangesAnnotator = TRACE diff --git a/dkpro-core-castransformation-asl/src/test/resources/log4j2.xml b/dkpro-core-castransformation-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..19bf03b585 --- /dev/null +++ b/dkpro-core-castransformation-asl/src/test/resources/log4j2.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/dkpro-core-castransformation-asl/src/test/resources/output.txt b/dkpro-core-castransformation-asl/src/test/resources/output.txt index 34b822c769..f3ffc8a02f 100644 --- a/dkpro-core-castransformation-asl/src/test/resources/output.txt +++ b/dkpro-core-castransformation-asl/src/test/resources/output.txt @@ -31,31 +31,37 @@ Token sofa: _InitialView begin: 27 end: 33 + order: 0 [humba] Token sofa: _InitialView begin: 34 end: 39 + order: 0 [doopi] Token sofa: _InitialView begin: 40 end: 45 + order: 0 [jamba] Token sofa: _InitialView begin: 46 end: 51 + order: 0 [Too] Token sofa: _InitialView begin: 52 end: 55 + order: 0 [.] Token sofa: _InitialView begin: 55 end: 56 + order: 0 [Mekoman jalla hembembel!] Sentence sofa: _InitialView @@ -66,21 +72,25 @@ Token sofa: _InitialView begin: 57 end: 64 + order: 0 [jalla] Token sofa: _InitialView begin: 65 end: 70 + order: 0 [hembembel] Token sofa: _InitialView begin: 71 end: 80 + order: 0 [!] Token sofa: _InitialView begin: 80 end: 81 + order: 0 [Labumbu jakka lembi nabi.] Sentence @@ -92,26 +102,31 @@ Token sofa: _InitialView begin: 82 end: 89 + order: 0 [jakka] Token sofa: _InitialView begin: 90 end: 95 + order: 0 [lembi] Token sofa: _InitialView begin: 96 end: 101 + order: 0 [nabi] Token sofa: _InitialView begin: 102 end: 106 + order: 0 [.] Token sofa: _InitialView begin: 106 end: 107 + order: 0 [Humba jumbaba tumba Jalla Jamba.] Sentence sofa: _InitialView @@ -122,31 +137,37 @@ Token sofa: _InitialView begin: 108 end: 113 + order: 0 [jumbaba] Token sofa: _InitialView begin: 114 end: 121 + order: 0 [tumba] Token sofa: _InitialView begin: 122 end: 127 + order: 0 [Jalla] Token sofa: _InitialView begin: 128 end: 133 + order: 0 [Jamba] Token sofa: _InitialView begin: 134 end: 139 + order: 0 [.] Token sofa: _InitialView begin: 139 end: 140 + order: 0 -------- View _InitialView end ---------------------------------- -------- View TargetView begin ---------------------------------- @@ -174,31 +195,37 @@ Token sofa: TargetView begin: 1 end: 7 + order: 0 [humba] Token sofa: TargetView begin: 8 end: 13 + order: 0 [doopi] Token sofa: TargetView begin: 14 end: 19 + order: 0 [jamba] Token sofa: TargetView begin: 20 end: 25 + order: 0 [Too] Token sofa: TargetView begin: 26 end: 29 + order: 0 [.] Token sofa: TargetView begin: 29 end: 30 + order: 0 [Mekoman jalla hembembel!] Sentence sofa: TargetView @@ -209,21 +236,25 @@ Token sofa: TargetView begin: 31 end: 38 + order: 0 [jalla] Token sofa: TargetView begin: 39 end: 44 + order: 0 [hembembel] Token sofa: TargetView begin: 45 end: 54 + order: 0 [!] Token sofa: TargetView begin: 54 end: 55 + order: 0 [Labumbu jakka lembi nabi.] Sentence @@ -235,26 +266,31 @@ Token sofa: TargetView begin: 56 end: 63 + order: 0 [jakka] Token sofa: TargetView begin: 64 end: 69 + order: 0 [lembi] Token sofa: TargetView begin: 70 end: 75 + order: 0 [nabi] Token sofa: TargetView begin: 76 end: 80 + order: 0 [.] Token sofa: TargetView begin: 80 end: 81 + order: 0 [Humba jumbaba tumba Jalla Jamba.] Sentence sofa: TargetView @@ -265,31 +301,37 @@ Token sofa: TargetView begin: 82 end: 87 + order: 0 [jumbaba] Token sofa: TargetView begin: 88 end: 95 + order: 0 [tumba] Token sofa: TargetView begin: 96 end: 101 + order: 0 [Jalla] Token sofa: TargetView begin: 102 end: 107 + order: 0 [Jamba] Token sofa: TargetView begin: 108 end: 113 + order: 0 [.] Token sofa: TargetView begin: 113 end: 114 + order: 0 -------- View TargetView end ---------------------------------- ======== CAS 0 end ================================== diff --git a/dkpro-core-cisstem-asl/pom.xml b/dkpro-core-cisstem-asl/pom.xml index 91c50dc83b..3407f23975 100644 --- a/dkpro-core-cisstem-asl/pom.xml +++ b/dkpro-core-cisstem-asl/pom.xml @@ -18,15 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - org.dkpro.core dkpro-core-cisstem-asl jar DKPro Core ASL - CISSTEM (German Stemmer) + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -41,13 +41,17 @@ commons-lang3 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.featurepath-asl + org.dkpro.core + dkpro-core-api-featurepath-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -60,8 +64,8 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test diff --git a/dkpro-core-cisstem-asl/src/main/java/org/dkpro/core/cisstem/CisStemmer.java b/dkpro-core-cisstem-asl/src/main/java/org/dkpro/core/cisstem/CisStemmer.java index 8a39d6efc0..12540fd835 100644 --- a/dkpro-core-cisstem-asl/src/main/java/org/dkpro/core/cisstem/CisStemmer.java +++ b/dkpro-core-cisstem-asl/src/main/java/org/dkpro/core/cisstem/CisStemmer.java @@ -34,99 +34,107 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.featurepath.FeaturePathAnnotatorBase; +import org.dkpro.core.api.featurepath.FeaturePathException; import org.dkpro.core.cisstem.util.CisStem; -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathAnnotatorBase; -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** - *

UIMA wrapper for the CISTEM algorithm. - * CISTEM is a stemming algorithm for the German language, developed by Leonie Weißweiler and Alexander Fraser. - * Annotation types to be stemmed can be configured by a {@link FeaturePath}.

+ * UIMA wrapper for the CISTEM algorithm. * - *

If you use this component in a pipeline which uses stop word removal, make sure that it - * runs after the stop word removal step, so only words that are no stop words are stemmed.

+ *

+ * CISTEM is a stemming algorithm for the German language, developed by Leonie Weißweiler and + * Alexander Fraser. Annotation types to be stemmed can be configured by a {@link FeaturePath}. + *

+ * + *

+ * If you use this component in a pipeline which uses stop word removal, make sure that it runs + * after the stop word removal step, so only words that are no stop words are stemmed. + *

* * @see CISSTEM homepage - * @see FeaturePathAnnotatorBase - * @since 1.1.0 */ -@ResourceMetaData(name="CIS Stemmer") +@Component(OperationType.STEMMER) +@ResourceMetaData(name = "CIS Stemmer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @LanguageCapability({ "de" }) @TypeCapability( - outputs={ + outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem"}) public class CisStemmer - extends FeaturePathAnnotatorBase + extends FeaturePathAnnotatorBase { - /** - * Per default the stemmer runs in case-sensitive mode. - * If this parameter is enabled, tokens are lower-cased before being passed to the stemmer. - */ - public static final String PARAM_LOWER_CASE = "lowerCase"; - @ConfigurationParameter(name = PARAM_LOWER_CASE, mandatory = false, defaultValue="false") - protected boolean lowerCase; - - @Override - protected Set getDefaultPaths() - { - return Collections.singleton(Token.class.getName()); - } - - @Override - protected void generateAnnotations(JCas jcas) throws FeaturePathException, - AnalysisEngineProcessException - { - // CAS is necessary to retrieve values - CAS currCAS = jcas.getCas(); - - for (String path : paths) { - - // Separate Typename and featurepath - String[] segments = path.split("/", 2); - String typeName = segments[0]; - - // Try to get the type from the typesystem of the CAS - Type t = currCAS.getTypeSystem().getType(typeName); - if (t == null) { - throw new IllegalStateException("Type [" + typeName + "] not found in type system"); - } - - // get an fpi object and initialize it - // initialize the FeaturePathInfo with the corresponding part - initializeFeaturePathInfoFrom(fp, segments); - - // get the annotations - AnnotationIndex idx = currCAS.getAnnotationIndex(t); - FSIterator iterator = idx.iterator(); - - while (iterator.hasNext()) { - AnnotationFS fs = (AnnotationFS) iterator.next(); - - try { - if (this.filterFeaturePath != null) { - // check annotation filter condition - if (this.filterFeaturePathInfo.match(fs, this.filterCondition)) { - createStemAnnotation(jcas, fs); - } - } - else { // no annotation filter specified - createStemAnnotation(jcas, fs); - } - } - catch (AnalysisEngineProcessException e) { - throw new IllegalStateException( - "error occured while creating a stem annotation", e); - } - } - } - } - - /** + /** + * Per default the stemmer runs in case-sensitive mode. If this parameter is enabled, tokens are + * lower-cased before being passed to the stemmer. + */ + public static final String PARAM_LOWER_CASE = "lowerCase"; + @ConfigurationParameter(name = PARAM_LOWER_CASE, mandatory = false, defaultValue = "false") + protected boolean lowerCase; + + @Override + protected Set getDefaultPaths() + { + return Collections.singleton(Token.class.getName()); + } + + @Override + protected void generateAnnotations(JCas jcas) + throws FeaturePathException, AnalysisEngineProcessException + { + // CAS is necessary to retrieve values + CAS currCAS = jcas.getCas(); + + for (String path : paths) { + + // Separate Typename and featurepath + String[] segments = path.split("/", 2); + String typeName = segments[0]; + + // Try to get the type from the typesystem of the CAS + Type t = currCAS.getTypeSystem().getType(typeName); + if (t == null) { + throw new IllegalStateException("Type [" + typeName + "] not found in type system"); + } + + // get an fpi object and initialize it + // initialize the FeaturePathInfo with the corresponding part + initializeFeaturePathInfoFrom(fp, segments); + + // get the annotations + AnnotationIndex idx = currCAS.getAnnotationIndex(t); + FSIterator iterator = idx.iterator(); + + while (iterator.hasNext()) { + AnnotationFS fs = (AnnotationFS) iterator.next(); + + try { + if (this.filterFeaturePath != null) { + // check annotation filter condition + if (this.filterFeaturePathInfo.match(fs, this.filterCondition)) { + createStemAnnotation(jcas, fs); + } + } + else { // no annotation filter specified + createStemAnnotation(jcas, fs); + } + } + catch (AnalysisEngineProcessException e) { + throw new IllegalStateException( + "error occured while creating a stem annotation", e); + } + } + } + } + + /** * Creates a Stem annotation with same begin and end as the AnnotationFS fs, the value is the * stemmed value derived by applying the feature path. * @@ -139,31 +147,31 @@ protected void generateAnnotations(JCas jcas) throws FeaturePathException, * @throws AnalysisEngineProcessException * if the {@code stem} method from the stemmer cannot be invoked. */ - private void createStemAnnotation(JCas jcas, AnnotationFS fs) - throws AnalysisEngineProcessException - { - // Check for blank text, it makes no sense to add a stem then (and raised an exception) - String word = fp.getValue(fs); - - boolean isUppercase = Character.isUpperCase(word.charAt(0)); - - if (!StringUtils.isBlank(word)) { - - String stemValue = CisStem.stem(word, lowerCase); - if (isUppercase && !lowerCase) { - stemValue = stemValue.substring(0, 1).toUpperCase() + stemValue.substring(1); - } - - Stem stemAnnot = new Stem(jcas, fs.getBegin(), fs.getEnd()); - stemAnnot.setValue(stemValue); - stemAnnot.addToIndexes(jcas); - - // Try setting the "stem" feature on Tokens. - Feature feat = fs.getType().getFeatureByBaseName("stem"); - if (feat != null && feat.getRange() != null - && jcas.getTypeSystem().subsumes(feat.getRange(), stemAnnot.getType())) { - fs.setFeatureValue(feat, stemAnnot); - } - } - } + private void createStemAnnotation(JCas jcas, AnnotationFS fs) + throws AnalysisEngineProcessException + { + // Check for blank text, it makes no sense to add a stem then (and raised an exception) + String word = fp.getValue(fs); + + boolean isUppercase = Character.isUpperCase(word.charAt(0)); + + if (!StringUtils.isBlank(word)) { + + String stemValue = CisStem.stem(word, lowerCase); + if (isUppercase && !lowerCase) { + stemValue = stemValue.substring(0, 1).toUpperCase() + stemValue.substring(1); + } + + Stem stemAnnot = new Stem(jcas, fs.getBegin(), fs.getEnd()); + stemAnnot.setValue(stemValue); + stemAnnot.addToIndexes(jcas); + + // Try setting the "stem" feature on Tokens. + Feature feat = fs.getType().getFeatureByBaseName("stem"); + if (feat != null && feat.getRange() != null + && jcas.getTypeSystem().subsumes(feat.getRange(), stemAnnot.getType())) { + fs.setFeatureValue(feat, stemAnnot); + } + } + } } diff --git a/dkpro-core-cisstem-asl/src/main/java/org/dkpro/core/cisstem/util/CisStem.java b/dkpro-core-cisstem-asl/src/main/java/org/dkpro/core/cisstem/util/CisStem.java index 04045e4412..39228e338d 100644 --- a/dkpro-core-cisstem-asl/src/main/java/org/dkpro/core/cisstem/util/CisStem.java +++ b/dkpro-core-cisstem-asl/src/main/java/org/dkpro/core/cisstem/util/CisStem.java @@ -23,148 +23,156 @@ */ package org.dkpro.core.cisstem.util; -public class CisStem { - - public static String stem(String word) { - return stem(word, false); - } - - public static String stem(String word, boolean case_insensitive) { - if (word.length() == 0) - return word; - - // TODO use compiled pattern for performance? - word = word.replace("Ü", "U"); - word = word.replace("Ö", "O"); - word = word.replace("Ä", "A"); - word = word.replace("ü", "u"); - word = word.replace("ö", "o"); - word = word.replace("ä", "a"); - - boolean uppercase = Character.isUpperCase(word.charAt(0)); - - word = word.toLowerCase(); - - word = word.replace("ß", "ss"); - word = word.replaceAll("^ge(.{4,})", "$1"); - word = word.replace("sch", "$"); - word = word.replace("ei", "%"); - word = word.replace("ie", "&"); - - word = word.replaceAll("(.)\\1", "$1*"); - - while (word.length() > 3) { - if (word.length() > 5) { - String newWord = word.replaceAll("e[mr]$", ""); - if (!word.equals(newWord)) { - word = newWord; - continue; - } - - newWord = word.replaceAll("nd$", ""); - if (!word.equals(newWord)) { - word = newWord; - continue; - } - } - - if (!uppercase || case_insensitive) { - String newWord = word.replaceAll("t$", ""); - if (!word.equals(newWord)) { - word = newWord; - continue; - } - } - - String newWord = word.replaceAll("[esn]$", ""); - if (!word.equals(newWord)) { - word = newWord; - continue; - } else { - break; - } - } - - word = word.replaceAll("(.)\\*", "$1$1"); - word = word.replace("&", "ie"); - word = word.replace("%", "ei"); - word = word.replace("$", "sch"); - - return word; - } - - public static String[] segment(String word) { - return segment(word, false); - } - - public static String[] segment(String word, boolean case_insensitive) { - if (word.length() == 0) { - String[] result = new String[2]; - result[0] = ""; - result[1] = ""; - return result; - } - - int restLength = 0; - boolean uppercase = Character.isUpperCase(word.charAt(0)); - word = word.toLowerCase(); - String original = new String(word); - - word = word.replace("sch", "$"); - word = word.replace("ei", "%"); - word = word.replace("ie", "&"); - - word = word.replaceAll("(.)\\1", "$1*"); - - while (word.length() > 3) { - if (word.length() > 5) { - String newWord = word.replaceAll("e[mr]$", ""); - if (!word.equals(newWord)) { - restLength += 2; - word = newWord; - continue; - } - - newWord = word.replaceAll("nd$", ""); - if (!word.equals(newWord)) { - restLength += 2; - word = newWord; - continue; - } - } - - if (!uppercase || case_insensitive) { - String newWord = word.replaceAll("t$", ""); - if (!word.equals(newWord)) { - restLength += 1; - word = newWord; - continue; - } - } - - String newWord = word.replaceAll("[esn]$", ""); - if (!word.equals(newWord)) { - restLength += 1; - word = newWord; - continue; - } else { - break; - } - } - - word = word.replaceAll("(.)\\*", "$1$1"); - word = word.replace("&", "ie"); - word = word.replace("%", "ei"); - word = word.replace("$", "sch"); - - String rest = ""; - if (restLength != 0) { - rest = original.substring(original.length() - restLength); - } - - String[] result = new String[2]; - result[0] = word; - result[1] = rest; - return result; - } +public class CisStem +{ + + public static String stem(String word) + { + return stem(word, false); + } + + public static String stem(String word, boolean case_insensitive) + { + if (word.length() == 0) { + return word; + } + + // TODO use compiled pattern for performance? + word = word.replace("Ü", "U"); + word = word.replace("Ö", "O"); + word = word.replace("Ä", "A"); + word = word.replace("ü", "u"); + word = word.replace("ö", "o"); + word = word.replace("ä", "a"); + + boolean uppercase = Character.isUpperCase(word.charAt(0)); + + word = word.toLowerCase(); + + word = word.replace("ß", "ss"); + word = word.replaceAll("^ge(.{4,})", "$1"); + word = word.replace("sch", "$"); + word = word.replace("ei", "%"); + word = word.replace("ie", "&"); + + word = word.replaceAll("(.)\\1", "$1*"); + + while (word.length() > 3) { + if (word.length() > 5) { + String newWord = word.replaceAll("e[mr]$", ""); + if (!word.equals(newWord)) { + word = newWord; + continue; + } + + newWord = word.replaceAll("nd$", ""); + if (!word.equals(newWord)) { + word = newWord; + continue; + } + } + + if (!uppercase || case_insensitive) { + String newWord = word.replaceAll("t$", ""); + if (!word.equals(newWord)) { + word = newWord; + continue; + } + } + + String newWord = word.replaceAll("[esn]$", ""); + if (!word.equals(newWord)) { + word = newWord; + continue; + } + else { + break; + } + } + + word = word.replaceAll("(.)\\*", "$1$1"); + word = word.replace("&", "ie"); + word = word.replace("%", "ei"); + word = word.replace("$", "sch"); + + return word; + } + + public static String[] segment(String word) + { + return segment(word, false); + } + + public static String[] segment(String word, boolean case_insensitive) + { + if (word.length() == 0) { + String[] result = new String[2]; + result[0] = ""; + result[1] = ""; + return result; + } + + int restLength = 0; + boolean uppercase = Character.isUpperCase(word.charAt(0)); + word = word.toLowerCase(); + String original = new String(word); + + word = word.replace("sch", "$"); + word = word.replace("ei", "%"); + word = word.replace("ie", "&"); + + word = word.replaceAll("(.)\\1", "$1*"); + + while (word.length() > 3) { + if (word.length() > 5) { + String newWord = word.replaceAll("e[mr]$", ""); + if (!word.equals(newWord)) { + restLength += 2; + word = newWord; + continue; + } + + newWord = word.replaceAll("nd$", ""); + if (!word.equals(newWord)) { + restLength += 2; + word = newWord; + continue; + } + } + + if (!uppercase || case_insensitive) { + String newWord = word.replaceAll("t$", ""); + if (!word.equals(newWord)) { + restLength += 1; + word = newWord; + continue; + } + } + + String newWord = word.replaceAll("[esn]$", ""); + if (!word.equals(newWord)) { + restLength += 1; + word = newWord; + continue; + } + else { + break; + } + } + + word = word.replaceAll("(.)\\*", "$1$1"); + word = word.replace("&", "ie"); + word = word.replace("%", "ei"); + word = word.replace("$", "sch"); + + String rest = ""; + if (restLength != 0) { + rest = original.substring(original.length() - restLength); + } + + String[] result = new String[2]; + result[0] = word; + result[1] = rest; + return result; + } } diff --git a/dkpro-core-cisstem-asl/src/test/java/org/dkpro/core/cisstem/CisStemmerTest.java b/dkpro-core-cisstem-asl/src/test/java/org/dkpro/core/cisstem/CisStemmerTest.java index 381f43370f..24bb79a0a7 100644 --- a/dkpro-core-cisstem-asl/src/test/java/org/dkpro/core/cisstem/CisStemmerTest.java +++ b/dkpro-core-cisstem-asl/src/test/java/org/dkpro/core/cisstem/CisStemmerTest.java @@ -21,12 +21,11 @@ import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.uima.jcas.JCas; -import org.dkpro.core.cisstem.CisStemmer; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.TestRunner; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class CisStemmerTest { diff --git a/dkpro-core-cisstem-asl/src/test/java/org/dkpro/core/cisstem/PerformanceTest.java b/dkpro-core-cisstem-asl/src/test/java/org/dkpro/core/cisstem/PerformanceTest.java index 79da0dd48f..d3f87371e4 100644 --- a/dkpro-core-cisstem-asl/src/test/java/org/dkpro/core/cisstem/PerformanceTest.java +++ b/dkpro-core-cisstem-asl/src/test/java/org/dkpro/core/cisstem/PerformanceTest.java @@ -24,16 +24,16 @@ import org.junit.Ignore; import org.junit.Test; -public class PerformanceTest { - - @Ignore +public class PerformanceTest +{ + @Ignore @Test - public void testGerman() - throws Exception + public void testGerman() throws Exception { - long startTime = System.currentTimeMillis(); - for (String line : FileUtils.readLines(new File("src/test/resources/wordlist/wortliste-deutsch.txt"), "UTF-8")) { - CisStem.stem(line); + long startTime = System.currentTimeMillis(); + for (String line : FileUtils.readLines( + new File("src/test/resources/wordlist/wortliste-deutsch.txt"), "UTF-8")) { + CisStem.stem(line); } long endTime = System.currentTimeMillis(); diff --git a/dkpro-core-clearnlp-asl/pom.xml b/dkpro-core-clearnlp-asl/pom.xml index a1e146559e..5b7d9fba7a 100644 --- a/dkpro-core-clearnlp-asl/pom.xml +++ b/dkpro-core-clearnlp-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.clearnlp-asl + dkpro-core-clearnlp-asl jar DKPro Core ASL - Clear NLP (v ${clearnlp.version}) + https://dkpro.github.io/dkpro-core/ 2.0.2 @@ -40,32 +41,36 @@ uimafit-core
- de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.syntax-asl + org.dkpro.core + dkpro-core-api-syntax-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.semantics-asl + org.dkpro.core + dkpro-core-api-semantics-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -78,13 +83,13 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl + org.dkpro.core + dkpro-core-opennlp-asl test @@ -169,9 +174,9 @@ - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-opennlp-asl + 2.3.0-SNAPSHOT pom import diff --git a/dkpro-core-clearnlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpLemmatizer.java b/dkpro-core-clearnlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpLemmatizer.java deleted file mode 100644 index 94352b4518..0000000000 --- a/dkpro-core-clearnlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpLemmatizer.java +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.clearnlp; - -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; - -import java.io.InputStream; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import com.clearnlp.component.AbstractComponent; -import com.clearnlp.component.morph.DefaultMPAnalyzer; -import com.clearnlp.component.morph.EnglishMPAnalyzer; -import com.clearnlp.dependency.DEPNode; -import com.clearnlp.dependency.DEPTree; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * Lemmatizer using Clear NLP. - * - */ -@ResourceMetaData(name="ClearNLP Lemmatizer") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" } -) -public class ClearNlpLemmatizer - extends JCasAnnotator_ImplBase -{ - - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false, defaultValue="en") - protected String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - protected String modelLocation; - - private CasConfigurableProviderBase modelProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase(this, "clearnlp", "lemma") - { - @Override - protected AbstractComponent produceResource(InputStream aStream) - throws Exception - { - String lang = getAggregatedProperties().getProperty(LANGUAGE); - AbstractComponent lemmatizer; - if(lang.equals("en")){ - lemmatizer = new EnglishMPAnalyzer(aStream); - }else{ - lemmatizer = new DefaultMPAnalyzer(); - } - return lemmatizer; - } - }; - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - - modelProvider.configure(aJCas.getCas()); - AbstractComponent analyzer = modelProvider.getResource(); - - // Iterate over all sentences - for (Sentence sentence : select(aJCas, Sentence.class)) { - List tokens = selectCovered(aJCas, Token.class, sentence); - - DEPTree tree = new DEPTree(); - - // Generate input format required by analyzer - for (int i = 0; i < tokens.size(); i++) { - Token t = tokens.get(i); - DEPNode node = new DEPNode(i+1, tokens.get(i).getText()); - node.pos = t.getPos().getPosValue(); - tree.add(node); - } - - analyzer.process(tree); - - int i = 0; - for (Token t : tokens) { - DEPNode node = tree.get(i+1); - String lemmaString = node.lemma; - if (lemmaString == null) { - lemmaString = t.getText(); - } - Lemma l = new Lemma(aJCas, t.getBegin(), t.getEnd()); - l.setValue(lemmaString); - l.addToIndexes(); - - t.setLemma(l); - i++; - } - } - } -} diff --git a/dkpro-core-clearnlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpSegmenter.java b/dkpro-core-clearnlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpSegmenter.java deleted file mode 100644 index 815608dcf7..0000000000 --- a/dkpro-core-clearnlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpSegmenter.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.clearnlp; - -import java.io.BufferedReader; -import java.io.InputStream; -import java.io.StringReader; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.LanguageCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import com.clearnlp.segmentation.AbstractSegmenter; -import com.clearnlp.segmentation.EnglishSegmenter; -import com.clearnlp.tokenization.EnglishTokenizer; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; - -/** - * Tokenizer using Clear NLP. - */ -@ResourceMetaData(name="ClearNLP Segmenter") -@LanguageCapability(value = "en") -@TypeCapability( - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) -public class ClearNlpSegmenter - extends SegmenterBase -{ - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - protected String modelLocation; - - private CasConfigurableProviderBase modelProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase(this, "segmenter") - { - @Override - protected AbstractSegmenter produceResource(InputStream aStream) - throws Exception - { - String lang = getAggregatedProperties().getProperty(LANGUAGE); - AbstractSegmenter segmenter; - if (lang.equals("en")) { - segmenter = new EnglishSegmenter(new EnglishTokenizer(aStream)); - } - else { - throw new ResourceInitializationException( - new Throwable("ClearNLP segmenter supports only English")); - } - return segmenter; - } - }; - } - - @Override - protected void process(JCas aJCas, String aText, int aZoneBegin) - throws AnalysisEngineProcessException - { - modelProvider.configure(aJCas.getCas()); - AbstractSegmenter segmenter = modelProvider.getResource(); - - List> sentences = segmenter.getSentences(new BufferedReader(new StringReader(aText))); - - int sBegin = 0; - int sEnd = 0; - int tBegin = 0; - int tEnd = 0; - - for (List sentence : sentences) { - sBegin = -1; - - for (String token : sentence) { - tBegin = aText.indexOf(token, tEnd); - tEnd = tBegin + token.length(); - - if (sBegin == -1) { - sBegin = tBegin; - } - - createToken(aJCas, aZoneBegin + tBegin, aZoneBegin + tEnd); - } - sEnd = tEnd; - - createSentence(aJCas, aZoneBegin + sBegin, aZoneBegin + sEnd); - } - } -} diff --git a/dkpro-core-clearnlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpSemanticRoleLabeler.java b/dkpro-core-clearnlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpSemanticRoleLabeler.java deleted file mode 100644 index fe60b87d30..0000000000 --- a/dkpro-core-clearnlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpSemanticRoleLabeler.java +++ /dev/null @@ -1,437 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.clearnlp; - -import static java.util.Arrays.asList; -import static org.apache.commons.io.IOUtils.closeQuietly; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; -import static org.apache.uima.util.Level.INFO; - -import java.io.BufferedInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.ObjectInputStream; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.zip.GZIPInputStream; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.FSCollectionFactory; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import com.clearnlp.classification.model.StringModel; -import com.clearnlp.component.AbstractComponent; -import com.clearnlp.component.AbstractStatisticalComponent; -import com.clearnlp.dependency.DEPArc; -import com.clearnlp.dependency.DEPLib; -import com.clearnlp.dependency.DEPNode; -import com.clearnlp.dependency.DEPTree; -import com.clearnlp.nlp.NLPGetter; -import com.clearnlp.nlp.NLPMode; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableStreamProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg; -import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArgLink; -import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; - -/** - * ClearNLP semantic role labeller. - */ -@ResourceMetaData(name="ClearNLP Semantic Role Labeler") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency"}, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred", - "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg"} - ) -public class ClearNlpSemanticRoleLabeler - extends JCasAnnotator_ImplBase -{ - /** - * Write the tag set(s) to the log when a model is loaded. - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") - protected boolean printTagSet; - - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Variant of a model the model. Used to address a specific model if here are multiple models - * for one language. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Location from which the predicate identifier model is read. - */ - public static final String PARAM_PRED_MODEL_LOCATION = "predModelLocation"; - @ConfigurationParameter(name = PARAM_PRED_MODEL_LOCATION, mandatory = false) - protected String predModelLocation; - - /** - * Location from which the roleset classification model is read. - */ - public static final String PARAM_ROLE_MODEL_LOCATION = "roleModelLocation"; - @ConfigurationParameter(name = PARAM_ROLE_MODEL_LOCATION, mandatory = false) - protected String roleModelLocation; - - /** - * Location from which the semantic role labeling model is read. - */ - public static final String PARAM_SRL_MODEL_LOCATION = "srlModelLocation"; - @ConfigurationParameter(name = PARAM_SRL_MODEL_LOCATION, mandatory = false) - protected String srlModelLocation; - - /** - *

Normally the arguments point only to the head words of arguments in the dependency tree. - * With this option enabled, they are expanded to the text covered by the minimal and maximal - * token offsets of all descendants (or self) of the head word.

- * - *

Warning: this parameter should be used with caution! For one, if the descentants of a - * head word cover a non-continuous region of the text, this information is lost. The arguments - * will appear to be spanning a continuous region. For another, the arguments may overlap with - * each other. E.g. if a sentence contains a relative clause with a verb, the subject of the - * main clause may be recognized as a dependent of the verb and may cause the whole main - * clause to be recorded in the argument.

- */ - public static final String PARAM_EXPAND_ARGUMENTS = "expandArguments"; - @ConfigurationParameter(name = PARAM_EXPAND_ARGUMENTS, mandatory = true, defaultValue="false") - protected boolean expandArguments; - - - private CasConfigurableProviderBase predicateFinder; - - private CasConfigurableProviderBase roleSetClassifier; - - private CasConfigurableProviderBase roleLabeller; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - predicateFinder = new CasConfigurableStreamProviderBase() - { - { - setContextObject(ClearNlpSemanticRoleLabeler.this); - - setDefault(ARTIFACT_ID, "${groupId}.clearnlp-model-pred-${language}-${variant}"); - setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/clearnlp/lib/" - + "pred-${language}-${variant}.properties"); - setDefault(VARIANT, "ontonotes"); - - setOverride(LOCATION, predModelLocation); - setOverride(LANGUAGE, language); - setOverride(VARIANT, variant); - } - - @Override - protected AbstractComponent produceResource(InputStream aStream) - throws Exception - { - BufferedInputStream bis = null; - ObjectInputStream ois = null; - GZIPInputStream gis = null; - try{ - gis = new GZIPInputStream(aStream); - bis = new BufferedInputStream(gis); - ois = new ObjectInputStream(bis); - AbstractComponent component = NLPGetter.getComponent(ois, - getAggregatedProperties().getProperty(LANGUAGE), NLPMode.MODE_PRED); - printTags(NLPMode.MODE_PRED, component); - return component; - } - catch (Exception e) { - throw new IOException(e); - } - finally { - closeQuietly(ois); - closeQuietly(bis); - closeQuietly(gis); - } - } - }; - - roleSetClassifier = new CasConfigurableStreamProviderBase() - { - { - setContextObject(ClearNlpSemanticRoleLabeler.this); - - setDefault(ARTIFACT_ID, "${groupId}.clearnlp-model-role-${language}-${variant}"); - setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/clearnlp/lib/" - + "role-${language}-${variant}.properties"); - setDefault(VARIANT, "ontonotes"); - - setOverride(LOCATION, roleModelLocation); - setOverride(LANGUAGE, language); - setOverride(VARIANT, variant); - } - - @Override - protected AbstractComponent produceResource(InputStream aStream) - throws Exception - { - BufferedInputStream bis = null; - ObjectInputStream ois = null; - GZIPInputStream gis = null; - try{ - gis = new GZIPInputStream(aStream); - bis = new BufferedInputStream(gis); - ois = new ObjectInputStream(bis); - AbstractComponent component = NLPGetter.getComponent(ois, - getAggregatedProperties().getProperty(LANGUAGE), NLPMode.MODE_ROLE); - - printTags(NLPMode.MODE_ROLE, component); - return component; - } - catch (Exception e) { - throw new IOException(e); - } - finally { - closeQuietly(ois); - closeQuietly(bis); - closeQuietly(gis); - } - } - }; - - roleLabeller = new CasConfigurableStreamProviderBase() - { - { - setContextObject(ClearNlpSemanticRoleLabeler.this); - - setDefault(ARTIFACT_ID, "${groupId}.clearnlp-model-srl-${language}-${variant}"); - setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/clearnlp/lib/" - + "srl-${language}-${variant}.properties"); - setDefault(VARIANT, "ontonotes"); - - setOverride(LOCATION, srlModelLocation); - setOverride(LANGUAGE, language); - setOverride(VARIANT, variant); - } - - @Override - protected AbstractComponent produceResource(InputStream aStream) - throws Exception - { - BufferedInputStream bis = null; - ObjectInputStream ois = null; - GZIPInputStream gis = null; - try{ - gis = new GZIPInputStream(aStream); - bis = new BufferedInputStream(gis); - ois = new ObjectInputStream(bis); - AbstractComponent component = NLPGetter.getComponent(ois, - getAggregatedProperties().getProperty(LANGUAGE), NLPMode.MODE_SRL); - printTags(NLPMode.MODE_SRL, component); - return component; - } - catch (Exception e) { - throw new IOException(e); - } - finally { - closeQuietly(ois); - closeQuietly(bis); - closeQuietly(gis); - } - } - }; - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - predicateFinder.configure(aJCas.getCas()); - roleSetClassifier.configure(aJCas.getCas()); - roleLabeller.configure(aJCas.getCas()); - - // Iterate over all sentences - for (Sentence sentence : select(aJCas, Sentence.class)) { - List tokens = selectCovered(aJCas, Token.class, sentence); - DEPTree tree = new DEPTree(); - - // Generate: - // - DEPNode - // - pos tags - // - lemma - for (int i = 0; i < tokens.size(); i++) { - Token t = tokens.get(i); - DEPNode node = new DEPNode(i + 1, tokens.get(i).getText()); - node.pos = t.getPos().getPosValue(); - node.lemma = t.getLemma().getValue(); - tree.add(node); - } - - // Generate: - // Dependency relations - for (Dependency dep : selectCovered(Dependency.class, sentence)) { - if (dep instanceof ROOT) { - // #736 ClearNlpSemanticRoleLabelerTest gets caught in infinite loop - // ClearNLP parser creates roots that do not have a head. We have to replicate - // this here to avoid running into an endless loop. - continue; - } - - int headIndex = tokens.indexOf(dep.getGovernor()); - int tokenIndex = tokens.indexOf(dep.getDependent()); - - DEPNode token = tree.get(tokenIndex + 1); - DEPNode head = tree.get(headIndex + 1); - - token.setHead(head, dep.getDependencyType()); - } - - // For the root node - for (int i = 0; i < tokens.size(); i++) { - DEPNode parserNode = tree.get(i + 1); - if(parserNode.getLabel() == null){ - int headIndex = tokens.indexOf(null); - DEPNode head = tree.get(headIndex + 1); - parserNode.setHead(head, "root"); - } - } - - // Do the SRL - predicateFinder.getResource().process(tree); - roleSetClassifier.getResource().process(tree); - roleLabeller.getResource().process(tree); - - // Convert the results into UIMA annotations - Map predicates = new HashMap<>(); - Map> predArgs = new HashMap<>(); - - for (int i = 0; i < tokens.size(); i++) { - DEPNode parserNode = tree.get(i + 1); - Token argumentToken = tokens.get(i); - - for (DEPArc argPredArc : parserNode.getSHeads()) { - Token predToken = tokens.get(argPredArc.getNode().id - 1); - - // Instantiate the semantic predicate annotation if it hasn't been done yet - SemPred pred = predicates.get(predToken); - if (pred == null) { - // Create the semantic predicate annotation itself - pred = new SemPred(aJCas, predToken.getBegin(), predToken.getEnd()); - pred.setCategory(argPredArc.getNode().getFeat(DEPLib.FEAT_PB)); - pred.addToIndexes(); - predicates.put(predToken, pred); - - // Prepare a list to store its arguments - predArgs.put(pred, new ArrayList<>()); - } - - // Instantiate the semantic argument annotation - SemArg arg = new SemArg(aJCas); - - if (expandArguments) { - List descendents = parserNode.getDescendents(Integer.MAX_VALUE) - .stream() - .map(arc -> arc.getNode()) - .collect(Collectors.toList()); - descendents.add(parserNode); - List descTokens = descendents.stream() - .map(node -> tokens.get(node.id - 1)) - .collect(Collectors.toList()); - int begin = descTokens.stream().mapToInt(t -> t.getBegin()).min().getAsInt(); - int end = descTokens.stream().mapToInt(t -> t.getEnd()).max().getAsInt(); - arg.setBegin(begin); - arg.setEnd(end); - } - else { - arg.setBegin(argumentToken.getBegin()); - arg.setEnd(argumentToken.getEnd()); - } - - arg.addToIndexes(); - - SemArgLink link = new SemArgLink(aJCas); - link.setRole(argPredArc.getLabel()); - link.setTarget(arg); - - // Remember to which predicate this argument belongs - predArgs.get(pred).add(link); - } - } - - for (Entry> e : predArgs.entrySet()) { - e.getKey().setArguments(FSCollectionFactory.createFSArray(aJCas, e.getValue())); - } - } - } - - private void printTags(String aType, AbstractComponent aComponent) - { - if (printTagSet && (aComponent instanceof AbstractStatisticalComponent)) { - AbstractStatisticalComponent component = (AbstractStatisticalComponent) aComponent; - - Set tagSet = new HashSet(); - - for (StringModel model : component.getModels()) { - tagSet.addAll(asList(model.getLabels())); - } - - List tagList = new ArrayList(tagSet); - Collections.sort(tagList); - - StringBuilder sb = new StringBuilder(); - sb.append("Model of " + aType + " contains [").append(tagList.size()) - .append("] tags: "); - - for (String tag : tagList) { - sb.append(tag); - sb.append(" "); - } - getContext().getLogger().log(INFO, sb.toString()); - } - } -} diff --git a/dkpro-core-clearnlp-asl/src/main/java/org/dkpro/core/clearnlp/ClearNlpLemmatizer.java b/dkpro-core-clearnlp-asl/src/main/java/org/dkpro/core/clearnlp/ClearNlpLemmatizer.java new file mode 100644 index 0000000000..729a65df63 --- /dev/null +++ b/dkpro-core-clearnlp-asl/src/main/java/org/dkpro/core/clearnlp/ClearNlpLemmatizer.java @@ -0,0 +1,177 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.clearnlp; + +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; + +import java.io.InputStream; +import java.util.List; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; + +import com.clearnlp.component.AbstractComponent; +import com.clearnlp.component.morph.DefaultMPAnalyzer; +import com.clearnlp.component.morph.EnglishMPAnalyzer; +import com.clearnlp.dependency.DEPNode; +import com.clearnlp.dependency.DEPTree; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Lemmatizer using Clear NLP. + */ +@Component(OperationType.LEMMATIZER) +@ResourceMetaData(name = "ClearNLP Lemmatizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" } +) +public class ClearNlpLemmatizer + extends JCasAnnotator_ImplBase +{ + + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false, defaultValue = "en") + protected String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + *

The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.

+ */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + protected String modelLocation; + + private CasConfigurableProviderBase modelProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase(this, "clearnlp", "lemma") + { + { + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/clearnlp/lib/lemma-${language}-${variant}.properties"); + } + + @Override + protected AbstractComponent produceResource(InputStream aStream) + throws Exception + { + String lang = getAggregatedProperties().getProperty(LANGUAGE); + AbstractComponent lemmatizer; + if (lang.equals("en")) { + lemmatizer = new EnglishMPAnalyzer(aStream); + } + else { + lemmatizer = new DefaultMPAnalyzer(); + } + return lemmatizer; + } + }; + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + + modelProvider.configure(aJCas.getCas()); + AbstractComponent analyzer = modelProvider.getResource(); + + // Iterate over all sentences + for (Sentence sentence : select(aJCas, Sentence.class)) { + List tokens = selectCovered(aJCas, Token.class, sentence); + + DEPTree tree = new DEPTree(); + + // Generate input format required by analyzer + for (int i = 0; i < tokens.size(); i++) { + Token t = tokens.get(i); + DEPNode node = new DEPNode(i + 1, tokens.get(i).getText()); + node.pos = t.getPos().getPosValue(); + tree.add(node); + } + + analyzer.process(tree); + + int i = 0; + for (Token t : tokens) { + DEPNode node = tree.get(i + 1); + String lemmaString = node.lemma; + if (lemmaString == null) { + lemmaString = t.getText(); + } + Lemma l = new Lemma(aJCas, t.getBegin(), t.getEnd()); + l.setValue(lemmaString); + l.addToIndexes(); + + t.setLemma(l); + i++; + } + } + } +} diff --git a/dkpro-core-clearnlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpParser.java b/dkpro-core-clearnlp-asl/src/main/java/org/dkpro/core/clearnlp/ClearNlpParser.java similarity index 84% rename from dkpro-core-clearnlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpParser.java rename to dkpro-core-clearnlp-asl/src/main/java/org/dkpro/core/clearnlp/ClearNlpParser.java index ef649eac19..aaf120252a 100644 --- a/dkpro-core-clearnlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpParser.java +++ b/dkpro-core-clearnlp-asl/src/main/java/org/dkpro/core/clearnlp/ClearNlpParser.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.clearnlp; +package org.dkpro.core.clearnlp; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.fit.util.JCasUtil.select; @@ -43,6 +43,10 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; import com.clearnlp.classification.model.StringModel; import com.clearnlp.component.dep.AbstractDEPParser; @@ -50,20 +54,21 @@ import com.clearnlp.dependency.DEPTree; import com.clearnlp.nlp.NLPGetter; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * CLEAR parser annotator. */ -@ResourceMetaData(name="ClearNLP Parser") +@Component(OperationType.DEPENDENCY_PARSER) +@ResourceMetaData(name = "ClearNLP Parser") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", @@ -97,6 +102,20 @@ public class ClearNlpParser @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + *

The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.

+ */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + /** * Location from which the model is read. */ @@ -116,6 +135,12 @@ public void initialize(UimaContext context) parserProvider = new ModelProviderBase(this, "clearnlp", "parser") { + { + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/clearnlp/lib/parser-${language}-${variant}.properties"); + } + @Override protected AbstractDEPParser produceResource(URL aUrl) throws IOException diff --git a/dkpro-core-clearnlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpPosTagger.java b/dkpro-core-clearnlp-asl/src/main/java/org/dkpro/core/clearnlp/ClearNlpPosTagger.java similarity index 76% rename from dkpro-core-clearnlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpPosTagger.java rename to dkpro-core-clearnlp-asl/src/main/java/org/dkpro/core/clearnlp/ClearNlpPosTagger.java index ec1df133b7..82cb81450d 100644 --- a/dkpro-core-clearnlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpPosTagger.java +++ b/dkpro-core-clearnlp-asl/src/main/java/org/dkpro/core/clearnlp/ClearNlpPosTagger.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.clearnlp; +package org.dkpro.core.clearnlp; import static java.util.Arrays.asList; import static org.apache.commons.io.IOUtils.closeQuietly; @@ -44,6 +44,14 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.CasConfigurableStreamProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.MappingProviderFactory; +import org.dkpro.core.api.resources.ModelProviderBase; import com.clearnlp.classification.model.StringModel; import com.clearnlp.component.AbstractComponent; @@ -54,23 +62,19 @@ import com.clearnlp.dependency.DEPTree; import com.clearnlp.nlp.NLPGetter; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableStreamProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Part-of-Speech annotator using Clear NLP. Requires {@link Sentence}s to be annotated before. - * */ -@ResourceMetaData(name="ClearNLP POS-Tagger") +@Component(OperationType.PART_OF_SPEECH_TAGGER) +@ResourceMetaData(name = "ClearNLP POS-Tagger") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, @@ -107,6 +111,20 @@ public class ClearNlpPosTagger @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String posVariant; + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + *

The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.

+ */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + /** * Load the model from this location instead of locating the pos-tagging model automatically. */ @@ -114,22 +132,23 @@ public class ClearNlpPosTagger @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) protected String posModelLocation; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating the * mapping automatically. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid spaming - * the heap with thousands of strings representing only a few different tags. - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - /** * Log the tag set(s) when a model is loaded. */ @@ -152,12 +171,14 @@ public void initialize(UimaContext aContext) { setContextObject(ClearNlpPosTagger.this); + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); setDefault(ARTIFACT_ID, "${groupId}.clearnlp-model-dictionary-${language}-${variant}"); setDefault(LOCATION, - "classpath:/${package}/lib/dictionary-${language}-${variant}.properties"); + "classpath:/de/tudarmstadt/ukp/dkpro/core/clearnlp/lib/dictionary-${language}-${variant}.properties"); setDefaultVariantsLocation("${package}/lib/dictionary-default-variants.map"); setDefault(VARIANT, "default"); + setOverride(ARTIFACT_URI, modelArtifactUri); setOverride(LOCATION, dictLocation); setOverride(LANGUAGE, language); setOverride(VARIANT, dictVariant); @@ -178,7 +199,10 @@ protected InputStream produceResource(InputStream aStream) posTaggingModelProvider = new ModelProviderBase(this, "clearnlp", "tagger") { { + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); setDefault(VARIANT, "ontonotes"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/clearnlp/lib/tagger-${language}-${variant}.properties"); } @Override @@ -191,16 +215,16 @@ protected AbstractPOSTagger produceResource(InputStream aStream) GZIPInputStream gis = null; try { - gis = new GZIPInputStream(aStream); bis = new BufferedInputStream(gis); ois = new ObjectInputStream(bis); String language = getAggregatedProperties().getProperty(LANGUAGE); AbstractPOSTagger tagger; - if(language.equals("en")){ + if (language.equals("en")) { tagger = new DkproPosTagger(ois); - }else{ + } + else { tagger = new DefaultPOSTagger(ois); } @@ -230,8 +254,8 @@ protected AbstractPOSTagger produceResource(InputStream aStream) }; - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - language, posTaggingModelProvider); + posMappingProvider = MappingProviderFactory.createPosMappingProvider(this, + posMappingLocation, language, posTaggingModelProvider); } @Override @@ -256,8 +280,8 @@ public void process(JCas aJCas) int i = 0; for (Token t : tokens) { - String tag = internTags ? posTags[i + 1].intern() : posTags[i + 1]; - Type posTag = posMappingProvider.getTagType(tag); + String tag = posTags[i + 1]; + Type posTag = posMappingProvider.getTagType(tag != null ? tag.intern() : null); POS posAnno = (POS) cas.createAnnotation(posTag, t.getBegin(), t.getEnd()); posAnno.setPosValue(tag); POSUtils.assignCoarseValue(posAnno); @@ -268,8 +292,9 @@ public void process(JCas aJCas) } } - private class DkproPosTagger extends EnglishPOSTagger{ - + private class DkproPosTagger + extends EnglishPOSTagger + { public DkproPosTagger(ObjectInputStream in) { super(in); @@ -280,6 +305,5 @@ protected void initMorphologicalAnalyzer() { mp_analyzer = new EnglishMPAnalyzer(dictModelProvider.getResource()); } - } } diff --git a/dkpro-core-clearnlp-asl/src/main/java/org/dkpro/core/clearnlp/ClearNlpSegmenter.java b/dkpro-core-clearnlp-asl/src/main/java/org/dkpro/core/clearnlp/ClearNlpSegmenter.java new file mode 100644 index 0000000000..2575a6baf9 --- /dev/null +++ b/dkpro-core-clearnlp-asl/src/main/java/org/dkpro/core/clearnlp/ClearNlpSegmenter.java @@ -0,0 +1,152 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.clearnlp; + +import java.io.BufferedReader; +import java.io.InputStream; +import java.io.StringReader; +import java.util.List; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.segmentation.SegmenterBase; + +import com.clearnlp.segmentation.AbstractSegmenter; +import com.clearnlp.segmentation.EnglishSegmenter; +import com.clearnlp.tokenization.EnglishTokenizer; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Tokenizer using Clear NLP. + */ +@ResourceMetaData(name = "ClearNLP Segmenter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability(value = "en") +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) +public class ClearNlpSegmenter + extends SegmenterBase +{ + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + *

The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.

+ */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + protected String modelLocation; + + private CasConfigurableProviderBase modelProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase(this, "segmenter") + { + { + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/clearnlp/lib/segmenter-${language}-${variant}.properties"); + } + + @Override + protected AbstractSegmenter produceResource(InputStream aStream) + throws Exception + { + String lang = getAggregatedProperties().getProperty(LANGUAGE); + AbstractSegmenter segmenter; + if (lang.equals("en")) { + segmenter = new EnglishSegmenter(new EnglishTokenizer(aStream)); + } + else { + throw new ResourceInitializationException( + new Throwable("ClearNLP segmenter supports only English")); + } + return segmenter; + } + }; + } + + @Override + protected void process(JCas aJCas, String aText, int aZoneBegin) + throws AnalysisEngineProcessException + { + modelProvider.configure(aJCas.getCas()); + AbstractSegmenter segmenter = modelProvider.getResource(); + + List> sentences = segmenter + .getSentences(new BufferedReader(new StringReader(aText))); + + int sBegin = 0; + int sEnd = 0; + int tBegin = 0; + int tEnd = 0; + + for (List sentence : sentences) { + sBegin = -1; + + for (String token : sentence) { + tBegin = aText.indexOf(token, tEnd); + tEnd = tBegin + token.length(); + + if (sBegin == -1) { + sBegin = tBegin; + } + + createToken(aJCas, aZoneBegin + tBegin, aZoneBegin + tEnd); + } + sEnd = tEnd; + + createSentence(aJCas, aZoneBegin + sBegin, aZoneBegin + sEnd); + } + } +} diff --git a/dkpro-core-clearnlp-asl/src/main/java/org/dkpro/core/clearnlp/ClearNlpSemanticRoleLabeler.java b/dkpro-core-clearnlp-asl/src/main/java/org/dkpro/core/clearnlp/ClearNlpSemanticRoleLabeler.java new file mode 100644 index 0000000000..75b648343e --- /dev/null +++ b/dkpro-core-clearnlp-asl/src/main/java/org/dkpro/core/clearnlp/ClearNlpSemanticRoleLabeler.java @@ -0,0 +1,444 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.clearnlp; + +import static java.util.Arrays.asList; +import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.apache.uima.util.Level.INFO; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.ObjectInputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.zip.GZIPInputStream; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.FSCollectionFactory; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.CasConfigurableStreamProviderBase; + +import com.clearnlp.classification.model.StringModel; +import com.clearnlp.component.AbstractComponent; +import com.clearnlp.component.AbstractStatisticalComponent; +import com.clearnlp.dependency.DEPArc; +import com.clearnlp.dependency.DEPLib; +import com.clearnlp.dependency.DEPNode; +import com.clearnlp.dependency.DEPTree; +import com.clearnlp.nlp.NLPGetter; +import com.clearnlp.nlp.NLPMode; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg; +import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArgLink; +import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * ClearNLP semantic role labeller. + */ +@Component(OperationType.ANNOTATOR_OF_SEMANTIC_ROLE_LABELS) +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@ResourceMetaData(name = "ClearNLP Semantic Role Labeler") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency"}, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred", + "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg"} + ) +public class ClearNlpSemanticRoleLabeler + extends JCasAnnotator_ImplBase +{ + /** + * Write the tag set(s) to the log when a model is loaded. + */ + public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") + protected boolean printTagSet; + + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Variant of a model the model. Used to address a specific model if here are multiple models + * for one language. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * Location from which the predicate identifier model is read. + */ + public static final String PARAM_PRED_MODEL_LOCATION = "predModelLocation"; + @ConfigurationParameter(name = PARAM_PRED_MODEL_LOCATION, mandatory = false) + protected String predModelLocation; + + /** + * Location from which the roleset classification model is read. + */ + public static final String PARAM_ROLE_MODEL_LOCATION = "roleModelLocation"; + @ConfigurationParameter(name = PARAM_ROLE_MODEL_LOCATION, mandatory = false) + protected String roleModelLocation; + + /** + * Location from which the semantic role labeling model is read. + */ + public static final String PARAM_SRL_MODEL_LOCATION = "srlModelLocation"; + @ConfigurationParameter(name = PARAM_SRL_MODEL_LOCATION, mandatory = false) + protected String srlModelLocation; + + /** + *

Normally the arguments point only to the head words of arguments in the dependency tree. + * With this option enabled, they are expanded to the text covered by the minimal and maximal + * token offsets of all descendants (or self) of the head word.

+ * + *

Warning: this parameter should be used with caution! For one, if the descentants of a + * head word cover a non-continuous region of the text, this information is lost. The arguments + * will appear to be spanning a continuous region. For another, the arguments may overlap with + * each other. E.g. if a sentence contains a relative clause with a verb, the subject of the + * main clause may be recognized as a dependent of the verb and may cause the whole main + * clause to be recorded in the argument.

+ */ + public static final String PARAM_EXPAND_ARGUMENTS = "expandArguments"; + @ConfigurationParameter(name = PARAM_EXPAND_ARGUMENTS, mandatory = true, defaultValue = "false") + protected boolean expandArguments; + + + private CasConfigurableProviderBase predicateFinder; + + private CasConfigurableProviderBase roleSetClassifier; + + private CasConfigurableProviderBase roleLabeller; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + predicateFinder = new CasConfigurableStreamProviderBase() + { + { + setContextObject(ClearNlpSemanticRoleLabeler.this); + + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(ARTIFACT_ID, "${groupId}.clearnlp-model-pred-${language}-${variant}"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/clearnlp/lib/pred-${language}-${variant}.properties"); + setDefault(VARIANT, "ontonotes"); + + setOverride(LOCATION, predModelLocation); + setOverride(LANGUAGE, language); + setOverride(VARIANT, variant); + } + + @Override + protected AbstractComponent produceResource(InputStream aStream) + throws Exception + { + BufferedInputStream bis = null; + ObjectInputStream ois = null; + GZIPInputStream gis = null; + try { + gis = new GZIPInputStream(aStream); + bis = new BufferedInputStream(gis); + ois = new ObjectInputStream(bis); + AbstractComponent component = NLPGetter.getComponent(ois, + getAggregatedProperties().getProperty(LANGUAGE), NLPMode.MODE_PRED); + printTags(NLPMode.MODE_PRED, component); + return component; + } + catch (Exception e) { + throw new IOException(e); + } + finally { + closeQuietly(ois); + closeQuietly(bis); + closeQuietly(gis); + } + } + }; + + roleSetClassifier = new CasConfigurableStreamProviderBase() + { + { + setContextObject(ClearNlpSemanticRoleLabeler.this); + + setDefault(ARTIFACT_ID, "${groupId}.clearnlp-model-role-${language}-${variant}"); + setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/clearnlp/lib/" + + "role-${language}-${variant}.properties"); + setDefault(VARIANT, "ontonotes"); + + setOverride(LOCATION, roleModelLocation); + setOverride(LANGUAGE, language); + setOverride(VARIANT, variant); + } + + @Override + protected AbstractComponent produceResource(InputStream aStream) + throws Exception + { + BufferedInputStream bis = null; + ObjectInputStream ois = null; + GZIPInputStream gis = null; + try { + gis = new GZIPInputStream(aStream); + bis = new BufferedInputStream(gis); + ois = new ObjectInputStream(bis); + AbstractComponent component = NLPGetter.getComponent(ois, + getAggregatedProperties().getProperty(LANGUAGE), NLPMode.MODE_ROLE); + + printTags(NLPMode.MODE_ROLE, component); + return component; + } + catch (Exception e) { + throw new IOException(e); + } + finally { + closeQuietly(ois); + closeQuietly(bis); + closeQuietly(gis); + } + } + }; + + roleLabeller = new CasConfigurableStreamProviderBase() + { + { + setContextObject(ClearNlpSemanticRoleLabeler.this); + + setDefault(ARTIFACT_ID, "${groupId}.clearnlp-model-srl-${language}-${variant}"); + setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/clearnlp/lib/" + + "srl-${language}-${variant}.properties"); + setDefault(VARIANT, "ontonotes"); + + setOverride(LOCATION, srlModelLocation); + setOverride(LANGUAGE, language); + setOverride(VARIANT, variant); + } + + @Override + protected AbstractComponent produceResource(InputStream aStream) + throws Exception + { + BufferedInputStream bis = null; + ObjectInputStream ois = null; + GZIPInputStream gis = null; + try { + gis = new GZIPInputStream(aStream); + bis = new BufferedInputStream(gis); + ois = new ObjectInputStream(bis); + AbstractComponent component = NLPGetter.getComponent(ois, + getAggregatedProperties().getProperty(LANGUAGE), NLPMode.MODE_SRL); + printTags(NLPMode.MODE_SRL, component); + return component; + } + catch (Exception e) { + throw new IOException(e); + } + finally { + closeQuietly(ois); + closeQuietly(bis); + closeQuietly(gis); + } + } + }; + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + predicateFinder.configure(aJCas.getCas()); + roleSetClassifier.configure(aJCas.getCas()); + roleLabeller.configure(aJCas.getCas()); + + // Iterate over all sentences + for (Sentence sentence : select(aJCas, Sentence.class)) { + List tokens = selectCovered(aJCas, Token.class, sentence); + DEPTree tree = new DEPTree(); + + // Generate: + // - DEPNode + // - pos tags + // - lemma + for (int i = 0; i < tokens.size(); i++) { + Token t = tokens.get(i); + DEPNode node = new DEPNode(i + 1, tokens.get(i).getText()); + node.pos = t.getPos().getPosValue(); + node.lemma = t.getLemma().getValue(); + tree.add(node); + } + + // Generate: + // Dependency relations + for (Dependency dep : selectCovered(Dependency.class, sentence)) { + if (dep instanceof ROOT) { + // #736 ClearNlpSemanticRoleLabelerTest gets caught in infinite loop + // ClearNLP parser creates roots that do not have a head. We have to replicate + // this here to avoid running into an endless loop. + continue; + } + + int headIndex = tokens.indexOf(dep.getGovernor()); + int tokenIndex = tokens.indexOf(dep.getDependent()); + + DEPNode token = tree.get(tokenIndex + 1); + DEPNode head = tree.get(headIndex + 1); + + token.setHead(head, dep.getDependencyType()); + } + + // For the root node + for (int i = 0; i < tokens.size(); i++) { + DEPNode parserNode = tree.get(i + 1); + if (parserNode.getLabel() == null) { + int headIndex = tokens.indexOf(null); + DEPNode head = tree.get(headIndex + 1); + parserNode.setHead(head, "root"); + } + } + + // Do the SRL + predicateFinder.getResource().process(tree); + roleSetClassifier.getResource().process(tree); + roleLabeller.getResource().process(tree); + + // Convert the results into UIMA annotations + Map predicates = new HashMap<>(); + Map> predArgs = new HashMap<>(); + + for (int i = 0; i < tokens.size(); i++) { + DEPNode parserNode = tree.get(i + 1); + Token argumentToken = tokens.get(i); + + for (DEPArc argPredArc : parserNode.getSHeads()) { + Token predToken = tokens.get(argPredArc.getNode().id - 1); + + // Instantiate the semantic predicate annotation if it hasn't been done yet + SemPred pred = predicates.get(predToken); + if (pred == null) { + // Create the semantic predicate annotation itself + pred = new SemPred(aJCas, predToken.getBegin(), predToken.getEnd()); + pred.setCategory(argPredArc.getNode().getFeat(DEPLib.FEAT_PB)); + pred.addToIndexes(); + predicates.put(predToken, pred); + + // Prepare a list to store its arguments + predArgs.put(pred, new ArrayList<>()); + } + + // Instantiate the semantic argument annotation + SemArg arg = new SemArg(aJCas); + + if (expandArguments) { + List descendents = parserNode.getDescendents(Integer.MAX_VALUE) + .stream() + .map(arc -> arc.getNode()) + .collect(Collectors.toList()); + descendents.add(parserNode); + List descTokens = descendents.stream() + .map(node -> tokens.get(node.id - 1)) + .collect(Collectors.toList()); + int begin = descTokens.stream().mapToInt(t -> t.getBegin()).min() + .getAsInt(); + int end = descTokens.stream().mapToInt(t -> t.getEnd()).max().getAsInt(); + arg.setBegin(begin); + arg.setEnd(end); + } + else { + arg.setBegin(argumentToken.getBegin()); + arg.setEnd(argumentToken.getEnd()); + } + + arg.addToIndexes(); + + SemArgLink link = new SemArgLink(aJCas); + link.setRole(argPredArc.getLabel()); + link.setTarget(arg); + + // Remember to which predicate this argument belongs + predArgs.get(pred).add(link); + } + } + + for (Entry> e : predArgs.entrySet()) { + e.getKey().setArguments(FSCollectionFactory.createFSArray(aJCas, e.getValue())); + } + } + } + + private void printTags(String aType, AbstractComponent aComponent) + { + if (printTagSet && (aComponent instanceof AbstractStatisticalComponent)) { + AbstractStatisticalComponent component = (AbstractStatisticalComponent) aComponent; + + Set tagSet = new HashSet(); + + for (StringModel model : component.getModels()) { + tagSet.addAll(asList(model.getLabels())); + } + + List tagList = new ArrayList(tagSet); + Collections.sort(tagList); + + StringBuilder sb = new StringBuilder(); + sb.append("Model of " + aType + " contains [").append(tagList.size()) + .append("] tags: "); + + for (String tag : tagList) { + sb.append(tag); + sb.append(" "); + } + getContext().getLogger().log(INFO, sb.toString()); + } + } +} diff --git a/dkpro-core-clearnlp-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/clearnlp/lib/parser-default-variants.map b/dkpro-core-clearnlp-asl/src/main/resources/org/dkpro/core/clearnlp/lib/parser-default-variants.map similarity index 100% rename from dkpro-core-clearnlp-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/clearnlp/lib/parser-default-variants.map rename to dkpro-core-clearnlp-asl/src/main/resources/org/dkpro/core/clearnlp/lib/parser-default-variants.map diff --git a/dkpro-core-clearnlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpParserTest.java b/dkpro-core-clearnlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpParserTest.java deleted file mode 100644 index be177d829f..0000000000 --- a/dkpro-core-clearnlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpParserTest.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.clearnlp; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.jcas.JCas; -import org.junit.Assume; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; -import de.tudarmstadt.ukp.dkpro.core.testing.dumper.DependencyDumper; - -public class ClearNlpParserTest -{ - static final String documentEnglish = "We need a very complicated example sentence , which " + - "contains as many constituents and dependencies as possible ."; - - @Test - public void testEnglishDependencies() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 3000000000l); - - JCas jcas = runTest("en", null, documentEnglish); - - String[] dependencies = new String[] { - "[ 0, 2]Dependency(nsubj,basic) D[0,2](We) G[3,7](need)", - "[ 3, 7]ROOT(ROOT,basic) D[3,7](need) G[3,7](need)", - "[ 8, 9]Dependency(det,basic) D[8,9](a) G[35,43](sentence)", - "[ 10, 14]Dependency(advmod,basic) D[10,14](very) G[15,26](complicated)", - "[ 15, 26]Dependency(amod,basic) D[15,26](complicated) G[35,43](sentence)", - "[ 27, 34]Dependency(nn,basic) D[27,34](example) G[35,43](sentence)", - "[ 35, 43]Dependency(dobj,basic) D[35,43](sentence) G[3,7](need)", - "[ 44, 45]Dependency(punct,basic) D[44,45](,) G[35,43](sentence)", - "[ 46, 51]Dependency(nsubj,basic) D[46,51](which) G[52,60](contains)", - "[ 52, 60]Dependency(rcmod,basic) D[52,60](contains) G[35,43](sentence)", - "[ 61, 63]Dependency(prep,basic) D[61,63](as) G[52,60](contains)", - "[ 64, 68]Dependency(amod,basic) D[64,68](many) G[69,81](constituents)", - "[ 69, 81]Dependency(pobj,basic) D[69,81](constituents) G[61,63](as)", - "[ 82, 85]Dependency(cc,basic) D[82,85](and) G[69,81](constituents)", - "[ 86, 98]Dependency(conj,basic) D[86,98](dependencies) G[69,81](constituents)", - "[ 99,101]Dependency(prep,basic) D[99,101](as) G[86,98](dependencies)", - "[102,110]Dependency(amod,basic) D[102,110](possible) G[99,101](as)", - "[111,112]Dependency(punct,basic) D[111,112](.) G[3,7](need)" }; - - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - } - - @Test - public void testEnglishMayo() - throws Exception - { -// Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1200000000l); - - JCas jcas = runTest("en", "mayo", documentEnglish); - - String[] dependencies = new String[] { - "[ 0, 2]Dependency(nsubj,basic) D[0,2](We) G[3,7](need)", - "[ 3, 7]ROOT(ROOT,basic) D[3,7](need) G[3,7](need)", - "[ 8, 9]Dependency(det,basic) D[8,9](a) G[35,43](sentence)", - "[ 10, 14]Dependency(advmod,basic) D[10,14](very) G[15,26](complicated)", - "[ 15, 26]Dependency(amod,basic) D[15,26](complicated) G[35,43](sentence)", - "[ 27, 34]Dependency(nn,basic) D[27,34](example) G[35,43](sentence)", - "[ 35, 43]Dependency(dobj,basic) D[35,43](sentence) G[3,7](need)", - "[ 44, 45]Dependency(punct,basic) D[44,45](,) G[35,43](sentence)", - "[ 46, 51]Dependency(nsubj,basic) D[46,51](which) G[52,60](contains)", - "[ 52, 60]Dependency(rcmod,basic) D[52,60](contains) G[35,43](sentence)", - "[ 61, 63]Dependency(prep,basic) D[61,63](as) G[52,60](contains)", - "[ 64, 68]Dependency(amod,basic) D[64,68](many) G[69,81](constituents)", - "[ 69, 81]Dependency(pobj,basic) D[69,81](constituents) G[61,63](as)", - "[ 82, 85]Dependency(cc,basic) D[82,85](and) G[69,81](constituents)", - "[ 86, 98]Dependency(conj,basic) D[86,98](dependencies) G[69,81](constituents)", - "[ 99,101]Dependency(mark,basic) D[99,101](as) G[102,110](possible)", - "[102,110]Dependency(advcl,basic) D[102,110](possible) G[52,60](contains)", - "[111,112]Dependency(punct,basic) D[111,112](.) G[3,7](need)" }; - - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - } - - private JCas runTest(String aLanguage, String aVariant, String aText) - throws Exception - { - AnalysisEngineDescription engine = createEngineDescription( - createEngineDescription(OpenNlpPosTagger.class), - createEngineDescription(ClearNlpLemmatizer.class), - createEngineDescription(ClearNlpParser.class, - ClearNlpParser.PARAM_VARIANT, aVariant, - ClearNlpParser.PARAM_PRINT_TAGSET, true), - createEngineDescription(DependencyDumper.class)); - - return TestRunner.runTest(engine, aLanguage, aText); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-clearnlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpPosTaggerTest.java b/dkpro-core-clearnlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpPosTaggerTest.java deleted file mode 100644 index cdf85ca7c7..0000000000 --- a/dkpro-core-clearnlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpPosTaggerTest.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.clearnlp; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.jcas.JCas; -import org.junit.Assume; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -public class ClearNlpPosTaggerTest -{ - @Test - public void testEnglish() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1200000000l); - - runTest("en", null, "This is a test . \n", - new String[] { "DT", "VBZ", "DT", "NN", "." }, - new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - - runTest("en", null, "A neural net . \n", - new String[] { "DT", "JJ", "NN", "." }, - new String[] { "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); - - runTest("en", null, "John is purchasing oranges . \n", - new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, - new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); - } - - @Test - public void testEnglishMayo() - throws Exception - { - runTest("en", "mayo", "This is a test . \n", - new String[] { "DT", "VBZ", "DT", "NN", "." }, - new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - - runTest("en", "mayo", "A neural net . \n", - new String[] { "DT", "JJ", "NN", "." }, - new String[] { "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); - - runTest("en", "mayo", "John is purchasing oranges . \n", - new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, - new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); - } - - private void runTest(String language, String variant, String testDocument, String[] tags, - String[] tagClasses) - throws Exception - { - AnalysisEngine engine = createEngine(ClearNlpPosTagger.class, - ClearNlpPosTagger.PARAM_VARIANT, variant, - ClearNlpPosTagger.PARAM_PRINT_TAGSET, true); - - JCas jcas = TestRunner.runTest(engine, language, testDocument); - - AssertAnnotations.assertPOS(tagClasses, tags, select(jcas, POS.class)); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); - - @Before - public void clearMemory() - { - Runtime.getRuntime().gc(); - Runtime.getRuntime().gc(); - Runtime.getRuntime().gc(); - } -} diff --git a/dkpro-core-clearnlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpSemanticRoleLabelerTest.java b/dkpro-core-clearnlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpSemanticRoleLabelerTest.java deleted file mode 100644 index 1f53bdb48b..0000000000 --- a/dkpro-core-clearnlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpSemanticRoleLabelerTest.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.clearnlp; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.commons.lang3.ArrayUtils; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.jcas.JCas; -import org.junit.Assume; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -public class ClearNlpSemanticRoleLabelerTest -{ - static final String documentEnglish = "We need a very complicated example sentence , which " - + "contains as many constituents and dependencies as possible ."; - - @Test - public void testEnglish() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 3000000000l); - - JCas jcas = runTest("en", null, documentEnglish); - - String[] predicates = { - "contains (contain.01): [(A0:sentence)(A1:as)(R-A0:which)]", - "need (need.01): [(A0:We)(A1:sentence)]" }; - - AssertAnnotations.assertSemPred(predicates, select(jcas, SemPred.class)); - } - - @Test - public void testEnglishExpand() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 3000000000l); - - JCas jcas = runTest("en", null, documentEnglish, - ClearNlpSemanticRoleLabeler.PARAM_EXPAND_ARGUMENTS, true); - - String[] predicates = { - "contains (contain.01): [" - + "(A0:a very complicated example sentence , which contains as many constituents and dependencies as possible)" - + "(A1:as many constituents and dependencies as possible)" - + "(R-A0:which)]", - "need (need.01): [" - + "(A0:We)" - + "(A1:a very complicated example sentence , which contains as many constituents and dependencies as possible)]" }; - - AssertAnnotations.assertSemPred(predicates, select(jcas, SemPred.class)); - } - - @Test - public void testEnglishExpand2() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 3000000000l); - - JCas jcas = runTest("en", null, "The man was sued by Jacqueline Kennedy Onassis .", - ClearNlpSemanticRoleLabeler.PARAM_EXPAND_ARGUMENTS, true); - - String[] predicates = { "sued (sue.01): [(A0:by Jacqueline Kennedy Onassis)(A1:The man)]" }; - - AssertAnnotations.assertSemPred(predicates, select(jcas, SemPred.class)); - } - @Test - public void testEnglishMayo() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 3000000000l); - - JCas jcas = runTest("en", "mayo", documentEnglish); - - String[] predicates = { - "contains (contain.01): [(A0:sentence)(A1:as)(R-A0:which)]", - "need (need.01): [(A0:We)(A1:sentence)]" }; - - AssertAnnotations.assertSemPred(predicates, select(jcas, SemPred.class)); - } - - private JCas runTest(String aLanguage, String aVariant, String aText, Object... aExtraParams) - throws Exception - { - Object[] params = new Object[] { - ClearNlpParser.PARAM_VARIANT, aVariant, - ClearNlpParser.PARAM_PRINT_TAGSET, true}; - params = ArrayUtils.addAll(params, aExtraParams); - - AnalysisEngineDescription engine = createEngineDescription( - createEngineDescription(OpenNlpPosTagger.class), - createEngineDescription(ClearNlpLemmatizer.class), - createEngineDescription(ClearNlpParser.class), - createEngineDescription(ClearNlpSemanticRoleLabeler.class, params)); - - return TestRunner.runTest(engine, aLanguage, aText); - } - - - @Rule - public DkproTestContext testContext = new DkproTestContext(); - - @Before - public void freeMemory() - { - Runtime.getRuntime().gc(); - Runtime.getRuntime().gc(); - Runtime.getRuntime().gc(); - } -} diff --git a/dkpro-core-clearnlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpLemmatizerTest.java b/dkpro-core-clearnlp-asl/src/test/java/org/dkpro/core/clearnlp/ClearNlpLemmatizerTest.java similarity index 91% rename from dkpro-core-clearnlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpLemmatizerTest.java rename to dkpro-core-clearnlp-asl/src/test/java/org/dkpro/core/clearnlp/ClearNlpLemmatizerTest.java index 61a204d037..52fa50e595 100644 --- a/dkpro-core-clearnlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpLemmatizerTest.java +++ b/dkpro-core-clearnlp-asl/src/test/java/org/dkpro/core/clearnlp/ClearNlpLemmatizerTest.java @@ -15,20 +15,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.clearnlp; +package org.dkpro.core.clearnlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class ClearNlpLemmatizerTest { diff --git a/dkpro-core-clearnlp-asl/src/test/java/org/dkpro/core/clearnlp/ClearNlpParserTest.java b/dkpro-core-clearnlp-asl/src/test/java/org/dkpro/core/clearnlp/ClearNlpParserTest.java new file mode 100644 index 0000000000..5e142ed32a --- /dev/null +++ b/dkpro-core-clearnlp-asl/src/test/java/org/dkpro/core/clearnlp/ClearNlpParserTest.java @@ -0,0 +1,119 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.clearnlp; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.opennlp.OpenNlpPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.dkpro.core.testing.dumper.DependencyDumper; +import org.junit.Assume; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; + +public class ClearNlpParserTest +{ + static final String documentEnglish = "We need a very complicated example sentence , which " + + "contains as many constituents and dependencies as possible ."; + + @Test + public void testEnglishDependencies() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 3000000000l); + + JCas jcas = runTest("en", null, documentEnglish); + + String[] dependencies = new String[] { + "[ 0, 2]Dependency(nsubj,basic) D[0,2](We) G[3,7](need)", + "[ 3, 7]ROOT(ROOT,basic) D[3,7](need) G[3,7](need)", + "[ 8, 9]Dependency(det,basic) D[8,9](a) G[35,43](sentence)", + "[ 10, 14]Dependency(advmod,basic) D[10,14](very) G[15,26](complicated)", + "[ 15, 26]Dependency(amod,basic) D[15,26](complicated) G[35,43](sentence)", + "[ 27, 34]Dependency(nn,basic) D[27,34](example) G[35,43](sentence)", + "[ 35, 43]Dependency(dobj,basic) D[35,43](sentence) G[3,7](need)", + "[ 44, 45]Dependency(punct,basic) D[44,45](,) G[35,43](sentence)", + "[ 46, 51]Dependency(nsubj,basic) D[46,51](which) G[52,60](contains)", + "[ 52, 60]Dependency(rcmod,basic) D[52,60](contains) G[35,43](sentence)", + "[ 61, 63]Dependency(prep,basic) D[61,63](as) G[52,60](contains)", + "[ 64, 68]Dependency(amod,basic) D[64,68](many) G[69,81](constituents)", + "[ 69, 81]Dependency(pobj,basic) D[69,81](constituents) G[61,63](as)", + "[ 82, 85]Dependency(cc,basic) D[82,85](and) G[69,81](constituents)", + "[ 86, 98]Dependency(conj,basic) D[86,98](dependencies) G[69,81](constituents)", + "[ 99,101]Dependency(prep,basic) D[99,101](as) G[86,98](dependencies)", + "[102,110]Dependency(amod,basic) D[102,110](possible) G[99,101](as)", + "[111,112]Dependency(punct,basic) D[111,112](.) G[3,7](need)" }; + + AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); + } + + @Test + public void testEnglishMayo() + throws Exception + { +// Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1200000000l); + + JCas jcas = runTest("en", "mayo", documentEnglish); + + String[] dependencies = new String[] { + "[ 0, 2]Dependency(nsubj,basic) D[0,2](We) G[3,7](need)", + "[ 3, 7]ROOT(ROOT,basic) D[3,7](need) G[3,7](need)", + "[ 8, 9]Dependency(det,basic) D[8,9](a) G[35,43](sentence)", + "[ 10, 14]Dependency(advmod,basic) D[10,14](very) G[15,26](complicated)", + "[ 15, 26]Dependency(amod,basic) D[15,26](complicated) G[35,43](sentence)", + "[ 27, 34]Dependency(nn,basic) D[27,34](example) G[35,43](sentence)", + "[ 35, 43]Dependency(dobj,basic) D[35,43](sentence) G[3,7](need)", + "[ 44, 45]Dependency(punct,basic) D[44,45](,) G[35,43](sentence)", + "[ 46, 51]Dependency(nsubj,basic) D[46,51](which) G[52,60](contains)", + "[ 52, 60]Dependency(rcmod,basic) D[52,60](contains) G[35,43](sentence)", + "[ 61, 63]Dependency(prep,basic) D[61,63](as) G[52,60](contains)", + "[ 64, 68]Dependency(amod,basic) D[64,68](many) G[69,81](constituents)", + "[ 69, 81]Dependency(pobj,basic) D[69,81](constituents) G[61,63](as)", + "[ 82, 85]Dependency(cc,basic) D[82,85](and) G[69,81](constituents)", + "[ 86, 98]Dependency(conj,basic) D[86,98](dependencies) G[69,81](constituents)", + "[ 99,101]Dependency(mark,basic) D[99,101](as) G[102,110](possible)", + "[102,110]Dependency(advcl,basic) D[102,110](possible) G[52,60](contains)", + "[111,112]Dependency(punct,basic) D[111,112](.) G[3,7](need)" }; + + AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); + } + + private JCas runTest(String aLanguage, String aVariant, String aText) + throws Exception + { + AnalysisEngineDescription engine = createEngineDescription( + createEngineDescription(OpenNlpPosTagger.class), + createEngineDescription(ClearNlpLemmatizer.class), + createEngineDescription(ClearNlpParser.class, + ClearNlpParser.PARAM_VARIANT, aVariant, + ClearNlpParser.PARAM_PRINT_TAGSET, true), + createEngineDescription(DependencyDumper.class)); + + return TestRunner.runTest(engine, aLanguage, aText); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-clearnlp-asl/src/test/java/org/dkpro/core/clearnlp/ClearNlpPosTaggerTest.java b/dkpro-core-clearnlp-asl/src/test/java/org/dkpro/core/clearnlp/ClearNlpPosTaggerTest.java new file mode 100644 index 0000000000..8168e02353 --- /dev/null +++ b/dkpro-core-clearnlp-asl/src/test/java/org/dkpro/core/clearnlp/ClearNlpPosTaggerTest.java @@ -0,0 +1,96 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.clearnlp; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Assume; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; + +public class ClearNlpPosTaggerTest +{ + @Test + public void testEnglish() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1200000000l); + + runTest("en", null, "This is a test . \n", + new String[] { "DT", "VBZ", "DT", "NN", "." }, + new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + + runTest("en", null, "A neural net . \n", + new String[] { "DT", "JJ", "NN", "." }, + new String[] { "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); + + runTest("en", null, "John is purchasing oranges . \n", + new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, + new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); + } + + @Test + public void testEnglishMayo() + throws Exception + { + runTest("en", "mayo", "This is a test . \n", + new String[] { "DT", "VBZ", "DT", "NN", "." }, + new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + + runTest("en", "mayo", "A neural net . \n", + new String[] { "DT", "JJ", "NN", "." }, + new String[] { "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); + + runTest("en", "mayo", "John is purchasing oranges . \n", + new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, + new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); + } + + private void runTest(String language, String variant, String testDocument, String[] tags, + String[] tagClasses) + throws Exception + { + AnalysisEngine engine = createEngine(ClearNlpPosTagger.class, + ClearNlpPosTagger.PARAM_VARIANT, variant, + ClearNlpPosTagger.PARAM_PRINT_TAGSET, true); + + JCas jcas = TestRunner.runTest(engine, language, testDocument); + + AssertAnnotations.assertPOS(tagClasses, tags, select(jcas, POS.class)); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); + + @Before + public void clearMemory() + { + Runtime.getRuntime().gc(); + Runtime.getRuntime().gc(); + Runtime.getRuntime().gc(); + } +} diff --git a/dkpro-core-clearnlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpSegmenterTest.java b/dkpro-core-clearnlp-asl/src/test/java/org/dkpro/core/clearnlp/ClearNlpSegmenterTest.java similarity index 82% rename from dkpro-core-clearnlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpSegmenterTest.java rename to dkpro-core-clearnlp-asl/src/test/java/org/dkpro/core/clearnlp/ClearNlpSegmenterTest.java index 64e0014c03..a3102c4c85 100644 --- a/dkpro-core-clearnlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/clearnlp/ClearNlpSegmenterTest.java +++ b/dkpro-core-clearnlp-asl/src/test/java/org/dkpro/core/clearnlp/ClearNlpSegmenterTest.java @@ -15,12 +15,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.clearnlp; +package org.dkpro.core.clearnlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; -import static org.junit.Assert.*; +import static org.junit.Assert.assertEquals; import java.util.ArrayList; import java.util.List; @@ -29,27 +29,27 @@ import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.harness.SegmenterHarness; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.harness.SegmenterHarness; public class ClearNlpSegmenterTest { - @Test - public void run() throws Throwable - { - AnalysisEngineDescription aed = createEngineDescription(ClearNlpSegmenter.class); + @Test + public void run() throws Throwable + { + AnalysisEngineDescription aed = createEngineDescription(ClearNlpSegmenter.class); SegmenterHarness.run(aed, "de.1", "de.2", "de.3", "de.4", "en.1", "en.7", "en.9", "ar.1", "zh.1", "zh.2"); - } + } - /** - * We had a bug where the token offsets were assigned wrong when one word was a suffix of the - * previous word. - */ + /** + * We had a bug where the token offsets were assigned wrong when one word was a suffix of the + * previous word. + */ @Test public void testSuffix() throws Exception { diff --git a/dkpro-core-clearnlp-asl/src/test/java/org/dkpro/core/clearnlp/ClearNlpSemanticRoleLabelerTest.java b/dkpro-core-clearnlp-asl/src/test/java/org/dkpro/core/clearnlp/ClearNlpSemanticRoleLabelerTest.java new file mode 100644 index 0000000000..b9dcbbec8a --- /dev/null +++ b/dkpro-core-clearnlp-asl/src/test/java/org/dkpro/core/clearnlp/ClearNlpSemanticRoleLabelerTest.java @@ -0,0 +1,135 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.clearnlp; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.commons.lang3.ArrayUtils; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.opennlp.OpenNlpPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Assume; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred; + +public class ClearNlpSemanticRoleLabelerTest +{ + static final String documentEnglish = "We need a very complicated example sentence , which " + + "contains as many constituents and dependencies as possible ."; + + @Test + public void testEnglish() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 3000000000l); + + JCas jcas = runTest("en", null, documentEnglish); + + String[] predicates = { + "contains (contain.01): [(A0:sentence)(A1:as)(R-A0:which)]", + "need (need.01): [(A0:We)(A1:sentence)]" }; + + AssertAnnotations.assertSemPred(predicates, select(jcas, SemPred.class)); + } + + @Test + public void testEnglishExpand() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 3000000000l); + + JCas jcas = runTest("en", null, documentEnglish, + ClearNlpSemanticRoleLabeler.PARAM_EXPAND_ARGUMENTS, true); + + String[] predicates = { + "contains (contain.01): [" + + "(A0:a very complicated example sentence , which contains as many constituents and dependencies as possible)" + + "(A1:as many constituents and dependencies as possible)" + + "(R-A0:which)]", + "need (need.01): [" + + "(A0:We)" + + "(A1:a very complicated example sentence , which contains as many constituents and dependencies as possible)]" + }; + + AssertAnnotations.assertSemPred(predicates, select(jcas, SemPred.class)); + } + + @Test + public void testEnglishExpand2() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 3000000000l); + + JCas jcas = runTest("en", null, "The man was sued by Jacqueline Kennedy Onassis .", + ClearNlpSemanticRoleLabeler.PARAM_EXPAND_ARGUMENTS, true); + + String[] predicates = { "sued (sue.01): [(A0:by Jacqueline Kennedy Onassis)(A1:The man)]" }; + + AssertAnnotations.assertSemPred(predicates, select(jcas, SemPred.class)); + } + @Test + public void testEnglishMayo() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 3000000000l); + + JCas jcas = runTest("en", "mayo", documentEnglish); + + String[] predicates = { + "contains (contain.01): [(A0:sentence)(A1:as)(R-A0:which)]", + "need (need.01): [(A0:We)(A1:sentence)]" }; + + AssertAnnotations.assertSemPred(predicates, select(jcas, SemPred.class)); + } + + private JCas runTest(String aLanguage, String aVariant, String aText, Object... aExtraParams) + throws Exception + { + Object[] params = new Object[] { + ClearNlpParser.PARAM_VARIANT, aVariant, + ClearNlpParser.PARAM_PRINT_TAGSET, true}; + params = ArrayUtils.addAll(params, aExtraParams); + + AnalysisEngineDescription engine = createEngineDescription( + createEngineDescription(OpenNlpPosTagger.class), + createEngineDescription(ClearNlpLemmatizer.class), + createEngineDescription(ClearNlpParser.class), + createEngineDescription(ClearNlpSemanticRoleLabeler.class, params)); + + return TestRunner.runTest(engine, aLanguage, aText); + } + + + @Rule + public DkproTestContext testContext = new DkproTestContext(); + + @Before + public void freeMemory() + { + Runtime.getRuntime().gc(); + Runtime.getRuntime().gc(); + Runtime.getRuntime().gc(); + } +} diff --git a/dkpro-core-clearnlp-asl/src/test/resources/log4j.properties b/dkpro-core-clearnlp-asl/src/test/resources/log4j.properties deleted file mode 100644 index 9ef9876f5c..0000000000 --- a/dkpro-core-clearnlp-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,7 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG diff --git a/dkpro-core-clearnlp-asl/src/test/resources/log4j2.xml b/dkpro-core-clearnlp-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..19bf03b585 --- /dev/null +++ b/dkpro-core-clearnlp-asl/src/test/resources/log4j2.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/dkpro-core-cogroo-asl/pom.xml b/dkpro-core-cogroo-asl/pom.xml index cc5cad6dd6..ada5136916 100644 --- a/dkpro-core-cogroo-asl/pom.xml +++ b/dkpro-core-cogroo-asl/pom.xml @@ -15,18 +15,20 @@ See the License for the specific language governing permissions and limitations under the License. --> - 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.cogroo-asl + dkpro-core-cogroo-asl jar DKPro Core ASL - CoGrOO (EXPERIMENTAL) + https://dkpro.github.io/dkpro-core/ 4.0.0 @@ -60,32 +62,36 @@ ${cogroo.version}
- de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl + + + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-ner-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.ner-asl + org.dkpro.core + dkpro-core-api-anomaly-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.anomaly-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-parameter-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + eu.openminted.share.annotations + omtd-share-annotations-api - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test diff --git a/dkpro-core-cogroo-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooChecker.java b/dkpro-core-cogroo-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooChecker.java deleted file mode 100644 index 818c77e0ee..0000000000 --- a/dkpro-core-cogroo-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooChecker.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.cogroo; - -import java.io.IOException; -import java.net.URL; -import java.util.Locale; -import java.util.Properties; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.LanguageCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.cogroo.analyzer.ComponentFactory; -import org.cogroo.checker.CheckDocument; -import org.cogroo.checker.GrammarChecker; -import org.cogroo.entities.Mistake; - -import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.GrammarAnomaly; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; - -/** - * Detect grammatical errors in text using CoGrOO. - */ -@ResourceMetaData(name="CoGrOO Grammar Checker") -@LanguageCapability("pt") -@TypeCapability( - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.GrammarAnomaly" }) -public class CogrooChecker - extends JCasAnnotator_ImplBase -{ - public static enum DetailLevel { - SHORT, LONG, FULL - } - - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - private String language; - - /** - * Set detail level. - */ - public static final String PARAM_DETAIL_LEVEL = "detailLevel"; - @ConfigurationParameter(name = PARAM_DETAIL_LEVEL, mandatory = true, defaultValue="SHORT") - private DetailLevel detailLevel; - - private ModelProviderBase modelProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase() { - { - setContextObject(CogrooChecker.this); - setDefault(LOCATION, NOT_REQUIRED); - - setOverride(LANGUAGE, language); - } - - @Override - protected GrammarChecker produceResource(URL aUrl) - throws IOException - { - Properties props = getAggregatedProperties(); - if (!"pt".equals(props.getProperty(LANGUAGE))) { - throw new IOException("The language code '" - + props.getProperty(LANGUAGE) + "' is not supported by LanguageTool."); - } - - ComponentFactory factory = ComponentFactory.create(new Locale("pt", "BR")); - return new GrammarChecker(factory.createPipe()); - } - }; - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - modelProvider.configure(aJCas.getCas()); - - // get document text - CheckDocument document = new CheckDocument(aJCas.getDocumentText()); - - modelProvider.getResource().analyze(document); - for (Mistake match : document.getMistakes()) { - // create annotation - GrammarAnomaly annotation = new GrammarAnomaly(aJCas); - annotation.setBegin(match.getStart()); - annotation.setEnd(match.getEnd()); - switch (detailLevel) { - case SHORT: - annotation.setDescription(match.getShortMessage()); - break; - case LONG: - annotation.setDescription(match.getLongMessage()); - break; - case FULL: - annotation.setDescription(match.getFullMessage()); - break; - } - annotation.addToIndexes(); - if (getLogger().isTraceEnabled()) { - getLogger().trace("Found: " + annotation); - } - } - } -} diff --git a/dkpro-core-cogroo-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooFeaturizer.java b/dkpro-core-cogroo-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooFeaturizer.java deleted file mode 100644 index ceed7538a4..0000000000 --- a/dkpro-core-cogroo-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooFeaturizer.java +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.cogroo; - -import static java.util.Arrays.asList; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; - -import java.io.IOException; -import java.net.URL; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Locale; -import java.util.Properties; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.LanguageCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.cogroo.analyzer.Analyzer; -import org.cogroo.analyzer.ComponentFactory; -import org.cogroo.text.Document; -import org.cogroo.text.impl.DocumentImpl; -import org.cogroo.text.impl.SentenceImpl; -import org.cogroo.text.impl.TokenImpl; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * Morphological analyzer using CoGrOO. - */ -@ResourceMetaData(name="CoGrOO Morphological Analyzer") -@LanguageCapability("pt") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures" }) - -public class CogrooFeaturizer - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - private CasConfigurableProviderBase modelProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase() { - { - setContextObject(CogrooFeaturizer.this); - - setDefault(LOCATION, NOT_REQUIRED); - setOverride(LANGUAGE, language); - } - - @Override - protected Analyzer produceResource(URL aUrl) - throws IOException - { - Properties props = getAggregatedProperties(); - - String language = props.getProperty(LANGUAGE); - - if (!"pt".equals(language)) { - throw new IOException("The language code '" + language - + "' is not supported by LanguageTool."); - } - - ComponentFactory factory = ComponentFactory.create(new Locale("pt", "BR")); - - return factory.createFeaturizer(); - } - }; - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - modelProvider.configure(cas); - - // This is actually quite some overhead, because internally Cogroo is just using a - // OpenNLP classifier which simply takes a token and pos tag and returnes a list of - // features. It would be much more efficient to use the classifier directly. - - for (Sentence sentence : select(aJCas, Sentence.class)) { - // We set up one CoGrOO document for each sentence. That makes it easier to maintain - // a list of tokens of the sentence, which we later need to attached the lemmata to the - // tokens. - - // Construct the document - Document doc = new DocumentImpl(); - doc.setText(aJCas.getDocumentText()); - - // Extract the sentence and its tokens - org.cogroo.text.Sentence cSent = new SentenceImpl(sentence.getBegin(), sentence.getEnd(), doc); - List cTokens = new ArrayList(); - List dTokens = selectCovered(Token.class, sentence); - for (Token dTok : dTokens) { - TokenImpl cTok = new TokenImpl(dTok.getBegin() - sentence.getBegin(), - dTok.getEnd() - sentence.getBegin(), dTok.getText()); - cTok.setPOSTag(dTok.getPos().getPosValue()); - cTokens.add(cTok); - } - cSent.setTokens(cTokens); - doc.setSentences(asList(cSent)); - - // Process - modelProvider.getResource().analyze(doc); - - assert cSent.getTokens().size() == dTokens.size(); - - // Convert from CoGrOO to UIMA model - Iterator dTokIt = dTokens.iterator(); - for (org.cogroo.text.Token cTok : cSent.getTokens()) { - Token dTok = dTokIt.next(); - MorphologicalFeatures m = new MorphologicalFeatures(aJCas, cSent.getStart() - + cTok.getStart(), cSent.getStart() + cTok.getEnd()); - m.setValue(cTok.getFeatures()); - m.addToIndexes(); - dTok.setMorph(m); - } - } - } -} diff --git a/dkpro-core-cogroo-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooLemmatizer.java b/dkpro-core-cogroo-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooLemmatizer.java deleted file mode 100644 index c0be968139..0000000000 --- a/dkpro-core-cogroo-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooLemmatizer.java +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.cogroo; - -import static java.util.Arrays.asList; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; - -import java.io.IOException; -import java.net.URL; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Locale; -import java.util.Properties; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.LanguageCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.cogroo.analyzer.Analyzer; -import org.cogroo.analyzer.ComponentFactory; -import org.cogroo.text.Document; -import org.cogroo.text.impl.DocumentImpl; -import org.cogroo.text.impl.SentenceImpl; -import org.cogroo.text.impl.TokenImpl; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * Lemmatizer using CoGrOO. - */ -@ResourceMetaData(name="CoGrOO Lemmatizer") -@LanguageCapability("pt") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }) - -public class CogrooLemmatizer - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - private CasConfigurableProviderBase modelProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase() { - { - setContextObject(CogrooLemmatizer.this); - - setDefault(LOCATION, NOT_REQUIRED); - setOverride(LANGUAGE, language); - } - - @Override - protected Analyzer produceResource(URL aUrl) - throws IOException - { - Properties props = getAggregatedProperties(); - - String language = props.getProperty(LANGUAGE); - - if (!"pt".equals(language)) { - throw new IOException("The language code '" + language - + "' is not supported by LanguageTool."); - } - - ComponentFactory factory = ComponentFactory.create(new Locale("pt", "BR")); - - return factory.createLemmatizer(); - } - }; - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - modelProvider.configure(cas); - - // This is actually quite some overhead, because internally Cogroo is just using a - // Morphlogik dictionary which simply takes a token and pos tag and returnes a list of - // lemmata. It would be much more efficient to use the dictionary directly. - - for (Sentence sentence : select(aJCas, Sentence.class)) { - // We set up one CoGrOO document for each sentence. That makes it easier to maintain - // a list of tokens of the sentence, which we later need to attached the lemmata to the - // tokens. - - // Construct the document - Document doc = new DocumentImpl(); - doc.setText(aJCas.getDocumentText()); - - // Extract the sentence and its tokens - org.cogroo.text.Sentence cSent = new SentenceImpl(sentence.getBegin(), sentence.getEnd(), doc); - List cTokens = new ArrayList(); - List dTokens = selectCovered(Token.class, sentence); - for (Token dTok : dTokens) { - TokenImpl cTok = new TokenImpl(dTok.getBegin() - sentence.getBegin(), - dTok.getEnd() - sentence.getBegin(), dTok.getText()); - cTok.setPOSTag(dTok.getPos().getPosValue()); - cTokens.add(cTok); - } - cSent.setTokens(cTokens); - doc.setSentences(asList(cSent)); - - // Process - modelProvider.getResource().analyze(doc); - - assert cSent.getTokens().size() == dTokens.size(); - - // Convert from CoGrOO to UIMA model - Iterator dTokIt = dTokens.iterator(); - for (org.cogroo.text.Token cTok : cSent.getTokens()) { - // CoGrOO allows storing multiple lemmas per token. DKPro Core only allows one lemma - // per token. We just take the first one here. If we would run the grammar - // checking based on the DKPro Core lemmata, we might miss certain errors for this - // reason. - Token dTok = dTokIt.next(); - String[] lemmas = cTok.getLemmas(); - Lemma l = new Lemma(aJCas, cSent.getStart() + cTok.getStart(), cSent.getStart() + cTok.getEnd()); - if (lemmas != null && lemmas.length > 0) { - String lemmaString = lemmas[0]; - if (lemmaString == null) { - lemmaString = dTok.getText(); - } - l.setValue(lemmaString); - } - else { - l.setValue(cTok.getLexeme()); - } - l.addToIndexes(); - dTok.setLemma(l); - } - } - } -} diff --git a/dkpro-core-cogroo-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooNamedEntityRecognizer.java b/dkpro-core-cogroo-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooNamedEntityRecognizer.java deleted file mode 100644 index 9c6abd4b25..0000000000 --- a/dkpro-core-cogroo-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooNamedEntityRecognizer.java +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.cogroo; - -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; - -import java.io.IOException; -import java.net.URL; -import java.util.ArrayList; -import java.util.List; -import java.util.Locale; -import java.util.Properties; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.LanguageCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.cogroo.analyzer.Analyzer; -import org.cogroo.analyzer.ComponentFactory; -import org.cogroo.config.Analyzers; -import org.cogroo.text.Document; -import org.cogroo.text.impl.DocumentImpl; -import org.cogroo.text.impl.SentenceImpl; -import org.cogroo.text.impl.TokenImpl; - -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * Tokenizer and sentence splitter using CoGrOO. - */ -@ResourceMetaData(name="CoGrOO Named Entity Recognizer") -@LanguageCapability("pt") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity" }) -public class CogrooNamedEntityRecognizer - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - private CasConfigurableProviderBase modelProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase() { - { - setContextObject(CogrooNamedEntityRecognizer.this); - - setDefault(LOCATION, NOT_REQUIRED); - setOverride(LANGUAGE, language); - } - - @Override - protected Analyzer produceResource(URL aUrl) - throws IOException - { - Properties props = getAggregatedProperties(); - - String language = props.getProperty(LANGUAGE); - - return ComponentFactory.create(Locale.forLanguageTag(language)).createNameFinder(); - } - }; - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - modelProvider.configure(cas); - - // This is actually quite some overhead, because internally Cogroo is just using the - // OpenNLP namefinder which simply takes a string array and returns and arrays of spans... - // It would be much more efficient to use the model directly. - - // Convert from UIMA to Cogroo model - Document doc = new DocumentImpl(); - doc.setText(aJCas.getDocumentText()); - List sentences = new ArrayList(); - for (Sentence sentence : select(aJCas, Sentence.class)) { - org.cogroo.text.Sentence s = new SentenceImpl(sentence.getBegin(), sentence.getEnd(), doc); - List tokens = new ArrayList(); - for (Token token : selectCovered(Token.class, sentence)) { - tokens.add(new TokenImpl(token.getBegin() - sentence.getBegin(), - token.getEnd() - sentence.getBegin(), token.getCoveredText())); - } - s.setTokens(tokens); - sentences.add(s); - } - doc.setSentences(sentences); - - // Process - modelProvider.getResource().analyze(doc); - - // Convert from Cogroo to UIMA model - for (org.cogroo.text.Sentence s : doc.getSentences()) { - for (org.cogroo.text.Token t : s.getTokens()) { - if ("P".equals(t.getAdditionalContext(Analyzers.NAME_FINDER))) { - NamedEntity ne = new NamedEntity(aJCas, s.getStart() + t.getStart(), - s.getStart() + t.getEnd()); - ne.setValue("P"); - ne.addToIndexes(); - } - } - } - } -} diff --git a/dkpro-core-cogroo-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooPosTagger.java b/dkpro-core-cogroo-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooPosTagger.java deleted file mode 100644 index dc46e8d73c..0000000000 --- a/dkpro-core-cogroo-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooPosTagger.java +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.cogroo; - -import static java.util.Arrays.asList; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; - -import java.io.IOException; -import java.net.URL; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Locale; -import java.util.Properties; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Type; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.LanguageCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.cogroo.analyzer.Analyzer; -import org.cogroo.analyzer.ComponentFactory; -import org.cogroo.text.Document; -import org.cogroo.text.impl.DocumentImpl; -import org.cogroo.text.impl.SentenceImpl; -import org.cogroo.text.impl.TokenImpl; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * POS-tagger using CoGrOO. - */ -@ResourceMetaData(name="CoGrOO POS-Tagger") -@LanguageCapability("pt") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) -public class CogrooPosTagger - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Load the part-of-speech tag to UIMA type mapping from this location instead of locating - * the mapping automatically. - */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - protected String posMappingLocation; - - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code true} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - - private CasConfigurableProviderBase modelProvider; - private MappingProvider mappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase() { - { - setContextObject(CogrooPosTagger.this); - - setDefault(LOCATION, NOT_REQUIRED); - setOverride(LANGUAGE, language); - } - - @Override - protected Analyzer produceResource(URL aUrl) - throws IOException - { - Properties props = getAggregatedProperties(); - - String language = props.getProperty(LANGUAGE); - - if (!"pt".equals(language)) { - throw new IOException("The language code '" + language - + "' is not supported by LanguageTool."); - } - - ComponentFactory factory = ComponentFactory.create(new Locale("pt", "BR")); - return factory.createPOSTagger(); - } - }; - - mappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - "bosque", language); - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - modelProvider.configure(cas); - mappingProvider.configure(cas); - - // This is actually quite some overhead, because internally Cogroo is just using a - // Morphlogik dictionary which simply takes a token and pos tag and returnes a list of - // lemmata. It would be much more efficient to use the dictionary directly. - - for (Sentence sentence : select(aJCas, Sentence.class)) { - // We set up one CoGrOO document for each sentence. That makes it easier to maintain - // a list of tokens of the sentence, which we later need to attached the lemmata to the - // tokens. - - // Construct the document - Document doc = new DocumentImpl(); - doc.setText(aJCas.getDocumentText()); - - // Extract the sentence and its tokens - org.cogroo.text.Sentence cSent = new SentenceImpl(sentence.getBegin(), sentence.getEnd(), doc); - List cTokens = new ArrayList(); - List dTokens = selectCovered(Token.class, sentence); - for (Token dTok : dTokens) { - TokenImpl cTok = new TokenImpl(dTok.getBegin() - sentence.getBegin(), - dTok.getEnd() - sentence.getBegin(), dTok.getText()); - cTokens.add(cTok); - } - cSent.setTokens(cTokens); - doc.setSentences(asList(cSent)); - - // Process - modelProvider.getResource().analyze(doc); - - assert cSent.getTokens().size() == dTokens.size(); - - // Convert from CoGrOO to UIMA model - Iterator dTokIt = dTokens.iterator(); - for (org.cogroo.text.Token cTok : cSent.getTokens()) { - // CoGrOO allows storing multiple lemmas per token. DKPro Core only allows one lemma - // per token. We just take the first one here. If we would run the grammar - // checking based on the DKPro Core lemmata, we might miss certain errors for this - // reason. - Token dTok = dTokIt.next(); - - Type posTag = mappingProvider.getTagType(cTok.getPOSTag()); - POS posAnno = (POS) cas.createAnnotation(posTag, cSent.getStart() + cTok.getStart(), - cSent.getStart() + cTok.getEnd()); - posAnno.setPosValue(internTags ? cTok.getPOSTag().intern() : cTok.getPOSTag()); - POSUtils.assignCoarseValue(posAnno); - posAnno.addToIndexes(); - dTok.setPos(posAnno); - } - } - } -} diff --git a/dkpro-core-cogroo-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooSegmenter.java b/dkpro-core-cogroo-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooSegmenter.java deleted file mode 100644 index f775406307..0000000000 --- a/dkpro-core-cogroo-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooSegmenter.java +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.cogroo; - -import java.io.IOException; -import java.net.URL; -import java.util.Locale; -import java.util.Properties; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.LanguageCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.cogroo.analyzer.Analyzer; -import org.cogroo.analyzer.ComponentFactory; -import org.cogroo.text.Document; -import org.cogroo.text.Sentence; -import org.cogroo.text.Token; -import org.cogroo.text.impl.DocumentImpl; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; - -/** - * Tokenizer and sentence splitter using CoGrOO. - */ -@ResourceMetaData(name="CoGrOO Segmenter") -@LanguageCapability("pt") -@TypeCapability( - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) -public class CogrooSegmenter - extends SegmenterBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - private CasConfigurableProviderBase sentenceModelProvider; - private CasConfigurableProviderBase tokenModelProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - sentenceModelProvider = new ModelProviderBase() { - { - setContextObject(CogrooSegmenter.this); - - setDefault(LOCATION, NOT_REQUIRED); - setOverride(LANGUAGE, language); - } - - @Override - protected Analyzer produceResource(URL aUrl) - throws IOException - { - Properties props = getAggregatedProperties(); - String language = props.getProperty(LANGUAGE); - - return ComponentFactory.create(Locale.forLanguageTag(language)) - .createSentenceDetector(); - } - }; - - tokenModelProvider = new ModelProviderBase() { - { - setContextObject(CogrooSegmenter.this); - - setDefault(LOCATION, NOT_REQUIRED); - setOverride(LANGUAGE, language); - } - - @Override - protected Analyzer produceResource(URL aUrl) - throws IOException - { - Properties props = getAggregatedProperties(); - String language = props.getProperty(LANGUAGE); - - return ComponentFactory.create(Locale.forLanguageTag(language)) - .createTokenizer(); - } - }; - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - sentenceModelProvider.configure(cas); - tokenModelProvider.configure(cas); - - super.process(aJCas); - } - - @Override - protected void process(JCas aJCas, String aText, int aZoneBegin) - throws AnalysisEngineProcessException - { - Document doc = new DocumentImpl(); - doc.setText(aText); - - sentenceModelProvider.getResource().analyze(doc); - tokenModelProvider.getResource().analyze(doc); - - for (Sentence s : doc.getSentences()) { - createSentence(aJCas, s.getStart() + aZoneBegin, s.getEnd() + aZoneBegin); - for (Token t : s.getTokens()) { - createToken(aJCas, t.getStart() + s.getStart() + aZoneBegin, - t.getEnd() + s.getStart() + aZoneBegin); - } - } - } -} diff --git a/dkpro-core-cogroo-asl/src/main/java/org/dkpro/core/cogroo/CogrooChecker.java b/dkpro-core-cogroo-asl/src/main/java/org/dkpro/core/cogroo/CogrooChecker.java new file mode 100644 index 0000000000..998dac593c --- /dev/null +++ b/dkpro-core-cogroo-asl/src/main/java/org/dkpro/core/cogroo/CogrooChecker.java @@ -0,0 +1,141 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.cogroo; + +import java.io.IOException; +import java.net.URL; +import java.util.Locale; +import java.util.Properties; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.cogroo.analyzer.ComponentFactory; +import org.cogroo.checker.CheckDocument; +import org.cogroo.checker.GrammarChecker; +import org.cogroo.entities.Mistake; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.ModelProviderBase; + +import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.GrammarAnomaly; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Detect grammatical errors in text using CoGrOO. + */ +@Component(OperationType.GRAMMAR_CHECKER) +@ResourceMetaData(name = "CoGrOO Grammar Checker") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability("pt") +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.GrammarAnomaly" }) +public class CogrooChecker + extends JCasAnnotator_ImplBase +{ + public static enum DetailLevel { + SHORT, LONG, FULL + } + + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + private String language; + + /** + * Set detail level. + */ + public static final String PARAM_DETAIL_LEVEL = "detailLevel"; + @ConfigurationParameter(name = PARAM_DETAIL_LEVEL, mandatory = true, defaultValue = "SHORT") + private DetailLevel detailLevel; + + private ModelProviderBase modelProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase() { + { + setContextObject(CogrooChecker.this); + setDefault(LOCATION, NOT_REQUIRED); + + setOverride(LANGUAGE, language); + } + + @Override + protected GrammarChecker produceResource(URL aUrl) + throws IOException + { + Properties props = getAggregatedProperties(); + if (!"pt".equals(props.getProperty(LANGUAGE))) { + throw new IOException("The language code '" + + props.getProperty(LANGUAGE) + "' is not supported by LanguageTool."); + } + + ComponentFactory factory = ComponentFactory.create(new Locale("pt", "BR")); + return new GrammarChecker(factory.createPipe()); + } + }; + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + modelProvider.configure(aJCas.getCas()); + + // get document text + CheckDocument document = new CheckDocument(aJCas.getDocumentText()); + + modelProvider.getResource().analyze(document); + for (Mistake match : document.getMistakes()) { + // create annotation + GrammarAnomaly annotation = new GrammarAnomaly(aJCas); + annotation.setBegin(match.getStart()); + annotation.setEnd(match.getEnd()); + switch (detailLevel) { + case SHORT: + annotation.setDescription(match.getShortMessage()); + break; + case LONG: + annotation.setDescription(match.getLongMessage()); + break; + case FULL: + annotation.setDescription(match.getFullMessage()); + break; + } + annotation.addToIndexes(); + if (getLogger().isTraceEnabled()) { + getLogger().trace("Found: " + annotation); + } + } + } +} diff --git a/dkpro-core-cogroo-asl/src/main/java/org/dkpro/core/cogroo/CogrooFeaturizer.java b/dkpro-core-cogroo-asl/src/main/java/org/dkpro/core/cogroo/CogrooFeaturizer.java new file mode 100644 index 0000000000..21183eb5de --- /dev/null +++ b/dkpro-core-cogroo-asl/src/main/java/org/dkpro/core/cogroo/CogrooFeaturizer.java @@ -0,0 +1,171 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.cogroo; + +import static java.util.Arrays.asList; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Properties; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.cogroo.analyzer.Analyzer; +import org.cogroo.analyzer.ComponentFactory; +import org.cogroo.text.Document; +import org.cogroo.text.impl.DocumentImpl; +import org.cogroo.text.impl.SentenceImpl; +import org.cogroo.text.impl.TokenImpl; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Morphological analyzer using CoGrOO. + */ +@Component(OperationType.MORPHOLOGICAL_TAGGER) +@ResourceMetaData(name = "CoGrOO Morphological Analyzer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability("pt") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures" }) + +public class CogrooFeaturizer + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + private CasConfigurableProviderBase modelProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase() { + { + setContextObject(CogrooFeaturizer.this); + + setDefault(LOCATION, NOT_REQUIRED); + setOverride(LANGUAGE, language); + } + + @Override + protected Analyzer produceResource(URL aUrl) + throws IOException + { + Properties props = getAggregatedProperties(); + + String language = props.getProperty(LANGUAGE); + + if (!"pt".equals(language)) { + throw new IOException("The language code '" + language + + "' is not supported by LanguageTool."); + } + + ComponentFactory factory = ComponentFactory.create(new Locale("pt", "BR")); + + return factory.createFeaturizer(); + } + }; + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + CAS cas = aJCas.getCas(); + modelProvider.configure(cas); + + // This is actually quite some overhead, because internally Cogroo is just using a + // OpenNLP classifier which simply takes a token and pos tag and returnes a list of + // features. It would be much more efficient to use the classifier directly. + + for (Sentence sentence : select(aJCas, Sentence.class)) { + // We set up one CoGrOO document for each sentence. That makes it easier to maintain + // a list of tokens of the sentence, which we later need to attached the lemmata to the + // tokens. + + // Construct the document + Document doc = new DocumentImpl(); + doc.setText(aJCas.getDocumentText()); + + // Extract the sentence and its tokens + org.cogroo.text.Sentence cSent = new SentenceImpl(sentence.getBegin(), + sentence.getEnd(), doc); + List cTokens = new ArrayList(); + List dTokens = selectCovered(Token.class, sentence); + for (Token dTok : dTokens) { + TokenImpl cTok = new TokenImpl(dTok.getBegin() - sentence.getBegin(), + dTok.getEnd() - sentence.getBegin(), dTok.getText()); + cTok.setPOSTag(dTok.getPos().getPosValue()); + cTokens.add(cTok); + } + cSent.setTokens(cTokens); + doc.setSentences(asList(cSent)); + + // Process + modelProvider.getResource().analyze(doc); + + assert cSent.getTokens().size() == dTokens.size(); + + // Convert from CoGrOO to UIMA model + Iterator dTokIt = dTokens.iterator(); + for (org.cogroo.text.Token cTok : cSent.getTokens()) { + Token dTok = dTokIt.next(); + MorphologicalFeatures m = new MorphologicalFeatures(aJCas, cSent.getStart() + + cTok.getStart(), cSent.getStart() + cTok.getEnd()); + m.setValue(cTok.getFeatures()); + m.addToIndexes(); + dTok.setMorph(m); + } + } + } +} diff --git a/dkpro-core-cogroo-asl/src/main/java/org/dkpro/core/cogroo/CogrooLemmatizer.java b/dkpro-core-cogroo-asl/src/main/java/org/dkpro/core/cogroo/CogrooLemmatizer.java new file mode 100644 index 0000000000..f63e1fa81f --- /dev/null +++ b/dkpro-core-cogroo-asl/src/main/java/org/dkpro/core/cogroo/CogrooLemmatizer.java @@ -0,0 +1,185 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.cogroo; + +import static java.util.Arrays.asList; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Properties; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.cogroo.analyzer.Analyzer; +import org.cogroo.analyzer.ComponentFactory; +import org.cogroo.text.Document; +import org.cogroo.text.impl.DocumentImpl; +import org.cogroo.text.impl.SentenceImpl; +import org.cogroo.text.impl.TokenImpl; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Lemmatizer using CoGrOO. + */ +@Component(OperationType.LEMMATIZER) +@ResourceMetaData(name = "CoGrOO Lemmatizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability("pt") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }) + +public class CogrooLemmatizer + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + private CasConfigurableProviderBase modelProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase() { + { + setContextObject(CogrooLemmatizer.this); + + setDefault(LOCATION, NOT_REQUIRED); + setOverride(LANGUAGE, language); + } + + @Override + protected Analyzer produceResource(URL aUrl) + throws IOException + { + Properties props = getAggregatedProperties(); + + String language = props.getProperty(LANGUAGE); + + if (!"pt".equals(language)) { + throw new IOException("The language code '" + language + + "' is not supported by LanguageTool."); + } + + ComponentFactory factory = ComponentFactory.create(new Locale("pt", "BR")); + + return factory.createLemmatizer(); + } + }; + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + CAS cas = aJCas.getCas(); + modelProvider.configure(cas); + + // This is actually quite some overhead, because internally Cogroo is just using a + // Morphlogik dictionary which simply takes a token and pos tag and returnes a list of + // lemmata. It would be much more efficient to use the dictionary directly. + + for (Sentence sentence : select(aJCas, Sentence.class)) { + // We set up one CoGrOO document for each sentence. That makes it easier to maintain + // a list of tokens of the sentence, which we later need to attached the lemmata to the + // tokens. + + // Construct the document + Document doc = new DocumentImpl(); + doc.setText(aJCas.getDocumentText()); + + // Extract the sentence and its tokens + org.cogroo.text.Sentence cSent = new SentenceImpl(sentence.getBegin(), + sentence.getEnd(), doc); + List cTokens = new ArrayList(); + List dTokens = selectCovered(Token.class, sentence); + for (Token dTok : dTokens) { + TokenImpl cTok = new TokenImpl(dTok.getBegin() - sentence.getBegin(), + dTok.getEnd() - sentence.getBegin(), dTok.getText()); + cTok.setPOSTag(dTok.getPos().getPosValue()); + cTokens.add(cTok); + } + cSent.setTokens(cTokens); + doc.setSentences(asList(cSent)); + + // Process + modelProvider.getResource().analyze(doc); + + assert cSent.getTokens().size() == dTokens.size(); + + // Convert from CoGrOO to UIMA model + Iterator dTokIt = dTokens.iterator(); + for (org.cogroo.text.Token cTok : cSent.getTokens()) { + // CoGrOO allows storing multiple lemmas per token. DKPro Core only allows one lemma + // per token. We just take the first one here. If we would run the grammar + // checking based on the DKPro Core lemmata, we might miss certain errors for this + // reason. + Token dTok = dTokIt.next(); + String[] lemmas = cTok.getLemmas(); + Lemma l = new Lemma(aJCas, cSent.getStart() + cTok.getStart(), + cSent.getStart() + cTok.getEnd()); + if (lemmas != null && lemmas.length > 0) { + String lemmaString = lemmas[0]; + if (lemmaString == null) { + lemmaString = dTok.getText(); + } + l.setValue(lemmaString); + } + else { + l.setValue(cTok.getLexeme()); + } + l.addToIndexes(); + dTok.setLemma(l); + } + } + } +} diff --git a/dkpro-core-cogroo-asl/src/main/java/org/dkpro/core/cogroo/CogrooNamedEntityRecognizer.java b/dkpro-core-cogroo-asl/src/main/java/org/dkpro/core/cogroo/CogrooNamedEntityRecognizer.java new file mode 100644 index 0000000000..989aebccce --- /dev/null +++ b/dkpro-core-cogroo-asl/src/main/java/org/dkpro/core/cogroo/CogrooNamedEntityRecognizer.java @@ -0,0 +1,153 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.cogroo; + +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import java.util.Properties; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.cogroo.analyzer.Analyzer; +import org.cogroo.analyzer.ComponentFactory; +import org.cogroo.config.Analyzers; +import org.cogroo.text.Document; +import org.cogroo.text.impl.DocumentImpl; +import org.cogroo.text.impl.SentenceImpl; +import org.cogroo.text.impl.TokenImpl; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; + +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Tokenizer and sentence splitter using CoGrOO. + */ +@Component(OperationType.NAMED_ENTITITY_RECOGNIZER) +@ResourceMetaData(name = "CoGrOO Named Entity Recognizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability("pt") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity" }) +public class CogrooNamedEntityRecognizer + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + private CasConfigurableProviderBase modelProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase() { + { + setContextObject(CogrooNamedEntityRecognizer.this); + + setDefault(LOCATION, NOT_REQUIRED); + setOverride(LANGUAGE, language); + } + + @Override + protected Analyzer produceResource(URL aUrl) + throws IOException + { + Properties props = getAggregatedProperties(); + + String language = props.getProperty(LANGUAGE); + + return ComponentFactory.create(Locale.forLanguageTag(language)).createNameFinder(); + } + }; + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + CAS cas = aJCas.getCas(); + modelProvider.configure(cas); + + // This is actually quite some overhead, because internally Cogroo is just using the + // OpenNLP namefinder which simply takes a string array and returns and arrays of spans... + // It would be much more efficient to use the model directly. + + // Convert from UIMA to Cogroo model + Document doc = new DocumentImpl(); + doc.setText(aJCas.getDocumentText()); + List sentences = new ArrayList(); + for (Sentence sentence : select(aJCas, Sentence.class)) { + org.cogroo.text.Sentence s = new SentenceImpl(sentence.getBegin(), sentence.getEnd(), + doc); + List tokens = new ArrayList(); + for (Token token : selectCovered(Token.class, sentence)) { + tokens.add(new TokenImpl(token.getBegin() - sentence.getBegin(), + token.getEnd() - sentence.getBegin(), token.getCoveredText())); + } + s.setTokens(tokens); + sentences.add(s); + } + doc.setSentences(sentences); + + // Process + modelProvider.getResource().analyze(doc); + + // Convert from Cogroo to UIMA model + for (org.cogroo.text.Sentence s : doc.getSentences()) { + for (org.cogroo.text.Token t : s.getTokens()) { + if ("P".equals(t.getAdditionalContext(Analyzers.NAME_FINDER))) { + NamedEntity ne = new NamedEntity(aJCas, s.getStart() + t.getStart(), + s.getStart() + t.getEnd()); + ne.setValue("P"); + ne.addToIndexes(); + } + } + } + } +} diff --git a/dkpro-core-cogroo-asl/src/main/java/org/dkpro/core/cogroo/CogrooPosTagger.java b/dkpro-core-cogroo-asl/src/main/java/org/dkpro/core/cogroo/CogrooPosTagger.java new file mode 100644 index 0000000000..cd40f172fd --- /dev/null +++ b/dkpro-core-cogroo-asl/src/main/java/org/dkpro/core/cogroo/CogrooPosTagger.java @@ -0,0 +1,199 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.cogroo; + +import static java.util.Arrays.asList; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Properties; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Type; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.cogroo.analyzer.Analyzer; +import org.cogroo.analyzer.ComponentFactory; +import org.cogroo.text.Document; +import org.cogroo.text.impl.DocumentImpl; +import org.cogroo.text.impl.SentenceImpl; +import org.cogroo.text.impl.TokenImpl; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.MappingProviderFactory; +import org.dkpro.core.api.resources.ModelProviderBase; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * POS-tagger using CoGrOO. + */ +@Component(OperationType.PART_OF_SPEECH_TAGGER) +@ResourceMetaData(name = "CoGrOO POS-Tagger") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability("pt") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) +public class CogrooPosTagger + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Load the part-of-speech tag to UIMA type mapping from this location instead of locating + * the mapping automatically. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + protected String posMappingLocation; + + private CasConfigurableProviderBase modelProvider; + private MappingProvider mappingProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase() { + { + setContextObject(CogrooPosTagger.this); + + setDefault(LOCATION, NOT_REQUIRED); + setOverride(LANGUAGE, language); + } + + @Override + protected Analyzer produceResource(URL aUrl) + throws IOException + { + Properties props = getAggregatedProperties(); + + String language = props.getProperty(LANGUAGE); + + if (!"pt".equals(language)) { + throw new IOException("The language code '" + language + + "' is not supported by LanguageTool."); + } + + ComponentFactory factory = ComponentFactory.create(new Locale("pt", "BR")); + return factory.createPOSTagger(); + } + }; + + mappingProvider = MappingProviderFactory.createPosMappingProvider(this, posMappingLocation, + "bosque", language); + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + CAS cas = aJCas.getCas(); + modelProvider.configure(cas); + mappingProvider.configure(cas); + + // This is actually quite some overhead, because internally Cogroo is just using a + // Morphlogik dictionary which simply takes a token and pos tag and returnes a list of + // lemmata. It would be much more efficient to use the dictionary directly. + + for (Sentence sentence : select(aJCas, Sentence.class)) { + // We set up one CoGrOO document for each sentence. That makes it easier to maintain + // a list of tokens of the sentence, which we later need to attached the lemmata to the + // tokens. + + // Construct the document + Document doc = new DocumentImpl(); + doc.setText(aJCas.getDocumentText()); + + // Extract the sentence and its tokens + org.cogroo.text.Sentence cSent = new SentenceImpl(sentence.getBegin(), + sentence.getEnd(), doc); + List cTokens = new ArrayList(); + List dTokens = selectCovered(Token.class, sentence); + for (Token dTok : dTokens) { + TokenImpl cTok = new TokenImpl(dTok.getBegin() - sentence.getBegin(), + dTok.getEnd() - sentence.getBegin(), dTok.getText()); + cTokens.add(cTok); + } + cSent.setTokens(cTokens); + doc.setSentences(asList(cSent)); + + // Process + modelProvider.getResource().analyze(doc); + + assert cSent.getTokens().size() == dTokens.size(); + + // Convert from CoGrOO to UIMA model + Iterator dTokIt = dTokens.iterator(); + for (org.cogroo.text.Token cTok : cSent.getTokens()) { + // CoGrOO allows storing multiple lemmas per token. DKPro Core only allows one lemma + // per token. We just take the first one here. If we would run the grammar + // checking based on the DKPro Core lemmata, we might miss certain errors for this + // reason. + Token dTok = dTokIt.next(); + + Type posTag = mappingProvider.getTagType(cTok.getPOSTag()); + POS posAnno = (POS) cas.createAnnotation(posTag, cSent.getStart() + cTok.getStart(), + cSent.getStart() + cTok.getEnd()); + String tag = cTok.getPOSTag(); + posAnno.setPosValue(tag != null ? tag.intern() : null); + POSUtils.assignCoarseValue(posAnno); + posAnno.addToIndexes(); + dTok.setPos(posAnno); + } + } + } +} diff --git a/dkpro-core-cogroo-asl/src/main/java/org/dkpro/core/cogroo/CogrooSegmenter.java b/dkpro-core-cogroo-asl/src/main/java/org/dkpro/core/cogroo/CogrooSegmenter.java new file mode 100644 index 0000000000..8ce97e6503 --- /dev/null +++ b/dkpro-core-cogroo-asl/src/main/java/org/dkpro/core/cogroo/CogrooSegmenter.java @@ -0,0 +1,146 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.cogroo; + +import java.io.IOException; +import java.net.URL; +import java.util.Locale; +import java.util.Properties; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.cogroo.analyzer.Analyzer; +import org.cogroo.analyzer.ComponentFactory; +import org.cogroo.text.Document; +import org.cogroo.text.Sentence; +import org.cogroo.text.Token; +import org.cogroo.text.impl.DocumentImpl; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.segmentation.SegmenterBase; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Tokenizer and sentence splitter using CoGrOO. + */ +@ResourceMetaData(name = "CoGrOO Segmenter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability("pt") +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) +public class CogrooSegmenter + extends SegmenterBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + private CasConfigurableProviderBase sentenceModelProvider; + private CasConfigurableProviderBase tokenModelProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + sentenceModelProvider = new ModelProviderBase() { + { + setContextObject(CogrooSegmenter.this); + + setDefault(LOCATION, NOT_REQUIRED); + setOverride(LANGUAGE, language); + } + + @Override + protected Analyzer produceResource(URL aUrl) + throws IOException + { + Properties props = getAggregatedProperties(); + String language = props.getProperty(LANGUAGE); + + return ComponentFactory.create(Locale.forLanguageTag(language)) + .createSentenceDetector(); + } + }; + + tokenModelProvider = new ModelProviderBase() { + { + setContextObject(CogrooSegmenter.this); + + setDefault(LOCATION, NOT_REQUIRED); + setOverride(LANGUAGE, language); + } + + @Override + protected Analyzer produceResource(URL aUrl) + throws IOException + { + Properties props = getAggregatedProperties(); + String language = props.getProperty(LANGUAGE); + + return ComponentFactory.create(Locale.forLanguageTag(language)) + .createTokenizer(); + } + }; + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + CAS cas = aJCas.getCas(); + sentenceModelProvider.configure(cas); + tokenModelProvider.configure(cas); + + super.process(aJCas); + } + + @Override + protected void process(JCas aJCas, String aText, int aZoneBegin) + throws AnalysisEngineProcessException + { + Document doc = new DocumentImpl(); + doc.setText(aText); + + sentenceModelProvider.getResource().analyze(doc); + tokenModelProvider.getResource().analyze(doc); + + for (Sentence s : doc.getSentences()) { + createSentence(aJCas, s.getStart() + aZoneBegin, s.getEnd() + aZoneBegin); + for (Token t : s.getTokens()) { + createToken(aJCas, t.getStart() + s.getStart() + aZoneBegin, + t.getEnd() + s.getStart() + aZoneBegin); + } + } + } +} diff --git a/dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooCheckerTest.java b/dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooCheckerTest.java deleted file mode 100644 index ec43eac5c3..0000000000 --- a/dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooCheckerTest.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.cogroo; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.*; - -import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.jcas.JCas; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.GrammarAnomaly; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -public class CogrooCheckerTest -{ - @Test - public void test() - throws Exception - { - JCas jcas = runTest("pt", - "Fomos levados à crer que os menino são burro de doer. As menina chegaram."); - - String[] anomalies = { - "[ 14, 15] GrammarAnomaly (Não acontece crase antes de verbo.)", - "[ 25, 34] GrammarAnomaly (Os artigos concordam com o substantivo a que se referem.)", - "[ 54, 63] GrammarAnomaly (Os artigos concordam com o substantivo a que se referem.)", - "[ 64, 72] GrammarAnomaly (Verificou-se erro de concordância entre o sujeito e o verbo.)" }; - - assertAnomaly(anomalies, select(jcas, GrammarAnomaly.class)); - } - - private JCas runTest(String aLanguage, String aText) - throws UIMAException - { - AnalysisEngineDescription checker = createEngineDescription(CogrooChecker.class); - - return TestRunner.runTest(checker, aLanguage, aText); - } -} diff --git a/dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooFeaturizerTest.java b/dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooFeaturizerTest.java deleted file mode 100644 index 1230476947..0000000000 --- a/dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooFeaturizerTest.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.cogroo; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.jcas.JCas; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TestName; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -public class CogrooFeaturizerTest -{ - @Test - public void testPortuguese() - throws Exception - { - runTest("pt", "Este é um teste . ", new String[] { - "[ 0, 4] - - - - - - - - - - - - - - - - - Este (M=S)", - "[ 5, 6] - - - - - - - - - - - - - - - - - é (PR=3S=IND)", - "[ 7, 9] - - - - - - - - - - - - - - - - - um (M=S)", - "[ 10, 15] - - - - - - - - - - - - - - - - - teste (M=S)", - "[ 16, 17] - - - - - - - - - - - - - - - - - . (-)" }); - - runTest("pt", "Uma rede neural .", new String[] { - "[ 0, 3] - - - - - - - - - - - - - - - - - Uma (F=S)", - "[ 4, 8] - - - - - - - - - - - - - - - - - rede (F=S)", - "[ 9, 15] - - - - - - - - - - - - - - - - - neural (F=S)", - "[ 16, 17] - - - - - - - - - - - - - - - - - . (-)" }); - - runTest("pt", "John está comprando laranjas .", new String[] { - "[ 0, 4] - - - - - - - - - - - - - - - - - John (M=S)", - "[ 5, 9] - - - - - - - - - - - - - - - - - está (PR=3S=IND)", - "[ 10, 19] - - - - - - - - - - - - - - - - - comprando (-)", - "[ 20, 28] - - - - - - - - - - - - - - - - - laranjas (F=P)", - "[ 29, 30] - - - - - - - - - - - - - - - - - . (-)" }); - } - - private void runTest(String language, String testDocument, String[] aFeatures) - throws Exception - { - AnalysisEngineDescription engine = createEngineDescription( - createEngineDescription(CogrooPosTagger.class), - createEngineDescription(CogrooFeaturizer.class)); - - JCas jcas = TestRunner.runTest(engine, language, testDocument); - - AssertAnnotations.assertMorph(aFeatures, select(jcas, MorphologicalFeatures.class)); - } - - @Rule - public TestName name = new TestName(); - - @Before - public void printSeparator() - { - System.out.println("\n=== " + name.getMethodName() + " ====================="); - } -} diff --git a/dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooLemmatizerTest.java b/dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooLemmatizerTest.java deleted file mode 100644 index c9fb05bc17..0000000000 --- a/dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooLemmatizerTest.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.cogroo; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.jcas.JCas; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TestName; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -public class CogrooLemmatizerTest -{ - @Test - public void testPortuguese() - throws Exception - { - runTest("pt", "Este é um teste . ", - new String[] { "este", "ser", "um", "teste", "." }); - - runTest("pt", "Uma rede neural .", - new String[] { "um", "rede", "neural", "." }); - - runTest("pt", "John está comprando laranjas .", - new String[] { "John", "estar", "comprar", "laranja", "." }); - } - - private void runTest(String language, String testDocument, String[] aLemma) - throws Exception - { - AnalysisEngineDescription engine = createEngineDescription( - createEngineDescription(CogrooPosTagger.class), - createEngineDescription(CogrooLemmatizer.class)); - - JCas jcas = TestRunner.runTest(engine, language, testDocument); - - AssertAnnotations.assertLemma(aLemma, select(jcas, Lemma.class)); - } - - @Rule - public TestName name = new TestName(); - - @Before - public void printSeparator() - { - System.out.println("\n=== " + name.getMethodName() + " ====================="); - } -} diff --git a/dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooPosTaggerTest.java b/dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooPosTaggerTest.java deleted file mode 100644 index 4819357d2d..0000000000 --- a/dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooPosTaggerTest.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.cogroo; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.jcas.JCas; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TestName; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -public class CogrooPosTaggerTest -{ - @Test - public void testPortuguese() - throws Exception - { - JCas jcas = runTest("pt", null, "Este é um teste .", - new String[] { "pron-det", "v-fin", "art", "n", "." }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS" }); - -// String[] posTags = new String[] { "?", "adj", "adv", "art", "conj-c", "conj-s", "ec", "in", -// "n", "num", "pp", "pron-det", "pron-indp", "pron-pers", "prop", "prp", "punc", -// "v-fin", "v-ger", "v-inf", "v-pcp", "vp" }; -// -// AssertAnnotations.assertTagset(POS.class, "bosque", posTags, jcas); - } - - private JCas runTest(String language, String variant, String testDocument, String[] tags, - String[] tagClasses) - throws Exception - { - AnalysisEngine engine = createEngine(CogrooPosTagger.class); - - JCas jcas = TestRunner.runTest(engine, language, testDocument); - - AssertAnnotations.assertPOS(tagClasses, tags, select(jcas, POS.class)); - - return jcas; - } - - @Rule - public TestName name = new TestName(); - - @Before - public void printSeparator() - { - System.out.println("\n=== " + name.getMethodName() + " ====================="); - } -} diff --git a/dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/CogrooCheckerTest.java b/dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/CogrooCheckerTest.java new file mode 100644 index 0000000000..f313c82143 --- /dev/null +++ b/dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/CogrooCheckerTest.java @@ -0,0 +1,59 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.cogroo; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.AssertAnnotations.assertAnomaly; + +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.cogroo.CogrooChecker; +import org.dkpro.core.testing.TestRunner; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.GrammarAnomaly; + +public class CogrooCheckerTest +{ + @Test + public void test() + throws Exception + { + JCas jcas = runTest("pt", + "Fomos levados à crer que os menino são burro de doer. As menina chegaram."); + + String[] anomalies = { + "[ 14, 15] GrammarAnomaly (Não acontece crase antes de verbo.)", + "[ 25, 34] GrammarAnomaly (Os artigos concordam com o substantivo a que se referem.)", + "[ 54, 63] GrammarAnomaly (Os artigos concordam com o substantivo a que se referem.)", + "[ 64, 72] GrammarAnomaly (Verificou-se erro de concordância entre o sujeito e o verbo.)" + }; + + assertAnomaly(anomalies, select(jcas, GrammarAnomaly.class)); + } + + private JCas runTest(String aLanguage, String aText) + throws UIMAException + { + AnalysisEngineDescription checker = createEngineDescription(CogrooChecker.class); + + return TestRunner.runTest(checker, aLanguage, aText); + } +} diff --git a/dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/CogrooFeaturizerTest.java b/dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/CogrooFeaturizerTest.java new file mode 100644 index 0000000000..661c6fe539 --- /dev/null +++ b/dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/CogrooFeaturizerTest.java @@ -0,0 +1,84 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.cogroo; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.cogroo.CogrooFeaturizer; +import org.dkpro.core.cogroo.CogrooPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.TestRunner; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; + +public class CogrooFeaturizerTest +{ + @Test + public void testPortuguese() throws Exception + { + runTest("pt", "Este é um teste . ", new String[] { + "[ 0, 4] - - - - - - - - - - - - - - - - - Este (M=S)", + "[ 5, 6] - - - - - - - - - - - - - - - - - é (PR=3S=IND)", + "[ 7, 9] - - - - - - - - - - - - - - - - - um (M=S)", + "[ 10, 15] - - - - - - - - - - - - - - - - - teste (M=S)", + "[ 16, 17] - - - - - - - - - - - - - - - - - . (-)" + }); + + runTest("pt", "Uma rede neural .", new String[] { + "[ 0, 3] - - - - - - - - - - - - - - - - - Uma (F=S)", + "[ 4, 8] - - - - - - - - - - - - - - - - - rede (F=S)", + "[ 9, 15] - - - - - - - - - - - - - - - - - neural (F=S)", + "[ 16, 17] - - - - - - - - - - - - - - - - - . (-)" + }); + + runTest("pt", "John está comprando laranjas .", new String[] { + "[ 0, 4] - - - - - - - - - - - - - - - - - John (M=S)", + "[ 5, 9] - - - - - - - - - - - - - - - - - está (PR=3S=IND)", + "[ 10, 19] - - - - - - - - - - - - - - - - - comprando (-)", + "[ 20, 28] - - - - - - - - - - - - - - - - - laranjas (F=P)", + "[ 29, 30] - - - - - - - - - - - - - - - - - . (-)" + }); + } + + private void runTest(String language, String testDocument, String[] aFeatures) throws Exception + { + AnalysisEngineDescription engine = createEngineDescription( + createEngineDescription(CogrooPosTagger.class), + createEngineDescription(CogrooFeaturizer.class)); + + JCas jcas = TestRunner.runTest(engine, language, testDocument); + + AssertAnnotations.assertMorph(aFeatures, select(jcas, MorphologicalFeatures.class)); + } + + @Rule + public TestName name = new TestName(); + + @Before + public void printSeparator() + { + System.out.println("\n=== " + name.getMethodName() + " ====================="); + } +} diff --git a/dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/CogrooLemmatizerTest.java b/dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/CogrooLemmatizerTest.java new file mode 100644 index 0000000000..cc29ce44b3 --- /dev/null +++ b/dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/CogrooLemmatizerTest.java @@ -0,0 +1,68 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.cogroo; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.cogroo.CogrooLemmatizer; +import org.dkpro.core.cogroo.CogrooPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.TestRunner; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; + +public class CogrooLemmatizerTest +{ + @Test + public void testPortuguese() throws Exception + { + runTest("pt", "Este é um teste . ", new String[] { "este", "ser", "um", "teste", "." }); + + runTest("pt", "Uma rede neural .", new String[] { "um", "rede", "neural", "." }); + + runTest("pt", "John está comprando laranjas .", + new String[] { "John", "estar", "comprar", "laranja", "." }); + } + + private void runTest(String language, String testDocument, String[] aLemma) throws Exception + { + AnalysisEngineDescription engine = createEngineDescription( + createEngineDescription(CogrooPosTagger.class), + createEngineDescription(CogrooLemmatizer.class)); + + JCas jcas = TestRunner.runTest(engine, language, testDocument); + + AssertAnnotations.assertLemma(aLemma, select(jcas, Lemma.class)); + } + + @Rule + public TestName name = new TestName(); + + @Before + public void printSeparator() + { + System.out.println("\n=== " + name.getMethodName() + " ====================="); + } +} diff --git a/dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooNamedEntityRecognizerTest.java b/dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/CogrooNamedEntityRecognizerTest.java similarity index 90% rename from dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooNamedEntityRecognizerTest.java rename to dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/CogrooNamedEntityRecognizerTest.java index 9e119babdb..25b6bc0d9c 100644 --- a/dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooNamedEntityRecognizerTest.java +++ b/dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/CogrooNamedEntityRecognizerTest.java @@ -15,18 +15,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.cogroo; +package org.dkpro.core.cogroo; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertNamedEntity; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.AssertAnnotations.assertNamedEntity; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.jcas.JCas; +import org.dkpro.core.cogroo.CogrooNamedEntityRecognizer; +import org.dkpro.core.testing.TestRunner; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class CogrooNamedEntityRecognizerTest { diff --git a/dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/CogrooPosTaggerTest.java b/dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/CogrooPosTaggerTest.java new file mode 100644 index 0000000000..2c9239bc5f --- /dev/null +++ b/dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/CogrooPosTaggerTest.java @@ -0,0 +1,73 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.cogroo; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.cogroo.CogrooPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.TestRunner; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; + +public class CogrooPosTaggerTest +{ + @Test + public void testPortuguese() + throws Exception + { + JCas jcas = runTest("pt", null, "Este é um teste .", + new String[] { "pron-det", "v-fin", "art", "n", "." }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS" }); + +// String[] posTags = new String[] { "?", "adj", "adv", "art", "conj-c", "conj-s", "ec", "in", +// "n", "num", "pp", "pron-det", "pron-indp", "pron-pers", "prop", "prp", "punc", +// "v-fin", "v-ger", "v-inf", "v-pcp", "vp" }; +// +// AssertAnnotations.assertTagset(POS.class, "bosque", posTags, jcas); + } + + private JCas runTest(String language, String variant, String testDocument, String[] tags, + String[] tagClasses) + throws Exception + { + AnalysisEngine engine = createEngine(CogrooPosTagger.class); + + JCas jcas = TestRunner.runTest(engine, language, testDocument); + + AssertAnnotations.assertPOS(tagClasses, tags, select(jcas, POS.class)); + + return jcas; + } + + @Rule + public TestName name = new TestName(); + + @Before + public void printSeparator() + { + System.out.println("\n=== " + name.getMethodName() + " ====================="); + } +} diff --git a/dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooSegmenterTest.java b/dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/CogrooSegmenterTest.java similarity index 87% rename from dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooSegmenterTest.java rename to dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/CogrooSegmenterTest.java index 3a867b98b9..7d7bf30319 100644 --- a/dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/CogrooSegmenterTest.java +++ b/dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/CogrooSegmenterTest.java @@ -15,20 +15,23 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.cogroo; +package org.dkpro.core.cogroo; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.*; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.AssertAnnotations.assertSentence; +import static org.dkpro.core.testing.AssertAnnotations.assertToken; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; +import org.dkpro.core.cogroo.CogrooSegmenter; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.harness.SegmenterHarness; import org.junit.Rule; import org.junit.Test; + import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.harness.SegmenterHarness; public class CogrooSegmenterTest { diff --git a/dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/SimpleTest.java b/dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/SimpleTest.java similarity index 97% rename from dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/SimpleTest.java rename to dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/SimpleTest.java index aaaa5f5f6a..5e458e4b45 100644 --- a/dkpro-core-cogroo-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/cogroo/SimpleTest.java +++ b/dkpro-core-cogroo-asl/src/test/java/org/dkpro/core/cogroo/SimpleTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.cogroo; +package org.dkpro.core.cogroo; import java.util.Locale; diff --git a/dkpro-core-commonscodec-asl/pom.xml b/dkpro-core-commonscodec-asl/pom.xml index 3cc595c47b..917ff4e5c3 100644 --- a/dkpro-core-commonscodec-asl/pom.xml +++ b/dkpro-core-commonscodec-asl/pom.xml @@ -1,57 +1,61 @@ - 4.0.0 - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT - ../dkpro-core-asl - - de.tudarmstadt.ukp.dkpro.core.commonscodec-asl - jar - DKPro Core ASL - Apache Commons Codec - - - commons-codec - commons-codec - - - org.apache.uima - uimaj-core - - - junit - junit - test - - - org.apache.uima - uimafit-core - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.phonetics-asl - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl - - - + 4.0.0 + + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT + ../dkpro-core-asl + + dkpro-core-commonscodec-asl + jar + DKPro Core ASL - Apache Commons Codec + https://dkpro.github.io/dkpro-core/ + + + commons-codec + commons-codec + + + org.apache.uima + uimaj-core + + + org.apache.uima + uimafit-core + + + org.dkpro.core + dkpro-core-api-phonetics-asl + + + org.dkpro.core + dkpro-core-api-segmentation-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api + + + junit + junit + test + + \ No newline at end of file diff --git a/dkpro-core-commonscodec-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/ColognePhoneticTranscriptor.java b/dkpro-core-commonscodec-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/ColognePhoneticTranscriptor.java deleted file mode 100644 index e75522dce6..0000000000 --- a/dkpro-core-commonscodec-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/ColognePhoneticTranscriptor.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright 2013 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.commonscodec; - -import org.apache.commons.codec.language.ColognePhonetic; -import org.apache.uima.fit.descriptor.LanguageCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; - -/** - * Cologne phonetic (Kölner Phonetik) transcription based on Apache Commons Codec. - * Works for German. - */ -@ResourceMetaData(name="Commons Codec Cologne Phonetic Transcriptor") -@LanguageCapability("de") -@TypeCapability( - inputs={"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, - outputs={"de.tudarmstadt.ukp.dkpro.core.api.phonetics.type.PhoneticTranscription"}) - -public class ColognePhoneticTranscriptor - extends PhoneticTranscriptor_ImplBase -{ - - public ColognePhoneticTranscriptor() - { - this.encoder = new ColognePhonetic(); - } -} diff --git a/dkpro-core-commonscodec-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/DoubleMetaphonePhoneticTranscriptor.java b/dkpro-core-commonscodec-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/DoubleMetaphonePhoneticTranscriptor.java deleted file mode 100644 index 4c7b57dbb0..0000000000 --- a/dkpro-core-commonscodec-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/DoubleMetaphonePhoneticTranscriptor.java +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright 2013 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.commonscodec; - -import org.apache.commons.codec.language.DoubleMetaphone; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; - -/** - * Double-Metaphone phonetic transcription based on Apache Commons Codec. - * Works for English. - */ -@ResourceMetaData(name="Commons Codec Double-Metaphone Phonetic Transcriptor") -@TypeCapability( - inputs={"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, - outputs={"de.tudarmstadt.ukp.dkpro.core.api.phonetics.type.PhoneticTranscription"}) - -public class DoubleMetaphonePhoneticTranscriptor - extends PhoneticTranscriptor_ImplBase -{ - - public DoubleMetaphonePhoneticTranscriptor() - { - this.encoder = new DoubleMetaphone(); - } -} diff --git a/dkpro-core-commonscodec-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/MetaphonePhoneticTranscriptor.java b/dkpro-core-commonscodec-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/MetaphonePhoneticTranscriptor.java deleted file mode 100644 index aa52a67a10..0000000000 --- a/dkpro-core-commonscodec-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/MetaphonePhoneticTranscriptor.java +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright 2013 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.commonscodec; - -import org.apache.commons.codec.language.Metaphone; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; - -/** - * Metaphone phonetic transcription based on Apache Commons Codec. - * Works for English. - */ -@ResourceMetaData(name="Commons Codec Metaphone Phonetic Transcriptor") -@TypeCapability( - inputs={"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, - outputs={"de.tudarmstadt.ukp.dkpro.core.api.phonetics.type.PhoneticTranscription"}) - -public class MetaphonePhoneticTranscriptor - extends PhoneticTranscriptor_ImplBase -{ - - public MetaphonePhoneticTranscriptor() - { - this.encoder = new Metaphone(); - } -} diff --git a/dkpro-core-commonscodec-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/SoundexPhoneticTranscriptor.java b/dkpro-core-commonscodec-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/SoundexPhoneticTranscriptor.java deleted file mode 100644 index a4506f1d74..0000000000 --- a/dkpro-core-commonscodec-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/SoundexPhoneticTranscriptor.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright 2013 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.commonscodec; - -import org.apache.commons.codec.language.Soundex; -import org.apache.uima.fit.descriptor.LanguageCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; - -/** - * Soundex phonetic transcription based on Apache Commons Codec. - * Works for English. - */ -@ResourceMetaData(name="Commons Codec Soundex Phonetic Transcriptor") -@LanguageCapability("en") -@TypeCapability( - inputs={"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, - outputs={"de.tudarmstadt.ukp.dkpro.core.api.phonetics.type.PhoneticTranscription"}) - -public class SoundexPhoneticTranscriptor - extends PhoneticTranscriptor_ImplBase -{ - public SoundexPhoneticTranscriptor() - { - this.encoder = new Soundex(); - } -} diff --git a/dkpro-core-commonscodec-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/package-info.java b/dkpro-core-commonscodec-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/package-info.java deleted file mode 100644 index dca57bdb3f..0000000000 --- a/dkpro-core-commonscodec-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Phonetic transcription annotators that wrap the Apache Commons-Codec methods. - * - * @since 1.5.0 - */ -package de.tudarmstadt.ukp.dkpro.core.commonscodec; \ No newline at end of file diff --git a/dkpro-core-commonscodec-asl/src/main/java/org/dkpro/core/commonscodec/ColognePhoneticTranscriptor.java b/dkpro-core-commonscodec-asl/src/main/java/org/dkpro/core/commonscodec/ColognePhoneticTranscriptor.java new file mode 100644 index 0000000000..0b71c55a6b --- /dev/null +++ b/dkpro-core-commonscodec-asl/src/main/java/org/dkpro/core/commonscodec/ColognePhoneticTranscriptor.java @@ -0,0 +1,46 @@ +/* + * Copyright 2013 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.commonscodec; + +import org.apache.commons.codec.language.ColognePhonetic; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Cologne phonetic (Kölner Phonetik) transcription based on Apache Commons Codec. Works for German. + */ +@ResourceMetaData(name = "Commons Codec Cologne Phonetic Transcriptor") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability("de") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.phonetics.type.PhoneticTranscription" }) + +public class ColognePhoneticTranscriptor + extends PhoneticTranscriptor_ImplBase +{ + public ColognePhoneticTranscriptor() + { + this.encoder = new ColognePhonetic(); + } +} diff --git a/dkpro-core-commonscodec-asl/src/main/java/org/dkpro/core/commonscodec/DoubleMetaphonePhoneticTranscriptor.java b/dkpro-core-commonscodec-asl/src/main/java/org/dkpro/core/commonscodec/DoubleMetaphonePhoneticTranscriptor.java new file mode 100644 index 0000000000..6ce73e2ec9 --- /dev/null +++ b/dkpro-core-commonscodec-asl/src/main/java/org/dkpro/core/commonscodec/DoubleMetaphonePhoneticTranscriptor.java @@ -0,0 +1,44 @@ +/* + * Copyright 2013 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.commonscodec; + +import org.apache.commons.codec.language.DoubleMetaphone; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Double-Metaphone phonetic transcription based on Apache Commons Codec. + * Works for English. + */ +@ResourceMetaData(name = "Commons Codec Double-Metaphone Phonetic Transcriptor") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = {"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, + outputs = {"de.tudarmstadt.ukp.dkpro.core.api.phonetics.type.PhoneticTranscription"}) + +public class DoubleMetaphonePhoneticTranscriptor + extends PhoneticTranscriptor_ImplBase +{ + + public DoubleMetaphonePhoneticTranscriptor() + { + this.encoder = new DoubleMetaphone(); + } +} diff --git a/dkpro-core-commonscodec-asl/src/main/java/org/dkpro/core/commonscodec/MetaphonePhoneticTranscriptor.java b/dkpro-core-commonscodec-asl/src/main/java/org/dkpro/core/commonscodec/MetaphonePhoneticTranscriptor.java new file mode 100644 index 0000000000..9e04641e59 --- /dev/null +++ b/dkpro-core-commonscodec-asl/src/main/java/org/dkpro/core/commonscodec/MetaphonePhoneticTranscriptor.java @@ -0,0 +1,43 @@ +/* + * Copyright 2013 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.commonscodec; + +import org.apache.commons.codec.language.Metaphone; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Metaphone phonetic transcription based on Apache Commons Codec. + * Works for English. + */ +@ResourceMetaData(name = "Commons Codec Metaphone Phonetic Transcriptor") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = {"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, + outputs = {"de.tudarmstadt.ukp.dkpro.core.api.phonetics.type.PhoneticTranscription"}) +public class MetaphonePhoneticTranscriptor + extends PhoneticTranscriptor_ImplBase +{ + + public MetaphonePhoneticTranscriptor() + { + this.encoder = new Metaphone(); + } +} diff --git a/dkpro-core-commonscodec-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/PhoneticTranscriptor_ImplBase.java b/dkpro-core-commonscodec-asl/src/main/java/org/dkpro/core/commonscodec/PhoneticTranscriptor_ImplBase.java similarity index 86% rename from dkpro-core-commonscodec-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/PhoneticTranscriptor_ImplBase.java rename to dkpro-core-commonscodec-asl/src/main/java/org/dkpro/core/commonscodec/PhoneticTranscriptor_ImplBase.java index 8b3a25244c..f49bd9d98c 100644 --- a/dkpro-core-commonscodec-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/PhoneticTranscriptor_ImplBase.java +++ b/dkpro-core-commonscodec-asl/src/main/java/org/dkpro/core/commonscodec/PhoneticTranscriptor_ImplBase.java @@ -1,66 +1,67 @@ -/* - * Copyright 2013 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.commonscodec; - -import org.apache.commons.codec.EncoderException; -import org.apache.commons.codec.StringEncoder; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; - -import de.tudarmstadt.ukp.dkpro.core.api.phonetics.type.PhoneticTranscription; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * Base class for all kinds of phonetic transcriptors based on Apache Commons Codec. - */ -@TypeCapability( - inputs={"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, - outputs={"de.tudarmstadt.ukp.dkpro.core.api.phonetics.type.PhoneticTranscription"}) -public abstract class PhoneticTranscriptor_ImplBase - extends JCasAnnotator_ImplBase -{ - protected StringEncoder encoder; - - @Override - public void process(JCas jcas) - throws AnalysisEngineProcessException - { - for (Token token : JCasUtil.select(jcas, Token.class)) { - PhoneticTranscription transcription = new PhoneticTranscription(jcas, token.getBegin(), token.getEnd()); - transcription.setTranscription(encode(token.getCoveredText())); - transcription.setName(encoder.getClass().getName()); - transcription.addToIndexes(); - } - } - - protected String encode(String string) - throws AnalysisEngineProcessException - { - try { - String encodedString = encoder.encode(string); - return encodedString; - - } - catch (EncoderException e) { - throw new AnalysisEngineProcessException(e); - } - } -} +/* + * Copyright 2013 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.commonscodec; + +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringEncoder; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; + +import de.tudarmstadt.ukp.dkpro.core.api.phonetics.type.PhoneticTranscription; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +/** + * Base class for all kinds of phonetic transcriptors based on Apache Commons Codec. + */ +@TypeCapability( + inputs = {"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, + outputs = {"de.tudarmstadt.ukp.dkpro.core.api.phonetics.type.PhoneticTranscription"}) +public abstract class PhoneticTranscriptor_ImplBase + extends JCasAnnotator_ImplBase +{ + protected StringEncoder encoder; + + @Override + public void process(JCas jcas) + throws AnalysisEngineProcessException + { + for (Token token : JCasUtil.select(jcas, Token.class)) { + PhoneticTranscription transcription = new PhoneticTranscription(jcas, token.getBegin(), + token.getEnd()); + transcription.setTranscription(encode(token.getCoveredText())); + transcription.setName(encoder.getClass().getName()); + transcription.addToIndexes(); + } + } + + protected String encode(String string) + throws AnalysisEngineProcessException + { + try { + String encodedString = encoder.encode(string); + return encodedString; + + } + catch (EncoderException e) { + throw new AnalysisEngineProcessException(e); + } + } +} diff --git a/dkpro-core-commonscodec-asl/src/main/java/org/dkpro/core/commonscodec/SoundexPhoneticTranscriptor.java b/dkpro-core-commonscodec-asl/src/main/java/org/dkpro/core/commonscodec/SoundexPhoneticTranscriptor.java new file mode 100644 index 0000000000..717a1413f5 --- /dev/null +++ b/dkpro-core-commonscodec-asl/src/main/java/org/dkpro/core/commonscodec/SoundexPhoneticTranscriptor.java @@ -0,0 +1,44 @@ +/* + * Copyright 2013 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.commonscodec; + +import org.apache.commons.codec.language.Soundex; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Soundex phonetic transcription based on Apache Commons Codec. + * Works for English. + */ +@ResourceMetaData(name = "Commons Codec Soundex Phonetic Transcriptor") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability("en") +@TypeCapability( + inputs = {"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, + outputs = {"de.tudarmstadt.ukp.dkpro.core.api.phonetics.type.PhoneticTranscription"}) +public class SoundexPhoneticTranscriptor + extends PhoneticTranscriptor_ImplBase +{ + public SoundexPhoneticTranscriptor() + { + this.encoder = new Soundex(); + } +} diff --git a/dkpro-core-commonscodec-asl/src/main/java/org/dkpro/core/commonscodec/package-info.java b/dkpro-core-commonscodec-asl/src/main/java/org/dkpro/core/commonscodec/package-info.java new file mode 100644 index 0000000000..7e2fb56782 --- /dev/null +++ b/dkpro-core-commonscodec-asl/src/main/java/org/dkpro/core/commonscodec/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Phonetic transcription annotators that wrap the Apache Commons-Codec methods. + * + * @since 1.5.0 + */ +package org.dkpro.core.commonscodec; diff --git a/dkpro-core-commonscodec-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/ColognePhoneticTranscriptorTest.java b/dkpro-core-commonscodec-asl/src/test/java/org/dkpro/core/commonscodec/ColognePhoneticTranscriptorTest.java similarity index 86% rename from dkpro-core-commonscodec-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/ColognePhoneticTranscriptorTest.java rename to dkpro-core-commonscodec-asl/src/test/java/org/dkpro/core/commonscodec/ColognePhoneticTranscriptorTest.java index fb87448de2..e96735bddf 100644 --- a/dkpro-core-commonscodec-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/ColognePhoneticTranscriptorTest.java +++ b/dkpro-core-commonscodec-asl/src/test/java/org/dkpro/core/commonscodec/ColognePhoneticTranscriptorTest.java @@ -1,37 +1,38 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.commonscodec; - -import static de.tudarmstadt.ukp.dkpro.core.commonscodec.PhoneticTranscriptorTestUtil.runTest; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; - -import org.junit.Test; - -public class ColognePhoneticTranscriptorTest -{ - - @Test - public void colognePhoneticTest() throws Exception { - - runTest( - createEngineDescription(ColognePhoneticTranscriptor.class), - "Man sagt die Ente ist das Ende vom Mann .", - "66", "842", "2", "062", "082", "28", "062", "36", "66", "" - ); - } -} +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.commonscodec; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.dkpro.core.commonscodec.PhoneticTranscriptorTestUtil.runTest; + +import org.dkpro.core.commonscodec.ColognePhoneticTranscriptor; +import org.junit.Test; + +public class ColognePhoneticTranscriptorTest +{ + + @Test + public void colognePhoneticTest() throws Exception { + + runTest( + createEngineDescription(ColognePhoneticTranscriptor.class), + "Man sagt die Ente ist das Ende vom Mann .", + "66", "842", "2", "062", "082", "28", "062", "36", "66", "" + ); + } +} diff --git a/dkpro-core-commonscodec-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/DoubleMetaphonePhoneticTranscriptorTest.java b/dkpro-core-commonscodec-asl/src/test/java/org/dkpro/core/commonscodec/DoubleMetaphonePhoneticTranscriptorTest.java similarity index 85% rename from dkpro-core-commonscodec-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/DoubleMetaphonePhoneticTranscriptorTest.java rename to dkpro-core-commonscodec-asl/src/test/java/org/dkpro/core/commonscodec/DoubleMetaphonePhoneticTranscriptorTest.java index 6cd087aa28..41ebda21e1 100644 --- a/dkpro-core-commonscodec-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/DoubleMetaphonePhoneticTranscriptorTest.java +++ b/dkpro-core-commonscodec-asl/src/test/java/org/dkpro/core/commonscodec/DoubleMetaphonePhoneticTranscriptorTest.java @@ -1,37 +1,38 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.commonscodec; - -import static de.tudarmstadt.ukp.dkpro.core.commonscodec.PhoneticTranscriptorTestUtil.runTest; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; - -import org.junit.Test; - -public class DoubleMetaphonePhoneticTranscriptorTest -{ - - @Test - public void doubleMetaphoneTest() throws Exception { - - runTest( - createEngineDescription(DoubleMetaphonePhoneticTranscriptor.class), - "The knight entered the store in the night .", - "0", "NT", "ANTR", "0", "STR", "AN", "0", "NT", "" - ); - } -} +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.commonscodec; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.dkpro.core.commonscodec.PhoneticTranscriptorTestUtil.runTest; + +import org.dkpro.core.commonscodec.DoubleMetaphonePhoneticTranscriptor; +import org.junit.Test; + +public class DoubleMetaphonePhoneticTranscriptorTest +{ + + @Test + public void doubleMetaphoneTest() throws Exception { + + runTest( + createEngineDescription(DoubleMetaphonePhoneticTranscriptor.class), + "The knight entered the store in the night .", + "0", "NT", "ANTR", "0", "STR", "AN", "0", "NT", "" + ); + } +} diff --git a/dkpro-core-commonscodec-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/MetaphonePhoneticTranscriptorTest.java b/dkpro-core-commonscodec-asl/src/test/java/org/dkpro/core/commonscodec/MetaphonePhoneticTranscriptorTest.java similarity index 86% rename from dkpro-core-commonscodec-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/MetaphonePhoneticTranscriptorTest.java rename to dkpro-core-commonscodec-asl/src/test/java/org/dkpro/core/commonscodec/MetaphonePhoneticTranscriptorTest.java index 239bc2f9cd..41d00077aa 100644 --- a/dkpro-core-commonscodec-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/MetaphonePhoneticTranscriptorTest.java +++ b/dkpro-core-commonscodec-asl/src/test/java/org/dkpro/core/commonscodec/MetaphonePhoneticTranscriptorTest.java @@ -1,37 +1,38 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.commonscodec; - -import static de.tudarmstadt.ukp.dkpro.core.commonscodec.PhoneticTranscriptorTestUtil.runTest; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; - -import org.junit.Test; - -public class MetaphonePhoneticTranscriptorTest -{ - - @Test - public void metaphoneTest() throws Exception { - - runTest( - createEngineDescription(MetaphonePhoneticTranscriptor.class), - "The knight entered the store in the night .", - "0", "NT", "ENTR", "0", "STR", "IN", "0", "NT", "." - ); - } -} +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.commonscodec; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.dkpro.core.commonscodec.PhoneticTranscriptorTestUtil.runTest; + +import org.dkpro.core.commonscodec.MetaphonePhoneticTranscriptor; +import org.junit.Test; + +public class MetaphonePhoneticTranscriptorTest +{ + + @Test + public void metaphoneTest() throws Exception { + + runTest( + createEngineDescription(MetaphonePhoneticTranscriptor.class), + "The knight entered the store in the night .", + "0", "NT", "ENTR", "0", "STR", "IN", "0", "NT", "." + ); + } +} diff --git a/dkpro-core-commonscodec-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/PhoneticTranscriptorTestUtil.java b/dkpro-core-commonscodec-asl/src/test/java/org/dkpro/core/commonscodec/PhoneticTranscriptorTestUtil.java similarity index 86% rename from dkpro-core-commonscodec-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/PhoneticTranscriptorTestUtil.java rename to dkpro-core-commonscodec-asl/src/test/java/org/dkpro/core/commonscodec/PhoneticTranscriptorTestUtil.java index 569e844fd7..c85fee8189 100644 --- a/dkpro-core-commonscodec-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/PhoneticTranscriptorTestUtil.java +++ b/dkpro-core-commonscodec-asl/src/test/java/org/dkpro/core/commonscodec/PhoneticTranscriptorTestUtil.java @@ -1,49 +1,53 @@ -/* - * Copyright 2013 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.commonscodec; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.junit.Assert.assertEquals; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.fit.testing.factory.TokenBuilder; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; - -import de.tudarmstadt.ukp.dkpro.core.api.phonetics.type.PhoneticTranscription; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -public class PhoneticTranscriptorTestUtil -{ - public static void runTest(AnalysisEngineDescription desc, String text, String ... goldTranscriptions) throws Exception { - - AnalysisEngine engine = createEngine(desc); - JCas jcas = engine.newJCas(); - TokenBuilder tb = new TokenBuilder(Token.class, Sentence.class); - tb.buildTokens(jcas, text); - engine.process(jcas); - - int i=0; - for (PhoneticTranscription transcription : JCasUtil.select(jcas, PhoneticTranscription.class)) { - assertEquals(goldTranscriptions[i], transcription.getTranscription()); - i++; - } - } -} +/* + * Copyright 2013 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.commonscodec; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.junit.Assert.assertEquals; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.fit.testing.factory.TokenBuilder; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; + +import de.tudarmstadt.ukp.dkpro.core.api.phonetics.type.PhoneticTranscription; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class PhoneticTranscriptorTestUtil +{ + public static void runTest(AnalysisEngineDescription desc, String text, + String... goldTranscriptions) + throws Exception + { + AnalysisEngine engine = createEngine(desc); + JCas jcas = engine.newJCas(); + TokenBuilder tb = new TokenBuilder(Token.class, + Sentence.class); + tb.buildTokens(jcas, text); + engine.process(jcas); + + int i = 0; + for (PhoneticTranscription transcription : JCasUtil.select(jcas, + PhoneticTranscription.class)) { + assertEquals(goldTranscriptions[i], transcription.getTranscription()); + i++; + } + } +} diff --git a/dkpro-core-commonscodec-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/SoundexPhoneticTranscriptorTest.java b/dkpro-core-commonscodec-asl/src/test/java/org/dkpro/core/commonscodec/SoundexPhoneticTranscriptorTest.java similarity index 86% rename from dkpro-core-commonscodec-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/SoundexPhoneticTranscriptorTest.java rename to dkpro-core-commonscodec-asl/src/test/java/org/dkpro/core/commonscodec/SoundexPhoneticTranscriptorTest.java index e0529a5c26..ddab204293 100644 --- a/dkpro-core-commonscodec-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/commonscodec/SoundexPhoneticTranscriptorTest.java +++ b/dkpro-core-commonscodec-asl/src/test/java/org/dkpro/core/commonscodec/SoundexPhoneticTranscriptorTest.java @@ -1,37 +1,38 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.commonscodec; - -import static de.tudarmstadt.ukp.dkpro.core.commonscodec.PhoneticTranscriptorTestUtil.runTest; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; - -import org.junit.Test; - -public class SoundexPhoneticTranscriptorTest -{ - - @Test - public void soundexTest() throws Exception { - - runTest( - createEngineDescription(SoundexPhoneticTranscriptor.class), - "The knight entered the store in the night .", - "T000", "K523", "E536", "T000", "S360", "I500", "T000", "N230", "" - ); - } -} +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.commonscodec; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.dkpro.core.commonscodec.PhoneticTranscriptorTestUtil.runTest; + +import org.dkpro.core.commonscodec.SoundexPhoneticTranscriptor; +import org.junit.Test; + +public class SoundexPhoneticTranscriptorTest +{ + + @Test + public void soundexTest() throws Exception { + + runTest( + createEngineDescription(SoundexPhoneticTranscriptor.class), + "The knight entered the store in the night .", + "T000", "K523", "E536", "T000", "S360", "I500", "T000", "N230", "" + ); + } +} diff --git a/dkpro-core-corenlp-gpl/.license-header.txt b/dkpro-core-corenlp-gpl/.license-header.txt index ab08133a17..bbaf6e0e56 100644 --- a/dkpro-core-corenlp-gpl/.license-header.txt +++ b/dkpro-core-corenlp-gpl/.license-header.txt @@ -13,4 +13,4 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with this program. If not, see http://www.gnu.org/licenses/. +along with this program. If not, see http://www.gnu.org/licenses/. diff --git a/dkpro-core-corenlp-gpl/LICENSE.txt b/dkpro-core-corenlp-gpl/LICENSE.txt index 6e22a15c3c..99ace43661 100644 --- a/dkpro-core-corenlp-gpl/LICENSE.txt +++ b/dkpro-core-corenlp-gpl/LICENSE.txt @@ -654,7 +654,7 @@ the "copyright" line and a pointer to where the full notice is found. GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. + along with this program. If not, see http://www.gnu.org/licenses/. Also add information on how to contact you by electronic and paper mail. diff --git a/dkpro-core-corenlp-gpl/pom.xml b/dkpro-core-corenlp-gpl/pom.xml index 9c469a5e92..d1e4525933 100644 --- a/dkpro-core-corenlp-gpl/pom.xml +++ b/dkpro-core-corenlp-gpl/pom.xml @@ -1,6 +1,6 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-gpl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-gpl + 2.3.0-SNAPSHOT ../dkpro-core-gpl - de.tudarmstadt.ukp.dkpro.core.corenlp-gpl + dkpro-core-corenlp-gpl jar DKPro Core GPL - Stanford CoreNLP Suite (v ${corenlp.version}) (GPL) + https://dkpro.github.io/dkpro-core/ - 3.8.0 + 3.9.2 @@ -54,7 +55,13 @@ stanford-corenlp ${corenlp.version} - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.ner-asl + org.dkpro.core + dkpro-core-api-ner-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.syntax-asl + org.dkpro.core + dkpro-core-api-syntax-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.coref-asl + org.dkpro.core + dkpro-core-api-coref-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api + + + + xml-apis + xml-apis + runtime junit @@ -130,8 +151,8 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test @@ -165,7 +186,7 @@ de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-coref-en-default - 20161031.1 + 20181005.1 de.tudarmstadt.ukp.dkpro.core @@ -174,13 +195,8 @@ de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-dewac_175m_600.crf - 20150130.1 - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-hgc_175m_600.crf - 20161213.1 + de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-germeval2014.hgc_175m_600.crf + 20180227.1 de.tudarmstadt.ukp.dkpro.core @@ -245,7 +261,7 @@ de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-parser-ar-sr - 20141031.1 + 20180227.1 de.tudarmstadt.ukp.dkpro.core @@ -324,19 +340,14 @@ de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-ar-accurate - 20131112.1 + de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-ar-default + 20180103.1 de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-ud 20161213.1 - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-dewac - 20140827.1 - de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-fast @@ -355,17 +366,17 @@ de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-en-bidirectional-distsim - 20140616.1 + 20181002.1 de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-en-left3words-distsim - 20140616.1 + 20181002.1 de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-en-caseless-left3words-distsim - 20140827.0 + 20181002.0 de.tudarmstadt.ukp.dkpro.core @@ -380,7 +391,7 @@ de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-en-wsj-0-18-left3words-distsim - 20131112.1 + 20140616.1 de.tudarmstadt.ukp.dkpro.core @@ -417,11 +428,6 @@ de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-zh-distsim 20140616.1 - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-zh-nodistsim - 20140616.1 - de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-es-ancora.distsim.s512.crf @@ -525,7 +531,7 @@ de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.corenlp-model-depparser-en-ud - 20150418.1 + 20161213.1 de.tudarmstadt.ukp.dkpro.core @@ -540,7 +546,7 @@ de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.corenlp-model-depparser-fr-ud - 20160114.1 + 20180227.1 de.tudarmstadt.ukp.dkpro.core @@ -557,6 +563,11 @@ de.tudarmstadt.ukp.dkpro.core.corenlp-model-depparser-zh-ud 20161223.1 + + de.tudarmstadt.ukp.dkpro.core + de.tudarmstadt.ukp.dkpro.core.corenlp-model-tokenizer-ar-atb-bn-arztrain + 20170614.1 + @@ -567,6 +578,7 @@ maven-dependency-plugin + xml-apis:xml-apis de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-coref-en-default de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-en-all.3class.distsim.crf @@ -675,12 +687,7 @@ de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-dewac_175m_600.crf - test - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-hgc_175m_600.crf + de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-germeval2014.hgc_175m_600.crf test @@ -703,11 +710,6 @@ de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-hgc test - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-dewac - test - de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-en-caseless-left3words-distsim @@ -725,7 +727,7 @@ de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-ar-accurate + de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-ar-default test @@ -878,6 +880,11 @@ de.tudarmstadt.ukp.dkpro.core.corenlp-model-depparser-zh-ud test + + de.tudarmstadt.ukp.dkpro.core + de.tudarmstadt.ukp.dkpro.core.corenlp-model-tokenizer-ar-atb-bn-arztrain + test +
@@ -888,6 +895,7 @@ + de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.corenlp-model-tokenizer-ar-atb-bn-arztrain de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.corenlp-model-depparser-de-ud de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.corenlp-model-depparser-en-ptb-conll de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.corenlp-model-depparser-en-ud @@ -898,8 +906,7 @@ de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.corenlp-model-depparser-zh-ptb-conll de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.corenlp-model-depparser-zh-ud de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-nemgp - de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-dewac_175m_600.crf - de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-hgc_175m_600.crf + de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-germeval2014.hgc_175m_600.crf de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-en-all.3class.caseless.distsim.crf de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-en-conll.4class.caseless.distsim.crf de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-en-conll.4class.distsim.crf @@ -931,9 +938,8 @@ de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-parser-zh-factored de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-parser-zh-sr de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-parser-zh-xinhua-factored - de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-ar-accurate + de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-ar-default de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-ud - de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-dewac de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-fast-caseless de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-fast de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-hgc @@ -953,4 +959,5 @@ -
+ +
diff --git a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpPosTagger.java b/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpPosTagger.java deleted file mode 100644 index e04340bfe2..0000000000 --- a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpPosTagger.java +++ /dev/null @@ -1,239 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.corenlp; - -import static org.apache.uima.util.Level.INFO; - -import java.io.IOException; -import java.net.URL; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.corenlp.internal.DKPro2CoreNlp; -import de.tudarmstadt.ukp.dkpro.core.corenlp.internal.CoreNlp2DKPro; -import edu.stanford.nlp.parser.lexparser.Lexicon; -import edu.stanford.nlp.pipeline.Annotation; -import edu.stanford.nlp.pipeline.POSTaggerAnnotator; -import edu.stanford.nlp.process.PTBEscapingProcessor; -import edu.stanford.nlp.tagger.maxent.MaxentTagger; -import edu.stanford.nlp.util.StringUtils; - -/** - * Part-of-speech tagger from CoreNLP. - */ -@ResourceMetaData(name="CoreNLP POS-Tagger") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}) -public class CoreNlpPosTagger - extends JCasAnnotator_ImplBase -{ - /** - * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") - private boolean printTagSet; - - /** - * Use this language instead of the document language to resolve the model and tag set mapping. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - private String language; - - /** - * Variant of a model the model. Used to address a specific model if here are multiple models - * for one language. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - private String variant; - - /** - * Location from which the model is read. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - private String modelLocation; - - /** - * The character encoding used by the model. - */ - public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; - @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = false) - private String modelEncoding; - - /** - * Location of the mapping file for part-of-speech tags to UIMA types. - */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - private String posMappingLocation; - - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code false} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internStrings; - - public static final String PARAM_MAX_SENTENCE_LENGTH = ComponentParameters.PARAM_MAX_SENTENCE_LENGTH; - @ConfigurationParameter(name = PARAM_MAX_SENTENCE_LENGTH, mandatory = true, defaultValue = "2147483647") - private int maxSentenceLength; - - public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS; - @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, defaultValue = ComponentParameters.AUTO_NUM_THREADS) - private int numThreads; - - /** - * Enable all traditional PTB3 token transforms (like -LRB-, -RRB-). - * - * @see PTBEscapingProcessor - */ - public static final String PARAM_PTB3_ESCAPING = "ptb3Escaping"; - @ConfigurationParameter(name = PARAM_PTB3_ESCAPING, mandatory = true, defaultValue = "true") - private boolean ptb3Escaping; - - /** - * List of extra token texts (usually single character strings) that should be treated like - * opening quotes and escaped accordingly before being sent to the parser. - */ - public static final String PARAM_QUOTE_BEGIN = "quoteBegin"; - @ConfigurationParameter(name = PARAM_QUOTE_BEGIN, mandatory = false) - private List quoteBegin; - - /** - * List of extra token texts (usually single character strings) that should be treated like - * closing quotes and escaped accordingly before being sent to the parser. - */ - public static final String PARAM_QUOTE_END = "quoteEnd"; - @ConfigurationParameter(name = PARAM_QUOTE_END, mandatory = false) - private List quoteEnd; - - private CasConfigurableProviderBase annotatorProvider; - private MappingProvider mappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - annotatorProvider = new CoreNlpPosTaggerModelProvider(this); - - mappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - language, annotatorProvider); - - numThreads = ComponentParameters.computeNumThreads(numThreads); - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - - annotatorProvider.configure(cas); - mappingProvider.configure(cas); - - // Transfer from CAS to CoreNLP - DKPro2CoreNlp converter = new DKPro2CoreNlp(); - converter.setPtb3Escaping(ptb3Escaping); - converter.setQuoteBegin(quoteBegin); - converter.setQuoteEnd(quoteEnd); - converter.setEncoding(modelEncoding); - - Annotation document = new Annotation((String) null); - converter.convert(aJCas, document); - - // Actual processing - annotatorProvider.getResource().annotate(document); - - // Transfer back into the CAS - CoreNlp2DKPro.convertPOSs(aJCas, document, mappingProvider, internStrings); - } - - private class CoreNlpPosTaggerModelProvider - extends ModelProviderBase - { - public CoreNlpPosTaggerModelProvider(Object aObject) - { - super(aObject, "stanfordnlp", "tagger"); - // setDefault(PACKAGE, "de/tudarmstadt/ukp/dkpro/core/stanfordnlp"); - setDefault(LOCATION, - "classpath:/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/tagger-${language}-${variant}.properties"); - } - - @Override - protected POSTaggerAnnotator produceResource(URL aUrl) throws IOException - { - String modelFile = aUrl.toString(); - - // Loading gzipped files from URL is broken in CoreNLP - // https://github.com/stanfordnlp/CoreNLP/issues/94 - if (modelFile.startsWith("jar:") && modelFile.endsWith(".gz")) { - modelFile = org.apache.commons.lang3.StringUtils.substringAfter(modelFile, "!/"); - } - - MaxentTagger tagger = new MaxentTagger(modelFile, - StringUtils.argsToProperties("-model", modelFile), - false); - - SingletonTagset tags = new SingletonTagset(POS.class, getResourceMetaData() - .getProperty(("pos.tagset"))); - tags.addAll(tagger.tagSet()); - tags.remove(Lexicon.BOUNDARY_TAG); - addTagset(tags); - - if (printTagSet) { - getContext().getLogger().log(INFO, getTagset().toString()); - } - - POSTaggerAnnotator annotator = new POSTaggerAnnotator(tagger, maxSentenceLength, - numThreads); - - return annotator; - } - } -} diff --git a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpSegmenter.java b/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpSegmenter.java deleted file mode 100644 index d34ad26f31..0000000000 --- a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpSegmenter.java +++ /dev/null @@ -1,188 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.corenlp; - -import java.io.IOException; -import java.net.URL; -import java.util.Properties; -import java.util.Set; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; -import edu.stanford.nlp.ling.CoreAnnotations; -import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation; -import edu.stanford.nlp.ling.CoreLabel; -import edu.stanford.nlp.pipeline.Annotation; -import edu.stanford.nlp.pipeline.TokenizerAnnotator; -import edu.stanford.nlp.pipeline.WordsToSentencesAnnotator; -import edu.stanford.nlp.process.WordToSentenceProcessor; -import edu.stanford.nlp.util.CoreMap; - -/** - * Tokenizer and sentence splitter using from CoreNLP. - */ -@ResourceMetaData(name="CoreNLP Segmenter") -@TypeCapability( - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) -public class CoreNlpSegmenter - extends SegmenterBase -{ - private boolean verbose; - - /** - * The set of boundary tokens. If null, use default. - * - * @see WordToSentenceProcessor#WordToSentenceProcessor - */ - public static final String PARAM_BOUNDARY_TOKEN_REGEX = "boundaryTokenRegex"; - @ConfigurationParameter(name = PARAM_BOUNDARY_TOKEN_REGEX, mandatory = false, defaultValue = WordToSentenceProcessor.DEFAULT_BOUNDARY_REGEX) - private String boundaryTokenRegex; - - public static final String PARAM_BOUNDARY_MULTI_TOKEN_REGEX = "boundaryMultiTokenRegex"; - @ConfigurationParameter(name = PARAM_BOUNDARY_MULTI_TOKEN_REGEX, mandatory = false) - private String boundaryMultiTokenRegex; - - /** - * These are elements like "p" or "sent", which will be wrapped into regex for approximate XML - * matching. They will be deleted in the output, and will always trigger a sentence boundary. - */ - public static final String PARAM_HTML_ELEMENTS_TO_DISCARD = "htmlElementsToDiscard"; - @ConfigurationParameter(name = PARAM_HTML_ELEMENTS_TO_DISCARD, mandatory = false) - private Set htmlElementsToDiscard; - - /** - * The set of regex for sentence boundary tokens that should be discarded. - * - * @see WordToSentenceProcessor#DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD - */ - public static final String PARAM_BOUNDARIES_TO_DISCARD = "boundaryToDiscard"; - @ConfigurationParameter(name = PARAM_BOUNDARIES_TO_DISCARD, mandatory = false, defaultValue = { - "\n", "*NL*" }) - private Set boundaryToDiscard; - - /** - * Strategy for treating newlines as sentence breaks. - */ - public static final String PARAM_NEWLINE_IS_SENTENCE_BREAK = "newlineIsSentenceBreak"; - @ConfigurationParameter(name = PARAM_NEWLINE_IS_SENTENCE_BREAK, mandatory = false, defaultValue = "two") - private String newlineIsSentenceBreak; - - /** - * The set of regex for sentence boundary tokens that should be discarded. - */ - public static final String PARAM_TOKEN_REGEXES_TO_DISCARD = "tokenRegexesToDiscard"; - @ConfigurationParameter(name = PARAM_TOKEN_REGEXES_TO_DISCARD, mandatory = false, defaultValue = {}) - private Set tokenRegexesToDiscard; - - private ModelProviderBase sentenceAnnotator; - private ModelProviderBase tokenizerAnnotator; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - tokenizerAnnotator = new ModelProviderBase(this, "corenlp", "tokenizer") - { - { - setDefault(LOCATION, NOT_REQUIRED); - } - - @Override - protected TokenizerAnnotator produceResource(URL aUrl) - throws IOException - { - Properties props = getAggregatedProperties(); - - Properties coreNlpProps = new Properties(); - coreNlpProps.setProperty("tokenize.language", props.getProperty(LANGUAGE)); - //coreNlpProps.setProperty("tokenize.class", null); - //coreNlpProps.setProperty("tokenize.whitespace", "false"); - //coreNlpProps.setProperty("tokenize.options", null); - //coreNlpProps.setProperty("tokenize.keepeol", "false"); - - String extraOptions = null; - - TokenizerAnnotator annotator = new TokenizerAnnotator(verbose, coreNlpProps, - extraOptions); - - return annotator; - } - }; - - sentenceAnnotator = new ModelProviderBase(this, "corenlp", "sentence") - { - { - setDefault(LOCATION, NOT_REQUIRED); - } - - @Override - protected WordsToSentencesAnnotator produceResource(URL aUrl) - throws IOException - { - WordsToSentencesAnnotator annotator = new WordsToSentencesAnnotator(verbose, - boundaryTokenRegex, boundaryToDiscard, htmlElementsToDiscard, - newlineIsSentenceBreak, boundaryMultiTokenRegex, tokenRegexesToDiscard); - - return annotator; - } - }; - } - - @Override - protected void process(JCas aJCas, String aText, int aZoneBegin) - throws AnalysisEngineProcessException - { - Annotation document = new Annotation(aText); - - if (isWriteToken()) { - tokenizerAnnotator.configure(aJCas.getCas()); - tokenizerAnnotator.getResource().annotate(document); - - for (CoreLabel token : document.get(CoreAnnotations.TokensAnnotation.class)) { - createToken(aJCas, - token.get(CharacterOffsetBeginAnnotation.class) + aZoneBegin, - token.get(CharacterOffsetEndAnnotation.class) + aZoneBegin); - } - } - - if (isWriteSentence()) { - sentenceAnnotator.configure(aJCas.getCas()); - sentenceAnnotator.getResource().annotate(document); - - for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) { - createSentence(aJCas, - sentence.get(CharacterOffsetBeginAnnotation.class) + aZoneBegin, - sentence.get(CharacterOffsetEndAnnotation.class) + aZoneBegin); - } - } - } -} diff --git a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/internal/TokenKey.java b/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/internal/TokenKey.java deleted file mode 100644 index f3c57df10a..0000000000 --- a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/internal/TokenKey.java +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.corenlp.internal; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import edu.stanford.nlp.util.TypesafeMap.Key; - -public class TokenKey - implements Key -{ -} \ No newline at end of file diff --git a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpCoreferenceResolver.java b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpCoreferenceResolver.java similarity index 79% rename from dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpCoreferenceResolver.java rename to dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpCoreferenceResolver.java index bfc337a753..0e02946ce7 100644 --- a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpCoreferenceResolver.java +++ b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpCoreferenceResolver.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,9 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.corenlp; +package org.dkpro.core.corenlp; import java.io.IOException; import java.net.URL; @@ -33,20 +33,26 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.corenlp.internal.CoreNlp2DKPro; +import org.dkpro.core.corenlp.internal.DKPro2CoreNlp; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.corenlp.internal.DKPro2CoreNlp; -import de.tudarmstadt.ukp.dkpro.core.corenlp.internal.CoreNlp2DKPro; +import edu.stanford.nlp.coref.hybrid.HybridCorefProperties; import edu.stanford.nlp.dcoref.Constants; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.DeterministicCorefAnnotator; import edu.stanford.nlp.process.PTBEscapingProcessor; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Deterministic coreference annotator from CoreNLP. */ -@ResourceMetaData(name="CoreNLP Coreference Resolver") +@Component(OperationType.CO_REFERENCE_ANNOTATOR) +@ResourceMetaData(name = "CoreNLP Coreference Resolver") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", @@ -65,7 +71,8 @@ public class CoreNlpCoreferenceResolver * DCoRef parameter: Sieve passes - each class is defined in dcoref/sievepasses/. */ public static final String PARAM_SIEVES = "sieves"; - @ConfigurationParameter(name = PARAM_SIEVES, defaultValue = Constants.SIEVEPASSES, mandatory = true) + @ConfigurationParameter(name = PARAM_SIEVES, defaultValue = Constants.SIEVEPASSES, + mandatory = true) private String sieves; /** @@ -163,21 +170,22 @@ private class CoreNlpPosTaggerModelProvider public CoreNlpPosTaggerModelProvider(Object aObject) { super(aObject, "stanfordnlp", "coref"); - setDefault(PACKAGE, "de/tudarmstadt/ukp/dkpro/core/stanfordnlp"); + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); setDefault(ARTIFACT_ID, "${groupId}.stanfordnlp-model-coref-${language}-${variant}"); - setDefault(LOCATION, "classpath:/${package}/lib/coref/${language}/${variant}/countries"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/coref/${language}/${variant}/countries"); setDefault(VARIANT, "default"); } @Override protected DeterministicCorefAnnotator produceResource(URL aUrl) throws IOException { - String base = FilenameUtils.getFullPathNoEndSeparator(aUrl.toString())+"/"; + String base = FilenameUtils.getFullPathNoEndSeparator(aUrl.toString()) + "/"; // Loading gzipped files from URL is broken in CoreNLP // https://github.com/stanfordnlp/CoreNLP/issues/94 String logicalBase = getModelLocation(getAggregatedProperties()); - logicalBase = FilenameUtils.getFullPathNoEndSeparator(logicalBase)+"/"; + logicalBase = FilenameUtils.getFullPathNoEndSeparator(logicalBase) + "/"; logicalBase = logicalBase.substring("classpath:/".length()); Properties props = new Properties(); @@ -196,28 +204,42 @@ protected DeterministicCorefAnnotator produceResource(URL aUrl) throws IOExcepti // Cf. edu.stanford.nlp.dcoref.Dictionaries.Dictionaries(Properties) // props.getProperty(Constants.DEMONYM_PROP, DefaultPaths.DEFAULT_DCOREF_DEMONYM), props.setProperty(Constants.DEMONYM_PROP, base + "demonyms.txt"); + props.setProperty(HybridCorefProperties.DEMONYM_PROP, base + "demonyms.txt"); // props.getProperty(Constants.ANIMATE_PROP, DefaultPaths.DEFAULT_DCOREF_ANIMATE), props.setProperty(Constants.ANIMATE_PROP, base + "animate.unigrams.txt"); + props.setProperty(HybridCorefProperties.ANIMATE_PROP, base + "animate.unigrams.txt"); // props.getProperty(Constants.INANIMATE_PROP, DefaultPaths.DEFAULT_DCOREF_INANIMATE), props.setProperty(Constants.INANIMATE_PROP, base + "inanimate.unigrams.txt"); + props.setProperty(HybridCorefProperties.INANIMATE_PROP, base + "inanimate.unigrams.txt"); // props.getProperty(Constants.MALE_PROP), props.setProperty(Constants.MALE_PROP, base + "male.unigrams.txt"); + props.setProperty(HybridCorefProperties.MALE_PROP, base + "male.unigrams.txt"); // props.getProperty(Constants.NEUTRAL_PROP), props.setProperty(Constants.NEUTRAL_PROP, base + "neutral.unigrams.txt"); + props.setProperty(HybridCorefProperties.NEUTRAL_PROP, base + "neutral.unigrams.txt"); // props.getProperty(Constants.FEMALE_PROP), props.setProperty(Constants.FEMALE_PROP, base + "female.unigrams.txt"); + props.setProperty(HybridCorefProperties.FEMALE_PROP, base + "female.unigrams.txt"); // props.getProperty(Constants.PLURAL_PROP), props.setProperty(Constants.PLURAL_PROP, base + "plural.unigrams.txt"); + props.setProperty(HybridCorefProperties.PLURAL_PROP, base + "plural.unigrams.txt"); // props.getProperty(Constants.SINGULAR_PROP), props.setProperty(Constants.SINGULAR_PROP, base + "singular.unigrams.txt"); + props.setProperty(HybridCorefProperties.SINGULAR_PROP, base + "singular.unigrams.txt"); // props.getProperty(Constants.STATES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES), props.setProperty(Constants.STATES_PROP, base + "state-abbreviations.txt"); - //props.getProperty(Constants.GENDER_NUMBER_PROP, DefaultPaths.DEFAULT_DCOREF_GENDER_NUMBER); - props.setProperty(Constants.GENDER_NUMBER_PROP, logicalBase + "gender.map.ser.gz"); + props.setProperty(HybridCorefProperties.STATES_PROP, base + "state-abbreviations.txt"); + // props.getProperty(Constants.GENDER_NUMBER_PROP, + // DefaultPaths.DEFAULT_DCOREF_GENDER_NUMBER); + props.setProperty(Constants.GENDER_NUMBER_PROP, base + "gender.map.ser.gz"); + props.setProperty(HybridCorefProperties.GENDER_NUMBER_PROP, base + "gender.data.gz"); // props.getProperty(Constants.COUNTRIES_PROP, DefaultPaths.DEFAULT_DCOREF_COUNTRIES), props.setProperty(Constants.COUNTRIES_PROP, base + "countries"); - // props.getProperty(Constants.STATES_PROVINCES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES_AND_PROVINCES), + props.setProperty(HybridCorefProperties.COUNTRIES_PROP, base + "countries"); + // props.getProperty(Constants.STATES_PROVINCES_PROP, + // DefaultPaths.DEFAULT_DCOREF_STATES_AND_PROVINCES), props.setProperty(Constants.STATES_PROVINCES_PROP, base + "statesandprovinces"); + props.setProperty(HybridCorefProperties.STATES_PROVINCES_PROP, base + "statesandprovinces"); // The following properties are only relevant if the "CorefDictionaryMatch" sieve // is enabled. @@ -229,9 +251,12 @@ protected DeterministicCorefAnnotator produceResource(URL aUrl) throws IOExcepti + "coref.dict4.tsv" + ']'); // props.getProperty(Constants.DICT_PMI_PROP, DefaultPaths.DEFAULT_DCOREF_DICT1), props.put(Constants.DICT_PMI_PROP, base + "coref.dict1.tsv"); - // props.getProperty(Constants.SIGNATURES_PROP, DefaultPaths.DEFAULT_DCOREF_NE_SIGNATURES)); + // props.getProperty(Constants.SIGNATURES_PROP, + // DefaultPaths.DEFAULT_DCOREF_NE_SIGNATURES)); props.put(Constants.SIGNATURES_PROP, base + "ne.signatures.txt"); + props.put("coref.md.model", base + "md-model-dep.ser.gz"); + DeterministicCorefAnnotator annotator = new DeterministicCorefAnnotator(props); return annotator; diff --git a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpDependencyParser.java b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpDependencyParser.java similarity index 77% rename from dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpDependencyParser.java rename to dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpDependencyParser.java index c58fc37449..1986697f61 100644 --- a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpDependencyParser.java +++ b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpDependencyParser.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,9 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.corenlp; +package org.dkpro.core.corenlp; import static org.apache.uima.util.Level.INFO; @@ -35,27 +35,32 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.MappingProviderFactory; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.corenlp.internal.CoreNlp2DKPro; +import org.dkpro.core.corenlp.internal.DKPro2CoreNlp; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.corenlp.internal.DKPro2CoreNlp; -import de.tudarmstadt.ukp.dkpro.core.corenlp.internal.CoreNlp2DKPro; import edu.stanford.nlp.parser.nndep.DependencyParser; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.DependencyParseAnnotator; import edu.stanford.nlp.process.PTBEscapingProcessor; import edu.stanford.nlp.trees.GrammaticalStructure; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Dependency parser from CoreNLP. */ -@ResourceMetaData(name="CoreNLP Dependency Parser") +@Component(OperationType.DEPENDENCY_PARSER) +@ResourceMetaData(name = "CoreNLP Dependency Parser") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", @@ -68,11 +73,9 @@ public class CoreNlpDependencyParser { /** * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} */ public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") private boolean printTagSet; /** @@ -90,6 +93,20 @@ public class CoreNlpDependencyParser @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) private String variant; + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + *

The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.

+ */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + /** * Location from which the model is read. */ @@ -104,31 +121,41 @@ public class CoreNlpDependencyParser @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = false) private String modelEncoding; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Location of the mapping file for part-of-speech tags to UIMA types. */ - public static final String PARAM_DEPENDENCY_MAPPING_LOCATION = ComponentParameters.PARAM_DEPENDENCY_MAPPING_LOCATION; + public static final String PARAM_DEPENDENCY_MAPPING_LOCATION = + ComponentParameters.PARAM_DEPENDENCY_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_DEPENDENCY_MAPPING_LOCATION, mandatory = false) private String dependencyMappingLocation; /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code false} + * Maximum sentence length. Longer sentences are skipped. */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internStrings; - - public static final String PARAM_MAX_SENTENCE_LENGTH = ComponentParameters.PARAM_MAX_SENTENCE_LENGTH; + public static final String PARAM_MAX_SENTENCE_LENGTH = + ComponentParameters.PARAM_MAX_SENTENCE_LENGTH; @ConfigurationParameter(name = PARAM_MAX_SENTENCE_LENGTH, mandatory = true, defaultValue = "2147483647") private int maxSentenceLength; + /** + * Number of parallel threads to use. + */ public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS; - @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, defaultValue = ComponentParameters.AUTO_NUM_THREADS) + @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, + defaultValue = ComponentParameters.AUTO_NUM_THREADS) private int numThreads; + /** + * Maximum time to spend on a single sentence. + */ public static final String PARAM_MAX_TIME = "maxTime"; @ConfigurationParameter(name = PARAM_MAX_TIME, mandatory = true, defaultValue = "-1") private int maxTime; @@ -158,8 +185,12 @@ public class CoreNlpDependencyParser @ConfigurationParameter(name = PARAM_QUOTE_END, mandatory = false) private List quoteEnd; + /** + * Types of extra edges to add to the dependency tree. + */ public static final String PARAM_EXTRA_DEPENDENCIES = "extraDependencies"; - @ConfigurationParameter(name = PARAM_EXTRA_DEPENDENCIES, mandatory = true, defaultValue="NONE") + @ConfigurationParameter(name = PARAM_EXTRA_DEPENDENCIES, mandatory = true, + defaultValue = "NONE") GrammaticalStructure.Extras extraDependencies; private CasConfigurableProviderBase annotatorProvider; @@ -173,7 +204,7 @@ public void initialize(UimaContext aContext) annotatorProvider = new CoreNlpDependencyParserModelProvider(this); - mappingProvider = MappingProviderFactory.createDependencyMappingProvider( + mappingProvider = MappingProviderFactory.createDependencyMappingProvider(this, dependencyMappingLocation, language, annotatorProvider); numThreads = ComponentParameters.computeNumThreads(numThreads); @@ -202,7 +233,7 @@ public void process(JCas aJCas) annotatorProvider.getResource().annotate(document); // Transfer back into the CAS - CoreNlp2DKPro.convertDependencies(aJCas, document, mappingProvider, internStrings); + CoreNlp2DKPro.convertDependencies(aJCas, document, mappingProvider); } private class CoreNlpDependencyParserModelProvider @@ -211,6 +242,10 @@ private class CoreNlpDependencyParserModelProvider public CoreNlpDependencyParserModelProvider(Object aObject) { super(aObject, "corenlp", "depparser"); + + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/depparser-${language}-${variant}.properties"); } @Override diff --git a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpLemmatizer.java b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpLemmatizer.java similarity index 83% rename from dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpLemmatizer.java rename to dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpLemmatizer.java index f1d7a7d00f..7b9f748b31 100644 --- a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpLemmatizer.java +++ b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpLemmatizer.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,9 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.corenlp; +package org.dkpro.core.corenlp; import java.io.IOException; import java.net.URL; @@ -31,20 +31,25 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.Messages; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.corenlp.internal.CoreNlp2DKPro; +import org.dkpro.core.corenlp.internal.DKPro2CoreNlp; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.Messages; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.corenlp.internal.DKPro2CoreNlp; -import de.tudarmstadt.ukp.dkpro.core.corenlp.internal.CoreNlp2DKPro; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.MorphaAnnotator; import edu.stanford.nlp.process.PTBEscapingProcessor; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Lemmatizer from CoreNLP. */ -@ResourceMetaData(name="CoreNLP Lemmatizer") +@Component(OperationType.LEMMATIZER) +@ResourceMetaData(name = "CoreNLP Lemmatizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", @@ -110,7 +115,8 @@ public void process(JCas aJCas) { if (!"en".equals(aJCas.getDocumentLanguage())) { throw new AnalysisEngineProcessException(Messages.BUNDLE, - Messages.ERR_UNSUPPORTED_LANGUAGE, new String[] { aJCas.getDocumentLanguage() }); + Messages.ERR_UNSUPPORTED_LANGUAGE, + new String[] { aJCas.getDocumentLanguage() }); } CAS cas = aJCas.getCas(); diff --git a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpNamedEntityRecognizer.java b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpNamedEntityRecognizer.java similarity index 76% rename from dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpNamedEntityRecognizer.java rename to dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpNamedEntityRecognizer.java index 61adcbb0b7..0fa5e11cd1 100644 --- a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpNamedEntityRecognizer.java +++ b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpNamedEntityRecognizer.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,11 +14,12 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.corenlp; +package org.dkpro.core.corenlp; import static org.apache.uima.util.Level.INFO; +import static org.dkpro.core.api.resources.MappingProviderFactory.createNerMappingProvider; import java.io.IOException; import java.io.InputStream; @@ -37,13 +38,12 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.corenlp.internal.CoreNlp2DKPro; +import org.dkpro.core.corenlp.internal.DKPro2CoreNlp; -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.corenlp.internal.DKPro2CoreNlp; -import de.tudarmstadt.ukp.dkpro.core.corenlp.internal.CoreNlp2DKPro; import edu.stanford.nlp.ie.AbstractSequenceClassifier; import edu.stanford.nlp.ie.NERClassifierCombiner; import edu.stanford.nlp.ie.crf.CRFClassifier; @@ -54,11 +54,16 @@ import edu.stanford.nlp.process.PTBEscapingProcessor; import edu.stanford.nlp.util.ErasureUtils; import edu.stanford.nlp.util.StringUtils; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Named entity recognizer from CoreNLP. */ -@ResourceMetaData(name="CoreNLP Named Entity Recognizer") +@Component(OperationType.NAMED_ENTITITY_RECOGNIZER) +@ResourceMetaData(name = "CoreNLP Named Entity Recognizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", @@ -70,11 +75,9 @@ public class CoreNlpNamedEntityRecognizer { /** * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} */ public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") protected boolean printTagSet; /** @@ -92,6 +95,20 @@ public class CoreNlpNamedEntityRecognizer @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) private String variant; + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + *

The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.

+ */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + /** * Location from which the model is read. */ @@ -109,30 +126,32 @@ public class CoreNlpNamedEntityRecognizer /** * Location of the mapping file for named entity tags to UIMA types. */ - public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; + public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = + ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_NAMED_ENTITY_MAPPING_LOCATION, mandatory = false) private String mappingLocation; /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code false} + * Maximum sentence length. Longer sentences are skipped. */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internStrings; - - public static final String PARAM_MAX_SENTENCE_LENGTH = ComponentParameters.PARAM_MAX_SENTENCE_LENGTH; + public static final String PARAM_MAX_SENTENCE_LENGTH = + ComponentParameters.PARAM_MAX_SENTENCE_LENGTH; @ConfigurationParameter(name = PARAM_MAX_SENTENCE_LENGTH, mandatory = true, defaultValue = "2147483647") private int maxSentenceLength; + /** + * Maximum time to spend on a single sentence. + */ public static final String PARAM_MAX_TIME = "maxTime"; @ConfigurationParameter(name = PARAM_MAX_TIME, mandatory = true, defaultValue = "-1") private int maxTime; + /** + * Number of parallel threads to use. + */ public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS; - @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, defaultValue = ComponentParameters.AUTO_NUM_THREADS) + @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, + defaultValue = ComponentParameters.AUTO_NUM_THREADS) private int numThreads; /** @@ -164,20 +183,19 @@ public class CoreNlpNamedEntityRecognizer * @see NERClassifierCombiner#APPLY_NUMERIC_CLASSIFIERS_DEFAULT */ public static final String PARAM_APPLY_NUMERIC_CLASSIFIERS = "applyNumericClassifiers"; - @ConfigurationParameter(name = PARAM_APPLY_NUMERIC_CLASSIFIERS, mandatory = true, defaultValue="true") - boolean applyNumericClassifiers; + @ConfigurationParameter(name = PARAM_APPLY_NUMERIC_CLASSIFIERS, mandatory = true, defaultValue = "true") + private boolean applyNumericClassifiers; +// /** +// * Use SUTime if it is available on the classpath. SUTime only works for English. +// */ +// public static final String PARAM_USE_SUTIME = "useSUTime"; +// @ConfigurationParameter(name = PARAM_USE_SUTIME, mandatory = true, defaultValue = "false") // FIXME Using USE_SUTIME_DEFAULT autodetects presence of SUTime. Need three values here: // on, off, auto - public static final String PARAM_USE_SUTIME = "useSUTime"; - @ConfigurationParameter(name = PARAM_USE_SUTIME, mandatory = true, defaultValue="false") - boolean useSUTime; // = NumberSequenceClassifier.USE_SUTIME_DEFAULT; - - public static final String PARAM_AUGMENT_REGEX_NER = "augmentRegexNER"; - @ConfigurationParameter(name = PARAM_AUGMENT_REGEX_NER, mandatory = true, defaultValue="false") - boolean augmentRegexNER; // = NERClassifierCombiner.APPLY_GAZETTE_PROPERTY; + private boolean useSUTime = false; // = NumberSequenceClassifier.USE_SUTIME_DEFAULT; - boolean verbose = false; + private boolean verbose = false; private ModelProviderBase annotatorProvider; private MappingProvider mappingProvider; @@ -190,15 +208,8 @@ public void initialize(UimaContext aContext) annotatorProvider = new CoreNlpNamedEntityRecognizerModelProvider(this); - mappingProvider = new MappingProvider(); - mappingProvider - .setDefaultVariantsLocation("de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-default-variants.map"); - mappingProvider.setDefault(MappingProvider.LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/" - + "core/corenlp/lib/ner-${language}-${variant}.map"); - mappingProvider.setDefault(MappingProvider.BASE_TYPE, NamedEntity.class.getName()); - mappingProvider.setOverride(MappingProvider.LOCATION, mappingLocation); - mappingProvider.setOverride(MappingProvider.LANGUAGE, language); - mappingProvider.setOverride(MappingProvider.VARIANT, variant); + mappingProvider = createNerMappingProvider(this, mappingLocation, language, variant, + annotatorProvider); numThreads = ComponentParameters.computeNumThreads(numThreads); } @@ -226,16 +237,16 @@ public void process(JCas aJCas) annotatorProvider.getResource().annotate(document); // Transfer back into the CAS - CoreNlp2DKPro.convertNamedEntities(aJCas, document, mappingProvider, internStrings); + CoreNlp2DKPro.convertNamedEntities(aJCas, document, mappingProvider); } private class CoreNlpNamedEntityRecognizerModelProvider - extends ModelProviderBase + extends ModelProviderBase { public CoreNlpNamedEntityRecognizerModelProvider(Object aObject) { super(aObject, "stanfordnlp", "ner"); - // setDefault(PACKAGE, "de/tudarmstadt/ukp/dkpro/core/stanfordnlp"); + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-${language}-${variant}.properties"); } @@ -296,10 +307,10 @@ protected NERCombinerAnnotator produceResource(URL aUrl) throws IOException } NERClassifierCombiner combiner = new NERClassifierCombiner(applyNumericClassifiers, - useSUTime, augmentRegexNER, classifier); + useSUTime, classifier); NERCombinerAnnotator annotator = new NERCombinerAnnotator(combiner, verbose, - numThreads, maxTime, maxSentenceLength); + numThreads, maxTime, maxSentenceLength, false, false); return annotator; } } diff --git a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpParser.java b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpParser.java similarity index 83% rename from dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpParser.java rename to dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpParser.java index ee246d7556..7a7b60b972 100644 --- a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpParser.java +++ b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpParser.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,12 +14,15 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.corenlp; +package org.dkpro.core.corenlp; import static org.apache.uima.util.Level.INFO; import static org.apache.uima.util.Level.WARNING; +import static org.dkpro.core.api.resources.MappingProviderFactory.createConstituentMappingProvider; +import static org.dkpro.core.api.resources.MappingProviderFactory.createDependencyMappingProvider; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; import java.io.IOException; import java.net.URL; @@ -36,18 +39,17 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.corenlp.internal.CoreNlp2DKPro; +import org.dkpro.core.corenlp.internal.DKPro2CoreNlp; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.corenlp.internal.DKPro2CoreNlp; -import de.tudarmstadt.ukp.dkpro.core.corenlp.internal.CoreNlp2DKPro; import edu.stanford.nlp.parser.common.ParserGrammar; import edu.stanford.nlp.parser.lexparser.LexicalizedParser; import edu.stanford.nlp.parser.lexparser.Lexicon; @@ -66,11 +68,16 @@ import edu.stanford.nlp.trees.UniversalEnglishGrammaticalRelations; import edu.stanford.nlp.trees.UniversalEnglishGrammaticalStructureFactory; import edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalRelations; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Parser from CoreNLP. */ -@ResourceMetaData(name="CoreNLP Parser") +@Component(OperationType.CONSTITUENCY_PARSER) +@ResourceMetaData(name = "CoreNLP Parser") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", @@ -84,11 +91,9 @@ public class CoreNlpParser { /** * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} */ public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") private boolean printTagSet; /** @@ -106,6 +111,20 @@ public class CoreNlpParser @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) private String variant; + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + *

The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.

+ */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + /** * Location from which the model is read. */ @@ -120,45 +139,58 @@ public class CoreNlpParser @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = false) private String modelEncoding; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Location of the mapping file for dependency tags to UIMA types. */ - public static final String PARAM_DEPENDENCY_MAPPING_LOCATION = ComponentParameters.PARAM_DEPENDENCY_MAPPING_LOCATION; + public static final String PARAM_DEPENDENCY_MAPPING_LOCATION = + ComponentParameters.PARAM_DEPENDENCY_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_DEPENDENCY_MAPPING_LOCATION, mandatory = false) private String dependencyMappingLocation; /** * Location of the mapping file for dependency tags to UIMA types. */ - public static final String PARAM_CONSTITUENT_MAPPING_LOCATION = ComponentParameters.PARAM_CONSTITUENT_MAPPING_LOCATION; + public static final String PARAM_CONSTITUENT_MAPPING_LOCATION = + ComponentParameters.PARAM_CONSTITUENT_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_CONSTITUENT_MAPPING_LOCATION, mandatory = false) private String constituentMappingLocation; /** * Location of the mapping file for part-of-speech tags to UIMA types. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) private String posMappingLocation; /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code false} + * Maximum sentence length. Longer sentences are skipped. */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internStrings; - - public static final String PARAM_MAX_SENTENCE_LENGTH = ComponentParameters.PARAM_MAX_SENTENCE_LENGTH; + public static final String PARAM_MAX_SENTENCE_LENGTH = + ComponentParameters.PARAM_MAX_SENTENCE_LENGTH; @ConfigurationParameter(name = PARAM_MAX_SENTENCE_LENGTH, mandatory = true, defaultValue = "2147483647") private int maxSentenceLength; - public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS; - @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, defaultValue = ComponentParameters.AUTO_NUM_THREADS) + /** + * Number of parallel threads to use. + */ + public static final String PARAM_NUM_THREADS = + ComponentParameters.PARAM_NUM_THREADS; + @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, + defaultValue = ComponentParameters.AUTO_NUM_THREADS) private int numThreads; + /** + * Maximum time to spend on a single sentence. + */ public static final String PARAM_MAX_TIME = "maxTime"; @ConfigurationParameter(name = PARAM_MAX_TIME, mandatory = true, defaultValue = "-1") private int maxTime; @@ -188,25 +220,25 @@ public class CoreNlpParser @ConfigurationParameter(name = PARAM_QUOTE_END, mandatory = false) private List quoteEnd; + /** + * Types of extra edges to add to the dependency tree. + */ public static final String PARAM_EXTRA_DEPENDENCIES = "extraDependencies"; - @ConfigurationParameter(name = PARAM_EXTRA_DEPENDENCIES, mandatory = true, defaultValue="NONE") + @ConfigurationParameter(name = PARAM_EXTRA_DEPENDENCIES, mandatory = true, defaultValue = "NONE") GrammaticalStructure.Extras extraDependencies; /** * Sets whether to create or not to create constituent tags. This is required for POS-tagging * and lemmatization. - *

- * Default: {@code true} */ - public static final String PARAM_WRITE_CONSTITUENT = ComponentParameters.PARAM_WRITE_CONSTITUENT; + public static final String PARAM_WRITE_CONSTITUENT = + ComponentParameters.PARAM_WRITE_CONSTITUENT; @ConfigurationParameter(name = PARAM_WRITE_CONSTITUENT, mandatory = true, defaultValue = "true") private boolean writeConstituent; /** * If this parameter is set to true, each sentence is annotated with a PennTree-Annotation, * containing the whole parse tree in Penn Treebank style format. - *

- * Default: {@code false} */ public static final String PARAM_WRITE_PENN_TREE = ComponentParameters.PARAM_WRITE_PENN_TREE; @ConfigurationParameter(name = PARAM_WRITE_PENN_TREE, mandatory = true, defaultValue = "false") @@ -214,8 +246,6 @@ public class CoreNlpParser /** * Sets whether to use or not to use existing POS tags. - *

- * Default: {@code true} */ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") @@ -224,8 +254,6 @@ public class CoreNlpParser /** * Sets whether to create or not to create POS tags. The creation of constituent tags must be * turned on for this to work. - *

- * Default: {@code false} */ public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "false") @@ -233,18 +261,22 @@ public class CoreNlpParser /** * Sets whether to create or not to create dependency annotations. - * - *

Default: {@code true} */ public static final String PARAM_WRITE_DEPENDENCY = ComponentParameters.PARAM_WRITE_DEPENDENCY; @ConfigurationParameter(name = PARAM_WRITE_DEPENDENCY, mandatory = true, defaultValue = "true") private boolean writeDependency; + /** + * Generate original Stanford Dependencies grammatical relations instead of Universal + * Dependencies. + */ public static final String PARAM_ORIGINAL_DEPENDENCIES = "originalDependencies"; @ConfigurationParameter(name = PARAM_ORIGINAL_DEPENDENCIES, mandatory = true, defaultValue = "true") private boolean originalDependencies; - // CoreNlpParser PARAM_KEEP_PUNCTUATION has no effect #965 + /** + * Whether to keep punctuation dependencies in the dependency parse output of the parser. + */ public static final String PARAM_KEEP_PUNCTUATION = "keepPunctuation"; @ConfigurationParameter(name = PARAM_KEEP_PUNCTUATION, mandatory = true, defaultValue = "false") private boolean keepPunctuation; @@ -262,14 +294,14 @@ public void initialize(UimaContext aContext) annotatorProvider = new CoreNlpParserModelProvider(this); - constituentMappingProvider = MappingProviderFactory.createConstituentMappingProvider( + constituentMappingProvider = createConstituentMappingProvider(this, constituentMappingLocation, language, annotatorProvider); - - dependencyMappingProvider = MappingProviderFactory.createDependencyMappingProvider( - dependencyMappingLocation, language, annotatorProvider); - - posMappingProvider = MappingProviderFactory.createPosMappingProvider( - posMappingLocation, language, annotatorProvider); + + dependencyMappingProvider = createDependencyMappingProvider(this, dependencyMappingLocation, + language, annotatorProvider); + + posMappingProvider = createPosMappingProvider(this, posMappingLocation, language, + annotatorProvider); numThreads = ComponentParameters.computeNumThreads(numThreads); } @@ -309,14 +341,13 @@ public void process(JCas aJCas) // Transfer back into the CAS if (writePos) { - posMappingProvider.configure(cas); - CoreNlp2DKPro.convertPOSs(aJCas, document, posMappingProvider, internStrings); + posMappingProvider.configure(cas); + CoreNlp2DKPro.convertPOSs(aJCas, document, posMappingProvider); } if (writeConstituent) { constituentMappingProvider.configure(cas); - CoreNlp2DKPro.convertConstituents(aJCas, document, constituentMappingProvider, - internStrings, tlp); + CoreNlp2DKPro.convertConstituents(aJCas, document, constituentMappingProvider, tlp); } if (writePennTree) { @@ -325,8 +356,7 @@ public void process(JCas aJCas) if (writeDependency) { dependencyMappingProvider.configure(cas); - CoreNlp2DKPro.convertDependencies(aJCas, document, dependencyMappingProvider, - internStrings); + CoreNlp2DKPro.convertDependencies(aJCas, document, dependencyMappingProvider); } } @@ -337,6 +367,7 @@ public CoreNlpParserModelProvider(Object aObject) { super(aObject, "stanfordnlp", "parser"); // setDefault(PACKAGE, "de/tudarmstadt/ukp/dkpro/core/stanfordnlp"); + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/parser-${language}-${variant}.properties"); } @@ -366,7 +397,8 @@ protected ParserAnnotator produceResource(URL aUrl) throws IOException // coreNlpProps.setProperty("parse.treemap", ...); coreNlpProps.setProperty("parse.maxtime", Integer.toString(maxTime)); coreNlpProps.setProperty("parse.buildgraphs", Boolean.toString(writeDependency)); - coreNlpProps.setProperty("parse.originalDependencies", Boolean.toString(originalDependencies)); + coreNlpProps.setProperty("parse.originalDependencies", + Boolean.toString(originalDependencies)); coreNlpProps.setProperty("parse.nthreads", Integer.toString(numThreads)); // coreNlpProps.setProperty("parse.binaryTrees", ...); // coreNlpProps.setProperty("parse.nosquash", ...); @@ -496,7 +528,8 @@ else if (parser instanceof ShiftReduceParser) { addTagset(depTags, writeDependency); } } - else if (gsf != null && UniversalEnglishGrammaticalStructureFactory.class.equals(gsf.getClass())) { + else if (gsf != null + && UniversalEnglishGrammaticalStructureFactory.class.equals(gsf.getClass())) { SingletonTagset depTags = new SingletonTagset(Dependency.class, "universal"); for (GrammaticalRelation r : UniversalEnglishGrammaticalRelations.values()) { depTags.add(r.getShortName()); diff --git a/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpPosTagger.java b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpPosTagger.java new file mode 100644 index 0000000000..91fe9178c6 --- /dev/null +++ b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpPosTagger.java @@ -0,0 +1,259 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.corenlp; + +import static org.apache.uima.util.Level.INFO; + +import java.io.IOException; +import java.net.URL; +import java.util.List; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.MappingProviderFactory; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.corenlp.internal.CoreNlp2DKPro; +import org.dkpro.core.corenlp.internal.DKPro2CoreNlp; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import edu.stanford.nlp.parser.lexparser.Lexicon; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.POSTaggerAnnotator; +import edu.stanford.nlp.process.PTBEscapingProcessor; +import edu.stanford.nlp.tagger.maxent.MaxentTagger; +import edu.stanford.nlp.util.StringUtils; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Part-of-speech tagger from CoreNLP. + */ +@Component(OperationType.PART_OF_SPEECH_TAGGER) +@ResourceMetaData(name = "CoreNLP POS-Tagger") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }) +public class CoreNlpPosTagger + extends JCasAnnotator_ImplBase +{ + /** + * Log the tag set(s) when a model is loaded. + */ + public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") + private boolean printTagSet; + + /** + * Use this language instead of the document language to resolve the model and tag set mapping. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + private String language; + + /** + * Variant of a model the model. Used to address a specific model if here are multiple models + * for one language. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + private String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving mechanism + * and directly address a particular model. + * + *

+ * The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set the + * variant parameter to match the artifact. If the artifact contains the model in a non-default + * location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}. + *

+ */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Location from which the model is read. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + private String modelLocation; + + /** + * The character encoding used by the model. + */ + public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; + @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = false) + private String modelEncoding; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Location of the mapping file for part-of-speech tags to UIMA types. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + private String posMappingLocation; + + /** + * Maximum sentence length. Longer sentences are skipped. + */ + public static final String PARAM_MAX_SENTENCE_LENGTH = + ComponentParameters.PARAM_MAX_SENTENCE_LENGTH; + @ConfigurationParameter(name = PARAM_MAX_SENTENCE_LENGTH, mandatory = true, defaultValue = "2147483647") + private int maxSentenceLength; + + /** + * Number of parallel threads to use. + */ + public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS; + @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, + defaultValue = ComponentParameters.AUTO_NUM_THREADS) + private int numThreads; + + /** + * Enable all traditional PTB3 token transforms (like -LRB-, -RRB-). + * + * @see PTBEscapingProcessor + */ + public static final String PARAM_PTB3_ESCAPING = "ptb3Escaping"; + @ConfigurationParameter(name = PARAM_PTB3_ESCAPING, mandatory = true, defaultValue = "true") + private boolean ptb3Escaping; + + /** + * List of extra token texts (usually single character strings) that should be treated like + * opening quotes and escaped accordingly before being sent to the parser. + */ + public static final String PARAM_QUOTE_BEGIN = "quoteBegin"; + @ConfigurationParameter(name = PARAM_QUOTE_BEGIN, mandatory = false) + private List quoteBegin; + + /** + * List of extra token texts (usually single character strings) that should be treated like + * closing quotes and escaped accordingly before being sent to the parser. + */ + public static final String PARAM_QUOTE_END = "quoteEnd"; + @ConfigurationParameter(name = PARAM_QUOTE_END, mandatory = false) + private List quoteEnd; + + private CasConfigurableProviderBase annotatorProvider; + private MappingProvider mappingProvider; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException + { + super.initialize(aContext); + + annotatorProvider = new CoreNlpPosTaggerModelProvider(this); + + mappingProvider = MappingProviderFactory.createPosMappingProvider(this, posMappingLocation, + language, annotatorProvider); + + numThreads = ComponentParameters.computeNumThreads(numThreads); + } + + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException + { + CAS cas = aJCas.getCas(); + + annotatorProvider.configure(cas); + mappingProvider.configure(cas); + + // Transfer from CAS to CoreNLP + DKPro2CoreNlp converter = new DKPro2CoreNlp(); + converter.setPtb3Escaping(ptb3Escaping); + converter.setQuoteBegin(quoteBegin); + converter.setQuoteEnd(quoteEnd); + converter.setEncoding(modelEncoding); + + Annotation document = new Annotation((String) null); + converter.convert(aJCas, document); + + // Actual processing + annotatorProvider.getResource().annotate(document); + + // Transfer back into the CAS + CoreNlp2DKPro.convertPOSs(aJCas, document, mappingProvider); + } + + private class CoreNlpPosTaggerModelProvider + extends ModelProviderBase + { + public CoreNlpPosTaggerModelProvider(Object aObject) + { + super(aObject, "stanfordnlp", "tagger"); + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/tagger-${language}-${variant}.properties"); + } + + @Override + protected POSTaggerAnnotator produceResource(URL aUrl) throws IOException + { + String modelFile = aUrl.toString(); + + // Loading gzipped files from URL is broken in CoreNLP + // https://github.com/stanfordnlp/CoreNLP/issues/94 + if (modelFile.startsWith("jar:") && modelFile.endsWith(".gz")) { + modelFile = org.apache.commons.lang3.StringUtils.substringAfter(modelFile, "!/"); + } + + MaxentTagger tagger = new MaxentTagger(modelFile, + StringUtils.argsToProperties("-model", modelFile), false); + + SingletonTagset tags = new SingletonTagset(POS.class, + getResourceMetaData().getProperty(("pos.tagset"))); + tags.addAll(tagger.tagSet()); + tags.remove(Lexicon.BOUNDARY_TAG); + addTagset(tags); + + if (printTagSet) { + getContext().getLogger().log(INFO, getTagset().toString()); + } + + POSTaggerAnnotator annotator = new POSTaggerAnnotator(tagger, maxSentenceLength, + numThreads); + + return annotator; + } + } +} diff --git a/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpSegmenter.java b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpSegmenter.java new file mode 100644 index 0000000000..2def8be69f --- /dev/null +++ b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/CoreNlpSegmenter.java @@ -0,0 +1,258 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.corenlp; + +import java.io.IOException; +import java.net.URL; +import java.util.Properties; +import java.util.Set; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.segmentation.SegmenterBase; + +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.TokenizerAnnotator; +import edu.stanford.nlp.pipeline.TokenizerAnnotator.TokenizerType; +import edu.stanford.nlp.pipeline.WordsToSentencesAnnotator; +import edu.stanford.nlp.process.WordToSentenceProcessor; +import edu.stanford.nlp.process.WordToSentenceProcessor.NewlineIsSentenceBreak; +import edu.stanford.nlp.util.CoreMap; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Tokenizer and sentence splitter using from Stanford CoreNLP. + */ +@ResourceMetaData(name = "CoreNLP Segmenter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) +public class CoreNlpSegmenter + extends SegmenterBase +{ + private boolean verbose; + + /** + * The set of boundary tokens. + * + * @see WordToSentenceProcessor#WordToSentenceProcessor + */ + public static final String PARAM_BOUNDARY_TOKEN_REGEX = "boundaryTokenRegex"; + @ConfigurationParameter(name = PARAM_BOUNDARY_TOKEN_REGEX, mandatory = false, + defaultValue = WordToSentenceProcessor.DEFAULT_BOUNDARY_REGEX) + private String boundaryTokenRegex; + + /** + * A TokensRegex multi-token pattern for finding boundaries. + */ + public static final String PARAM_BOUNDARY_MULTI_TOKEN_REGEX = "boundaryMultiTokenRegex"; + @ConfigurationParameter(name = PARAM_BOUNDARY_MULTI_TOKEN_REGEX, mandatory = false) + private String boundaryMultiTokenRegex; + + /** + * These are elements like "p" or "sent", which will be wrapped into regular expressions for + * approximate XML matching. They will be deleted in the output, and will always trigger a + * sentence boundary. + */ + public static final String PARAM_HTML_ELEMENTS_TO_DISCARD = "htmlElementsToDiscard"; + @ConfigurationParameter(name = PARAM_HTML_ELEMENTS_TO_DISCARD, mandatory = false) + private Set htmlElementsToDiscard; + + /** + * The set of regular expressions for sentence boundary tokens that should be discarded. + * + * @see WordToSentenceProcessor#DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD + */ + public static final String PARAM_BOUNDARIES_TO_DISCARD = "boundaryToDiscard"; + @ConfigurationParameter(name = PARAM_BOUNDARIES_TO_DISCARD, mandatory = false, defaultValue = { + "\n", "*NL*" }) + private Set boundaryToDiscard; + + /** + * Strategy for treating newlines as sentence breaks. + */ + public static final String PARAM_NEWLINE_IS_SENTENCE_BREAK = "newlineIsSentenceBreak"; + @ConfigurationParameter(name = PARAM_NEWLINE_IS_SENTENCE_BREAK, mandatory = false, defaultValue = "two") + private String newlineIsSentenceBreak; + + /** + * The set of regular expressions for sentence boundary tokens that should be discarded. + */ + public static final String PARAM_TOKEN_REGEXES_TO_DISCARD = "tokenRegexesToDiscard"; + @ConfigurationParameter(name = PARAM_TOKEN_REGEXES_TO_DISCARD, mandatory = false, + defaultValue = {}) + private Set tokenRegexesToDiscard; + + /** + * Variant of a model the model. Used to address a specific model if here are multiple models + * for one language. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + private String variant; + + /** + * Location from which the model is read. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + private String modelLocation; + + /** + * Additional options that should be passed to the tokenizers. + */ + public static final String PARAM_TOKENIZATION_OPTIONS = "tokenizationOption"; + @ConfigurationParameter(name = PARAM_TOKENIZATION_OPTIONS, mandatory = false) + private String options; + + private ModelProviderBase sentenceAnnotator; + private ModelProviderBase tokenizerAnnotator; + private boolean useCoreLabelWord = false; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + tokenizerAnnotator = new ModelProviderBase(this, "corenlp", "tokenizer") + { + { + setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/tokenizer-${language}-${variant}.properties"); + } + + @Override + protected TokenizerAnnotator produceResource(URL aUrl) + throws IOException + { + Properties props = getAggregatedProperties(); + + Properties coreNlpProps = new Properties(); + coreNlpProps.setProperty("tokenize.language", props.getProperty(LANGUAGE)); + //coreNlpProps.setProperty("tokenize.class", null); + //coreNlpProps.setProperty("tokenize.whitespace", "false"); + //coreNlpProps.setProperty("tokenize.keepeol", "false"); + + if (options == null) { + options = TokenizerType.getTokenizerType(coreNlpProps).getDefaultOptions(); + } + + if (options.contains("splitAll=true")) { + useCoreLabelWord = true; + } + + NewlineIsSentenceBreak breakNL = + WordToSentenceProcessor.stringToNewlineIsSentenceBreak(newlineIsSentenceBreak); + if (NewlineIsSentenceBreak.ALWAYS == breakNL || + NewlineIsSentenceBreak.TWO_CONSECUTIVE == breakNL) { + options = "tokenizeNLs=true," + options; + } + + coreNlpProps.setProperty("tokenize.options", options); + + if (aUrl != null) { + String modelFile = aUrl.toString(); + + // Loading gzipped files from URL is broken in CoreNLP + // https://github.com/stanfordnlp/CoreNLP/issues/94 + if (modelFile.startsWith("jar:") && modelFile.endsWith(".gz")) { + modelFile = org.apache.commons.lang3.StringUtils.substringAfter(modelFile, "!/"); + } + + coreNlpProps.setProperty("segment.model", modelFile); + } + + String extraOptions = null; + + TokenizerAnnotator annotator = new TokenizerAnnotator(verbose, coreNlpProps, + extraOptions); + + return annotator; + } + }; + + sentenceAnnotator = new ModelProviderBase(this, "corenlp", "sentence") + { + { + setDefault(LOCATION, NOT_REQUIRED); + } + + @Override + protected WordsToSentencesAnnotator produceResource(URL aUrl) + throws IOException + { + WordsToSentencesAnnotator annotator = new WordsToSentencesAnnotator(verbose, + boundaryTokenRegex, boundaryToDiscard, htmlElementsToDiscard, + newlineIsSentenceBreak, boundaryMultiTokenRegex, tokenRegexesToDiscard); + + return annotator; + } + }; + } + + @Override + protected void process(JCas aJCas, String aText, int aZoneBegin) + throws AnalysisEngineProcessException + { + Annotation document = new Annotation(aText); + + if (isWriteToken()) { + tokenizerAnnotator.configure(aJCas.getCas()); + tokenizerAnnotator.getResource().annotate(document); + + for (CoreLabel token : document.get(CoreAnnotations.TokensAnnotation.class)) { + //useCoreLabelWord to be set to true when allowing clitics in the language + if (useCoreLabelWord) { + createToken(aJCas, + token.word(), + token.get(CharacterOffsetBeginAnnotation.class) + aZoneBegin, + token.get(CharacterOffsetEndAnnotation.class) + aZoneBegin); + } else { + createToken(aJCas, + token.get(CharacterOffsetBeginAnnotation.class) + aZoneBegin, + token.get(CharacterOffsetEndAnnotation.class) + aZoneBegin); + } + } + } + + if (isWriteSentence()) { + sentenceAnnotator.configure(aJCas.getCas()); + sentenceAnnotator.getResource().annotate(document); + + for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) { + createSentence(aJCas, + sentence.get(CharacterOffsetBeginAnnotation.class) + aZoneBegin, + sentence.get(CharacterOffsetEndAnnotation.class) + aZoneBegin); + } + } + } +} diff --git a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/internal/CoreNlp2DKPro.java b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/internal/CoreNlp2DKPro.java similarity index 91% rename from dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/internal/CoreNlp2DKPro.java rename to dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/internal/CoreNlp2DKPro.java index d551e0105d..d20e07f783 100644 --- a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/internal/CoreNlp2DKPro.java +++ b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/internal/CoreNlp2DKPro.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,9 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.corenlp.internal; +package org.dkpro.core.corenlp.internal; import java.util.ArrayList; import java.util.List; @@ -26,13 +26,13 @@ import org.apache.uima.cas.Type; import org.apache.uima.fit.util.FSCollectionFactory; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain; import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; @@ -67,7 +67,7 @@ public class CoreNlp2DKPro { public static void convertPOSs(JCas aJCas, Annotation document, - MappingProvider mappingProvider, boolean internStrings) + MappingProvider mappingProvider) { for (CoreMap s : document.get(SentencesAnnotation.class)) { for (CoreLabel t : s.get(TokensAnnotation.class)) { @@ -76,7 +76,7 @@ public static void convertPOSs(JCas aJCas, Annotation document, Type tagType = mappingProvider.getTagType(tag); POS anno = (POS) aJCas.getCas().createAnnotation(tagType, token.getBegin(), token.getEnd()); - anno.setPosValue(internStrings ? tag.intern() : tag); + anno.setPosValue(tag != null ? tag.intern() : null); POSUtils.assignCoarseValue(anno); anno.addToIndexes(); token.setPos(anno); @@ -85,7 +85,7 @@ public static void convertPOSs(JCas aJCas, Annotation document, } public static void convertNamedEntities(JCas aJCas, Annotation document, - MappingProvider mappingProvider, boolean internStrings) + MappingProvider mappingProvider) { for (CoreMap s : document.get(SentencesAnnotation.class)) { for (CoreLabel t : s.get(TokensAnnotation.class)) { @@ -100,7 +100,7 @@ public static void convertNamedEntities(JCas aJCas, Annotation document, Type tagType = mappingProvider.getTagType(tag); NamedEntity anno = (NamedEntity) aJCas.getCas().createAnnotation(tagType, token.getBegin(), token.getEnd()); - anno.setValue(internStrings ? tag.intern() : tag); + anno.setValue(tag != null ? tag.intern() : null); anno.addToIndexes(); } } @@ -121,7 +121,7 @@ public static void convertLemmas(JCas aJCas, Annotation document) } public static void convertDependencies(JCas aJCas, Annotation document, - MappingProvider mappingProvider, boolean internStrings) + MappingProvider mappingProvider) { for (CoreMap s : document.get(SentencesAnnotation.class)) { SemanticGraph graph = s.get(CollapsedDependenciesAnnotation.class); @@ -163,7 +163,7 @@ public static void convertDependencies(JCas aJCas, Annotation document, Type depRel = mappingProvider.getTagType(labelUsedForMapping); Dependency dep = (Dependency) aJCas.getCas().createFS(depRel); - dep.setDependencyType(internStrings ? actualLabel.intern() : actualLabel); + dep.setDependencyType(actualLabel != null ? actualLabel.intern() : null); dep.setDependent(dependent); dep.setGovernor(governor); dep.setBegin(dep.getDependent().getBegin()); @@ -175,22 +175,21 @@ public static void convertDependencies(JCas aJCas, Annotation document, } public static void convertConstituents(JCas aJCas, Annotation aDocument, - MappingProvider aMappingProvider, boolean aInternStrings, - TreebankLanguagePack aTreebankLanguagePack) + MappingProvider aMappingProvider, TreebankLanguagePack aTreebankLanguagePack) { for (CoreMap s : aDocument.get(SentencesAnnotation.class)) { Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class); tree.setSpans(); List tokens = s.get(TokensAnnotation.class); - convertConstituentTreeNode(aJCas, aTreebankLanguagePack, tree, null, aInternStrings, - aMappingProvider, tokens); + convertConstituentTreeNode(aJCas, aTreebankLanguagePack, tree, null, aMappingProvider, + tokens); } } private static org.apache.uima.jcas.tcas.Annotation convertConstituentTreeNode(JCas aJCas, TreebankLanguagePack aTreebankLanguagePack, Tree aNode, - org.apache.uima.jcas.tcas.Annotation aParentFS, boolean internStrings, + org.apache.uima.jcas.tcas.Annotation aParentFS, MappingProvider constituentMappingProvider, List tokens) { // Get node label @@ -215,17 +214,16 @@ private static org.apache.uima.jcas.tcas.Annotation convertConstituentTreeNode(J Constituent constituent = (Constituent) aJCas.getCas().createAnnotation(constType, begin, end); - constituent.setConstituentType(internStrings ? nodeLabelValue.intern() : - nodeLabelValue); - constituent.setSyntacticFunction(internStrings && syntacticFunction != null ? - syntacticFunction.intern() : syntacticFunction); + constituent.setConstituentType(nodeLabelValue != null ? nodeLabelValue.intern() : null); + constituent.setSyntacticFunction( + syntacticFunction != null ? syntacticFunction.intern() : null); constituent.setParent(aParentFS); // Do we have any children? List childAnnotations = new ArrayList<>(); for (Tree child : aNode.getChildrenAsList()) { org.apache.uima.jcas.tcas.Annotation childAnnotation = convertConstituentTreeNode( - aJCas, aTreebankLanguagePack, child, constituent, internStrings, + aJCas, aTreebankLanguagePack, child, constituent, constituentMappingProvider, tokens); if (childAnnotation != null) { childAnnotations.add(childAnnotation); diff --git a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/internal/DKPro2CoreNlp.java b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/internal/DKPro2CoreNlp.java similarity index 89% rename from dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/internal/DKPro2CoreNlp.java rename to dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/internal/DKPro2CoreNlp.java index 45b83c52c3..7c5e7fab79 100644 --- a/dkpro-core-corenlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/corenlp/internal/DKPro2CoreNlp.java +++ b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/internal/DKPro2CoreNlp.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,14 +14,14 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.corenlp.internal; +package org.dkpro.core.corenlp.internal; -import static org.apache.uima.fit.util.JCasUtil.selectPreceding; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import static org.apache.uima.fit.util.JCasUtil.selectFollowing; +import static org.apache.uima.fit.util.JCasUtil.selectPreceding; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; @@ -48,6 +48,8 @@ import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.CoarseNamedEntityTagAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.FineGrainedNamedEntityTagAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.IndexAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation; @@ -64,6 +66,7 @@ import edu.stanford.nlp.process.CoreLabelTokenFactory; import edu.stanford.nlp.process.PTBEscapingProcessor; import edu.stanford.nlp.semgraph.SemanticGraph; +import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation; import edu.stanford.nlp.trees.GrammaticalRelation; import edu.stanford.nlp.trees.LabeledScoredTreeFactory; @@ -135,6 +138,8 @@ public void setQuoteEnd(List aQuoteEnd) public Annotation convert(JCas aSource, Annotation aTarget) { + List allTokens = new ArrayList<>(); + // Document annotation aTarget.set(CoreAnnotations.TextAnnotation.class, aSource.getDocumentText()); @@ -161,7 +166,7 @@ public Annotation convert(JCas aSource, Annotation aTarget) for (Token t : selectCovered(Token.class, s)) { String tokenText = t.getText(); if (encoding != null && !"UTF-8".equals(encoding.name())) { - tokenText = new String(tokenText.getBytes(StandardCharsets.UTF_8), encoding); + tokenText = new String(tokenText.getBytes(StandardCharsets.UTF_8), encoding); } CoreLabel token = tokenFactory.makeToken(tokenText, t.getBegin(), @@ -196,9 +201,13 @@ public Annotation convert(JCas aSource, Annotation aTarget) List nes = selectCovered(NamedEntity.class, t); if (nes.size() > 0) { token.set(NamedEntityTagAnnotation.class, nes.get(0).getValue()); + token.set(FineGrainedNamedEntityTagAnnotation.class, nes.get(0).getValue()); + token.set(CoarseNamedEntityTagAnnotation.class, nes.get(0).getValue()); } else { token.set(NamedEntityTagAnnotation.class, "O"); + token.set(FineGrainedNamedEntityTagAnnotation.class, "O"); + token.set(CoarseNamedEntityTagAnnotation.class, "O"); } } @@ -210,17 +219,25 @@ public Annotation convert(JCas aSource, Annotation aTarget) } // Dependencies - List dependencies = new ArrayList<>(); + List basicDependencies = new ArrayList<>(); + List enhancedDependencies = new ArrayList<>(); for (Dependency d : selectCovered(Dependency.class, s)) { TypedDependency dep = new TypedDependency( GrammaticalRelation.valueOf(d.getDependencyType()), idxTokens.get(d.getGovernor()), idxTokens.get(d.getDependent())); - if (DependencyFlavor.ENHANCED.equals(d.getFlavor())) { + + if (d.getFlavor() == null || DependencyFlavor.BASIC.equals(d.getFlavor())) { + basicDependencies.add(dep); + } + else if (DependencyFlavor.ENHANCED.equals(d.getFlavor())) { dep.setExtra(); + basicDependencies.add(dep); + enhancedDependencies.add(dep); } - dependencies.add(dep); } - sentence.set(EnhancedDependenciesAnnotation.class, new SemanticGraph(dependencies)); + sentence.set(BasicDependenciesAnnotation.class, new SemanticGraph(basicDependencies)); + sentence.set(EnhancedDependenciesAnnotation.class, + new SemanticGraph(enhancedDependencies)); if (ptb3Escaping) { tokens = applyPtbEscaping(tokens, quoteBegin, quoteEnd); @@ -228,8 +245,12 @@ public Annotation convert(JCas aSource, Annotation aTarget) sentence.set(TokensAnnotation.class, tokens); sentences.add(sentence); + + allTokens.addAll(tokens); } + aTarget.set(SentencesAnnotation.class, sentences); + aTarget.set(TokensAnnotation.class, allTokens); return aTarget; } diff --git a/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/internal/TokenKey.java b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/internal/TokenKey.java new file mode 100644 index 0000000000..9357ce28b5 --- /dev/null +++ b/dkpro-core-corenlp-gpl/src/main/java/org/dkpro/core/corenlp/internal/TokenKey.java @@ -0,0 +1,27 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.corenlp.internal; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import edu.stanford.nlp.util.TypesafeMap.Key; + +public class TokenKey + implements Key +{ +} diff --git a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-default-variants.map b/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-default-variants.map deleted file mode 100644 index 22fa925b02..0000000000 --- a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-default-variants.map +++ /dev/null @@ -1,3 +0,0 @@ -de=dewac_175m_600.crf -en=all.3class.distsim.crf -es=ancora.distsim.s512.crf diff --git a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/tagger-default-variants.map b/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/tagger-default-variants.map deleted file mode 100644 index 3942bc99a7..0000000000 --- a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/tagger-default-variants.map +++ /dev/null @@ -1,6 +0,0 @@ -ar=accurate -de=fast -fr=default -en=bidirectional-distsim -es=distsim -zh=distsim diff --git a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/tokenizer-de-none.properties b/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/tokenizer-de-none.properties new file mode 100644 index 0000000000..e80254a372 --- /dev/null +++ b/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/tokenizer-de-none.properties @@ -0,0 +1 @@ +location=-=* NOT REQUIRED *=- diff --git a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/tokenizer-en-none.properties b/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/tokenizer-en-none.properties new file mode 100644 index 0000000000..e80254a372 --- /dev/null +++ b/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/tokenizer-en-none.properties @@ -0,0 +1 @@ +location=-=* NOT REQUIRED *=- diff --git a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/tokenizer-es-none.properties b/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/tokenizer-es-none.properties new file mode 100644 index 0000000000..e80254a372 --- /dev/null +++ b/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/tokenizer-es-none.properties @@ -0,0 +1 @@ +location=-=* NOT REQUIRED *=- diff --git a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/tokenizer-fr-none.properties b/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/tokenizer-fr-none.properties new file mode 100644 index 0000000000..e80254a372 --- /dev/null +++ b/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/tokenizer-fr-none.properties @@ -0,0 +1 @@ +location=-=* NOT REQUIRED *=- diff --git a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/depparser-default-variants.map b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/depparser-default-variants.map similarity index 100% rename from dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/depparser-default-variants.map rename to dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/depparser-default-variants.map diff --git a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-de-dewac_175m_600.crf.map b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-de-dewac_175m_600.crf.map similarity index 100% rename from dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-de-dewac_175m_600.crf.map rename to dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-de-dewac_175m_600.crf.map diff --git a/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-de-germeval2014.hgc_175m_600.crf.map b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-de-germeval2014.hgc_175m_600.crf.map new file mode 100644 index 0000000000..e7c53d13de --- /dev/null +++ b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-de-germeval2014.hgc_175m_600.crf.map @@ -0,0 +1,5 @@ +PERSON=de.tudarmstadt.ukp.dkpro.core.api.ner.type.Person +LOCATION=de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location +ORGANIZATION=de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization +#MISC +*=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity diff --git a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-de-hgc_175m_600.crf.map b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-de-hgc_175m_600.crf.map similarity index 100% rename from dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-de-hgc_175m_600.crf.map rename to dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-de-hgc_175m_600.crf.map diff --git a/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-default-variants.map b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-default-variants.map new file mode 100644 index 0000000000..1372b1e156 --- /dev/null +++ b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-default-variants.map @@ -0,0 +1,3 @@ +de=germeval2014.hgc_175m_600.crf +en=all.3class.distsim.crf +es=ancora.distsim.s512.crf diff --git a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-en-all.3class.caseless.distsim.crf.map b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-en-all.3class.caseless.distsim.crf.map similarity index 100% rename from dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-en-all.3class.caseless.distsim.crf.map rename to dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-en-all.3class.caseless.distsim.crf.map diff --git a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-en-all.3class.distsim.crf.map b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-en-all.3class.distsim.crf.map similarity index 100% rename from dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-en-all.3class.distsim.crf.map rename to dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-en-all.3class.distsim.crf.map diff --git a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-en-conll.4class.caseless.distsim.crf.map b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-en-conll.4class.caseless.distsim.crf.map similarity index 100% rename from dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-en-conll.4class.caseless.distsim.crf.map rename to dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-en-conll.4class.caseless.distsim.crf.map diff --git a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-en-conll.4class.distsim.crf.map b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-en-conll.4class.distsim.crf.map similarity index 100% rename from dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-en-conll.4class.distsim.crf.map rename to dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-en-conll.4class.distsim.crf.map diff --git a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-en-muc.7class.caseless.distsim.crf.map b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-en-muc.7class.caseless.distsim.crf.map similarity index 100% rename from dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-en-muc.7class.caseless.distsim.crf.map rename to dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-en-muc.7class.caseless.distsim.crf.map diff --git a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-en-muc.7class.distsim.crf.map b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-en-muc.7class.distsim.crf.map similarity index 100% rename from dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-en-muc.7class.distsim.crf.map rename to dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-en-muc.7class.distsim.crf.map diff --git a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-en-nowiki.3class.caseless.distsim.crf.map b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-en-nowiki.3class.caseless.distsim.crf.map similarity index 100% rename from dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-en-nowiki.3class.caseless.distsim.crf.map rename to dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-en-nowiki.3class.caseless.distsim.crf.map diff --git a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-es-ancora.distsim.s512.crf.map b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-es-ancora.distsim.s512.crf.map similarity index 100% rename from dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-es-ancora.distsim.s512.crf.map rename to dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/ner-es-ancora.distsim.s512.crf.map diff --git a/dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/parser-default-variants.map b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/parser-default-variants.map similarity index 100% rename from dkpro-core-corenlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/corenlp/lib/parser-default-variants.map rename to dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/parser-default-variants.map diff --git a/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/tagger-default-variants.map b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/tagger-default-variants.map new file mode 100644 index 0000000000..aa4c942157 --- /dev/null +++ b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/tagger-default-variants.map @@ -0,0 +1,6 @@ +ar=default +de=fast +fr=default +en=bidirectional-distsim +es=distsim +zh=distsim diff --git a/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/tokenizer-default-variants.map b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/tokenizer-default-variants.map new file mode 100644 index 0000000000..71bed9af20 --- /dev/null +++ b/dkpro-core-corenlp-gpl/src/main/resources/org/dkpro/core/corenlp/lib/tokenizer-default-variants.map @@ -0,0 +1,5 @@ +ar=atb-bn-arztrain +de=none +en=none +es=none +fr=none diff --git a/dkpro-core-corenlp-gpl/src/scripts/build.xml b/dkpro-core-corenlp-gpl/src/scripts/build.xml index 87a4a46862..fa239479a0 100644 --- a/dkpro-core-corenlp-gpl/src/scripts/build.xml +++ b/dkpro-core-corenlp-gpl/src/scripts/build.xml @@ -1,6 +1,6 @@ - + - - - - - - - - - - + + + + + + + + + + - - + + - - - - + + + + - - - - + + + + - - - + + + - - - - - + + + + + - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - + + + + + + + + + + + + + - - - - - - - - - - - - - + + + + + + + + + + + + + - - - - - - - - - - - - + + + + + + + + + + + + - - - - - - - - - - - - - + + + + + + + + + + + + + - - - - - + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + - - - - - - - - - + + + - - - - - + + + + + + + + + + + - - - + + + + - - - - - + + + + + - - - + + + - - - - - - - - - + + + + + - - - - - - - - - + + + - - - - - - - - ================================ - == IMPORTANT LICENSE REMINDER == - ================================ - - YOU MAY NOT BE ALLOWED TO REDISTRIBUTE THESE JARS WITHOUT EXPLICIT PERMISSION - - from their respective original creators. Please make sure to check the licenses. - - For Maven users: - - Use the build target "local-maven" (ant local-maven) to automatically install - the jars into your local Maven repository at ~/.m2/repository. - - If you choose to upload these JARs to a PRIVATE repository or install them in - your local repository manually, use the following groupId and artifactId scheme: - - groupId: de.tudarmstadt.ukp.dkpro.core - artifactId: de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-<tool>-<lang>-<variant> - version: <use the timestamp encoded in the JAR file name> - - Example: - - groupId: de.tudarmstadt.ukp.dkpro.core - artifactId: de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-parser-en-pcfg - version: 1.0.2 - - If you have done that, you may enable the Maven profile "use-proprietary-resources" - to enable additional test cases when building the DKPro Stanford NLP Integration. - - YOU MAY NOT BE ALLOWED TO REDISTRIBUTE THESE JARS WITHOUT EXPLICIT PERMISSION - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ================================ + == IMPORTANT LICENSE REMINDER == + ================================ + + YOU MAY NOT BE ALLOWED TO REDISTRIBUTE THESE JARS WITHOUT EXPLICIT PERMISSION + + from their respective original creators. Please make sure to check the licenses. + + For Maven users: + + Use the build target "local-maven" (ant local-maven) to automatically install + the jars into your local Maven repository at ~/.m2/repository. + + If you choose to upload these JARs to a PRIVATE repository or install them in + your local repository manually, use the following groupId and artifactId scheme: + + groupId: de.tudarmstadt.ukp.dkpro.core + artifactId: de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-<tool>-<lang>-<variant> + version: <use the timestamp encoded in the JAR file name> + + Example: + + groupId: de.tudarmstadt.ukp.dkpro.core + artifactId: de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-parser-en-pcfg + version: 1.0.2 + + If you have done that, you may enable the Maven profile "use-proprietary-resources" + to enable additional test cases when building the DKPro Stanford NLP Integration. + + YOU MAY NOT BE ALLOWED TO REDISTRIBUTE THESE JARS WITHOUT EXPLICIT PERMISSION + \ No newline at end of file diff --git a/dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpCoreferenceResolverTest.java b/dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpCoreferenceResolverTest.java deleted file mode 100644 index c8faad6ecf..0000000000 --- a/dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpCoreferenceResolverTest.java +++ /dev/null @@ -1,175 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.corenlp; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.jcas.JCas; -import org.junit.Ignore; -import org.junit.Rule; -import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import edu.stanford.nlp.dcoref.Constants; - -public class CoreNlpCoreferenceResolverTest -{ - @Test - public void test() - throws Exception - { - JCas jcas = runTest("en", "John bought a car. He is very happy with it."); - - String[][] ref = { - { "John", "He" }, - { "a car", "it" } }; - - AssertAnnotations.assertCoreference(ref, select(jcas, CoreferenceChain.class)); - } - - @Test - public void testDictionarySieve() - throws Exception - { - JCas jcas = runTest("en", "John joined Google in 2012. He is doing research for the company.", - Constants.SIEVEPASSES + ",CorefDictionaryMatch"); - - String[][] ref = new String[][] { - { "John", "He" }, - { "Google", "the company" }, - { "2012" } }; - - AssertAnnotations.assertCoreference(ref, select(jcas, CoreferenceChain.class)); - } - - @Test - public void testTriggerReparse() - throws Exception - { - JCas jcas = runTest("en", "'Let's go! I want to see the Don', he said."); - - String[][] ref = { - { "'Let's go" }, - { "'Let's" }, - { "I" }, - { "the Don'", "he" } }; - - String[] pennTree = { - "(ROOT (FRAG (NP (NP ('' ') (NNP Let) (POS 's)) (NN go)) (. !)))", - "(ROOT (S (S (NP (PRP I)) (VP (VBP want) (S (VP (TO to) (VP (VB see) (NP (DT the) " - + "(NX (NNP Don) (POS ')))))))) (, ,) (NP (PRP he)) (VP (VBD said)) (. .)))" - }; - - AssertAnnotations.assertCoreference(ref, select(jcas, CoreferenceChain.class)); - AssertAnnotations.assertPennTree(pennTree, select(jcas, PennTree.class)); - } - - @Test - @Ignore("Disabled due to side effects on parser unit tests. See issue 175") - public void testTriggerReparse1() - throws Exception - { - JCas jcas = runTest("en", - "Other major domestic initiatives in his presidency include the Patient Protection and " + - "Affordable Care Act, often referred to as \"Obamacare\"; the Dodd–Frank Wall Street Reform and " + - "Consumer Protection Act; the Don't Ask, Don't Tell Repeal Act of 2010; the Budget Control " + - "Act of 2011; and the American Taxpayer Relief Act of 2012."); - - String[][] ref = { - { "Other major domestic initiatives in his presidency" }, - { "his presidency" }, - { "his" }, - { "the Patient Protection and Affordable Care Act, often referred to as \"Obamacare\"; the Dodd–Frank Wall Street Reform and Consumer Protection Act; the Don't Ask" }, - { "the Patient Protection and Affordable Care Act" }, - { "the Patient Protection" }, - { "Affordable Care Act" }, - { "\"Obamacare\"; the Dodd–Frank Wall Street Reform and Consumer Protection Act;" }, - { "the Dodd" }, - { "Frank Wall Street Reform and Consumer Protection Act" }, - { "Frank Wall Street Reform" }, - { "Consumer Protection Act" }, - { "Repeal Act of 2010; the Budget Control Act of 2011; and the American Taxpayer Relief Act of 2012" }, - { "2010" }, - { "the Budget Control Act of 2011" }, - { "the American Taxpayer Relief Act of 2012" }, - { "2011" }, - { "2012" } }; - - String[] pennTree = { - "(ROOT (S (NP (NP (JJ Other) (JJ major) (JJ domestic) (NNS initiatives)) (PP (IN in) " - + "(NP (PRP$ his) (NN presidency)))) (VP (VBP include) (SBAR (S (NP (NP (DT the) " - + "(NNP Patient) (NNP Protection) (CC and) (NNP Affordable) (NNP Care) (NNP Act)) " - + "(, ,) (VP (ADVP (RB often)) (VBN referred) (PP (TO to) (SBAR (IN as) (S (NP " - + "(`` \") (NP (NNP Obamacare)) ('' \") (PRN (: ;) (S (NP (DT the) (NNP Dodd)) (VP " - + "(VBP –) (NP (NP (NNP Frank) (NNP Wall) (NNP Street) (NNP Reform)) (CC and) (NP " - + "(NNP Consumer) (NNP Protection) (NNP Act))))) (: ;))) (DT the) (VP (VBP Do) " - + "(RB n't) (VP (VB Ask))))))) (, ,)) (VP (VBP Do) (RB n't) (VP (VB Tell) (NP (NP " - + "(NP (NN Repeal) (NNP Act)) (PP (IN of) (NP (CD 2010)))) (: ;) (NP (NP (DT the) " - + "(NNP Budget) (NNP Control) (NNP Act)) (PP (IN of) (NP (CD 2011)))) (: ;) " - + "(CC and) (NP (NP (DT the) (NNP American) (NNP Taxpayer) (NNP Relief) (NNP Act)) " - + "(PP (IN of) (NP (CD 2012)))))))))) (. .)))" - }; - - AssertAnnotations.assertPennTree(pennTree, select(jcas, PennTree.class)); - AssertAnnotations.assertCoreference(ref, select(jcas, CoreferenceChain.class)); - } - - private JCas runTest(String aLanguage, String aText) - throws Exception - { - return runTest(aLanguage, aText, Constants.SIEVEPASSES); - } - - - private JCas runTest(String aLanguage, String aText, String aSieves) - throws Exception - { - AssumeResource.assumeResource(CoreNlpCoreferenceResolver.class, - "de/tudarmstadt/ukp/dkpro/core/stanfordnlp", "coref", aLanguage, "default"); - - // Coreference resolution requires the parser and the NER to run before - AnalysisEngine engine = createEngine(createEngineDescription( - createEngineDescription(CoreNlpSegmenter.class), - createEngineDescription(CoreNlpPosTagger.class), - createEngineDescription(CoreNlpLemmatizer.class), - createEngineDescription(CoreNlpParser.class, - CoreNlpParser.PARAM_WRITE_CONSTITUENT, true, - CoreNlpParser.PARAM_WRITE_DEPENDENCY, true, - CoreNlpParser.PARAM_WRITE_PENN_TREE, true), - createEngineDescription(CoreNlpNamedEntityRecognizer.class), - createEngineDescription(CoreNlpCoreferenceResolver.class, - CoreNlpCoreferenceResolver.PARAM_SIEVES, aSieves))); - - // Set up a simple example - JCas jcas = engine.newJCas(); - jcas.setDocumentLanguage(aLanguage); - jcas.setDocumentText(aText); - engine.process(jcas); - - return jcas; - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpLemmatizerTest.java b/dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpLemmatizerTest.java deleted file mode 100644 index e8984d684c..0000000000 --- a/dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpLemmatizerTest.java +++ /dev/null @@ -1,86 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.corenlp; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.jcas.JCas; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -public class CoreNlpLemmatizerTest -{ - @Test - public void testUnderscore() throws Exception - { - runTest("en", "foo _ bar", - new String[] { "foo", "_", "bar" }); - } - - @Test - public void testEnglish() throws Exception - { - runTest("en", "This is a test .", - new String[] { "this", "be", "a", "test", "." }); - - runTest("en", "We need a very complicated example sentence , which " - + "contains as many constituents and dependencies as possible .", - new String[] { "we", "need", "a", "very", "complicated", "example", - "sentence", ",", "which", "contain", "as", "many", "constituent", "and", - "dependency", "as", "possible", "." }); - } - - @Test(expected = AnalysisEngineProcessException.class) - public void testNotEnglish() - throws Exception - { - runTest("de", "Das ist ein test .", new String[] {} ); - } - - @Test - public void testUrl() throws Exception - { - runTest("en", "Details hinzu findet man unter http://www.armytimes.com/news/2009/11/army_M4_112109w/ .", - new String[] { "detail", "hinzu", "findet", "man", "unter", "http://www.armytimes.com/news/2009/11/army_m4_112109w/", "." }); - - } - - private void runTest(String aLanguage, String testDocument, String[] lemmas) - throws Exception - { - AnalysisEngineDescription posTagger = createEngineDescription(CoreNlpPosTagger.class); - AnalysisEngineDescription lemmatizer = createEngineDescription(CoreNlpLemmatizer.class); - - JCas aJCas = TestRunner.runTest(createEngineDescription(posTagger, lemmatizer), - aLanguage, testDocument); - - AssertAnnotations.assertLemma(lemmas, select(aJCas, Lemma.class)); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpNamedEntityRecognizerTest.java b/dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpNamedEntityRecognizerTest.java deleted file mode 100644 index 8675f6d489..0000000000 --- a/dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpNamedEntityRecognizerTest.java +++ /dev/null @@ -1,224 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.corenlp; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.jcas.JCas; -import org.junit.Assume; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -/** - */ -public class CoreNlpNamedEntityRecognizerTest -{ - @Test - public void testEnglish() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("en", null, "IBM where John works is in Germany ."); - - String[] ne = { - "[ 0, 3]Organization(ORGANIZATION) (IBM)", - "[ 10, 14]Person(PERSON) (John)", - "[ 27, 34]Location(LOCATION) (Germany)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void test3classCaselessEnglish() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("en", "all.3class.caseless.distsim.crf", "ibm where john works is in germany ."); - - String[] ne = { - "[ 0, 3]Organization(ORGANIZATION) (ibm)", - "[ 10, 14]Person(PERSON) (john)", - "[ 27, 34]Location(LOCATION) (germany)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void testNoWiki3classCaselessEnglish() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("en", "nowiki.3class.caseless.distsim.crf", "ibm where john works is in germany ."); - - String[] ne = { - "[ 0, 3]Organization(ORGANIZATION) (ibm)", - "[ 10, 14]Person(PERSON) (john)", - "[ 27, 34]Location(LOCATION) (germany)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - - @Test - public void test4classEnglish() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("en", "conll.4class.distsim.crf", "IBM where John works is in Germany ."); - - String[] ne = { - "[ 0, 3]Organization(ORGANIZATION) (IBM)", - "[ 10, 14]Person(PERSON) (John)", - "[ 27, 34]Location(LOCATION) (Germany)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - - @Test - public void test4classCaselessEnglish() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("en", "conll.4class.caseless.distsim.crf", "ibm where john works is in germany ."); - - String[] ne = { - "[ 0, 3]Organization(ORGANIZATION) (ibm)", - "[ 10, 14]Person(PERSON) (john)", - "[ 27, 34]Location(LOCATION) (germany)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void test4classCaselessMixedEnglish() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("en", "conll.4class.caseless.distsim.crf", "IBM where john works is in Germany ."); - - String[] ne = { - "[ 0, 3]Organization(ORGANIZATION) (IBM)", - "[ 10, 14]Person(PERSON) (john)", - "[ 27, 34]Location(LOCATION) (Germany)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void test7classEnglish() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("en", "muc.7class.distsim.crf", "IBM where John works is in Germany ."); - - String[] ne = { - "[ 0, 3]Organization(ORGANIZATION) (IBM)", - "[ 10, 14]Person(PERSON) (John)", - "[ 27, 34]Location(LOCATION) (Germany)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void testGerman() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("de", null, "Markus arbeitet seit 10 Jahren bei SAP in Deutschland ."); - - String[] ne = { - "[ 0, 6]Person(I-PER) (Markus)", - "[ 35, 38]Organization(I-ORG) (SAP)", - "[ 42, 53]Location(I-LOC) (Deutschland)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void testHgcGerman() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("de", "hgc_175m_600.crf", "Markus arbeitet seit 10 Jahren bei SAP in Deutschland ."); - - String[] ne = { - "[ 0, 6]Person(I-PER) (Markus)", - "[ 35, 38]Organization(I-ORG) (SAP)", - "[ 42, 53]Location(I-LOC) (Deutschland)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void testSpanish() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("es", null, "Hace 10 años Markus trabaja en SAP en Alemania ."); - - String[] ne = { - "[ 13, 19]Person(PERS) (Markus)", - "[ 31, 34]Organization(ORG) (SAP)", - "[ 38, 46]Location(LUG) (Alemania)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test(expected = AnalysisEngineProcessException.class) - public void testMissingModel() throws Exception - { - runTest("xx", null, "Xec xena Xeo ."); - } - - private JCas runTest(String language, String variant, String testDocument) - throws Exception - { - AssumeResource.assumeResource(CoreNlpNamedEntityRecognizer.class, - "de/tudarmstadt/ukp/dkpro/core/stanfordnlp", "ner", language, variant); - - AnalysisEngine engine = createEngine(CoreNlpNamedEntityRecognizer.class, - CoreNlpNamedEntityRecognizer.PARAM_VARIANT, variant, - CoreNlpNamedEntityRecognizer.PARAM_PRINT_TAGSET, true); - - return TestRunner.runTest(engine, language, testDocument); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpSegmenterTest.java b/dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpSegmenterTest.java deleted file mode 100644 index 3351b49fef..0000000000 --- a/dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpSegmenterTest.java +++ /dev/null @@ -1,87 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.corenlp; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.fit.factory.JCasFactory; -import org.apache.uima.jcas.JCas; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.harness.SegmenterHarness; - -public class CoreNlpSegmenterTest -{ - @Test - public void run() throws Throwable - { - AnalysisEngineDescription aed = createEngineDescription(CoreNlpSegmenter.class); - - SegmenterHarness.run(aed, "de.4", "en.9", "ar.1", "zh.1", "zh.2"); - } - - @Test - public void testEnglishSpeech() throws Exception - { - JCas jcas = JCasFactory.createJCas(); - jcas.setDocumentLanguage("en"); - jcas.setDocumentText("'Let's go! I want to see the Don', he said."); - - AnalysisEngine aed = createEngine(CoreNlpSegmenter.class); - aed.process(jcas); - - String[] tokens = { "'", "Let", "'s", "go", "!", "I", "want", "to", "see", "the", "Don", - "'", ",", "he", "said", "." }; - - AssertAnnotations.assertToken(tokens, select(jcas, Token.class)); - } - - @Test - public void testSpanish() throws Exception - { - JCas jcas = JCasFactory.createJCas(); - jcas.setDocumentLanguage("es"); - jcas.setDocumentText("Tim dijo a Jamie para la 100ª vez que abandone la sala."); - - AnalysisEngine aed = createEngine(CoreNlpSegmenter.class); - aed.process(jcas); - - String[] tokens = { "Tim", "dijo", "a", "Jamie", "para", "la", "100ª", "vez", "que", - "abandone", "la", "sala", "." }; - - AssertAnnotations.assertToken(tokens, select(jcas, Token.class)); - } - - @Test - public void testZoning() throws Exception - { - SegmenterHarness.testZoning(CoreNlpSegmenter.class); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpCoreferenceResolverTest.java b/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpCoreferenceResolverTest.java new file mode 100644 index 0000000000..d80c5db3ce --- /dev/null +++ b/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpCoreferenceResolverTest.java @@ -0,0 +1,188 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.corenlp; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.corenlp.CoreNlpCoreferenceResolver; +import org.dkpro.core.corenlp.CoreNlpLemmatizer; +import org.dkpro.core.corenlp.CoreNlpNamedEntityRecognizer; +import org.dkpro.core.corenlp.CoreNlpParser; +import org.dkpro.core.corenlp.CoreNlpPosTagger; +import org.dkpro.core.corenlp.CoreNlpSegmenter; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Ignore; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; +import edu.stanford.nlp.dcoref.Constants; + +public class CoreNlpCoreferenceResolverTest +{ + @Test + public void test() + throws Exception + { + JCas jcas = runTest("en", "John bought a car. He is very happy with it."); + + String[][] ref = { + { "John", "He" }, + { "a car", "it" } }; + + AssertAnnotations.assertCoreference(ref, select(jcas, CoreferenceChain.class)); + } + + @Test + public void testDictionarySieve() + throws Exception + { + JCas jcas = runTest("en", "John joined Google in 2012. He is doing research for the company.", + Constants.SIEVEPASSES + ",CorefDictionaryMatch"); + + String[][] ref = new String[][] { + { "John", "He" }, + { "Google", "the company" }, + { "2012" } }; + + AssertAnnotations.assertCoreference(ref, select(jcas, CoreferenceChain.class)); + } + + @Test + public void testTriggerReparse() + throws Exception + { + JCas jcas = runTest("en", "'Let's go! I want to see the Don', he said."); + + String[][] ref = { + { "'" }, + { "Let's go" }, + { "Let's" }, + { "I" }, + { "the Don'", "he" } }; + + String[] pennTree = { + "(ROOT (S (S (NP (POS ')) (NP (NP (NNP Let) (POS 's)) (NN go))) (. !)))", + "(ROOT (S (S (NP (PRP I)) (VP (VBP want) (S (VP (TO to) (VP (VB see) (NP (DT the) " + + "(NX (NNP Don) (POS ')))))))) (, ,) (NP (PRP he)) (VP (VBD said)) (. .)))" + }; + + AssertAnnotations.assertCoreference(ref, select(jcas, CoreferenceChain.class)); + AssertAnnotations.assertPennTree(pennTree, select(jcas, PennTree.class)); + } + + @Test + @Ignore("Disabled due to side effects on parser unit tests. See issue 175") + public void testTriggerReparse1() + throws Exception + { + JCas jcas = runTest("en", + "Other major domestic initiatives in his presidency include the Patient " + + "Protection and Affordable Care Act, often referred to as \"Obamacare\"; the " + + "Dodd–Frank Wall Street Reform and Consumer Protection Act; the Don't Ask, " + + "Don't Tell Repeal Act of 2010; the Budget Control Act of 2011; and the " + + "American Taxpayer Relief Act of 2012."); + + String[][] ref = { + { "Other major domestic initiatives in his presidency" }, + { "his presidency" }, + { "his" }, + { "the Patient Protection and Affordable Care Act, often referred to as " + + "\"Obamacare\"; the Dodd–Frank Wall Street Reform and Consumer " + + "Protection Act; the Don't Ask" }, + { "the Patient Protection and Affordable Care Act" }, + { "the Patient Protection" }, + { "Affordable Care Act" }, + { "\"Obamacare\"; the Dodd–Frank Wall Street Reform and Consumer Protection Act;" }, + { "the Dodd" }, + { "Frank Wall Street Reform and Consumer Protection Act" }, + { "Frank Wall Street Reform" }, + { "Consumer Protection Act" }, + { "Repeal Act of 2010; the Budget Control Act of 2011; and the American " + + "Taxpayer Relief Act of 2012" }, + { "2010" }, + { "the Budget Control Act of 2011" }, + { "the American Taxpayer Relief Act of 2012" }, + { "2011" }, + { "2012" } }; + + String[] pennTree = { + "(ROOT (S (NP (NP (JJ Other) (JJ major) (JJ domestic) (NNS initiatives)) (PP (IN in) " + + "(NP (PRP$ his) (NN presidency)))) (VP (VBP include) (SBAR (S (NP (NP (DT the) " + + "(NNP Patient) (NNP Protection) (CC and) (NNP Affordable) (NNP Care) (NNP Act)) " + + "(, ,) (VP (ADVP (RB often)) (VBN referred) (PP (TO to) (SBAR (IN as) (S (NP " + + "(`` \") (NP (NNP Obamacare)) ('' \") (PRN (: ;) (S (NP (DT the) (NNP Dodd)) (VP " + + "(VBP –) (NP (NP (NNP Frank) (NNP Wall) (NNP Street) (NNP Reform)) (CC and) (NP " + + "(NNP Consumer) (NNP Protection) (NNP Act))))) (: ;))) (DT the) (VP (VBP Do) " + + "(RB n't) (VP (VB Ask))))))) (, ,)) (VP (VBP Do) (RB n't) (VP (VB Tell) (NP (NP " + + "(NP (NN Repeal) (NNP Act)) (PP (IN of) (NP (CD 2010)))) (: ;) (NP (NP (DT the) " + + "(NNP Budget) (NNP Control) (NNP Act)) (PP (IN of) (NP (CD 2011)))) (: ;) " + + "(CC and) (NP (NP (DT the) (NNP American) (NNP Taxpayer) (NNP Relief) (NNP Act)) " + + "(PP (IN of) (NP (CD 2012)))))))))) (. .)))" + }; + + AssertAnnotations.assertPennTree(pennTree, select(jcas, PennTree.class)); + AssertAnnotations.assertCoreference(ref, select(jcas, CoreferenceChain.class)); + } + + private JCas runTest(String aLanguage, String aText) + throws Exception + { + return runTest(aLanguage, aText, Constants.SIEVEPASSES); + } + + + private JCas runTest(String aLanguage, String aText, String aSieves) + throws Exception + { + AssumeResource.assumeResource(CoreNlpCoreferenceResolver.class, + "de/tudarmstadt/ukp/dkpro/core/stanfordnlp", "coref", aLanguage, "default"); + + // Coreference resolution requires the parser and the NER to run before + AnalysisEngine engine = createEngine(createEngineDescription( + createEngineDescription(CoreNlpSegmenter.class), + createEngineDescription(CoreNlpPosTagger.class), + createEngineDescription(CoreNlpLemmatizer.class), + createEngineDescription(CoreNlpParser.class, + CoreNlpParser.PARAM_WRITE_CONSTITUENT, true, + CoreNlpParser.PARAM_WRITE_DEPENDENCY, true, + CoreNlpParser.PARAM_WRITE_PENN_TREE, true), + createEngineDescription(CoreNlpNamedEntityRecognizer.class), + createEngineDescription(CoreNlpCoreferenceResolver.class, + CoreNlpCoreferenceResolver.PARAM_SIEVES, aSieves))); + + // Set up a simple example + JCas jcas = engine.newJCas(); + jcas.setDocumentLanguage(aLanguage); + jcas.setDocumentText(aText); + engine.process(jcas); + + return jcas; + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpDependencyParserTest.java b/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpDependencyParserTest.java similarity index 77% rename from dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpDependencyParserTest.java rename to dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpDependencyParserTest.java index a83a9fc706..9f292b91a9 100644 --- a/dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpDependencyParserTest.java +++ b/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpDependencyParserTest.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,25 +14,27 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.corenlp; +package org.dkpro.core.corenlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.AssertAnnotations.assertDependencies; +import static org.dkpro.core.testing.AssertAnnotations.assertTagset; +import static org.dkpro.core.testing.AssertAnnotations.assertTagsetMapping; import org.apache.commons.lang3.ArrayUtils; import org.apache.uima.fit.factory.AggregateBuilder; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class CoreNlpDependencyParserTest { @@ -95,13 +97,12 @@ public void testEnglishStanfordDependencies() String[] unmappedDep = {}; - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "ptb", PTB_POS_TAGS, jcas); - AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "ptb", PTB_POS_TAGS, - jcas); - AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, Dependency.class, - "stanford341", STANFORD_DEPENDENCY_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(CoreNlpPosTagger.class, POS.class, "ptb", PTB_POS_TAGS, jcas); + assertTagset(CoreNlpDependencyParser.class, POS.class, "ptb", PTB_POS_TAGS, jcas); + assertTagset(CoreNlpDependencyParser.class, Dependency.class, "stanford341", + STANFORD_DEPENDENCY_TAGS, jcas); + assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); } @Test @@ -122,9 +123,9 @@ public void testEnglishUniversalDependencies() "[ 44, 45]PUNCT(punct,basic) D[44,45](,) G[35,43](sentence)", "[ 46, 51]NSUBJ(nsubj,basic) D[46,51](which) G[52,60](contains)", "[ 52, 60]Dependency(acl:relcl,basic) D[52,60](contains) G[35,43](sentence)", - "[ 61, 63]PREP(case,basic) D[61,63](as) G[69,81](constituents)", + "[ 61, 63]DOBJ(dobj,basic) D[61,63](as) G[52,60](contains)", "[ 64, 68]AMOD(amod,basic) D[64,68](many) G[69,81](constituents)", - "[ 69, 81]Dependency(nmod:as,basic) D[69,81](constituents) G[52,60](contains)", + "[ 69, 81]DEP(dep,basic) D[69,81](constituents) G[61,63](as)", "[ 82, 85]CC(cc,basic) D[82,85](and) G[69,81](constituents)", "[ 86, 98]CONJ(conj:and,basic) D[86,98](dependencies) G[69,81](constituents)", "[ 99,101]PREP(case,basic) D[99,101](as) G[102,110](possible)", @@ -134,13 +135,12 @@ public void testEnglishUniversalDependencies() String[] unmappedDep = { "acl:relcl", "cc:preconj", "compound:prt", "det:predet", "nmod:npmod", "nmod:poss", "nmod:tmod" }; - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "ptb", PTB_POS_TAGS, jcas); - AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "ptb", PTB_POS_TAGS, - jcas); - AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, Dependency.class, "universal", + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(CoreNlpPosTagger.class, POS.class, "ptb", PTB_POS_TAGS, jcas); + assertTagset(CoreNlpDependencyParser.class, POS.class, "ptb", PTB_POS_TAGS, jcas); + assertTagset(CoreNlpDependencyParser.class, Dependency.class, "universal", UNIVERSAL_DEPENDENCY_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "universal", unmappedDep, jcas); + assertTagsetMapping(Dependency.class, "universal", unmappedDep, jcas); } @Test @@ -172,13 +172,12 @@ public void testEnglishWsjSd() String[] unmappedDep = {}; - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "ptb", PTB_POS_TAGS, jcas); - AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "ptb", PTB_POS_TAGS, - jcas); - AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, Dependency.class, - "stanford341", STANFORD_DEPENDENCY_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(CoreNlpPosTagger.class, POS.class, "ptb", PTB_POS_TAGS, jcas); + assertTagset(CoreNlpDependencyParser.class, POS.class, "ptb", PTB_POS_TAGS, jcas); + assertTagset(CoreNlpDependencyParser.class, Dependency.class, "stanford341", + STANFORD_DEPENDENCY_TAGS, jcas); + assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); } @@ -219,13 +218,11 @@ public void testEnglishWsjUd() String[] unmappedDep = { "acl:relcl", "cc:preconj", "compound:prt", "det:predet", "nmod:npmod", "nmod:poss", "nmod:tmod" }; - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "ptb", PTB_POS_TAGS, jcas); - AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "ptb", PTB_POS_TAGS, - jcas); - AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, Dependency.class, "universal", - depTags, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "universal", unmappedDep, jcas); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(CoreNlpPosTagger.class, POS.class, "ptb", PTB_POS_TAGS, jcas); + assertTagset(CoreNlpDependencyParser.class, POS.class, "ptb", PTB_POS_TAGS, jcas); + assertTagset(CoreNlpDependencyParser.class, Dependency.class, "universal", depTags, jcas); + assertTagsetMapping(Dependency.class, "universal", unmappedDep, jcas); } @Test @@ -238,44 +235,43 @@ public void testFrenchUniversalDependencies() String[] dependencies = { "[ 0, 4]ROOT(root,basic) D[0,4](Nous) G[0,4](Nous)", - "[ 5, 10]NN(compound,basic) D[5,10](avons) G[0,4](Nous)", - "[ 11, 17]DEP(dep,basic) D[11,17](besoin) G[5,10](avons)", - "[ 18, 20]MWE(mwe,basic) D[18,20](d') G[11,17](besoin)", + "[ 5, 10]Dependency(nmod,basic) D[5,10](avons) G[0,4](Nous)", + "[ 11, 17]DOBJ(dobj,basic) D[11,17](besoin) G[5,10](avons)", + "[ 18, 20]PREP(case,basic) D[18,20](d') G[25,31](phrase)", "[ 21, 24]DET(det,basic) D[21,24](une) G[25,31](phrase)", - "[ 25, 31]PREP(case,basic) D[25,31](phrase) G[18,20](d')", - "[ 32, 35]PREP(case,basic) D[32,35](par) G[25,31](phrase)", - "[ 36, 43]PREP(case,basic) D[36,43](exemple) G[32,35](par)", + "[ 25, 31]Dependency(nmod:d',basic) D[25,31](phrase) G[11,17](besoin)", + "[ 32, 35]ADVMOD(advmod,basic) D[32,35](par) G[25,31](phrase)", + "[ 36, 43]MWE(mwe,basic) D[36,43](exemple) G[32,35](par)", "[ 44, 48]ADVMOD(advmod,basic) D[44,48](très) G[49,58](compliqué)", "[ 49, 58]AMOD(amod,basic) D[49,58](compliqué) G[36,43](exemple)", - "[ 59, 60]PUNCT(punct,basic) D[59,60](,) G[49,58](compliqué)", - "[ 61, 64]APPOS(appos,basic) D[61,64](qui) G[59,60](,)", - "[ 65, 73]APPOS(appos,basic) D[65,73](contient) G[61,64](qui)", + "[ 59, 60]DEP(dep,basic) D[59,60](,) G[49,58](compliqué)", + "[ 61, 64]NSUBJ(nsubj,basic) D[61,64](qui) G[65,73](contient)", + "[ 65, 73]Dependency(acl:relcl,basic) D[65,73](contient) G[59,60](,)", "[ 74, 77]DET(det,basic) D[74,77](des) G[78,90](constituants)", - "[ 78, 90]Dependency(nmod,basic) D[78,90](constituants) G[65,73](contient)", - "[ 91, 94]DEP(dep,basic) D[91,94](que) G[78,90](constituants)", + "[ 78, 90]DOBJ(dobj,basic) D[78,90](constituants) G[65,73](contient)", + "[ 91, 94]PREP(case,basic) D[91,94](que) G[109,120](dépendances)", "[ 95, 97]DET(det,basic) D[95,97](de) G[109,120](dépendances)", "[ 98,108]AMOD(amod,basic) D[98,108](nombreuses) G[109,120](dépendances)", - "[109,120]Dependency(nmod,basic) D[109,120](dépendances) G[91,94](que)", + "[109,120]DEP(dep,basic) D[109,120](dépendances) G[78,90](constituants)", "[121,123]CC(cc,basic) D[121,123](et) G[109,120](dépendances)", - "[124,127]MWE(mwe,basic) D[124,127](que) G[121,123](et)", - "[128,136]AMOD(amod,basic) D[128,136](possible) G[124,127](que)", - "[137,138]CONJ(conj:et,basic) D[137,138](.) G[109,120](dépendances)" }; + "[124,127]COP(cop,basic) D[124,127](que) G[128,136](possible)", + "[128,136]CCOMP(ccomp,basic) D[128,136](possible) G[121,123](et)", + "[137,138]PUNCT(punct,basic) D[137,138](.) G[128,136](possible)" }; String[] depTags = { "acl", "acl:relcl", "advcl", "advmod", "amod", "appos", "aux", "auxpass", "case", "cc", "ccomp", "compound", "conj", "cop", "csubj", "dep", "det", - "discourse", "dobj", "expl", "iobj", "mark", "mwe", "name", "neg", "nmod", - "nmod:poss", "nsubj", "nsubjpass", "nummod", "parataxis", "punct", "root", - "xcomp" }; + "discourse", "dislocated", "dobj", "expl", "foreign", "goeswith", "iobj", "mark", + "mwe", "name", "neg", "nmod", "nmod:poss", "nsubj", "nsubjpass", "nummod", + "parataxis", "punct", "remnant", "reparandum", "root", "vocative", "xcomp" }; String[] unmappedDep = { "acl:relcl", "nmod:poss" }; - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "corenlp34", CORENLP34_POS_TAGS, jcas); - AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "universal", UNIVERSAL_POS_TAGS, + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(CoreNlpPosTagger.class, POS.class, "corenlp34", CORENLP34_POS_TAGS, jcas); + assertTagset(CoreNlpDependencyParser.class, POS.class, "universal", UNIVERSAL_POS_TAGS, jcas); - AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, Dependency.class, "universal", - depTags, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "universal", unmappedDep, jcas); + assertTagset(CoreNlpDependencyParser.class, Dependency.class, "universal", depTags, jcas); + assertTagsetMapping(Dependency.class, "universal", unmappedDep, jcas); } @Test @@ -316,13 +312,11 @@ public void testGermanUniversalDependencies() "TRUNC", "VAFIN", "VAIMP", "VAINF", "VAPP", "VMFIN", "VMINF", "VVFIN", "VVIMP", "VVINF", "VVIZU", "VVPP", "XY" }; - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "stts", GERMAN_POS_TAGS, jcas); - AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "stts", depParserPosTags, - jcas); - AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, Dependency.class, "universal", - depTags, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "universal", unmappedDep, jcas); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(CoreNlpPosTagger.class, POS.class, "stts", GERMAN_POS_TAGS, jcas); + assertTagset(CoreNlpDependencyParser.class, POS.class, "stts", depParserPosTags, jcas); + assertTagset(CoreNlpDependencyParser.class, Dependency.class, "universal", depTags, jcas); + assertTagsetMapping(Dependency.class, "universal", unmappedDep, jcas); } @Test @@ -360,17 +354,17 @@ public void testChineseCtbConllDependencies() String[] unmappedDep = {}; - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "ctb", posTags, jcas); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(CoreNlpPosTagger.class, POS.class, "ctb", posTags, jcas); // There are some minor differences between the tags produced by the POS tagger and the - // tags expected by the parser model. We need a better test here that makes these differences + // tags expected by the parser model. We need a better test here that makes these + // differences // more visible and at the same time doesn't fail. - //AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "ctb", posTags, - // jcas); - AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, Dependency.class, "conll", - depTags, jcas); - AssertAnnotations.assertTagsetMapping(CoreNlpDependencyParser.class, Dependency.class, - "conll", unmappedDep, jcas); + // AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "ctb", posTags, + // jcas); + assertTagset(CoreNlpDependencyParser.class, Dependency.class, "conll", depTags, jcas); + assertTagsetMapping(CoreNlpDependencyParser.class, Dependency.class, "conll", unmappedDep, + jcas); } @Test @@ -411,17 +405,17 @@ public void testChinesePtbConllDependencies() String[] unmappedDep = {}; - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "ctb", posTags, jcas); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(CoreNlpPosTagger.class, POS.class, "ctb", posTags, jcas); // There are some minor differences between the tags produced by the POS tagger and the - // tags expected by the parser model. We need a better test here that makes these differences + // tags expected by the parser model. We need a better test here that makes these + // differences // more visible and at the same time doesn't fail. - //AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "ctb", posTags, - // jcas); - AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, Dependency.class, "conll2008", - depTags, jcas); - AssertAnnotations.assertTagsetMapping(CoreNlpDependencyParser.class, Dependency.class, - "conll2008", unmappedDep, jcas); + // AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "ctb", posTags, + // jcas); + assertTagset(CoreNlpDependencyParser.class, Dependency.class, "conll2008", depTags, jcas); + assertTagsetMapping(CoreNlpDependencyParser.class, Dependency.class, "conll2008", + unmappedDep, jcas); } @Test @@ -467,17 +461,17 @@ public void testChineseUniversalDependencies() "compound:vc", "erased", "etc", "mark:clf", "nmod:assmod", "nmod:poss", "nmod:prep", "nmod:range", "nmod:tmod", "nmod:topic", "nsubj:xsubj", "parataxis:prnmod" }; - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "ctb", posTags, jcas); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(CoreNlpPosTagger.class, POS.class, "ctb", posTags, jcas); // There are some minor differences between the tags produced by the POS tagger and the - // tags expected by the parser model. We need a better test here that makes these differences + // tags expected by the parser model. We need a better test here that makes these + // differences // more visible and at the same time doesn't fail. - //AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "ctb", posTags, - // jcas); - AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, Dependency.class, "universal", - depTags, jcas); - AssertAnnotations.assertTagsetMapping(CoreNlpDependencyParser.class, Dependency.class, - "universal", unmappedDep, jcas); + // AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "ctb", posTags, + // jcas); + assertTagset(CoreNlpDependencyParser.class, Dependency.class, "universal", depTags, jcas); + assertTagsetMapping(CoreNlpDependencyParser.class, Dependency.class, "universal", + unmappedDep, jcas); } @Test @@ -518,15 +512,16 @@ public void testEnglishPtbConllDependencies() "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "``" }; - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); + assertDependencies(dependencies, select(jcas, Dependency.class)); // There are some minor differences between the tags produced by the POS tagger and the - // tags expected by the parser model. We need a better test here that makes these differences + // tags expected by the parser model. We need a better test here that makes these + // differences // more visible and at the same time doesn't fail. - //AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "ptb", PTB_POS_TAGS, jcas); - AssertAnnotations.assertTagset(CoreNlpDependencyParser.class, POS.class, "ptb", posTags, - jcas); - AssertAnnotations.assertTagset(Dependency.class, "conll", depTags, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "conll", unmappedDep, jcas); + // AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "ptb", PTB_POS_TAGS, + // jcas); + assertTagset(CoreNlpDependencyParser.class, POS.class, "ptb", posTags, jcas); + assertTagset(Dependency.class, "conll", depTags, jcas); + assertTagsetMapping(Dependency.class, "conll", unmappedDep, jcas); } private JCas runTest(String aLanguage, String aVariant, String aText, Object... aExtraParams) diff --git a/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpLemmatizerTest.java b/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpLemmatizerTest.java new file mode 100644 index 0000000000..800b047831 --- /dev/null +++ b/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpLemmatizerTest.java @@ -0,0 +1,88 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.corenlp; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.corenlp.CoreNlpLemmatizer; +import org.dkpro.core.corenlp.CoreNlpPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; + +public class CoreNlpLemmatizerTest +{ + @Test + public void testUnderscore() throws Exception + { + runTest("en", "foo _ bar", + new String[] { "foo", "_", "bar" }); + } + + @Test + public void testEnglish() throws Exception + { + runTest("en", "This is a test .", + new String[] { "this", "be", "a", "test", "." }); + + runTest("en", "We need a very complicated example sentence , which " + + "contains as many constituents and dependencies as possible .", + new String[] { "we", "need", "a", "very", "complicated", "example", + "sentence", ",", "which", "contain", "as", "many", "constituent", "and", + "dependency", "as", "possible", "." }); + } + + @Test(expected = AnalysisEngineProcessException.class) + public void testNotEnglish() + throws Exception + { + runTest("de", "Das ist ein test .", new String[] {} ); + } + + @Test + public void testUrl() throws Exception + { + runTest("en", "Details hinzu findet man unter http://www.armytimes.com/news/2009/11/army_M4_112109w/ .", + new String[] { "detail", "hinzu", "findet", "man", "unter", "http://www.armytimes.com/news/2009/11/army_m4_112109w/", "." }); + + } + + private void runTest(String aLanguage, String testDocument, String[] lemmas) + throws Exception + { + AnalysisEngineDescription posTagger = createEngineDescription(CoreNlpPosTagger.class); + AnalysisEngineDescription lemmatizer = createEngineDescription(CoreNlpLemmatizer.class); + + JCas aJCas = TestRunner.runTest(createEngineDescription(posTagger, lemmatizer), + aLanguage, testDocument); + + AssertAnnotations.assertLemma(lemmas, select(aJCas, Lemma.class)); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpNamedEntityRecognizerTest.java b/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpNamedEntityRecognizerTest.java new file mode 100644 index 0000000000..54c13b0289 --- /dev/null +++ b/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpNamedEntityRecognizerTest.java @@ -0,0 +1,222 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.corenlp; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.corenlp.CoreNlpNamedEntityRecognizer; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Assume; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; + +/** + */ +public class CoreNlpNamedEntityRecognizerTest +{ + @Test + public void testEnglish() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("en", null, "IBM where John works is in Germany ."); + + String[] ne = { + "[ 0, 3]Organization(ORGANIZATION) (IBM)", + "[ 10, 14]Person(PERSON) (John)", + "[ 27, 34]Location(LOCATION) (Germany)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void test3classCaselessEnglish() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("en", "all.3class.caseless.distsim.crf", "ibm where john works is in germany ."); + + String[] ne = { + "[ 0, 3]Organization(ORGANIZATION) (ibm)", + "[ 10, 14]Person(PERSON) (john)", + "[ 27, 34]Location(LOCATION) (germany)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void testNoWiki3classCaselessEnglish() throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("en", "nowiki.3class.caseless.distsim.crf", + "ibm where john works is in germany ."); + + String[] ne = { + "[ 0, 3]Organization(ORGANIZATION) (ibm)", + "[ 10, 14]Person(PERSON) (john)", + "[ 27, 34]Location(LOCATION) (germany)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void test4classEnglish() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("en", "conll.4class.distsim.crf", "IBM where John works is in Germany ."); + + String[] ne = { + "[ 0, 3]Organization(ORGANIZATION) (IBM)", + "[ 10, 14]Person(PERSON) (John)", + "[ 27, 34]Location(LOCATION) (Germany)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + + @Test + public void test4classCaselessEnglish() throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("en", "conll.4class.caseless.distsim.crf", + "ibm where john works is in germany ."); + + String[] ne = { + "[ 0, 3]Organization(ORGANIZATION) (ibm)", + "[ 10, 14]Person(PERSON) (john)", + "[ 27, 34]Location(LOCATION) (germany)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void test4classCaselessMixedEnglish() throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("en", "conll.4class.caseless.distsim.crf", + "IBM where john works is in Germany ."); + + String[] ne = { + "[ 0, 3]Organization(ORGANIZATION) (IBM)", + "[ 10, 14]Person(PERSON) (john)", + "[ 27, 34]Location(LOCATION) (Germany)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void test7classEnglish() throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("en", "muc.7class.distsim.crf", "IBM where John works is in Germany ."); + + String[] ne = { + "[ 0, 3]Organization(ORGANIZATION) (IBM)", + "[ 10, 14]Person(PERSON) (John)", + "[ 27, 34]Location(LOCATION) (Germany)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void testGerman() throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("de", null, "Markus arbeitet seit 10 Jahren bei SAP in Deutschland ."); + + String[] ne = { + "[ 0, 6]Person(PERSON) (Markus)", + "[ 35, 38]Organization(ORGANIZATION) (SAP)", + "[ 42, 53]Location(LOCATION) (Deutschland)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void testHgcGerman() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("de", "hgc_175m_600.crf", "Markus arbeitet seit 10 Jahren bei SAP in Deutschland ."); + + String[] ne = { + "[ 0, 6]Person(I-PER) (Markus)", + "[ 35, 38]Organization(I-ORG) (SAP)", + "[ 42, 53]Location(I-LOC) (Deutschland)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void testSpanish() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("es", null, "Hace 10 años Markus trabaja en SAP en Alemania ."); + + String[] ne = { + "[ 13, 19]Person(PERS) (Markus)", + "[ 31, 34]Organization(ORG) (SAP)", + "[ 38, 46]Location(LUG) (Alemania)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test(expected = AnalysisEngineProcessException.class) + public void testMissingModel() throws Exception + { + runTest("xx", null, "Xec xena Xeo ."); + } + + private JCas runTest(String language, String variant, String testDocument) + throws Exception + { + AssumeResource.assumeResource(CoreNlpNamedEntityRecognizer.class, + "de/tudarmstadt/ukp/dkpro/core/stanfordnlp", "ner", language, variant); + + AnalysisEngine engine = createEngine(CoreNlpNamedEntityRecognizer.class, + CoreNlpNamedEntityRecognizer.PARAM_VARIANT, variant, + CoreNlpNamedEntityRecognizer.PARAM_PRINT_TAGSET, true); + + return TestRunner.runTest(engine, language, testDocument); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpParserTest.java b/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpParserTest.java similarity index 78% rename from dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpParserTest.java rename to dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpParserTest.java index 7b0c17c34f..55e4383c43 100644 --- a/dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpParserTest.java +++ b/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpParserTest.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,39 +14,48 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.corenlp; +package org.dkpro.core.corenlp; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectSingle; +import static org.dkpro.core.testing.AssertAnnotations.assertConstituents; +import static org.dkpro.core.testing.AssertAnnotations.assertDependencies; +import static org.dkpro.core.testing.AssertAnnotations.assertPOS; +import static org.dkpro.core.testing.AssertAnnotations.assertPennTree; +import static org.dkpro.core.testing.AssertAnnotations.assertSyntacticFunction; +import static org.dkpro.core.testing.AssertAnnotations.assertTagset; +import static org.dkpro.core.testing.AssertAnnotations.assertTagsetMapping; +import static org.junit.Assert.assertTrue; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.corenlp.internal.DKPro2CoreNlp; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; -import edu.stanford.nlp.ling.StringLabel; -import edu.stanford.nlp.trees.Tree; import org.apache.commons.lang3.ArrayUtils; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.fit.factory.AggregateBuilder; import org.apache.uima.fit.factory.JCasBuilder; import org.apache.uima.jcas.JCas; +import org.dkpro.core.corenlp.CoreNlpParser; +import org.dkpro.core.corenlp.CoreNlpPosTagger; +import org.dkpro.core.corenlp.internal.DKPro2CoreNlp; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Assume; import org.junit.Rule; import org.junit.Test; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectSingle; -import static org.junit.Assert.assertTrue; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import edu.stanford.nlp.ling.StringLabel; +import edu.stanford.nlp.trees.Tree; public class CoreNlpParserTest { @@ -135,16 +144,16 @@ public void testGermanPcfg() String[] unmappedConst = { "NUR" }; - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - AssertAnnotations.assertSyntacticFunction(synFunc, select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(POS.class, "stts", GERMAN_POS_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "stts", unmappedPos, jcas); - AssertAnnotations.assertTagset(Constituent.class, "negra", GERMAN_CONSTITUENT_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "negra", unmappedConst, jcas); + assertSyntacticFunction(synFunc, select(jcas, Constituent.class)); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(POS.class, "stts", GERMAN_POS_TAGS, jcas); + assertTagsetMapping(POS.class, "stts", unmappedPos, jcas); + assertTagset(Constituent.class, "negra", GERMAN_CONSTITUENT_TAGS, jcas); + assertTagsetMapping(Constituent.class, "negra", unmappedConst, jcas); } @Test @@ -178,15 +187,15 @@ public void testGermanFactored() String[] unmappedConst = { "NUR" }; - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(POS.class, "stts", GERMAN_POS_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "stts", unmappedPos, jcas); - AssertAnnotations.assertTagset(Constituent.class, "negra", GERMAN_CONSTITUENT_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "negra", unmappedConst, jcas, true); + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(POS.class, "stts", GERMAN_POS_TAGS, jcas); + assertTagsetMapping(POS.class, "stts", unmappedPos, jcas); + assertTagset(Constituent.class, "negra", GERMAN_CONSTITUENT_TAGS, jcas); + assertTagsetMapping(Constituent.class, "negra", unmappedConst, jcas, true); } @Test @@ -219,8 +228,9 @@ public void testEnglishPcfg() "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", "[102,110]PREP(prep_as,basic) D[102,110](possible) G[52,60](contains)" }; - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", "POS_NOUN", "POS_PUNCT", "POS_DET", - "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", + "POS_NOUN", "POS_PUNCT", "POS_DET", "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", + "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; String[] posOriginal = { "PRP", "VBP", "DT", "RB", "VBN", "NN", "NN", ",", "WDT", "VBZ", "IN", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; @@ -232,17 +242,17 @@ public void testEnglishPcfg() String[] unmappedDep = { "gov" }; - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); + assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); + assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); + assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); + assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); + assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); } @Test @@ -276,8 +286,9 @@ public void testEnglishFactored() "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", "[102,110]PREP(prep_as,basic) D[102,110](possible) G[52,60](contains)" }; - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", "POS_NOUN", "POS_PUNCT", "POS_DET", - "POS_VERB", "POS_ADV", "POS_ADJ", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", + "POS_NOUN", "POS_PUNCT", "POS_DET", "POS_VERB", "POS_ADV", "POS_ADJ", "POS_NOUN", + "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; String[] posOriginal = { "PRP", "VBP", "DT", "RB", "VBN", "NN", "NN", ",", "WDT", "VBZ", "RB", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; @@ -289,17 +300,17 @@ public void testEnglishFactored() String[] unmappedDep = { "gov" }; - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); + assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); + assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); + assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); + assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); + assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); } // CoreNlpParser PARAM_KEEP_PUNCTUATION has no effect #965 @@ -317,7 +328,7 @@ public void testEnglishFactored() // "[ 8, 9]DET(det) D[8,9](a) G[10,14](test)", // "[ 10, 14]ROOT(root) D[10,14](test) G[10,14](test)" }; // -// AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); +// assertDependencies(dependencies, select(jcas, Dependency.class)); // } @Test @@ -351,8 +362,9 @@ public void testEnglishRnn() "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", "[102,110]PREP(prep_as,basic) D[102,110](possible) G[52,60](contains)" }; - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", "POS_NOUN", "POS_PUNCT", "POS_DET", - "POS_VERB", "POS_ADV", "POS_ADJ", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", + "POS_NOUN", "POS_PUNCT", "POS_DET", "POS_VERB", "POS_ADV", "POS_ADJ", "POS_NOUN", + "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; String[] posOriginal = { "PRP", "VBP", "DT", "RB", "VBN", "NN", "NN", ",", "WDT", "VBZ", "RB", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; @@ -364,17 +376,17 @@ public void testEnglishRnn() String[] unmappedDep = { "gov" }; - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); + assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); + assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); + assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); + assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); + assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); } @Test @@ -407,8 +419,9 @@ public void testEnglishShiftReduce() "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", "[102,110]PREP(prep_as,basic) D[102,110](possible) G[69,81](constituents)" }; - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", "POS_NOUN", "POS_PUNCT", "POS_DET", - "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", + "POS_NOUN", "POS_PUNCT", "POS_DET", "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", + "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; String[] posOriginal = { "PRP", "VBP", "DT", "RB", "JJ", "NN", "NN", ",", "WDT", "VBZ", "IN", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; @@ -420,22 +433,22 @@ public void testEnglishShiftReduce() String[] unmappedDep = { "gov" }; - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "ptb", ENGLISH_POS_TAGS, + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertTagset(CoreNlpPosTagger.class, POS.class, "ptb", ENGLISH_POS_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(CoreNlpPosTagger.class, POS.class, "ptb", + assertTagsetMapping(CoreNlpPosTagger.class, POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); - AssertAnnotations.assertTagset(CoreNlpParser.class, Constituent.class, "ptb", + assertTagset(CoreNlpParser.class, Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(CoreNlpParser.class, Constituent.class, "ptb", + assertTagsetMapping(CoreNlpParser.class, Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); - AssertAnnotations.assertTagset(CoreNlpParser.class, Dependency.class, "stanford341", + assertTagset(CoreNlpParser.class, Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(CoreNlpParser.class, Dependency.class, "stanford341", + assertTagsetMapping(CoreNlpParser.class, Dependency.class, "stanford341", unmappedDep, jcas); } @@ -469,8 +482,9 @@ public void testEnglishShiftReduceBeam() "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", "[102,110]PREP(prep_as,basic) D[102,110](possible) G[69,81](constituents)" }; - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", "POS_NOUN", "POS_PUNCT", "POS_DET", - "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", + "POS_NOUN", "POS_PUNCT", "POS_DET", "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", + "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; String[] posOriginal = { "PRP", "VBP", "DT", "RB", "JJ", "NN", "NN", ",", "WDT", "VBZ", "IN", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; @@ -482,17 +496,17 @@ public void testEnglishShiftReduceBeam() String[] unmappedDep = { "gov" }; - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); + assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); + assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); + assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); + assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); + assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); } @Test @@ -526,8 +540,9 @@ public void testEnglishWsjRnn() "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", "[102,110]PREP(prep_as,basic) D[102,110](possible) G[52,60](contains)" }; - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", "POS_NOUN", "POS_PUNCT", "POS_DET", - "POS_VERB", "POS_ADV", "POS_ADJ", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", + "POS_NOUN", "POS_PUNCT", "POS_DET", "POS_VERB", "POS_ADV", "POS_ADJ", "POS_NOUN", + "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; String[] posOriginal = { "PRP", "VBP", "DT", "RB", "VBN", "NN", "NN", ",", "WDT", "VBZ", "RB", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; @@ -539,17 +554,17 @@ public void testEnglishWsjRnn() String[] unmappedDep = { "gov" }; - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); + assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); + assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); + assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); + assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); + assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); } /** @@ -573,8 +588,8 @@ public void testEnglishFactoredDirectSpeech() + ",) (`` \")) (CC and) (S (NP (PRP it)) (VP (VBZ 's) (VP (VBG starting) (PP " + "(TO to) (NP (NN rain)))))) (. .) ('' \")))"; - AssertAnnotations.assertPOS(null, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertPOS(null, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); } /** @@ -602,8 +617,8 @@ public void testEnglishFactoredDirectSpeech2() + "(, ,) (`` ‘)) (CC and) (S (NP (PRP it)) (VP (VBZ 's) (VP (VBG starting) (PP " + "(TO to) (NP (NN rain)))))) (. .) ('' ’)))"; - AssertAnnotations.assertPOS(null, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertPOS(null, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); } @Test @@ -661,20 +676,20 @@ public void testSpanishShiftReduceBeam() String[] unmappedConst = { "f" }; - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertTagset(CoreNlpPosTagger.class, POS.class, "ancora", posTags, jcas); - AssertAnnotations.assertTagsetMapping(CoreNlpPosTagger.class, POS.class, "ancora", + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertTagset(CoreNlpPosTagger.class, POS.class, "ancora", posTags, jcas); + assertTagsetMapping(CoreNlpPosTagger.class, POS.class, "ancora", unmappedPos, jcas); - AssertAnnotations.assertTagset(CoreNlpParser.class, Constituent.class, "ancora", + assertTagset(CoreNlpParser.class, Constituent.class, "ancora", constituentTags, jcas); - AssertAnnotations.assertTagsetMapping(CoreNlpParser.class, Constituent.class, "ancora", + assertTagsetMapping(CoreNlpParser.class, Constituent.class, "ancora", unmappedConst, jcas); - // AssertAnnotations.assertTagset(Dependency.class, "stanford341", depTags, jcas); -// AssertAnnotations.assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); + // assertTagset(Dependency.class, "stanford341", depTags, jcas); +// assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); } /** @@ -701,16 +716,16 @@ public void testExistingPos() String pennTree = "(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN test))) (. .)))"; String pennTreeVariant = "(ROOT (S (NP (DT This)) (VP (VBZ is) (NP-TMP (DT a) (NN test))) (. .)))"; - AssertAnnotations.assertPOS(null, posOriginal, select(jcas, POS.class)); + assertPOS(null, posOriginal, select(jcas, POS.class)); /* Due to https://github.com/dkpro/dkpro-core/issues/852, the results are instable; * if the test fails for the expected output, try the 2nd variant. * FIXME: once https://github.com/dkpro/dkpro-core/issues/852 is resolved, the try/catch clause should be removed.*/ try { - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); } catch (Throwable e) { - AssertAnnotations.assertPennTree(pennTreeVariant, selectSingle(jcas, PennTree.class)); + assertPennTree(pennTreeVariant, selectSingle(jcas, PennTree.class)); } } @@ -766,18 +781,18 @@ public void testFrenchFactored() // NO DEP TAGS String[] unmappedDep = {}; - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - - AssertAnnotations.assertTagset(POS.class, "corenlp34", posTags, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "corenlp34", unmappedPos, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ftb", constituentTags, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ftb", unmappedConst, jcas); - // NO DEP TAGS AssertAnnotations.assertTagset(Dependency.class, null, depTags, jcas); - // NO DEP TAGS AssertAnnotations.assertTagsetMapping(Dependency.class, null, unmappedDep, jcas); + assertDependencies(dependencies, select(jcas, Dependency.class)); + + assertTagset(POS.class, "corenlp34", posTags, jcas); + assertTagsetMapping(POS.class, "corenlp34", unmappedPos, jcas); + assertTagset(Constituent.class, "ftb", constituentTags, jcas); + assertTagsetMapping(Constituent.class, "ftb", unmappedConst, jcas); + // NO DEP TAGS assertTagset(Dependency.class, null, depTags, jcas); + // NO DEP TAGS assertTagsetMapping(Dependency.class, null, unmappedDep, jcas); } @Test @@ -804,9 +819,9 @@ public void testFrench2() + "(NC texte) (PP (P du) (AP (ADJ français))))) (PP (P vers) (NP (DET l') " + "(NC anglais)))) (PUNC .)))"; - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); } @@ -845,8 +860,9 @@ public void testChineseFactored() "[ 41, 42]Dependency(dvpm,basic) D[41,42](的) G[37,40](尽可能)", "[ 43, 45]CONJ(conj,basic) D[43,45](依赖) G[26,28](包含)" }; - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_NUM", "POS_ADJ", "POS_ADJ", "POS_PART", "POS_NOUN", "POS_ADJ", "POS_NOUN", - "POS_VERB", "POS_NUM", "POS_NOUN", "POS_CONJ", "POS_ADJ", "POS_PART", "POS_VERB", "POS_PUNCT" }; + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_NUM", "POS_ADJ", "POS_ADJ", "POS_PART", + "POS_NOUN", "POS_ADJ", "POS_NOUN", "POS_VERB", "POS_NUM", "POS_NOUN", "POS_CONJ", + "POS_ADJ", "POS_PART", "POS_VERB", "POS_PUNCT" }; String[] posOriginal = { "PN", "VV", "CD", "AD", "JJ", "DEG", "NN", "AD", "NN", "VV", "CD", "NN", "CC", "AD", "DEV", "VV", "PU" }; @@ -872,17 +888,17 @@ public void testChineseFactored() // NO DEP TAGS String[] unmappedDep = new String[] {}; - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(POS.class, "ctb", posTags, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ctb", unmappedPos, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ctb", constituentTags, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ctb", unmappedConst, jcas); - // NO DEP TAGS AssertAnnotations.assertTagset(Dependency.class, null, depTags, jcas); - // NO DEP TAGS AssertAnnotations.assertTagsetMapping(Dependency.class, null, unmappedDep, jcas); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(POS.class, "ctb", posTags, jcas); + assertTagsetMapping(POS.class, "ctb", unmappedPos, jcas); + assertTagset(Constituent.class, "ctb", constituentTags, jcas); + assertTagsetMapping(Constituent.class, "ctb", unmappedConst, jcas); + // NO DEP TAGS assertTagset(Dependency.class, null, depTags, jcas); + // NO DEP TAGS assertTagsetMapping(Dependency.class, null, unmappedDep, jcas); } @Test @@ -895,7 +911,8 @@ public void testChineseXinhuaFactored() String[] constituentMapped = { "ADVP 20,22", "ADVP 37,40", "ADVP 9,11", "NP 0,2", "NP 17,19", "NP 23,25", "NP 29,34", "NP 32,34", "NP 43,45", "NP 6,45", "NP 9,19", "QP 29,31", "QP 6,8", "ROOT 0,47", "VP 12,14", "VP 26,34", "VP 26,40", "VP 3,45", - "VP 37,40", "VP 9,14", "X 0,47", "X 20,40", "X 9,14", "X 9,16", "X 9,40", "X 9,42" }; + "VP 37,40", "VP 9,14", "X 0,47", "X 20,40", "X 9,14", "X 9,16", "X 9,40", + "X 9,42" }; String[] constituentOriginal = { "ADVP 20,22", "ADVP 37,40", "ADVP 9,11", "CP 9,16", "CP 9,42", "IP 0,47", "IP 20,40", "IP 9,14", "IP 9,40", "NP 0,2", "NP 17,19", @@ -921,8 +938,9 @@ public void testChineseXinhuaFactored() "[ 41, 42]Dependency(cpm,basic) D[41,42](的) G[26,28](包含)", "[ 43, 45]DOBJ(dobj,basic) D[43,45](依赖) G[3,5](需要)" }; - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_NUM", "POS_ADJ", "POS_VERB", "POS_PART", "POS_NOUN", "POS_ADJ", "POS_NOUN", - "POS_VERB", "POS_NUM", "POS_NOUN", "POS_CONJ", "POS_ADJ", "POS_PART", "POS_NOUN", "POS_PUNCT" }; + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_NUM", "POS_ADJ", "POS_VERB", "POS_PART", + "POS_NOUN", "POS_ADJ", "POS_NOUN", "POS_VERB", "POS_NUM", "POS_NOUN", "POS_CONJ", + "POS_ADJ", "POS_PART", "POS_NOUN", "POS_PUNCT" }; String[] posOriginal = { "PN", "VV", "CD", "AD", "VA", "DEC", "NN", "AD", "NN", "VV", "CD", "NN", "CC", "AD", "DEC", "NN", "PU" }; @@ -949,17 +967,17 @@ public void testChineseXinhuaFactored() // NO DEP TAGS String[] unmappedDep = new String[] {}; - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(POS.class, "ctb", posTags, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ctb", unmappedPos, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ctb", constituentTags, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ctb", unmappedConst, jcas); - // NO DEP TAGS AssertAnnotations.assertTagset(Dependency.class, null, depTags, jcas); - // NO DEP TAGS AssertAnnotations.assertTagsetMapping(Dependency.class, null, unmappedDep, jcas); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(POS.class, "ctb", posTags, jcas); + assertTagsetMapping(POS.class, "ctb", unmappedPos, jcas); + assertTagset(Constituent.class, "ctb", constituentTags, jcas); + assertTagsetMapping(Constituent.class, "ctb", unmappedConst, jcas); + // NO DEP TAGS assertTagset(Dependency.class, null, depTags, jcas); + // NO DEP TAGS assertTagsetMapping(Dependency.class, null, unmappedDep, jcas); } @Test @@ -1002,15 +1020,15 @@ public void testArabicFactored() String[] unmappedConst = { "LST" }; - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(POS.class, "atb", posTags, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "atb", unmappedPos, jcas); - AssertAnnotations.assertTagset(Constituent.class, "atb", constituentTags, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "atb", unmappedConst, jcas); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(POS.class, "atb", posTags, jcas); + assertTagsetMapping(POS.class, "atb", unmappedPos, jcas); + assertTagset(Constituent.class, "atb", constituentTags, jcas); + assertTagsetMapping(Constituent.class, "atb", unmappedConst, jcas); } /** diff --git a/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpPosTaggerAndNamedEntityRecognizerTest.java b/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpPosTaggerAndNamedEntityRecognizerTest.java new file mode 100644 index 0000000000..c921fb15ea --- /dev/null +++ b/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpPosTaggerAndNamedEntityRecognizerTest.java @@ -0,0 +1,106 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.corenlp; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; + +public class CoreNlpPosTaggerAndNamedEntityRecognizerTest +{ + @Test + public void thatDurationIsRecognized() throws Exception + { + JCas jcas = runTest("en", "John lives for 200 years ."); + + String[] ne = { + "[ 0, 4]Person(PERSON) (John)", + "[ 15, 18]NamedEntity(DURATION) (200)", + "[ 19, 24]NamedEntity(DURATION) (years)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void thatMoneyIsRecognized() throws Exception + { + JCas jcas = runTest("en", "John buys a laptop for 200 dollars ."); + + String[] ne = { + "[ 0, 4]Person(PERSON) (John)", + "[ 23, 26]NamedEntity(MONEY) (200)", + "[ 27, 34]NamedEntity(MONEY) (dollars)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void thatOrdinalNumbersAreRecognized() throws Exception + { + JCas jcas = runTest("en", "John made the second place in the run ."); + + String[] ne = { + "[ 0, 4]Person(PERSON) (John)", + "[ 14, 20]NamedEntity(ORDINAL) (second)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void thatCardinalNumbersAreRecognized() throws Exception + { + JCas jcas = runTest("en", "John bought one hundred laptops ."); + + String[] ne = { + "[ 0, 4]Person(PERSON) (John)", + "[ 12, 15]NamedEntity(NUMBER) (one)", + "[ 16, 23]NamedEntity(NUMBER) (hundred)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + private JCas runTest(String language, String testDocument) + throws Exception + { + AssumeResource.assumeResource(CoreNlpNamedEntityRecognizer.class, + "de/tudarmstadt/ukp/dkpro/core/stanfordnlp", "ner", language, null); + + AssumeResource.assumeResource(CoreNlpPosTagger.class, + "de/tudarmstadt/ukp/dkpro/core/stanfordnlp", "tagger", language, null); + + AnalysisEngineDescription engine = createEngineDescription( + createEngineDescription(CoreNlpPosTagger.class), + createEngineDescription(CoreNlpNamedEntityRecognizer.class)); + + return TestRunner.runTest(engine, language, testDocument); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpPosTaggerTest.java b/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpPosTaggerTest.java similarity index 75% rename from dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpPosTaggerTest.java rename to dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpPosTaggerTest.java index 1e8395b719..8070d41ccf 100644 --- a/dkpro-core-corenlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/corenlp/CoreNlpPosTaggerTest.java +++ b/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpPosTaggerTest.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,43 +14,44 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.corenlp; +package org.dkpro.core.corenlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; +import org.dkpro.core.corenlp.CoreNlpPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class CoreNlpPosTaggerTest { - @Test - public void testEnglish() - throws Exception - { + @Test + public void testEnglish() + throws Exception + { runTest("en", "This is a test . \n", - new String[] { "DT", "VBZ", "DT", "NN", "." }, - new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + new String[] { "DT", "VBZ", "DT", "NN", "." }, + new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); runTest("en", "A neural net . \n", - new String[] { "DT", "JJ", "NN", "." }, - new String[] { "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); + new String[] { "DT", "JJ", "NN", "." }, + new String[] { "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); runTest("en", "John is purchasing oranges . \n", - new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, - new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); - } + new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, + new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); + } @Test public void testEnglishExtra() @@ -78,25 +79,25 @@ public void testEnglishExtra() } - @Test - public void testGerman() - throws Exception + @Test + public void testGerman() + throws Exception { runTest("de", "Das ist ein Test .", - new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); runTest("de", "ud", "Das ist ein Test .", new String[] { "PRON", "VERB", "DET", "NOUN", "PUNCT" }, new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); runTest("de", "hgc", "Das ist ein Test .", - new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); runTest("de", "dewac", "Das ist ein Test .", - new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); runTest("de", "fast-caseless", "das ist ein test .", new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, @@ -148,8 +149,8 @@ public void testFrench2() String[] posOriginal = { "DET", "NC", "P", "DET", "NC", "P", "NC", "P", "DET", "NC", "PUNC" }; - String[] posMapped = { "POS_DET", "POS_NOUN", "POS_ADP", "POS_DET", "POS_NOUN", "POS_ADP", "POS_NOUN", "POS_ADP", "POS_DET", - "POS_NOUN", "POS_PUNCT" }; + String[] posMapped = { "POS_DET", "POS_NOUN", "POS_ADP", "POS_DET", "POS_NOUN", "POS_ADP", + "POS_NOUN", "POS_ADP", "POS_DET", "POS_NOUN", "POS_PUNCT" }; AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); } @@ -158,33 +159,35 @@ public void testFrench2() public void testChinese() throws Exception { - // The rudder often in the wake of the wind round the back of the area. + // The rudder often in the wake of the wind round the back of the area. runTest("zh", "尾 舵 常 处于 风轮 后面 的 尾流 区里 。", - new String[] { "NN", "NN", "AD", "VV", "NN", "NN", "DEG", "NN", "NN", "PU" }, + new String[] { "NN", "NN", "AD", "VV", "NN", "NN", "DEG", "NN", "NN", "PU" }, new String[] { "POS_NOUN", "POS_NOUN", "POS_ADJ", "POS_VERB", "POS_NOUN", "POS_NOUN", "POS_PART", "POS_NOUN", "POS_NOUN", "POS_PUNCT" }); // The service sector has become an important engine of Guangdong's economic transformation // and upgrading. runTest("zh", "服务业 成为 广东 经济 转型 升级 的 重要 引擎 。", - new String[] { "NN", "VV", "NR", "NN", "VV", "VV", "DEC", "JJ", "NN", "PU" }, + new String[] { "NN", "VV", "NR", "NN", "VV", "VV", "DEC", "JJ", "NN", "PU" }, new String[] { "POS_NOUN", "POS_VERB", "POS_PROPN", "POS_NOUN", "POS_VERB", "POS_VERB", "POS_PART", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); // How far is China from the world brand? runTest("zh", "中国 离 世界 技术 品牌 有 多远 ?", - new String[] { "NR", "P", "NN", "NN", "NN", "VE", "VV", "PU" } , - new String[] { "POS_PROPN", "POS_ADP", "POS_NOUN", "POS_NOUN", "POS_NOUN", "POS_VERB", "POS_VERB", "POS_PUNCT" } ); + new String[] { "NR", "P", "NN", "NN", "NN", "VE", "VV", "PU" } , + new String[] { "POS_PROPN", "POS_ADP", "POS_NOUN", "POS_NOUN", "POS_NOUN", + "POS_VERB", "POS_VERB", "POS_PUNCT" }); } @Test public void testArabic() throws Exception { - // Covering the following sub-Saharan countries with vast areas very + // Covering the following sub-Saharan countries with vast areas very runTest("ar", "تغطي الصحراء الكبرى الدول التالية بمساحات شاسعة جدا", - new String[] { "VBP", "DTNN", "DTJJR", "DTNN", "DTJJ", "NNS", "JJ", "NN" }, - new String[] { "POS_VERB", "POS_NOUN", "POS_ADJ", "POS_NOUN", "POS_ADJ", "POS_NOUN", "POS_ADJ", "POS_NOUN" }); + new String[] { "VBP", "DTNN", "DTJJR", "DTNN", "DTJJ", "NNS", "JJ", "NN" }, + new String[] { "POS_VERB", "POS_NOUN", "POS_ADJ", "POS_NOUN", "POS_ADJ", "POS_NOUN", + "POS_ADJ", "POS_NOUN" }); } @Test @@ -201,14 +204,15 @@ public void testEscaping() throws Exception { runTest("en", "This is a ( small ) test . \n", new String[] { "DT", "VBZ", "DT", "-LRB-", "JJ", "-RRB-", "NN", "." }, - new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_PUNCT", "POS_ADJ", "POS_PUNCT", "POS_NOUN", "POS_PUNCT" }); + new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_PUNCT", "POS_ADJ", + "POS_PUNCT", "POS_NOUN", "POS_PUNCT" }); } - private void runTest(String language, String testDocument, String[] tags, String[] tagClasses) - throws Exception - { - runTest(language, null, testDocument, tags, tagClasses); - } + private void runTest(String language, String testDocument, String[] tags, String[] tagClasses) + throws Exception + { + runTest(language, null, testDocument, tags, tagClasses); + } private void runTest(String language, String variant, String testDocument, String[] tags, String[] tagClasses) diff --git a/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpSegmenterTest.java b/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpSegmenterTest.java new file mode 100644 index 0000000000..5dfd4a535e --- /dev/null +++ b/dkpro-core-corenlp-gpl/src/test/java/org/dkpro/core/corenlp/CoreNlpSegmenterTest.java @@ -0,0 +1,153 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.corenlp; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.corenlp.CoreNlpParser; +import org.dkpro.core.corenlp.CoreNlpSegmenter; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.harness.SegmenterHarness; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class CoreNlpSegmenterTest +{ + @Test + public void run() throws Throwable + { + AnalysisEngineDescription aed = createEngineDescription(CoreNlpSegmenter.class); + + SegmenterHarness.run(aed, "de.4", "en.9", "ar.1", "zh.1", "zh.2"); + } + + @Test + public void testEnglishSpeech() throws Exception + { + JCas jcas = JCasFactory.createJCas(); + jcas.setDocumentLanguage("en"); + jcas.setDocumentText("'Let's go! I want to see the Don', he said."); + + AnalysisEngine aed = createEngine(CoreNlpSegmenter.class); + aed.process(jcas); + + String[] tokens = { "'", "Let", "'s", "go", "!", "I", "want", "to", "see", "the", "Don", + "'", ",", "he", "said", "." }; + + AssertAnnotations.assertToken(tokens, select(jcas, Token.class)); + } + + @Test + public void testSpanish() throws Exception + { + JCas jcas = JCasFactory.createJCas(); + jcas.setDocumentLanguage("es"); + jcas.setDocumentText("Tim dijo a Jamie para la 100ª vez que abandone la sala."); + + AnalysisEngine aed = createEngine(CoreNlpSegmenter.class); + aed.process(jcas); + + String[] tokens = { "Tim", "dijo", "a", "Jamie", "para", "la", "100ª", "vez", "que", + "abandone", "la", "sala", "." }; + + AssertAnnotations.assertToken(tokens, select(jcas, Token.class)); + } + + @Test + public void testSpanishClitics() throws Exception + { + //Important: Verb clitics will not be segmented unless Spanish models are included + //e. g.: entregarles, inmutarse + JCas jcas = JCasFactory.createJCas(); + jcas.setDocumentLanguage("es"); + jcas.setDocumentText("Al entregarles los libros del maestro los abrieron sin inmutarse\n" + + "Estaban contentos."); + + AnalysisEngine aed = createEngine(CoreNlpSegmenter.class, + CoreNlpSegmenter.PARAM_NEWLINE_IS_SENTENCE_BREAK, "always", + CoreNlpSegmenter.PARAM_TOKENIZATION_OPTIONS, "splitAll=true,ptb3Escaping=false"); + aed.process(jcas); + + String[] sentences = {"Al entregarles los libros del maestro los abrieron sin inmutarse", + "Estaban contentos."}; + + String[] expectedTokens = { "A", "el", "entregarles", "los", "libros", "de", "el", + "maestro", "los", "abrieron", "sin", "inmutarse", "Estaban", "contentos", "."}; + + AssertAnnotations.assertSentence(sentences, select(jcas, Sentence.class)); + List tokens = new ArrayList(); + for (Token token : select(jcas, Token.class)) { + tokens.add(token.getText()); + } + System.out.printf("%-20s - Expected: %s%n", "Tokens", Arrays.asList(expectedTokens)); + System.out.printf("%-20s - Actual : %s%n", "Tokens", tokens); + Assert.assertEquals(Arrays.asList(expectedTokens), tokens); + } + + @Test + public void testArabic() throws Exception + { + AssumeResource.assumeResource(CoreNlpParser.class, + "de/tudarmstadt/ukp/dkpro/core/corenlp", "tokenizer", "ar", "atb-bn-arztrain"); + + + JCas jcas = JCasFactory.createJCas(); + jcas.setDocumentLanguage("ar"); + jcas.setDocumentText("هل من المهم مراقبة وزن الرضيع خلال السنة الاولى من عمره؟\n" + + " هل يجب وزن و قياس الطفل خلال السنة الاولى من عمره ؟\n"); + + AnalysisEngine aed = createEngine(CoreNlpSegmenter.class); + aed.process(jcas); + + String[] sentences = { "هل من المهم مراقبة وزن الرضيع خلال السنة الاولى من عمره؟", + "هل يجب وزن و قياس الطفل خلال السنة الاولى من عمره ؟"}; + + String[] tokens = { "هل", "من", "المهم", "مراقبة", "وزن", "الرضيع", "خلال", "السنة", + "الاولى", "من", "عمر", "ه", "؟", "هل", "يجب", "وزن", "و", "قياس", "الطفل", + "خلال", "السنة", "الاولى", "من", "عمر", "ه", "؟"}; + + AssertAnnotations.assertSentence(sentences, select(jcas, Sentence.class)); + AssertAnnotations.assertToken(tokens, select(jcas, Token.class)); + } + + @Test + public void testZoning() throws Exception + { + SegmenterHarness.testZoning(CoreNlpSegmenter.class); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-corenlp-gpl/src/test/resources/log4j.properties b/dkpro-core-corenlp-gpl/src/test/resources/log4j.properties deleted file mode 100644 index 9ef9876f5c..0000000000 --- a/dkpro-core-corenlp-gpl/src/test/resources/log4j.properties +++ /dev/null @@ -1,7 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG diff --git a/dkpro-core-corenlp-gpl/src/test/resources/log4j2.xml b/dkpro-core-corenlp-gpl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..19bf03b585 --- /dev/null +++ b/dkpro-core-corenlp-gpl/src/test/resources/log4j2.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/dkpro-core-decompounding-asl/pom.xml b/dkpro-core-decompounding-asl/pom.xml index 17a4c6b540..5615a6f8eb 100644 --- a/dkpro-core-decompounding-asl/pom.xml +++ b/dkpro-core-decompounding-asl/pom.xml @@ -18,13 +18,14 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl DKPro Core ASL - Decompounding - de.tudarmstadt.ukp.dkpro.core.decompounding-asl + https://dkpro.github.io/dkpro-core/ + dkpro-core-decompounding-asl jar DKPro core module for decompounding. @@ -86,16 +87,20 @@ commons-collections4 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -103,13 +108,13 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test - org.hamcrest - hamcrest-core + org.assertj + assertj-core test @@ -197,5 +202,19 @@ - - \ No newline at end of file + + + eu.openminted.share.annotations + omtd-share-annotations-maven-plugin + + + + **/*.xml + + + + + \ No newline at end of file diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/German98Dictionary.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/German98Dictionary.java deleted file mode 100644 index fb73f6d4b5..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/German98Dictionary.java +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.igerman98.Affix; - -/** - * The igerman98 dictionary from www.j3e.de/ispell/igerman98 - * - * A current version of the german dictionary de_DE can be found in - * /src/main/resources/de_DE.dic - * - * This class can also be used to read other ispell/hunspell dictionaries. - * - */ -public class German98Dictionary - extends SimpleDictionary -{ - private static final String PREFIX_KEY = "PFX"; - private static final String SUFFIX_KEY = "SFX"; - - private Map> affixes = new HashMap>(); - - public German98Dictionary(File aDict, File aAffix, String aEncoding) - throws IOException - { - try ( - BufferedReader dis = new BufferedReader( - new InputStreamReader(new FileInputStream(aDict), aEncoding)); - BufferedReader ais = new BufferedReader( - new InputStreamReader(new FileInputStream(aAffix), aEncoding)); - ) { - readAffixFile(ais); - setWords(readFileToSet(dis)); - } - } - - public German98Dictionary(InputStream aDictStream, InputStream aAffixStream, String aEncoding) - throws IOException - { - try ( - BufferedReader dis = new BufferedReader( - new InputStreamReader(aDictStream, aEncoding)); - BufferedReader ais = new BufferedReader( - new InputStreamReader(aAffixStream, aEncoding)); - ) { - readAffixFile(ais); - setWords(readFileToSet(dis)); - } - } - - protected Set readFileToSet(BufferedReader aReader) - throws IOException - { - Set words = new HashSet(); - - // First line contains number of entries -> skip - String line = aReader.readLine(); - while ((line = aReader.readLine()) != null) { - if (line.equals("") || line.substring(0, 1).equals("#") - || line.substring(0, 1).equals("\t")) { - // Ignore lines starting with hash of tab (comments) - continue; - } - String[] split = line.split("/"); - String word = split[0].toLowerCase(); - char[] flags = {}; - - if (split.length > 1) { - flags = split[1].toCharArray(); - } - - if (word.length() > 2) { - words.add(word); - } - - if (flags.length > 0) { - words.addAll(buildWords(word, flags)); - } - } - - return words; - } - - /** - * Reads the affix file and processes the data - * - * @param aReader - * a reader. - */ - protected void readAffixFile(BufferedReader aReader) - { - try { - String line; - while ((line = aReader.readLine()) != null) { - if (line.startsWith(PREFIX_KEY) || line.startsWith(SUFFIX_KEY)) { - parseAffix(line, aReader); - } - } - } - catch (FileNotFoundException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - } - - /** - * Parse a affix in the affix file - * - * @param aHeader - * The header of the affix - * @param aReader - * The file reader to read the rest of the affix - * @throws IOException if an I/O error occurs. - */ - private void parseAffix(String aHeader, BufferedReader aReader) - throws IOException - { - String args[] = aHeader.split("\\s+"); - - boolean crossProduct = args[2].equals("Y"); - int numLines = Integer.parseInt(args[3]); - - for (int i = 0; i < numLines; i++) { - String line = aReader.readLine(); - if (line == null) { - throw new IOException("Unexpected end of file after reading [" + i + - "] lines. Expected were [" + numLines + "] lines."); - } - String ruleArgs[] = line.split("\\s+"); - Character flag = ruleArgs[1].toCharArray()[0]; - - Affix a = new Affix(args[0]); - a.setCrossProduct(crossProduct); - a.setFlag(flag); - a.setStripping(ruleArgs[2]); - a.setAffix(ruleArgs[3]); - a.setCondition(ruleArgs[4]); - - List list = affixes.get(flag); - if (list == null) { - list = new ArrayList(); - affixes.put(flag, list); - } - list.add(a); - } - } - - /** - * Uses affixes to build new words - * - * @param aWord - * a word. - * @param aFlags - * flags. - * @return inflected word forms. - */ - protected List buildWords(String aWord, char[] aFlags) - { - List words = new ArrayList(); - for (char c : aFlags) { - List aff = affixes.get(c); - if (aff == null) { - continue; - } - for (Affix affix : aff) { - String w = affix.handleWord(aWord); - if (w != null && w.length() > 2) { - words.add(w); - } - } - } - - return words; - } -} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/JWordSplitterDictionary.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/JWordSplitterDictionary.java deleted file mode 100644 index 5c1b48cadb..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/JWordSplitterDictionary.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary; - -import java.io.BufferedReader; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import de.abelssoft.tools.persistence.FastObjectSaver; - -/** - * The simple dictionary reads a file in which each line is a new word. - * - * This can be used to create your own dictionary from a corpus - * - */ -public class JWordSplitterDictionary - implements Dictionary -{ - private static final String SERIALIZED_DICT = "/wordsGerman.ser"; // dict inside the JAR - - private Set words; - - /** - * Constructor for a simple dictionary - */ - public JWordSplitterDictionary() - { - try { - words = (HashSet) FastObjectSaver.load(SERIALIZED_DICT); - } - catch (IOException e) { - e.printStackTrace(); - } - } - - @Override - public boolean contains(String aWord) - { - return words.contains(aWord); - } - - /** - * Reads the dictionary to set - * - * @param aReader - * a reader. - * @return A set of words - * @throws IOException - * if an I/O problem occurs. - */ - protected Set readFileToSet(BufferedReader aReader) - throws IOException - { - Set words = new HashSet(); - String line; - while ((line = aReader.readLine()) != null) { - words.add(line.toLowerCase()); - } - - return words; - } - - @Override - public List getAll() - { - return new ArrayList(words); - } -} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/igerman98/Affix.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/igerman98/Affix.java deleted file mode 100644 index 66855fddda..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/igerman98/Affix.java +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.igerman98; - -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * Affix data model - * - */ -public class Affix -{ - - /** - * Key for prefixed in affix files - */ - private static final String PREFIX_KEY = "PFX"; - - /** - * Key for suffixes in the affix files - */ - private static final String SUFFIX_KEY = "SFX"; - - private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*"; - private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s"; - - private AffixType type; - private char flag; - private String stripping; - private String affix; - private String condition; - private Pattern conditionPattern; - private boolean crossProduct; - - public Affix(AffixType aType) - { - type = aType; - } - - public Affix(String aKey) - { - if (aKey.equals(PREFIX_KEY)) { - type = AffixType.PREFIX; - } - else if (aKey.equals(SUFFIX_KEY)) { - type = AffixType.SUFFIX; - } - else { - throw new RuntimeException(aKey + " do not exist"); - } - } - - public boolean isCrossProduct() - { - return crossProduct; - } - - public void setCrossProduct(boolean aCrossProduct) - { - crossProduct = aCrossProduct; - } - - public AffixType getType() - { - return type; - } - - public void setType(AffixType aType) - { - type = aType; - } - - public char getFlag() - { - return flag; - } - - public void setFlag(char aFlag) - { - flag = aFlag; - } - - public String getStripping() - { - return stripping; - } - - public void setStripping(String aStripping) - { - stripping = aStripping; - } - - public String getAffix() - { - return affix; - } - - public void setAffix(String aAffix) - { - affix = aAffix; - } - - public String getCondition() - { - return condition; - } - - public void setCondition(String aCondition) - { - condition = aCondition; - - String regExp; - - switch (type) { - case PREFIX: - regExp = String.format(PREFIX_CONDITION_REGEX_PATTERN, aCondition); - break; - case SUFFIX: - regExp = String.format(SUFFIX_CONDITION_REGEX_PATTERN, aCondition); - break; - default: - throw new RuntimeException(type.toString() - + " is not supported"); - } - - conditionPattern = Pattern.compile(regExp); - } - - /** - * Adopt this affix on a given word - * - * @param aWord - * a word. - * @return The word with a change prefix or affix - */ - public String handleWord(String aWord) - { - Matcher m = conditionPattern.matcher(aWord); - - if (m != null && m.matches()) { - if (type.equals(AffixType.PREFIX)) { - return handlePrefix(aWord); - } - else if (type.equals(AffixType.SUFFIX)) { - return handleSuffix(aWord); - } - } - - return null; - } - - private String handlePrefix(String aWord) - { - if (stripping.equals("0") || aWord.startsWith(stripping)) { - int start = 0; - if (!stripping.equals("0") && aWord.startsWith(stripping)) { - start = aWord.length() - stripping.length(); - } - - return affix + aWord.substring(start); - } - - return null; - } - - private String handleSuffix(String aWord) - { - if (stripping.equals("0") || aWord.endsWith(stripping)) { - int end = aWord.length(); - if (!stripping.equals("0") && aWord.endsWith(stripping)) { - end = aWord.length() - stripping.length(); - } - - return aWord.substring(0, end) + affix; - } - - return null; - } -} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/igerman98/package-info.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/igerman98/package-info.java deleted file mode 100644 index 5f6cfa12c5..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/igerman98/package-info.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -/** - * This package contains the Affix models used by the dictionary classes. - */ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.igerman98; \ No newline at end of file diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/package-info.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/package-info.java deleted file mode 100644 index b0d426e7f0..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/package-info.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -/** - * This package contains dictionary classes. Currently you have to options. You can work with - * your own dictionary or with popular IGerman98 Dictionary which is part of nearly all spell - * checkers. - * - * If you want to use your own dictionary you have to create a file that contains your words. Each - * word in one line. Then you can use the - * {@link de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.SimpleDictionary} class. - * - * If you want to use the IGerman98 dictionary you can use - * the {@link de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.German98Dictionary}. - * - * Additional this package contains the {@link de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes} class. - * This is a simple dictionary and hold all possible morphemes. - * - * If you want to code you own dictionary use the {@link de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary} - * interface. - */ -package de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary; \ No newline at end of file diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/AbstractRanker.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/AbstractRanker.java deleted file mode 100644 index a47f02db22..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/AbstractRanker.java +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.ranking; - -import java.math.BigInteger; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -import org.apache.ivy.util.cli.CommandLine; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundedWord; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundingTree; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.Fragment; -import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.ValueNode; -import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.Finder; -import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.NGramModel; - -/** - * Contains base method for the ranking algorithms - * - */ -public abstract class AbstractRanker implements Ranker -{ - - private Finder finder; - - /** - * Empty constructor - * - * Use setFinder before using this class - */ - public AbstractRanker() { - - } - - public AbstractRanker(Finder aFinder) - { - finder = aFinder; - } - - public Finder getFinder() - { - return finder; - } - - /** - * Gets the frequency of a Split Element - * - * @param aWord - * a fragment. - * @return the frequency. - */ - protected BigInteger freq(Fragment aWord) - { - return finder.freq(aWord.getWord()); - } - - /** - * Returns the frequency of n-grams that contain both split elements - * - * @param aWord1 - * a fragment. - * @param aWord2 - * another fragment. - * @return the n-gram frequency. - */ - protected BigInteger freq(Fragment aWord1, Fragment aWord2) - { - return freq(new String[] { aWord1.getWord(), aWord2.getWord() }); - } - - /** - * Returns the frequency for a array of words - * - * @param aWords - * the words. - * @return the frequency. - */ - protected BigInteger freq(String[] aWords) - { - BigInteger total = BigInteger.valueOf(0l); - - for (NGramModel gram : finder.find(aWords)) { - total = total.add(BigInteger.valueOf(gram.getFreq())); - } - - return total; - } - - public final static String INDEX_OPTION = "luceneIndex"; - public final static String LIMIT_OPTION = "limit"; - - public static int getLimitOption(CommandLine aCmd) - { - int i = Integer.MAX_VALUE; - if (aCmd.hasOption(LIMIT_OPTION)) { - i = Integer.valueOf(aCmd.getOptionValue(LIMIT_OPTION)); - } - - return i; - } - - public static String getIndexPathOption(CommandLine aCmd) - { - return aCmd.getOptionValue(INDEX_OPTION); - } - - @Override - public void setFinder(Finder aFinder) { - finder = aFinder; - } - - /** - * Expects that the splits list contains at least one element and that this is the unsplit word. - * - * @param aSplits - * the splits. - * @return the filtered splits. - */ - public static List filterAndSort(List aSplits) { - List filtered = new ArrayList(); - for (DecompoundedWord s : aSplits) { - if (!Double.isInfinite(s.getWeight()) && !Double.isInfinite(s.getWeight()) - && (s.getWeight() > 0.0)) { - filtered.add(s); - } - } - Collections.sort(filtered); - - if (filtered.isEmpty()) { - filtered.add(aSplits.get(0)); - } - - return filtered; - } - - @Override - public DecompoundedWord highestRank(DecompoundingTree aTree){ - return highestRank(aTree.getRoot(), null); - } - - public abstract DecompoundedWord highestRank(ValueNode aParent, - List aPath); - -} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/CompoundProbabilityRanker.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/CompoundProbabilityRanker.java deleted file mode 100644 index 80f9be19c3..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/CompoundProbabilityRanker.java +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.ranking; - -import java.util.Collections; -import java.util.List; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundedWord; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.Fragment; -import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.ValueNode; -import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.Finder; - -/** - * Probability based ranking method - * - */ -public class CompoundProbabilityRanker - extends AbstractRanker - implements RankerList -{ - /** - * Empty constructor - * - * Use {@link #setFinder(Finder)} before using this class - */ - public CompoundProbabilityRanker() { - } - - /** - * Constructor - * - * @param aFinder - * a finder. - */ - public CompoundProbabilityRanker(Finder aFinder) - { - super(aFinder); - } - - @Override - public DecompoundedWord highestRank(List aSplits) - { - return rank(aSplits).get(0); - } - - @Override - public List rank(List aSplits) - { - for (DecompoundedWord split : aSplits) { - split.setWeight(calcRank(split)); - } - - List result = filterAndSort(aSplits); - Collections.sort(result, Collections.reverseOrder()); - - return result; - } - - /** - * Calculates the weight for a split - */ - private float calcRank(DecompoundedWord aSplit) - { - float result = 0; - - for (Fragment elem : aSplit.getSplits()) { - result += -1 * Math.log(freq(elem).doubleValue() / getFinder().getUnigramCount().doubleValue()); - } - - return result; - } - - /** - * Searches a a path throw the tree - */ - @Override - public DecompoundedWord highestRank(ValueNode aParent, - List aPath) - { - if (aPath != null) { - aPath.add(aParent.getValue()); - } - - List children = aParent.getChildrenValues(); - if (children.size() == 0) { - return aParent.getValue(); - } - - children.add(aParent.getValue()); - List result = rank(children); - DecompoundedWord best = result.get(0); - - if (best.equals(aParent.getValue())) { - // None of the children get a better score than the parent - return aParent.getValue(); - } - else { - // Find the child node that ranked best and recurse - for (ValueNode split : aParent.getChildren()) { - if (best.equals(split.getValue())) { - return highestRank(split, aPath); - } - } - } - - return null; - } -} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/FrequencyGeometricMeanRanker.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/FrequencyGeometricMeanRanker.java deleted file mode 100644 index ff320867a9..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/FrequencyGeometricMeanRanker.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.ranking; - -import java.util.List; - -import org.apache.commons.math3.stat.descriptive.SummaryStatistics; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundedWord; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.Fragment; -import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.ValueNode; -import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.Finder; - -/** - * Frequency based ranking algorithm. See doc folder for more informations. - * - */ -public class FrequencyGeometricMeanRanker - extends AbstractRanker - implements RankerList -{ - /** - * Empty constructor - * - * Use {@link #setFinder(Finder)} before using this class - */ - public FrequencyGeometricMeanRanker() { - - } - - public FrequencyGeometricMeanRanker(Finder aFinder) - { - super(aFinder); - } - - @Override - public DecompoundedWord highestRank(List aSplits) - { - return rank(aSplits).get(0); - } - - @Override - public List rank(List aSplits) - { - for (DecompoundedWord split : aSplits) { - split.setWeight(calcRank(split)); - } - - return filterAndSort(aSplits); - } - - /** - * Calculates the weight for a split - */ - private double calcRank(DecompoundedWord aSplit) - { - SummaryStatistics stats = new SummaryStatistics(); - for (Fragment elem : aSplit.getSplits()) { - stats.addValue(freq(elem).doubleValue()); - } - return stats.getGeometricMean(); - } - - /** - * Searches a a path throw the tree - */ - @Override - public DecompoundedWord highestRank(ValueNode aParent, - List aPath) - { - if (aPath != null) { - aPath.add(aParent.getValue()); - } - - List children = aParent.getChildrenValues(); - if (children.size() == 0) { - return aParent.getValue(); - } - - children.add(aParent.getValue()); - List result = rank(children); - DecompoundedWord best = result.get(0); - - if (best.equals(aParent.getValue())) { - // None of the childs get a better score than the parent - return aParent.getValue(); - } - else { - // Find the child node that ranked best and recurse - for (ValueNode split : aParent.getChildren()) { - if (best.equals(split.getValue())) { - return highestRank(split, aPath); - } - } - } - - return null; - } -} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/MutualInformationRanker.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/MutualInformationRanker.java deleted file mode 100644 index 448242484c..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/MutualInformationRanker.java +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.ranking; - -import java.math.BigInteger; -import java.util.List; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundedWord; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.Fragment; -import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.ValueNode; -import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.Finder; - -/** - * Mutual informationen based ranking algorithm. See doc folder for more - * information - * - */ -public class MutualInformationRanker - extends AbstractRanker - implements RankerList -{ - /** - * Empty constructor - * - * Use {@link #setFinder(Finder)} before using this class - */ - public MutualInformationRanker() { - } - - public MutualInformationRanker(Finder aFinder) - { - super(aFinder); - } - - @Override - public DecompoundedWord highestRank(List aSplits) - { - return rank(aSplits).get(0); - } - - @Override - public List rank(List aSplits) - { - for (DecompoundedWord split : aSplits) { - double weight = calcRank(split); - if (Double.isInfinite(split.getWeight()) || Double.isNaN(split.getWeight())) { - weight = 0.0; - } - split.setWeight(weight); - } - - return filterAndSort(aSplits); - } - - /** - * Calculates the weight for a split - */ - private float calcRank(DecompoundedWord aSplit) - { - double total = 0; - double count = 0; - - BigInteger unigramCount = getFinder().getUnigramCount(); - - if (aSplit.getSplits().size() == 1) { - // Entropy for single words - Fragment w = aSplit.getSplits().get(0); - double p = freq(w).doubleValue() / unigramCount.doubleValue(); - - return (float) ((-1) * p * Math.log(p)); - } - - // Mutual Information for splits. - for (int i = 1; i < aSplit.getSplits().size(); i++) { - count++; - - Fragment w1 = aSplit.getSplits().get(i - 1); - Fragment w2 = aSplit.getSplits().get(i); - // Look up unigram frequencies first - this is fast and allows us to bail out early - BigInteger w1f = freq(w1); - if (w1f.equals(BigInteger.ZERO)) { - continue; - } - - BigInteger w2f = freq(w2); - if (w2f.equals(BigInteger.ZERO)) { - continue; - } - - // This is a slow lookup that we only do if the unigram frequencies are greate than 0 - double a = freq(w1, w2).multiply(unigramCount).doubleValue(); - if (a == 0d) { - continue; - } - - // Finally calculate - double b = w1f.multiply(w2f).doubleValue(); - total += Math.log(a / b); - } - - return (float) (total / count); - } - - - /** - * Searches a a path throw the tree - */ - @Override - public DecompoundedWord highestRank(ValueNode aParent, - List aPath) - { - if (aPath != null) { - aPath.add(aParent.getValue()); - } - - List children = aParent.getChildrenValues(); - if (children.size() == 0) { - return aParent.getValue(); - } - - children.add(aParent.getValue()); - List result = rank(children); - DecompoundedWord best = result.get(0); - - if (best.equals(aParent.getValue())) { - // None of the childs get a better score than the parent - return aParent.getValue(); - } - else { - // Find the child node that ranked best and recurse - for (ValueNode split : aParent.getChildren()) { - if (best.equals(split.getValue())) { - return highestRank(split, aPath); - } - } - } - - return null; - } - - - - - - - - - - - - - - - - - - - - - - - - -} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/package-info.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/package-info.java deleted file mode 100644 index b10d30e597..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -/** - * Contains ranking algorithm. - */ -package de.tudarmstadt.ukp.dkpro.core.decompounding.ranking; \ No newline at end of file diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/AsvToolboxSplitterAlgorithm.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/AsvToolboxSplitterAlgorithm.java deleted file mode 100644 index ac4a6c2b3d..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/AsvToolboxSplitterAlgorithm.java +++ /dev/null @@ -1,551 +0,0 @@ -/* - * Copyright 2013 - - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter; - -import static java.util.Arrays.asList; - -import java.io.File; -import java.util.StringTokenizer; -import java.util.Vector; - -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes; -import de.uni_leipzig.asv.utils.Pretree; - -public class AsvToolboxSplitterAlgorithm -implements SplitterAlgorithm -{ - private final Zerleger2 splitter; - - private final Log logger; - - public AsvToolboxSplitterAlgorithm(File kompVVicTree, File kompVHic, File grfExt) - throws ResourceInitializationException - { - logger = LogFactory.getLog(this.getClass()); - splitter = new Zerleger2(); - splitter.init(kompVVicTree.getAbsolutePath(), kompVHic.getAbsolutePath(), - grfExt.getAbsolutePath()); - } - - @Override - public DecompoundingTree split(String aWord) - { - // splitter.kZerlegung("katalogleichen"); - // splitter.kZerlegung("nischenthemen"); - // splitter.kZerlegung("brennbaukästen"); - // splitter.kZerlegung("autokorrelationszeit"); - // splitter.kZerlegung("providerdaten"); - // splitter.kZerlegung("zahnärzten"); - - logger.debug("SPLITTING WORD: "+aWord); - Vector split = splitter.kZerlegung(aWord); - String joined = StringUtils.join(split, "").replace("(", "").replace(")", ""); - if (!joined.equals(aWord)) { - logger.error("Failed while splitting " + aWord + " into " + split); - } - - if (StringUtils.join(split, "").contains("()")) { - logger.error(aWord + " -> " + split); - throw new IllegalStateException("Failed while splitting " + aWord + " into " + split, null); - } - - StringBuilder splitStr = new StringBuilder(); - for (int i = 0; i < split.size(); i++) { - if ((splitStr.length() > 0) && !split.get(i).startsWith("(")) { - splitStr.append("+"); - } - splitStr.append(split.get(i)); - } - - return new DecompoundingTree(splitStr.toString()); - } - - @Override - public void setDictionary(Dictionary aDict) - { - // Nothing to do - } - - @Override - public void setLinkingMorphemes(LinkingMorphemes aMorphemes) - { - // Nothing to do - } - - @Override - public void setMaximalTreeDepth(int aDepth) - { - // Nothing to do - } - - public class Zerleger2 - { - Pretree kompvvTree = new Pretree(); - Pretree kompvhTree = new Pretree(); - Pretree grfTree = new Pretree(); - String anweisungGrf = new String(); - String anweisungKomp = new String(); - // boolean d = true; // debugguing - - String reverse(String torev) - { - String ret = new String(); - for (int i = torev.length(); i > 0; i--) { - ret += torev.substring(i - 1, i); - } - return ret; - } - - public Vector kZerlegung(String aAktwort) - { - // if (d) { - // logger.debug("grf: " + aAktwort + "->"); - // } - String aktwort = grundFormReduktion(aAktwort); - // if (d) { - // logger.debug(aktwort); - // } - Vector retvec = new Vector(); - String classvv = new String(); - String classvh = new String(); - String zahlStrvv = "", zahlStrvh = "", suffixvv = "", suffixvh = "", vvteil1 = "", vhteil1 = "", vvteil2 = "", vhteil2 = ""; - Vector zervh = new Vector(); - Vector zervv = new Vector(); - int zahlvv = 0, zahlvh = 0; - boolean vhOk, vvOk; - // if (d) { - // logger.debug("Zerlege " + aktwort); - // } - classvv = kompvvTree.classify(aktwort + "<"); - classvh = kompvhTree.classify(reverse(aktwort) + "<"); - // if (d) { - // logger.debug("VV liefert " + classvv); - // } - // if (d) { - // logger.debug("VH liefert " + classvh); - // } - - zervv = new Vector(); - zervh = new Vector(); - zervv.addElement(aktwort); - zervh.addElement(aktwort); - vvOk = true; - vhOk = true; - if (classvv.equals("undecided")) { - vvOk = false; - } - if (classvh.equals("undecided")) { - vhOk = false; - } - - if (vvOk) { - for (int i = 0; i < classvv.length(); i++) { - char c = classvv.charAt(i); - // if (d) { - // logger.debug("Parse: " + c + " " + (int) c); - // } - if ((c < 58) && (c > 47)) { - zahlStrvv += c; - } - else { - suffixvv += c; - } - } // rof i - } - if (vhOk) { - for (int i = 0; i < classvh.length(); i++) { - char c = classvh.charAt(i); - // if (d) { - // logger.info("Parse: " + c + " " + (int) c); - // } - if ((c < 58) && (c > 47)) { - zahlStrvh += c; - } - else { - suffixvh += c; - } - } // rof i - } - - if (vvOk) { - zahlvv = new Integer(zahlStrvv).intValue(); - } - if (vhOk) { - zahlvh = new Integer(zahlStrvh).intValue(); - } - - if (vvOk) { - if (zahlvv >= aktwort.length()) { - vvOk = false; - } - } - - if (vhOk) { - if (zahlvh >= aktwort.length()) { - vhOk = false; - } - } - - if (vvOk) { - for (int i = 0; i < suffixvv.length(); i++) { - // if (d) { - // logger.debug("VV matche " + suffixvv.charAt(i) + " und " - // + aktwort.charAt(zahlvv + i)); - // } - if (aktwort.length() > (zahlvv + i)) { - if (suffixvv.charAt(i) != aktwort.charAt(zahlvv + i)) { - vvOk = false; - } - } - else { - vvOk = false; - } - } - } - if (vhOk) { - for (int i = 0; i < suffixvh.length(); i++) { - if (suffixvh.charAt(i) != aktwort.charAt(zahlvh + 1 + i)) { - vvOk = false; - } - } - } - - // nun abschneiden durchf�hren - if (vvOk) { - zervv.removeElement(aktwort); - vvteil1 = aktwort.substring(0, zahlvv); - vvteil2 = aktwort.substring(zahlvv + suffixvv.length(), aktwort.length()); - zervv.addElement(vvteil1); - zervv.addElement(vvteil2); - // if (d) { - // logger.debug("VV zerlegt in " + vvteil1 + " " + vvteil2); - // } - if (vvteil2.length() <= 3) { - vvOk = false; - } - - } - if (vhOk) { - zervh.removeElement(aktwort); - vhteil1 = aktwort.substring(0, aktwort.length() - zahlvh); - vhteil2 = aktwort.substring(aktwort.length() - (zahlvh + suffixvh.length()), - aktwort.length()); - zervh.addElement(vhteil1); - zervh.addElement(vhteil2); - // if (d) { - // logger.debug("VH zerlegt in " + vhteil1 + " " + vhteil2); - // } - - if (vhteil1.length() <= 3) { - vhOk = false; - } - - } - if (vvOk && vhOk) { // beide ok - if (vvteil1.equals(vhteil1)) { - retvec.addElement(vvteil1); - if (vhteil2.length() < vvteil2.length()) { - retvec.addElement(vhteil2); - } - else if (vhteil2.length() > vvteil2.length()) { - retvec.addElement(vvteil2); - } - } - else if ((vhteil1.length() - vvteil1.length()) < 3) { - retvec.addElement(vvteil1); - if (vhteil2.length() < vvteil2.length()) { - retvec.addElement(vhteil2); - } - else if (vhteil2.length() > vvteil2.length()) { - retvec.addElement(vvteil2); - } - } - // sonst 3 teile - else { - retvec.addElement(vvteil1); - retvec.addElement(aktwort.substring(vvteil1.length() + suffixvv.length(), - aktwort.length() - zahlvh)); - retvec.addElement(vhteil2); - } - if (vvteil2.equals(vhteil2)) { - retvec.addElement(vvteil2); - } - - } - else if (vvOk && !vhOk) { // nur vvOK - retvec.addElement(vvteil1); - retvec.addElement(vvteil2); - } - else if (vhOk && !vvOk) { // nur vhOK - retvec.addElement(vhteil1); - retvec.addElement(vhteil2); - } - else { // keine Zerlegung gefunden -> lassen - retvec.addElement(aktwort); - } - - // if (d) { - // logger.debug("Pre-Ergebnis: [" + aAktwort + "] -> " + retvec); - // } - - if (retvec.size() == 1) { - // If there was no split, return verbatim - retvec.clear(); - retvec.add(aAktwort); - } - else if (retvec.size() == 2) { - String w1 = retvec.get(0); - String w2 = retvec.get(1); - retvec.clear(); - - if (!aAktwort.startsWith(w1)) { - // throw new - // IllegalStateException("Bad assumption: first split not changed by - // grundFormReduktion"); - logger.error("Unable to map split " + asList(w1, w2) - + " back to original " + aAktwort + "... no splitting"); - retvec.add(aAktwort); - } - else { - retvec.add(w1); - int restBegin = w1.length(); - handleLastSplit(aAktwort, restBegin, w2, retvec); - } - } - else if (retvec.size() == 3) { - String w1 = retvec.get(0); - String w2 = retvec.get(1); - String w3 = retvec.get(2); - retvec.clear(); - - if (!aAktwort.startsWith(w1)) { - // throw new - // IllegalStateException("Bad assumption: first split not changed by - // grundFormReduktion"); - logger.error("Unable to map split " + asList(w1, w2, w3) - + " back to original " + aAktwort + "... no splitting"); - retvec.add(aAktwort); - } - else { - retvec.add(w1); - int morphi = aAktwort.indexOf(w2, w1.length()); - if (morphi == -1) { - // throw new - // IllegalStateException("Bad assumption: second split not changed by - // grundFormReduktion"); - logger.error("Unable to map split " + asList(w1, w2, w3) - + " back to original " + aAktwort + "... no splitting"); - retvec.clear(); - retvec.add(aAktwort); - } - else { - if (morphi > w1.length()) { - retvec.add("(" + aAktwort.substring(w1.length(), morphi) + ")"); - } - retvec.add(w2); - int restBegin = w2.length() + morphi; - handleLastSplit(aAktwort, restBegin, w3, retvec); - } - } - } - - // if (d) { - // logger.debug("Ergebnis: " + retvec); - // } - - Vector retvec2 = new Vector(); - - if (retvec.size() > 1) { - for (String aktelement : retvec) { - if (aktelement.startsWith("(")) { - // This is a linking morpheme - retvec2.addElement(aktelement); - continue; - } - Vector zwischen = kZerlegung(aktelement); - for (String string : zwischen) { - retvec2.addElement(string); - } - } - } // rof if enum - else { - retvec2 = retvec; - } - - // if (d) { - // logger.debug("Ergebnis2: " + retvec2.toString()); - // } - - return retvec2; - } // end kZerlegung - - public void handleLastSplit(String aAktwort, int aSplitBegin, String aSplit, - Vector retvec) - { - boolean found = false; - for (int i = 0; i < (aSplit.length() - 1); i++) { - int restOffset = aSplitBegin + i; - String rest = aAktwort.substring(restOffset); - String restGrund = grundFormReduktion(rest); - boolean isEqual = aSplit.equals(restGrund) || aSplit.equals(rest); - boolean isStartsWith = aSplit.startsWith(restGrund) || aSplit.startsWith(rest); - boolean isInvStartsWith = rest.startsWith(aSplit) || restGrund.startsWith(aSplit); - - if (isEqual || isStartsWith || isInvStartsWith) { - if (i > 0) { - retvec.add("(" + aAktwort.substring(aSplitBegin, restOffset) + ")"); - } - } - - if (isEqual) { - retvec.add(aAktwort.substring(restOffset)); - found = true; - } - else if (aSplit.startsWith(rest)) { - retvec.add(rest); - found = true; - } - else if (aSplit.startsWith(restGrund)) { - retvec.add(restGrund); - retvec.add("(" + rest.substring(restGrund.length()) + ")"); - found = true; - } - else if (isInvStartsWith) { - retvec.add(aSplit); - retvec.add("(" + rest.substring(aSplit.length()) + ")"); - // retvec.add(restGrund); - found = true; - } - - if (found) { - break; - } - } - - if (!found) { - retvec.add(aAktwort.substring(aSplitBegin)); - // throw new - // IllegalStateException("Bad assumption: last split does not start a grundform of - // a suffix of aktwort"); - } - } - - public String grundFormReduktion(String wort) - { - String retwort = wort; - anweisungGrf = grfTree.classify(reverse(wort)); - // logger.info("Anweisung f�r "+wort+": "+anweisungGrf); - if (!anweisungGrf.equals("undecided")) { - StringTokenizer kommatok = new StringTokenizer(anweisungGrf, ","); - anweisungGrf = kommatok.nextToken(); // nehme bei - // mehreren - // nurerstes - // parsing anweisung - String zahlStr = new String(); - String suffix = new String(); - - for (int i = 0; i < anweisungGrf.length(); i++) { - char c = anweisungGrf.charAt(i); - // logger.info("Parse: "+c+" "+(int)c); - if ((c < 58) && (c > 47)) { - zahlStr += c; - } - else { - suffix += c; - } - } // rof i - - // logger.info(anweisungGrf+"->"+zahlStr+"-"+suffix+"'"); - - int cutpos = new Integer(zahlStr).intValue(); - if (cutpos > retwort.length()) { - cutpos = retwort.length(); - } - retwort = retwort.substring(0, retwort.length() - cutpos) + suffix; - } - - String[] alternatives = retwort.split(";"); - if (alternatives.length > 0) { - retwort = retwort.split(";")[0]; - } - else { - retwort = wort; - } - - return retwort; - } - - public void init(String kompvv, String kompvh, String gfred) - { - // B�ume initialisierung - // logger.info("Loading from "+grfFile); - logger.debug("Loading " + kompvv + " ..."); - kompvvTree.load(kompvv); - // logger.debug("loaded"); - kompvvTree.setIgnoreCase(true); - kompvvTree.setThresh(0.51); - - // Kompositazerlegung-Beum initialisieren - logger.debug("Loading " + kompvh + " ..."); - - kompvhTree.load(kompvh); - - // logger.debug("loaded"); - kompvhTree.setIgnoreCase(true); // Trainingsmenge in - // lowcase :( - kompvhTree.setThresh(0.51); // weiss nicht? - logger.debug("Loading " + gfred + " ..."); - - grfTree.load(gfred); - // logger.debug("loaded"); - grfTree.setIgnoreCase(true); // Trainingsmenge in lowcase - // :( - grfTree.setThresh(0.46); // weiss nicht? - - } - - // inititialisieren mit pretrees - public void init2(Pretree kompvv, Pretree kompvh, Pretree gfred) - { - // B�ume initialisierung - - kompvvTree = kompvv; - kompvvTree.setIgnoreCase(true); - kompvvTree.setThresh(0.51); - - // Kompositazerlegung-Beum initialisieren - kompvhTree = kompvh; - kompvhTree.setIgnoreCase(true); // Trainingsmenge in lowcase - // :( - kompvhTree.setThresh(0.51); // weiss nicht? - - grfTree = gfred; - grfTree.setIgnoreCase(true); // Trainingsmenge in lowcase :( - grfTree.setThresh(0.46); // weiss nicht? - } - - } // end class Zerleger -} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/DecompoundingTree.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/DecompoundingTree.java deleted file mode 100644 index ec1018a1ee..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/DecompoundingTree.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter; - -import static java.util.Arrays.asList; - -import java.util.ArrayList; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Set; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.ValueNode; - -/** - * A split tree. Holds all splits in a tree structure. This can help to see the - * how the split algorithm works - * - */ -public class DecompoundingTree -{ - - private ValueNode root; - - public DecompoundingTree(String aWord) - { - root = new ValueNode(DecompoundedWord.createFromString(aWord)); - } - - public DecompoundingTree(DecompoundedWord aSplit) - { - root = new ValueNode(aSplit); - } - - public ValueNode getRoot() - { - return root; - } - - public void setRoot(ValueNode aRoot) - { - root = aRoot; - } - - /** - * Converts the tree to a list. - * - * @return the splits. - */ - public List getAllSplits() - { - Set splits = new LinkedHashSet(); - getAllSplitsRecursive(splits, getRoot(), true); - - return new ArrayList(splits); - } - - /** - * Converts the tree to a list. If there are splits, then the root node, which contains the - * unsplit word, is not returned. - * - * @return the splits. - */ - public List getSplits() - { - Set splits = new LinkedHashSet(); - getAllSplitsRecursive(splits, getRoot(), false); - - if (!splits.isEmpty()) { - return new ArrayList(splits); - } - else { - return asList(getRoot().getValue()); - } - } - - protected void getAllSplitsRecursive(Set aSplits, - ValueNode aNode, boolean aAddNode) - { - if (aAddNode) { - aSplits.add(aNode.getValue()); - } - if (aNode.hasChildren()) { - for (ValueNode child : aNode.getChildren()) { - getAllSplitsRecursive(aSplits, child, true); - } - } - } -} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/JWordSplitterAlgorithm.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/JWordSplitterAlgorithm.java deleted file mode 100644 index ea235e778e..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/JWordSplitterAlgorithm.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter; - -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - -import de.abelssoft.wordtools.jwordsplitter.AbstractWordSplitter; -import de.abelssoft.wordtools.jwordsplitter.impl.GermanWordSplitter; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes; -import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.ValueNode; - -/** - * Wrapper for the JWordSplitter algorithm. - * - */ -public class JWordSplitterAlgorithm - implements SplitterAlgorithm -{ - private AbstractWordSplitter splitterHiddenLinking; - private AbstractWordSplitter splitter; - private Dictionary dict; - - @Override - public DecompoundingTree split(String aWord) - { - if (splitter == null) { - try { - splitterHiddenLinking = new InternalGermanWordSplitter(true); - splitter = new InternalGermanWordSplitter(false); - } - catch (IOException e) { - throw new IllegalStateException("Unable to access dictionary", e); - } - } - - DecompoundingTree t = new DecompoundingTree(aWord); - - // Just append on child to the tree - String[] splits = splitter.splitWord(aWord).toArray(new String[0]); - String[] splitsNoLink = splitterHiddenLinking.splitWord(aWord).toArray(new String[0]); - - if (splits.length != splitsNoLink.length) { - throw new IllegalStateException( - "Something is fishy - more must have happened than just hiding the links"); - } - - if (splits.length > 1) { - StringBuilder splitStringMorph = new StringBuilder(); - for (int i = 0; i < splits.length; i++) { - String base = splitsNoLink[i]; - String full = splits[i]; - - if (!full.startsWith(base)) { - throw new IllegalStateException( - "Something is fishy - links should be at the end"); - } - String link = full.substring(base.length()); - - // Split with linking morphemes - splitStringMorph.append(base); - if (link.length() > 0) { - splitStringMorph.append("(").append(link).append(")"); - } - splitStringMorph.append("+"); - } - - String splitStringMorphStr = splitStringMorph.toString(); - t.getRoot().addChild(new ValueNode(DecompoundedWord.createFromString(splitStringMorphStr))); - } - - return t; - } - - @Override - public void setDictionary(Dictionary aDict) - { - dict = aDict; - splitter = null; - splitterHiddenLinking = null; - } - - @Override - public void setLinkingMorphemes(LinkingMorphemes aMorphemes) - { - // Not needed for this algorithm - } - - @Override - public void setMaximalTreeDepth(int aDepth) - { - // Not needed for this algorithm - } - - private class InternalGermanWordSplitter extends GermanWordSplitter - { - public InternalGermanWordSplitter(boolean aHideConnectingCharacters) - throws IOException - { - super(aHideConnectingCharacters); - } - - @Override - protected Set getWordList() - throws IOException - { - if (dict == null) { - return super.getWordList(); - } - else { - return new HashSet(dict.getAll()); - } - } - } -} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/package-info.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/package-info.java deleted file mode 100644 index cefe1463e0..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/package-info.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -/** - * Contains splitting algorithm. - * - * To implement you own splitting algorithm you can - * use the {@link de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.SplitterAlgorithm} interface. - */ -package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter; \ No newline at end of file diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/trie/package-info.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/trie/package-info.java deleted file mode 100644 index 2bf2b6eaf0..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/trie/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -/** - * Some base classes for trees and a simple Trie implementation - */ -package de.tudarmstadt.ukp.dkpro.core.decompounding.trie; \ No newline at end of file diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/annotator/package-info.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/annotator/package-info.java deleted file mode 100644 index 50bcd56b3a..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/annotator/package-info.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -/** - * This package contains UIMA annotation classes. - * - * The annotators iterate over all token in a CAS and - * try to split them. - */ -package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.annotator; \ No newline at end of file diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/AsvToolboxSplitterResource.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/AsvToolboxSplitterResource.java deleted file mode 100644 index 5c46f20d50..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/AsvToolboxSplitterResource.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright 2010 - - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource; - -import static org.apache.uima.util.Level.SEVERE; - -import java.io.IOException; -import java.util.Map; - -import org.apache.uima.fit.descriptor.ExternalResource; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceSpecifier; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundingTree; - -public class AsvToolboxSplitterResource - extends SplitterResource -{ - - /** - * - * This external resource wraps the patricia trie which shall be used by the ASV Toolbox splitter. - * - * */ - - public static final String PARAM_PATRICIA_TRIES_RESOURCE = "patriciaTriesResource"; - @ExternalResource(key = PARAM_PATRICIA_TRIES_RESOURCE) - private SharedPatriciaTries patriciaTriesResource; - - @Override - public boolean initialize(ResourceSpecifier aSpecifier, Map aAdditionalParams) - throws ResourceInitializationException - { - if (!super.initialize(aSpecifier, aAdditionalParams)) { - return false; - } - return true; - } - - @Override - public void afterResourcesInitialized() throws RuntimeException { - try { - splitter = patriciaTriesResource.getSplitter(); - } - catch (IOException e) { - getLogger().log(SEVERE, "IOException caught when getting the patricia trie resource"); - getLogger().log(SEVERE, e.getLocalizedMessage()); - getLogger().log(SEVERE, e.getMessage()); - throw new RuntimeException(e); - } - catch (ResourceInitializationException e) { - getLogger().log(SEVERE, "RuntimeException caught when getting the patrica trie resource"); - getLogger().log(SEVERE, e.getLocalizedMessage()); - getLogger().log(SEVERE, e.getMessage()); - throw new RuntimeException(e); - } - } - - @Override - public DecompoundingTree split(String aWord) - { - return splitter.split(aWord); - } - - @Override - public void setDictionary(Dictionary aDict) - { - splitter.setDictionary(aDict); - } - - @Override - public void setLinkingMorphemes(LinkingMorphemes aMorphemes) - { - splitter.setLinkingMorphemes(aMorphemes); - } - - @Override - public void setMaximalTreeDepth(int aDepth) - { - splitter.setMaximalTreeDepth(aDepth); - } - -} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/BananaSplitterResource.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/BananaSplitterResource.java deleted file mode 100644 index 4fa0f540ce..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/BananaSplitterResource.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource; - -import java.util.Map; - -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceSpecifier; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.BananaSplitterAlgorithm; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundingTree; - -public class BananaSplitterResource - extends SplitterResource -{ - - @SuppressWarnings({ "rawtypes" }) - @Override - public boolean initialize(ResourceSpecifier aSpecifier, - Map aAdditionalParams) - throws ResourceInitializationException - { - if (!super.initialize(aSpecifier, aAdditionalParams)) { - return false; - } - - splitter = new BananaSplitterAlgorithm(); - return true; - } - - @Override - public DecompoundingTree split(String aWord) - { - return splitter.split(aWord); - } - - @Override - public void setDictionary(Dictionary aDict) - { - splitter.setDictionary(aDict); - } - - @Override - public void setLinkingMorphemes(LinkingMorphemes aMorphemes) - { - splitter.setLinkingMorphemes(aMorphemes); - } - - @Override - public void setMaximalTreeDepth(int aDepth) - { - splitter.setMaximalTreeDepth(aDepth); - } -} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/DataDrivenSplitterResource.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/DataDrivenSplitterResource.java deleted file mode 100644 index 404b68a5b5..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/DataDrivenSplitterResource.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource; - -import java.util.Map; - -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceSpecifier; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DataDrivenSplitterAlgorithm; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundingTree; - -public class DataDrivenSplitterResource - extends SplitterResource -{ - - @SuppressWarnings({ "rawtypes" }) - @Override - public boolean initialize(ResourceSpecifier aSpecifier, - Map aAdditionalParams) - throws ResourceInitializationException - { - if (!super.initialize(aSpecifier, aAdditionalParams)) { - return false; - } - - splitter = new DataDrivenSplitterAlgorithm(); - - return true; - } - - @Override - public DecompoundingTree split(String aWord) - { - return splitter.split(aWord); - } - - @Override - public void setDictionary(Dictionary aDict) - { - splitter.setDictionary(aDict); - } - - @Override - public void setLinkingMorphemes(LinkingMorphemes aMorphemes) - { - splitter.setLinkingMorphemes(aMorphemes); - } - - @Override - public void setMaximalTreeDepth(int aDepth) - { - splitter.setMaximalTreeDepth(aDepth); - } -} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/FrequencyRankerResource.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/FrequencyRankerResource.java deleted file mode 100644 index 026e261d54..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/FrequencyRankerResource.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource; - -import java.util.Map; - -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceSpecifier; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.ranking.FrequencyGeometricMeanRanker; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundedWord; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundingTree; -import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.Finder; - -public class FrequencyRankerResource - extends RankerResource -{ - - @SuppressWarnings({ "rawtypes" }) - @Override - public boolean initialize(ResourceSpecifier aSpecifier, - Map aAdditionalParams) - throws ResourceInitializationException - { - if (!super.initialize(aSpecifier, aAdditionalParams)) { - return false; - } - - ranker = new FrequencyGeometricMeanRanker(); - return true; - } - - @Override - public DecompoundedWord highestRank(DecompoundingTree aSplitTree) - { - return ranker.highestRank(aSplitTree); - } - - @Override - public void setFinder(Finder aFinder) - { - ranker.setFinder(aFinder); - } - -} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/JWordSplitterResource.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/JWordSplitterResource.java deleted file mode 100644 index aa8f613c7f..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/JWordSplitterResource.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource; - -import java.util.Map; - -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceSpecifier; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundingTree; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.JWordSplitterAlgorithm; - -public class JWordSplitterResource - extends SplitterResource -{ - - @SuppressWarnings({ "rawtypes" }) - @Override - public boolean initialize(ResourceSpecifier aSpecifier, - Map aAdditionalParams) - throws ResourceInitializationException - { - if (!super.initialize(aSpecifier, aAdditionalParams)) { - return false; - } - - splitter = new JWordSplitterAlgorithm(); - - return true; - } - - @Override - public DecompoundingTree split(String aWord) - { - return splitter.split(aWord); - } - - @Override - public void setDictionary(Dictionary aDict) - { - splitter.setDictionary(aDict); - } - - @Override - public void setLinkingMorphemes(LinkingMorphemes aMorphemes) - { - splitter.setLinkingMorphemes(aMorphemes); - } - - @Override - public void setMaximalTreeDepth(int aDepth) - { - splitter.setMaximalTreeDepth(aDepth); - } -} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/LeftToRightSplitterResource.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/LeftToRightSplitterResource.java deleted file mode 100644 index a4ccb42072..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/LeftToRightSplitterResource.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource; - -import java.util.Map; - -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceSpecifier; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundingTree; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.LeftToRightSplitterAlgorithm; - -public class LeftToRightSplitterResource - extends SplitterResource -{ - - @SuppressWarnings({ "rawtypes" }) - @Override - public boolean initialize(ResourceSpecifier aSpecifier, - Map aAdditionalParams) - throws ResourceInitializationException - { - if (!super.initialize(aSpecifier, aAdditionalParams)) { - return false; - } - - splitter = new LeftToRightSplitterAlgorithm(); - - return true; - } - - @Override - public DecompoundingTree split(String aWord) - { - return splitter.split(aWord); - } - - @Override - public void setDictionary(Dictionary aDict) - { - splitter.setDictionary(aDict); - } - - @Override - public void setMaximalTreeDepth(int aDepth) - { - splitter.setMaximalTreeDepth(aDepth); - } - - @Override - public void setLinkingMorphemes(LinkingMorphemes aMorphemes) - { - splitter.setLinkingMorphemes(aMorphemes); - } -} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/MutualInformationRankerResource.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/MutualInformationRankerResource.java deleted file mode 100644 index edd2dfe86b..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/MutualInformationRankerResource.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource; - -import java.util.Map; - -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceSpecifier; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.ranking.MutualInformationRanker; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundedWord; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundingTree; -import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.Finder; - -public class MutualInformationRankerResource - extends RankerResource -{ - - @SuppressWarnings({ "rawtypes" }) - @Override - public boolean initialize(ResourceSpecifier aSpecifier, - Map aAdditionalParams) - throws ResourceInitializationException - { - if (!super.initialize(aSpecifier, aAdditionalParams)) { - return false; - } - - ranker = new MutualInformationRanker(); - return true; - } - - @Override - public DecompoundedWord highestRank(DecompoundingTree aSplitTree) - { - return ranker.highestRank(aSplitTree); - } - - @Override - public void setFinder(Finder aFinder) - { - ranker.setFinder(aFinder); - } - -} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/ProbabilityRankerResource.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/ProbabilityRankerResource.java deleted file mode 100644 index f2b811dfc1..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/ProbabilityRankerResource.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource; - -import java.util.Map; - -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceSpecifier; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.ranking.CompoundProbabilityRanker; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundedWord; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundingTree; -import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.Finder; - -public class ProbabilityRankerResource - extends RankerResource -{ - - @SuppressWarnings({ "rawtypes" }) - @Override - public boolean initialize(ResourceSpecifier aSpecifier, - Map aAdditionalParams) - throws ResourceInitializationException - { - if (!super.initialize(aSpecifier, aAdditionalParams)) { - return false; - } - - ranker = new CompoundProbabilityRanker(); - - return true; - } - - @Override - public DecompoundedWord highestRank(DecompoundingTree aSplitTree) - { - return ranker.highestRank(aSplitTree); - } - - @Override - public void setFinder(Finder aFinder) - { - ranker.setFinder(aFinder); - } - -} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/package-info.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/package-info.java deleted file mode 100644 index c004b8602d..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/package-info.java +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -/** - * This package contains UIMA resources classes. Each splitter and each ranker needs to have a UIMA - * resource so the user can combine different strategies and choose the one which best suits its - * needs. Besides the splitters and rankers, there is also a resource for the Dictionary, for - * Finder and for the LinkingMorphemes. - * - */ -package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource; \ No newline at end of file diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/NGramModel.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/NGramModel.java deleted file mode 100644 index 65e8d26ea4..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/NGramModel.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.web1t; - -/** - * N-gram model class. - * - * This is only a data container for the n-grams - * - */ -public class NGramModel -{ - - private String gram; - private int freq; - - public NGramModel(String aGram, int aFreq) - { - gram = aGram; - freq = aFreq; - } - - public String getGram() - { - return gram; - } - - public void setGram(String aGram) - { - gram = aGram; - } - - public int getFreq() - { - return freq; - } - - public void setFreq(int aFreq) - { - freq = aFreq; - } - - public int getN() - { - return gram.split(" ").length; - } - - @Override - public String toString() - { - return "[" + gram + "] (freq=" + freq + ")"; - } -} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/package-info.java b/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/package-info.java deleted file mode 100644 index 086f3a7723..0000000000 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/package-info.java +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -/** - * This package contains all classes that are needed access the Google web1T data set. - * - * The {@link de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.LuceneIndexer} creates a Lucence index from the - * data set and the {@link de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.Finder} can be used to search on the - * Lucence index. - */ -package de.tudarmstadt.ukp.dkpro.core.decompounding.web1t; \ No newline at end of file diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/Dictionary.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/Dictionary.java similarity index 94% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/Dictionary.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/Dictionary.java index cf12bfdeb1..12bc0075f6 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/Dictionary.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/Dictionary.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary; +package org.dkpro.core.decompounding.dictionary; import java.util.List; diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/German98Dictionary.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/German98Dictionary.java new file mode 100644 index 0000000000..0cd6732314 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/German98Dictionary.java @@ -0,0 +1,211 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.dictionary; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.dkpro.core.decompounding.dictionary.igerman98.Affix; + +/** + * The igerman98 dictionary from www.j3e.de/ispell/igerman98 + * + * A current version of the german dictionary de_DE can be found in + * /src/main/resources/de_DE.dic + * + * This class can also be used to read other ispell/hunspell dictionaries. + * + */ +public class German98Dictionary + extends SimpleDictionary +{ + private static final String PREFIX_KEY = "PFX"; + private static final String SUFFIX_KEY = "SFX"; + + private Map> affixes = new HashMap>(); + + public German98Dictionary(File aDict, File aAffix, String aEncoding) + throws IOException + { + try ( + BufferedReader dis = new BufferedReader( + new InputStreamReader(new FileInputStream(aDict), aEncoding)); + BufferedReader ais = new BufferedReader( + new InputStreamReader(new FileInputStream(aAffix), aEncoding)); + ) { + readAffixFile(ais); + setWords(readFileToSet(dis)); + } + } + + public German98Dictionary(InputStream aDictStream, InputStream aAffixStream, String aEncoding) + throws IOException + { + try ( + BufferedReader dis = new BufferedReader( + new InputStreamReader(aDictStream, aEncoding)); + BufferedReader ais = new BufferedReader( + new InputStreamReader(aAffixStream, aEncoding)); + ) { + readAffixFile(ais); + setWords(readFileToSet(dis)); + } + } + + @Override + protected Set readFileToSet(BufferedReader aReader) + throws IOException + { + Set words = new HashSet(); + + // First line contains number of entries -> skip + String line = aReader.readLine(); + while ((line = aReader.readLine()) != null) { + if (line.equals("") || line.substring(0, 1).equals("#") + || line.substring(0, 1).equals("\t")) { + // Ignore lines starting with hash of tab (comments) + continue; + } + String[] split = line.split("/"); + String word = split[0].toLowerCase(); + char[] flags = {}; + + if (split.length > 1) { + flags = split[1].toCharArray(); + } + + if (word.length() > 2) { + words.add(word); + } + + if (flags.length > 0) { + words.addAll(buildWords(word, flags)); + } + } + + return words; + } + + /** + * Reads the affix file and processes the data + * + * @param aReader + * a reader. + */ + protected void readAffixFile(BufferedReader aReader) + { + try { + String line; + while ((line = aReader.readLine()) != null) { + if (line.startsWith(PREFIX_KEY) || line.startsWith(SUFFIX_KEY)) { + parseAffix(line, aReader); + } + } + } + catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + /** + * Parse a affix in the affix file + * + * @param aHeader + * The header of the affix + * @param aReader + * The file reader to read the rest of the affix + * @throws IOException if an I/O error occurs. + */ + private void parseAffix(String aHeader, BufferedReader aReader) + throws IOException + { + String[] args = aHeader.split("\\s+"); + + boolean crossProduct = args[2].equals("Y"); + int numLines = Integer.parseInt(args[3]); + + for (int i = 0; i < numLines; i++) { + String line = aReader.readLine(); + if (line == null) { + throw new IOException("Unexpected end of file after reading [" + i + + "] lines. Expected were [" + numLines + "] lines."); + } + String[] ruleArgs = line.split("\\s+"); + Character flag = ruleArgs[1].toCharArray()[0]; + + Affix a = new Affix(args[0]); + a.setCrossProduct(crossProduct); + a.setFlag(flag); + a.setStripping(ruleArgs[2]); + a.setAffix(ruleArgs[3]); + a.setCondition(ruleArgs[4]); + + List list = affixes.get(flag); + if (list == null) { + list = new ArrayList(); + affixes.put(flag, list); + } + list.add(a); + } + } + + /** + * Uses affixes to build new words + * + * @param aWord + * a word. + * @param aFlags + * flags. + * @return inflected word forms. + */ + protected List buildWords(String aWord, char[] aFlags) + { + List words = new ArrayList(); + for (char c : aFlags) { + List aff = affixes.get(c); + if (aff == null) { + continue; + } + for (Affix affix : aff) { + String w = affix.handleWord(aWord); + if (w != null && w.length() > 2) { + words.add(w); + } + } + } + + return words; + } +} diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/JWordSplitterDictionary.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/JWordSplitterDictionary.java new file mode 100644 index 0000000000..e13e040523 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/JWordSplitterDictionary.java @@ -0,0 +1,88 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.dictionary; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import de.abelssoft.tools.persistence.FastObjectSaver; + +/** + * The simple dictionary reads a file in which each line is a new word. + * + * This can be used to create your own dictionary from a corpus + * + */ +public class JWordSplitterDictionary + implements Dictionary +{ + private static final String SERIALIZED_DICT = "/wordsGerman.ser"; // dict inside the JAR + + private Set words; + + /** + * Constructor for a simple dictionary + */ + public JWordSplitterDictionary() + { + try { + words = (HashSet) FastObjectSaver.load(SERIALIZED_DICT); + } + catch (IOException e) { + e.printStackTrace(); + } + } + + @Override + public boolean contains(String aWord) + { + return words.contains(aWord); + } + + /** + * Reads the dictionary to set + * + * @param aReader + * a reader. + * @return A set of words + * @throws IOException + * if an I/O problem occurs. + */ + protected Set readFileToSet(BufferedReader aReader) + throws IOException + { + Set words = new HashSet(); + String line; + while ((line = aReader.readLine()) != null) { + words.add(line.toLowerCase()); + } + + return words; + } + + @Override + public List getAll() + { + return new ArrayList(words); + } +} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/LinkingMorphemes.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/LinkingMorphemes.java similarity index 98% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/LinkingMorphemes.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/LinkingMorphemes.java index ddae0ffab3..5b0decee8b 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/LinkingMorphemes.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/LinkingMorphemes.java @@ -16,7 +16,7 @@ * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary; +package org.dkpro.core.decompounding.dictionary; import java.io.BufferedReader; import java.io.File; diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/SimpleDictionary.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/SimpleDictionary.java similarity index 96% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/SimpleDictionary.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/SimpleDictionary.java index deb836452d..1f12548e71 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/SimpleDictionary.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/SimpleDictionary.java @@ -16,7 +16,7 @@ * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary; +package org.dkpro.core.decompounding.dictionary; import java.io.BufferedReader; import java.io.File; @@ -36,9 +36,9 @@ * */ public class SimpleDictionary - implements Dictionary + implements Dictionary { - private Set words; + private Set words; /** * Constructor for a simple dictionary diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/igerman98/Affix.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/igerman98/Affix.java new file mode 100644 index 0000000000..7389e298ed --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/igerman98/Affix.java @@ -0,0 +1,196 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.dictionary.igerman98; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Affix data model + * + */ +public class Affix +{ + + /** + * Key for prefixed in affix files + */ + private static final String PREFIX_KEY = "PFX"; + + /** + * Key for suffixes in the affix files + */ + private static final String SUFFIX_KEY = "SFX"; + + private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*"; + private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s"; + + private AffixType type; + private char flag; + private String stripping; + private String affix; + private String condition; + private Pattern conditionPattern; + private boolean crossProduct; + + public Affix(AffixType aType) + { + type = aType; + } + + public Affix(String aKey) + { + if (aKey.equals(PREFIX_KEY)) { + type = AffixType.PREFIX; + } + else if (aKey.equals(SUFFIX_KEY)) { + type = AffixType.SUFFIX; + } + else { + throw new RuntimeException(aKey + " do not exist"); + } + } + + public boolean isCrossProduct() + { + return crossProduct; + } + + public void setCrossProduct(boolean aCrossProduct) + { + crossProduct = aCrossProduct; + } + + public AffixType getType() + { + return type; + } + + public void setType(AffixType aType) + { + type = aType; + } + + public char getFlag() + { + return flag; + } + + public void setFlag(char aFlag) + { + flag = aFlag; + } + + public String getStripping() + { + return stripping; + } + + public void setStripping(String aStripping) + { + stripping = aStripping; + } + + public String getAffix() + { + return affix; + } + + public void setAffix(String aAffix) + { + affix = aAffix; + } + + public String getCondition() + { + return condition; + } + + public void setCondition(String aCondition) + { + condition = aCondition; + + String regExp; + + switch (type) { + case PREFIX: + regExp = String.format(PREFIX_CONDITION_REGEX_PATTERN, aCondition); + break; + case SUFFIX: + regExp = String.format(SUFFIX_CONDITION_REGEX_PATTERN, aCondition); + break; + default: + throw new RuntimeException(type.toString() + + " is not supported"); + } + + conditionPattern = Pattern.compile(regExp); + } + + /** + * Adopt this affix on a given word + * + * @param aWord + * a word. + * @return The word with a change prefix or affix + */ + public String handleWord(String aWord) + { + Matcher m = conditionPattern.matcher(aWord); + + if (m != null && m.matches()) { + if (type.equals(AffixType.PREFIX)) { + return handlePrefix(aWord); + } + else if (type.equals(AffixType.SUFFIX)) { + return handleSuffix(aWord); + } + } + + return null; + } + + private String handlePrefix(String aWord) + { + if (stripping.equals("0") || aWord.startsWith(stripping)) { + int start = 0; + if (!stripping.equals("0") && aWord.startsWith(stripping)) { + start = aWord.length() - stripping.length(); + } + + return affix + aWord.substring(start); + } + + return null; + } + + private String handleSuffix(String aWord) + { + if (stripping.equals("0") || aWord.endsWith(stripping)) { + int end = aWord.length(); + if (!stripping.equals("0") && aWord.endsWith(stripping)) { + end = aWord.length() - stripping.length(); + } + + return aWord.substring(0, end) + affix; + } + + return null; + } +} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/igerman98/AffixType.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/igerman98/AffixType.java similarity index 88% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/igerman98/AffixType.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/igerman98/AffixType.java index daaf42c439..253485e285 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/igerman98/AffixType.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/igerman98/AffixType.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.igerman98; +package org.dkpro.core.decompounding.dictionary.igerman98; /** * Affix type @@ -23,5 +23,5 @@ */ public enum AffixType { - PREFIX, SUFFIX + PREFIX, SUFFIX } diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/igerman98/package-info.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/igerman98/package-info.java new file mode 100644 index 0000000000..a04f34ba4c --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/igerman98/package-info.java @@ -0,0 +1,22 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +/** + * This package contains the Affix models used by the dictionary classes. + */ +package org.dkpro.core.decompounding.dictionary.igerman98; diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/package-info.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/package-info.java new file mode 100644 index 0000000000..f6be4c0960 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/dictionary/package-info.java @@ -0,0 +1,37 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +/** + * This package contains dictionary classes. Currently you have to options. You can work with your + * own dictionary or with popular IGerman98 Dictionary which is part of nearly all spell checkers. + * + * If you want to use your own dictionary you have to create a file that contains your words. Each + * word in one line. Then you can use the + * {@link org.dkpro.core.decompounding.dictionary.SimpleDictionary} class. + * + * If you want to use the IGerman98 dictionary you can use the + * {@link org.dkpro.core.decompounding.dictionary.German98Dictionary}. + * + * Additional this package contains the + * {@link org.dkpro.core.decompounding.dictionary.LinkingMorphemes} class. This is a + * simple dictionary and hold all possible morphemes. + * + * If you want to code you own dictionary use the + * {@link org.dkpro.core.decompounding.dictionary.Dictionary} interface. + */ +package org.dkpro.core.decompounding.dictionary; diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/AbstractRanker.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/AbstractRanker.java new file mode 100644 index 0000000000..3eb8cde5d7 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/AbstractRanker.java @@ -0,0 +1,159 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.ranking; + +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.ivy.util.cli.CommandLine; +import org.dkpro.core.decompounding.splitter.DecompoundedWord; +import org.dkpro.core.decompounding.splitter.DecompoundingTree; +import org.dkpro.core.decompounding.splitter.Fragment; +import org.dkpro.core.decompounding.trie.ValueNode; +import org.dkpro.core.decompounding.web1t.Finder; +import org.dkpro.core.decompounding.web1t.NGramModel; + +/** + * Contains base method for the ranking algorithms + * + */ +public abstract class AbstractRanker implements Ranker +{ + private Finder finder; + + /** + * Empty constructor + * + * Use setFinder before using this class + */ + public AbstractRanker() { + + } + + public AbstractRanker(Finder aFinder) + { + finder = aFinder; + } + + public Finder getFinder() + { + return finder; + } + + /** + * Gets the frequency of a Split Element + * + * @param aWord + * a fragment. + * @return the frequency. + */ + protected BigInteger freq(Fragment aWord) + { + return finder.freq(aWord.getWord()); + } + + /** + * Returns the frequency of n-grams that contain both split elements + * + * @param aWord1 + * a fragment. + * @param aWord2 + * another fragment. + * @return the n-gram frequency. + */ + protected BigInteger freq(Fragment aWord1, Fragment aWord2) + { + return freq(new String[] { aWord1.getWord(), aWord2.getWord() }); + } + + /** + * Returns the frequency for a array of words + * + * @param aWords + * the words. + * @return the frequency. + */ + protected BigInteger freq(String[] aWords) + { + BigInteger total = BigInteger.valueOf(0l); + + for (NGramModel gram : finder.find(aWords)) { + total = total.add(BigInteger.valueOf(gram.getFreq())); + } + + return total; + } + + public final static String INDEX_OPTION = "luceneIndex"; + public final static String LIMIT_OPTION = "limit"; + + public static int getLimitOption(CommandLine aCmd) + { + int i = Integer.MAX_VALUE; + if (aCmd.hasOption(LIMIT_OPTION)) { + i = Integer.valueOf(aCmd.getOptionValue(LIMIT_OPTION)); + } + + return i; + } + + public static String getIndexPathOption(CommandLine aCmd) + { + return aCmd.getOptionValue(INDEX_OPTION); + } + + @Override + public void setFinder(Finder aFinder) { + finder = aFinder; + } + + /** + * Expects that the splits list contains at least one element and that this is the unsplit word. + * + * @param aSplits + * the splits. + * @return the filtered splits. + */ + public static List filterAndSort(List aSplits) { + List filtered = new ArrayList(); + for (DecompoundedWord s : aSplits) { + if (!Double.isInfinite(s.getWeight()) && !Double.isInfinite(s.getWeight()) + && (s.getWeight() > 0.0)) { + filtered.add(s); + } + } + Collections.sort(filtered); + + if (filtered.isEmpty()) { + filtered.add(aSplits.get(0)); + } + + return filtered; + } + + @Override + public DecompoundedWord highestRank(DecompoundingTree aTree) { + return highestRank(aTree.getRoot(), null); + } + + public abstract DecompoundedWord highestRank(ValueNode aParent, + List aPath); +} diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/CompoundProbabilityRanker.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/CompoundProbabilityRanker.java new file mode 100644 index 0000000000..3d212c2bfa --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/CompoundProbabilityRanker.java @@ -0,0 +1,125 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.ranking; + +import java.util.Collections; +import java.util.List; + +import org.dkpro.core.decompounding.splitter.DecompoundedWord; +import org.dkpro.core.decompounding.splitter.Fragment; +import org.dkpro.core.decompounding.trie.ValueNode; +import org.dkpro.core.decompounding.web1t.Finder; + +/** + * Probability based ranking method + * + */ +public class CompoundProbabilityRanker + extends AbstractRanker + implements RankerList +{ + /** + * Empty constructor + * + * Use {@link #setFinder(Finder)} before using this class + */ + public CompoundProbabilityRanker() { + } + + /** + * Constructor + * + * @param aFinder + * a finder. + */ + public CompoundProbabilityRanker(Finder aFinder) + { + super(aFinder); + } + + @Override + public DecompoundedWord highestRank(List aSplits) + { + return rank(aSplits).get(0); + } + + @Override + public List rank(List aSplits) + { + for (DecompoundedWord split : aSplits) { + split.setWeight(calcRank(split)); + } + + List result = filterAndSort(aSplits); + Collections.sort(result, Collections.reverseOrder()); + + return result; + } + + /** + * Calculates the weight for a split + */ + private float calcRank(DecompoundedWord aSplit) + { + float result = 0; + + for (Fragment elem : aSplit.getSplits()) { + result += -1 * Math + .log(freq(elem).doubleValue() / getFinder().getUnigramCount().doubleValue()); + } + + return result; + } + + /** + * Searches a a path throw the tree + */ + @Override + public DecompoundedWord highestRank(ValueNode aParent, + List aPath) + { + if (aPath != null) { + aPath.add(aParent.getValue()); + } + + List children = aParent.getChildrenValues(); + if (children.size() == 0) { + return aParent.getValue(); + } + + children.add(aParent.getValue()); + List result = rank(children); + DecompoundedWord best = result.get(0); + + if (best.equals(aParent.getValue())) { + // None of the children get a better score than the parent + return aParent.getValue(); + } + else { + // Find the child node that ranked best and recurse + for (ValueNode split : aParent.getChildren()) { + if (best.equals(split.getValue())) { + return highestRank(split, aPath); + } + } + } + + return null; + } +} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/DummyRanker.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/DummyRanker.java similarity index 78% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/DummyRanker.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/DummyRanker.java index 8fbbb2f97d..d2526eaf43 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/DummyRanker.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/DummyRanker.java @@ -16,14 +16,15 @@ * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.ranking; +package org.dkpro.core.decompounding.ranking; import java.util.List; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundedWord; -import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.ValueNode; +import org.dkpro.core.decompounding.splitter.DecompoundedWord; +import org.dkpro.core.decompounding.trie.ValueNode; -public class DummyRanker extends AbstractRanker +public class DummyRanker + extends AbstractRanker { @Override @@ -39,7 +40,6 @@ public DecompoundedWord highestRank(ValueNode aParent, return aParent.getValue(); } - return children.get(children.size()-1); + return children.get(children.size() - 1); } - } diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/FrequencyGeometricMeanRanker.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/FrequencyGeometricMeanRanker.java new file mode 100644 index 0000000000..b411b528c4 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/FrequencyGeometricMeanRanker.java @@ -0,0 +1,114 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.ranking; + +import java.util.List; + +import org.apache.commons.math3.stat.descriptive.SummaryStatistics; +import org.dkpro.core.decompounding.splitter.DecompoundedWord; +import org.dkpro.core.decompounding.splitter.Fragment; +import org.dkpro.core.decompounding.trie.ValueNode; +import org.dkpro.core.decompounding.web1t.Finder; + +/** + * Frequency based ranking algorithm. See doc folder for more informations. + * + */ +public class FrequencyGeometricMeanRanker + extends AbstractRanker + implements RankerList +{ + /** + * Empty constructor + * + * Use {@link #setFinder(Finder)} before using this class + */ + public FrequencyGeometricMeanRanker() { + + } + + public FrequencyGeometricMeanRanker(Finder aFinder) + { + super(aFinder); + } + + @Override + public DecompoundedWord highestRank(List aSplits) + { + return rank(aSplits).get(0); + } + + @Override + public List rank(List aSplits) + { + for (DecompoundedWord split : aSplits) { + split.setWeight(calcRank(split)); + } + + return filterAndSort(aSplits); + } + + /** + * Calculates the weight for a split + */ + private double calcRank(DecompoundedWord aSplit) + { + SummaryStatistics stats = new SummaryStatistics(); + for (Fragment elem : aSplit.getSplits()) { + stats.addValue(freq(elem).doubleValue()); + } + return stats.getGeometricMean(); + } + + /** + * Searches a a path throw the tree + */ + @Override + public DecompoundedWord highestRank(ValueNode aParent, + List aPath) + { + if (aPath != null) { + aPath.add(aParent.getValue()); + } + + List children = aParent.getChildrenValues(); + if (children.size() == 0) { + return aParent.getValue(); + } + + children.add(aParent.getValue()); + List result = rank(children); + DecompoundedWord best = result.get(0); + + if (best.equals(aParent.getValue())) { + // None of the childs get a better score than the parent + return aParent.getValue(); + } + else { + // Find the child node that ranked best and recurse + for (ValueNode split : aParent.getChildren()) { + if (best.equals(split.getValue())) { + return highestRank(split, aPath); + } + } + } + + return null; + } +} diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/MutualInformationRanker.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/MutualInformationRanker.java new file mode 100644 index 0000000000..4e40267d2f --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/MutualInformationRanker.java @@ -0,0 +1,154 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.ranking; + +import java.math.BigInteger; +import java.util.List; + +import org.dkpro.core.decompounding.splitter.DecompoundedWord; +import org.dkpro.core.decompounding.splitter.Fragment; +import org.dkpro.core.decompounding.trie.ValueNode; +import org.dkpro.core.decompounding.web1t.Finder; + +/** + * Mutual informationen based ranking algorithm. + */ +public class MutualInformationRanker + extends AbstractRanker + implements RankerList +{ + /** + * Empty constructor + * + * Use {@link #setFinder(Finder)} before using this class + */ + public MutualInformationRanker() { + } + + public MutualInformationRanker(Finder aFinder) + { + super(aFinder); + } + + @Override + public DecompoundedWord highestRank(List aSplits) + { + return rank(aSplits).get(0); + } + + @Override + public List rank(List aSplits) + { + for (DecompoundedWord split : aSplits) { + double weight = calcRank(split); + if (Double.isInfinite(split.getWeight()) || Double.isNaN(split.getWeight())) { + weight = 0.0; + } + split.setWeight(weight); + } + + return filterAndSort(aSplits); + } + + /** + * Calculates the weight for a split + */ + private float calcRank(DecompoundedWord aSplit) + { + double total = 0; + double count = 0; + + BigInteger unigramCount = getFinder().getUnigramCount(); + + if (aSplit.getSplits().size() == 1) { + // Entropy for single words + Fragment w = aSplit.getSplits().get(0); + double p = freq(w).doubleValue() / unigramCount.doubleValue(); + + return (float) ((-1) * p * Math.log(p)); + } + + // Mutual Information for splits. + for (int i = 1; i < aSplit.getSplits().size(); i++) { + count++; + + Fragment w1 = aSplit.getSplits().get(i - 1); + Fragment w2 = aSplit.getSplits().get(i); + // Look up unigram frequencies first - this is fast and allows us to bail out early + BigInteger w1f = freq(w1); + if (w1f.equals(BigInteger.ZERO)) { + continue; + } + + BigInteger w2f = freq(w2); + if (w2f.equals(BigInteger.ZERO)) { + continue; + } + + // This is a slow lookup that we only do if the unigram frequencies are greate than 0 + double a = freq(w1, w2).multiply(unigramCount).doubleValue(); + if (a == 0d) { + continue; + } + + // Finally calculate + double b = w1f.multiply(w2f).doubleValue(); + total += Math.log(a / b); + } + + return (float) (total / count); + } + + + /** + * Searches a a path throw the tree + */ + @Override + public DecompoundedWord highestRank(ValueNode aParent, + List aPath) + { + if (aPath != null) { + aPath.add(aParent.getValue()); + } + + List children = aParent.getChildrenValues(); + if (children.size() == 0) { + return aParent.getValue(); + } + + children.add(aParent.getValue()); + List result = rank(children); + DecompoundedWord best = result.get(0); + + if (best.equals(aParent.getValue())) { + // None of the childs get a better score than the parent + return aParent.getValue(); + } + else { + // Find the child node that ranked best and recurse + for (ValueNode split : aParent.getChildren()) { + if (best.equals(split.getValue())) { + return highestRank(split, aPath); + } + } + } + + return null; + } +} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/Ranker.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/Ranker.java similarity index 80% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/Ranker.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/Ranker.java index 6ef2689229..7fcdc5ed4e 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/Ranker.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/Ranker.java @@ -15,11 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.ranking; +package org.dkpro.core.decompounding.ranking; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundedWord; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundingTree; -import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.Finder; +import org.dkpro.core.decompounding.splitter.DecompoundedWord; +import org.dkpro.core.decompounding.splitter.DecompoundingTree; +import org.dkpro.core.decompounding.web1t.Finder; /** * The ranking algorithm interface diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/RankerList.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/RankerList.java similarity index 90% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/RankerList.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/RankerList.java index 8dba728aa0..c2e21bedd1 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/RankerList.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/RankerList.java @@ -16,11 +16,11 @@ * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.ranking; +package org.dkpro.core.decompounding.ranking; import java.util.List; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundedWord; +import org.dkpro.core.decompounding.splitter.DecompoundedWord; /** * Ranking interface for list of splits diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/package-info.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/package-info.java new file mode 100644 index 0000000000..7584f63393 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/ranking/package-info.java @@ -0,0 +1,22 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +/** + * Contains ranking algorithm. + */ +package org.dkpro.core.decompounding.ranking; diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/AsvToolboxSplitterAlgorithm.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/AsvToolboxSplitterAlgorithm.java new file mode 100644 index 0000000000..b8a79f0d98 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/AsvToolboxSplitterAlgorithm.java @@ -0,0 +1,551 @@ +/* + * Copyright 2013 + + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.splitter; + +import static java.util.Arrays.asList; + +import java.io.File; +import java.util.StringTokenizer; +import java.util.Vector; + +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.decompounding.dictionary.Dictionary; +import org.dkpro.core.decompounding.dictionary.LinkingMorphemes; + +import de.uni_leipzig.asv.utils.Pretree; + +public class AsvToolboxSplitterAlgorithm + implements SplitterAlgorithm +{ + private final Zerleger2 splitter; + + private final Log logger; + + public AsvToolboxSplitterAlgorithm(File kompVVicTree, File kompVHic, File grfExt) + throws ResourceInitializationException + { + logger = LogFactory.getLog(this.getClass()); + splitter = new Zerleger2(); + splitter.init(kompVVicTree.getAbsolutePath(), kompVHic.getAbsolutePath(), + grfExt.getAbsolutePath()); + } + + @Override + public DecompoundingTree split(String aWord) + { + // splitter.kZerlegung("katalogleichen"); + // splitter.kZerlegung("nischenthemen"); + // splitter.kZerlegung("brennbaukästen"); + // splitter.kZerlegung("autokorrelationszeit"); + // splitter.kZerlegung("providerdaten"); + // splitter.kZerlegung("zahnärzten"); + + logger.debug("SPLITTING WORD: " + aWord); + Vector split = splitter.kZerlegung(aWord); + String joined = StringUtils.join(split, "").replace("(", "").replace(")", ""); + if (!joined.equals(aWord)) { + logger.error("Failed while splitting " + aWord + " into " + split); + } + + if (StringUtils.join(split, "").contains("()")) { + logger.error(aWord + " -> " + split); + throw new IllegalStateException("Failed while splitting " + aWord + " into " + split); + } + + StringBuilder splitStr = new StringBuilder(); + for (int i = 0; i < split.size(); i++) { + if ((splitStr.length() > 0) && !split.get(i).startsWith("(")) { + splitStr.append("+"); + } + splitStr.append(split.get(i)); + } + + return new DecompoundingTree(splitStr.toString()); + } + + @Override + public void setDictionary(Dictionary aDict) + { + // Nothing to do + } + + @Override + public void setLinkingMorphemes(LinkingMorphemes aMorphemes) + { + // Nothing to do + } + + @Override + public void setMaximalTreeDepth(int aDepth) + { + // Nothing to do + } + + public class Zerleger2 + { + Pretree kompvvTree = new Pretree(); + Pretree kompvhTree = new Pretree(); + Pretree grfTree = new Pretree(); + String anweisungGrf = new String(); + String anweisungKomp = new String(); + // boolean d = true; // debugguing + + String reverse(String torev) + { + String ret = new String(); + for (int i = torev.length(); i > 0; i--) { + ret += torev.substring(i - 1, i); + } + return ret; + } + + public Vector kZerlegung(String aAktwort) + { + // if (d) { + // logger.debug("grf: " + aAktwort + "->"); + // } + String aktwort = grundFormReduktion(aAktwort); + // if (d) { + // logger.debug(aktwort); + // } + Vector retvec = new Vector(); + String classvv = new String(); + String classvh = new String(); + String zahlStrvv = "", zahlStrvh = "", suffixvv = "", suffixvh = "", vvteil1 = "", vhteil1 = "", vvteil2 = "", vhteil2 = ""; + Vector zervh = new Vector(); + Vector zervv = new Vector(); + int zahlvv = 0, zahlvh = 0; + boolean vhOk, vvOk; + // if (d) { + // logger.debug("Zerlege " + aktwort); + // } + classvv = kompvvTree.classify(aktwort + "<"); + classvh = kompvhTree.classify(reverse(aktwort) + "<"); + // if (d) { + // logger.debug("VV liefert " + classvv); + // } + // if (d) { + // logger.debug("VH liefert " + classvh); + // } + + zervv = new Vector(); + zervh = new Vector(); + zervv.addElement(aktwort); + zervh.addElement(aktwort); + vvOk = true; + vhOk = true; + if (classvv.equals("undecided")) { + vvOk = false; + } + if (classvh.equals("undecided")) { + vhOk = false; + } + + if (vvOk) { + for (int i = 0; i < classvv.length(); i++) { + char c = classvv.charAt(i); + // if (d) { + // logger.debug("Parse: " + c + " " + (int) c); + // } + if ((c < 58) && (c > 47)) { + zahlStrvv += c; + } + else { + suffixvv += c; + } + } // rof i + } + if (vhOk) { + for (int i = 0; i < classvh.length(); i++) { + char c = classvh.charAt(i); + // if (d) { + // logger.info("Parse: " + c + " " + (int) c); + // } + if ((c < 58) && (c > 47)) { + zahlStrvh += c; + } + else { + suffixvh += c; + } + } // rof i + } + + if (vvOk) { + zahlvv = new Integer(zahlStrvv).intValue(); + } + if (vhOk) { + zahlvh = new Integer(zahlStrvh).intValue(); + } + + if (vvOk) { + if (zahlvv >= aktwort.length()) { + vvOk = false; + } + } + + if (vhOk) { + if (zahlvh >= aktwort.length()) { + vhOk = false; + } + } + + if (vvOk) { + for (int i = 0; i < suffixvv.length(); i++) { + // if (d) { + // logger.debug("VV matche " + suffixvv.charAt(i) + " und " + // + aktwort.charAt(zahlvv + i)); + // } + if (aktwort.length() > (zahlvv + i)) { + if (suffixvv.charAt(i) != aktwort.charAt(zahlvv + i)) { + vvOk = false; + } + } + else { + vvOk = false; + } + } + } + if (vhOk) { + for (int i = 0; i < suffixvh.length(); i++) { + if (suffixvh.charAt(i) != aktwort.charAt(zahlvh + 1 + i)) { + vvOk = false; + } + } + } + + // nun abschneiden durchf�hren + if (vvOk) { + zervv.removeElement(aktwort); + vvteil1 = aktwort.substring(0, zahlvv); + vvteil2 = aktwort.substring(zahlvv + suffixvv.length(), aktwort.length()); + zervv.addElement(vvteil1); + zervv.addElement(vvteil2); + // if (d) { + // logger.debug("VV zerlegt in " + vvteil1 + " " + vvteil2); + // } + if (vvteil2.length() <= 3) { + vvOk = false; + } + + } + if (vhOk) { + zervh.removeElement(aktwort); + vhteil1 = aktwort.substring(0, aktwort.length() - zahlvh); + vhteil2 = aktwort.substring(aktwort.length() - (zahlvh + suffixvh.length()), + aktwort.length()); + zervh.addElement(vhteil1); + zervh.addElement(vhteil2); + // if (d) { + // logger.debug("VH zerlegt in " + vhteil1 + " " + vhteil2); + // } + + if (vhteil1.length() <= 3) { + vhOk = false; + } + + } + if (vvOk && vhOk) { // beide ok + if (vvteil1.equals(vhteil1)) { + retvec.addElement(vvteil1); + if (vhteil2.length() < vvteil2.length()) { + retvec.addElement(vhteil2); + } + else if (vhteil2.length() > vvteil2.length()) { + retvec.addElement(vvteil2); + } + } + else if ((vhteil1.length() - vvteil1.length()) < 3) { + retvec.addElement(vvteil1); + if (vhteil2.length() < vvteil2.length()) { + retvec.addElement(vhteil2); + } + else if (vhteil2.length() > vvteil2.length()) { + retvec.addElement(vvteil2); + } + } + // sonst 3 teile + else { + retvec.addElement(vvteil1); + retvec.addElement(aktwort.substring(vvteil1.length() + suffixvv.length(), + aktwort.length() - zahlvh)); + retvec.addElement(vhteil2); + } + if (vvteil2.equals(vhteil2)) { + retvec.addElement(vvteil2); + } + + } + else if (vvOk && !vhOk) { // nur vvOK + retvec.addElement(vvteil1); + retvec.addElement(vvteil2); + } + else if (vhOk && !vvOk) { // nur vhOK + retvec.addElement(vhteil1); + retvec.addElement(vhteil2); + } + else { // keine Zerlegung gefunden -> lassen + retvec.addElement(aktwort); + } + + // if (d) { + // logger.debug("Pre-Ergebnis: [" + aAktwort + "] -> " + retvec); + // } + + if (retvec.size() == 1) { + // If there was no split, return verbatim + retvec.clear(); + retvec.add(aAktwort); + } + else if (retvec.size() == 2) { + String w1 = retvec.get(0); + String w2 = retvec.get(1); + retvec.clear(); + + if (!aAktwort.startsWith(w1)) { + // throw new + // IllegalStateException("Bad assumption: first split not changed by + // grundFormReduktion"); + logger.error("Unable to map split " + asList(w1, w2) + + " back to original " + aAktwort + "... no splitting"); + retvec.add(aAktwort); + } + else { + retvec.add(w1); + int restBegin = w1.length(); + handleLastSplit(aAktwort, restBegin, w2, retvec); + } + } + else if (retvec.size() == 3) { + String w1 = retvec.get(0); + String w2 = retvec.get(1); + String w3 = retvec.get(2); + retvec.clear(); + + if (!aAktwort.startsWith(w1)) { + // throw new + // IllegalStateException("Bad assumption: first split not changed by + // grundFormReduktion"); + logger.error("Unable to map split " + asList(w1, w2, w3) + + " back to original " + aAktwort + "... no splitting"); + retvec.add(aAktwort); + } + else { + retvec.add(w1); + int morphi = aAktwort.indexOf(w2, w1.length()); + if (morphi == -1) { + // throw new + // IllegalStateException("Bad assumption: second split not changed by + // grundFormReduktion"); + logger.error("Unable to map split " + asList(w1, w2, w3) + + " back to original " + aAktwort + "... no splitting"); + retvec.clear(); + retvec.add(aAktwort); + } + else { + if (morphi > w1.length()) { + retvec.add("(" + aAktwort.substring(w1.length(), morphi) + ")"); + } + retvec.add(w2); + int restBegin = w2.length() + morphi; + handleLastSplit(aAktwort, restBegin, w3, retvec); + } + } + } + + // if (d) { + // logger.debug("Ergebnis: " + retvec); + // } + + Vector retvec2 = new Vector(); + + if (retvec.size() > 1) { + for (String aktelement : retvec) { + if (aktelement.startsWith("(")) { + // This is a linking morpheme + retvec2.addElement(aktelement); + continue; + } + Vector zwischen = kZerlegung(aktelement); + for (String string : zwischen) { + retvec2.addElement(string); + } + } + } // rof if enum + else { + retvec2 = retvec; + } + + // if (d) { + // logger.debug("Ergebnis2: " + retvec2.toString()); + // } + + return retvec2; + } // end kZerlegung + + public void handleLastSplit(String aAktwort, int aSplitBegin, String aSplit, + Vector retvec) + { + boolean found = false; + for (int i = 0; i < (aSplit.length() - 1); i++) { + int restOffset = aSplitBegin + i; + String rest = aAktwort.substring(restOffset); + String restGrund = grundFormReduktion(rest); + boolean isEqual = aSplit.equals(restGrund) || aSplit.equals(rest); + boolean isStartsWith = aSplit.startsWith(restGrund) || aSplit.startsWith(rest); + boolean isInvStartsWith = rest.startsWith(aSplit) || restGrund.startsWith(aSplit); + + if (isEqual || isStartsWith || isInvStartsWith) { + if (i > 0) { + retvec.add("(" + aAktwort.substring(aSplitBegin, restOffset) + ")"); + } + } + + if (isEqual) { + retvec.add(aAktwort.substring(restOffset)); + found = true; + } + else if (aSplit.startsWith(rest)) { + retvec.add(rest); + found = true; + } + else if (aSplit.startsWith(restGrund)) { + retvec.add(restGrund); + retvec.add("(" + rest.substring(restGrund.length()) + ")"); + found = true; + } + else if (isInvStartsWith) { + retvec.add(aSplit); + retvec.add("(" + rest.substring(aSplit.length()) + ")"); + // retvec.add(restGrund); + found = true; + } + + if (found) { + break; + } + } + + if (!found) { + retvec.add(aAktwort.substring(aSplitBegin)); + // throw new + // IllegalStateException("Bad assumption: last split does not start a grundform of + // a suffix of aktwort"); + } + } + + public String grundFormReduktion(String wort) + { + String retwort = wort; + anweisungGrf = grfTree.classify(reverse(wort)); + // logger.info("Anweisung f�r "+wort+": "+anweisungGrf); + if (!anweisungGrf.equals("undecided")) { + StringTokenizer kommatok = new StringTokenizer(anweisungGrf, ","); + anweisungGrf = kommatok.nextToken(); // nehme bei + // mehreren + // nurerstes + // parsing anweisung + String zahlStr = new String(); + String suffix = new String(); + + for (int i = 0; i < anweisungGrf.length(); i++) { + char c = anweisungGrf.charAt(i); + // logger.info("Parse: "+c+" "+(int)c); + if ((c < 58) && (c > 47)) { + zahlStr += c; + } + else { + suffix += c; + } + } // rof i + + // logger.info(anweisungGrf+"->"+zahlStr+"-"+suffix+"'"); + + int cutpos = new Integer(zahlStr).intValue(); + if (cutpos > retwort.length()) { + cutpos = retwort.length(); + } + retwort = retwort.substring(0, retwort.length() - cutpos) + suffix; + } + + String[] alternatives = retwort.split(";"); + if (alternatives.length > 0) { + retwort = retwort.split(";")[0]; + } + else { + retwort = wort; + } + + return retwort; + } + + public void init(String kompvv, String kompvh, String gfred) + { + // B�ume initialisierung + // logger.info("Loading from "+grfFile); + logger.debug("Loading " + kompvv + " ..."); + kompvvTree.load(kompvv); + // logger.debug("loaded"); + kompvvTree.setIgnoreCase(true); + kompvvTree.setThresh(0.51); + + // Kompositazerlegung-Beum initialisieren + logger.debug("Loading " + kompvh + " ..."); + + kompvhTree.load(kompvh); + + // logger.debug("loaded"); + kompvhTree.setIgnoreCase(true); // Trainingsmenge in + // lowcase :( + kompvhTree.setThresh(0.51); // weiss nicht? + logger.debug("Loading " + gfred + " ..."); + + grfTree.load(gfred); + // logger.debug("loaded"); + grfTree.setIgnoreCase(true); // Trainingsmenge in lowcase + // :( + grfTree.setThresh(0.46); // weiss nicht? + + } + + // inititialisieren mit pretrees + public void init2(Pretree kompvv, Pretree kompvh, Pretree gfred) + { + // B�ume initialisierung + + kompvvTree = kompvv; + kompvvTree.setIgnoreCase(true); + kompvvTree.setThresh(0.51); + + // Kompositazerlegung-Beum initialisieren + kompvhTree = kompvh; + kompvhTree.setIgnoreCase(true); // Trainingsmenge in lowcase + // :( + kompvhTree.setThresh(0.51); // weiss nicht? + + grfTree = gfred; + grfTree.setIgnoreCase(true); // Trainingsmenge in lowcase :( + grfTree.setThresh(0.46); // weiss nicht? + } + + } // end class Zerleger +} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/BananaSplitterAlgorithm.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/BananaSplitterAlgorithm.java similarity index 95% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/BananaSplitterAlgorithm.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/BananaSplitterAlgorithm.java index 4d5a69470e..57394a1a68 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/BananaSplitterAlgorithm.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/BananaSplitterAlgorithm.java @@ -15,16 +15,17 @@ * See the License for the specific language governing permissions and * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter; +package org.dkpro.core.decompounding.splitter; + +import org.dkpro.core.decompounding.dictionary.Dictionary; +import org.dkpro.core.decompounding.dictionary.LinkingMorphemes; +import org.dkpro.core.decompounding.trie.ValueNode; import de.drni.bananasplit.BananaSplit; import de.drni.bananasplit.Compound; import de.drni.bananasplit.affix.Affix; import de.drni.bananasplit.simpledict.SimpleDictEntry; import de.drni.bananasplit.simpledict.SimpleDictionaryInterface; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes; -import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.ValueNode; /** * Wrapper for the banana splitter algorithm diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/DataDrivenSplitterAlgorithm.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/DataDrivenSplitterAlgorithm.java similarity index 95% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/DataDrivenSplitterAlgorithm.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/DataDrivenSplitterAlgorithm.java index 9cda9c927b..9a38376469 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/DataDrivenSplitterAlgorithm.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/DataDrivenSplitterAlgorithm.java @@ -16,16 +16,16 @@ * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter; +package org.dkpro.core.decompounding.splitter; import java.util.ArrayList; import java.util.List; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.SimpleDictionary; -import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.TrieStructure; -import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.ValueNode; +import org.dkpro.core.decompounding.dictionary.Dictionary; +import org.dkpro.core.decompounding.dictionary.LinkingMorphemes; +import org.dkpro.core.decompounding.dictionary.SimpleDictionary; +import org.dkpro.core.decompounding.trie.TrieStructure; +import org.dkpro.core.decompounding.trie.ValueNode; /** * A data driven algorithm, that uses a TRIE to look for splits diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/DecompoundedWord.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/DecompoundedWord.java similarity index 99% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/DecompoundedWord.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/DecompoundedWord.java index c480e2816c..c9e321e505 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/DecompoundedWord.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/DecompoundedWord.java @@ -16,7 +16,7 @@ * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter; +package org.dkpro.core.decompounding.splitter; import java.util.ArrayList; import java.util.List; diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/DecompoundingTree.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/DecompoundingTree.java new file mode 100644 index 0000000000..e3e3e151b7 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/DecompoundingTree.java @@ -0,0 +1,102 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.splitter; + +import static java.util.Arrays.asList; + +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; + +import org.dkpro.core.decompounding.trie.ValueNode; + +/** + * A split tree. Holds all splits in a tree structure. This can help to see the + * how the split algorithm works + */ +public class DecompoundingTree +{ + private ValueNode root; + + public DecompoundingTree(String aWord) + { + root = new ValueNode(DecompoundedWord.createFromString(aWord)); + } + + public DecompoundingTree(DecompoundedWord aSplit) + { + root = new ValueNode(aSplit); + } + + public ValueNode getRoot() + { + return root; + } + + public void setRoot(ValueNode aRoot) + { + root = aRoot; + } + + /** + * Converts the tree to a list. + * + * @return the splits. + */ + public List getAllSplits() + { + Set splits = new LinkedHashSet(); + getAllSplitsRecursive(splits, getRoot(), true); + + return new ArrayList(splits); + } + + /** + * Converts the tree to a list. If there are splits, then the root node, which contains the + * unsplit word, is not returned. + * + * @return the splits. + */ + public List getSplits() + { + Set splits = new LinkedHashSet(); + getAllSplitsRecursive(splits, getRoot(), false); + + if (!splits.isEmpty()) { + return new ArrayList(splits); + } + else { + return asList(getRoot().getValue()); + } + } + + protected void getAllSplitsRecursive(Set aSplits, + ValueNode aNode, boolean aAddNode) + { + if (aAddNode) { + aSplits.add(aNode.getValue()); + } + if (aNode.hasChildren()) { + for (ValueNode child : aNode.getChildren()) { + getAllSplitsRecursive(aSplits, child, true); + } + } + } +} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/Fragment.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/Fragment.java similarity index 98% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/Fragment.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/Fragment.java index 5fb39877bd..f200d1c386 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/Fragment.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/Fragment.java @@ -16,7 +16,7 @@ * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter; +package org.dkpro.core.decompounding.splitter; /** * Data container for a split element. A split element contains a word and optional a linking diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/JWordSplitterAlgorithm.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/JWordSplitterAlgorithm.java new file mode 100644 index 0000000000..c0afb6c3c9 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/JWordSplitterAlgorithm.java @@ -0,0 +1,134 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.decompounding.splitter; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + +import org.dkpro.core.decompounding.dictionary.Dictionary; +import org.dkpro.core.decompounding.dictionary.LinkingMorphemes; +import org.dkpro.core.decompounding.trie.ValueNode; + +import de.abelssoft.wordtools.jwordsplitter.AbstractWordSplitter; +import de.abelssoft.wordtools.jwordsplitter.impl.GermanWordSplitter; + +/** + * Wrapper for the JWordSplitter algorithm. + * + */ +public class JWordSplitterAlgorithm + implements SplitterAlgorithm +{ + private AbstractWordSplitter splitterHiddenLinking; + private AbstractWordSplitter splitter; + private Dictionary dict; + + @Override + public DecompoundingTree split(String aWord) + { + if (splitter == null) { + try { + splitterHiddenLinking = new InternalGermanWordSplitter(true); + splitter = new InternalGermanWordSplitter(false); + } + catch (IOException e) { + throw new IllegalStateException("Unable to access dictionary", e); + } + } + + DecompoundingTree t = new DecompoundingTree(aWord); + + // Just append on child to the tree + String[] splits = splitter.splitWord(aWord).toArray(new String[0]); + String[] splitsNoLink = splitterHiddenLinking.splitWord(aWord).toArray(new String[0]); + + if (splits.length != splitsNoLink.length) { + throw new IllegalStateException( + "Something is fishy - more must have happened than just hiding the links"); + } + + if (splits.length > 1) { + StringBuilder splitStringMorph = new StringBuilder(); + for (int i = 0; i < splits.length; i++) { + String base = splitsNoLink[i]; + String full = splits[i]; + + if (!full.startsWith(base)) { + throw new IllegalStateException( + "Something is fishy - links should be at the end"); + } + String link = full.substring(base.length()); + + // Split with linking morphemes + splitStringMorph.append(base); + if (link.length() > 0) { + splitStringMorph.append("(").append(link).append(")"); + } + splitStringMorph.append("+"); + } + + String splitStringMorphStr = splitStringMorph.toString(); + t.getRoot().addChild(new ValueNode( + DecompoundedWord.createFromString(splitStringMorphStr))); + } + + return t; + } + + @Override + public void setDictionary(Dictionary aDict) + { + dict = aDict; + splitter = null; + splitterHiddenLinking = null; + } + + @Override + public void setLinkingMorphemes(LinkingMorphemes aMorphemes) + { + // Not needed for this algorithm + } + + @Override + public void setMaximalTreeDepth(int aDepth) + { + // Not needed for this algorithm + } + + private class InternalGermanWordSplitter extends GermanWordSplitter + { + public InternalGermanWordSplitter(boolean aHideConnectingCharacters) + throws IOException + { + super(aHideConnectingCharacters); + } + + @Override + protected Set getWordList() + throws IOException + { + if (dict == null) { + return super.getWordList(); + } + else { + return new HashSet(dict.getAll()); + } + } + } +} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/LeftToRightSplitterAlgorithm.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/LeftToRightSplitterAlgorithm.java similarity index 95% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/LeftToRightSplitterAlgorithm.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/LeftToRightSplitterAlgorithm.java index ca40aa5635..957c307f24 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/LeftToRightSplitterAlgorithm.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/LeftToRightSplitterAlgorithm.java @@ -16,14 +16,14 @@ * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter; +package org.dkpro.core.decompounding.splitter; import java.util.ArrayList; import java.util.List; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes; -import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.ValueNode; +import org.dkpro.core.decompounding.dictionary.Dictionary; +import org.dkpro.core.decompounding.dictionary.LinkingMorphemes; +import org.dkpro.core.decompounding.trie.ValueNode; /** * Implements a simple left to right split algorithm. diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/SplitterAlgorithm.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/SplitterAlgorithm.java similarity index 84% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/SplitterAlgorithm.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/SplitterAlgorithm.java index 39e3e92f2d..ef29d836f6 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/SplitterAlgorithm.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/SplitterAlgorithm.java @@ -16,14 +16,13 @@ * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter; +package org.dkpro.core.decompounding.splitter; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes; +import org.dkpro.core.decompounding.dictionary.Dictionary; +import org.dkpro.core.decompounding.dictionary.LinkingMorphemes; /** * Interface for all splitting algorithms - * */ public interface SplitterAlgorithm { @@ -53,7 +52,7 @@ public interface SplitterAlgorithm public void setLinkingMorphemes(LinkingMorphemes aMorphemes); /** - * Set the maximal tree depth. Default: Integer.MaxValue + * Set the maximal tree depth. * * @param aDepth * maximal tree depth. diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/package-info.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/package-info.java new file mode 100644 index 0000000000..28539f0cdc --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/splitter/package-info.java @@ -0,0 +1,25 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +/** + * Contains splitting algorithm. + * + * To implement you own splitting algorithm you can use the + * {@link org.dkpro.core.decompounding.splitter.SplitterAlgorithm} interface. + */ +package org.dkpro.core.decompounding.splitter; diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/trie/KeyValueNode.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/trie/KeyValueNode.java similarity index 98% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/trie/KeyValueNode.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/trie/KeyValueNode.java index 10b772cbb8..2188b73d9c 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/trie/KeyValueNode.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/trie/KeyValueNode.java @@ -16,7 +16,7 @@ * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.trie; +package org.dkpro.core.decompounding.trie; import java.util.ArrayList; import java.util.List; diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/trie/TrieStructure.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/trie/TrieStructure.java similarity index 96% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/trie/TrieStructure.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/trie/TrieStructure.java index bde9d7facc..1d09d586a4 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/trie/TrieStructure.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/trie/TrieStructure.java @@ -16,9 +16,9 @@ * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.trie; +package org.dkpro.core.decompounding.trie; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary; +import org.dkpro.core.decompounding.dictionary.Dictionary; /** * A trie datastructor which also stores the number of successor for each node diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/trie/ValueNode.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/trie/ValueNode.java similarity index 97% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/trie/ValueNode.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/trie/ValueNode.java index 6ea52552b7..585d35058f 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/trie/ValueNode.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/trie/ValueNode.java @@ -16,7 +16,7 @@ * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.trie; +package org.dkpro.core.decompounding.trie; import java.util.ArrayList; import java.util.List; diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/trie/package-info.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/trie/package-info.java new file mode 100644 index 0000000000..7772ee8993 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/trie/package-info.java @@ -0,0 +1,22 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +/** + * Some base classes for trees and a simple Trie implementation + */ +package org.dkpro.core.decompounding.trie; diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/annotator/CompoundAnnotator.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/annotator/CompoundAnnotator.java similarity index 84% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/annotator/CompoundAnnotator.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/annotator/CompoundAnnotator.java index ed3bec69c3..14b436dc5d 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/annotator/CompoundAnnotator.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/annotator/CompoundAnnotator.java @@ -1,142 +1,145 @@ -/* - * Copyright 2010 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.annotator; - -import static org.apache.uima.fit.util.JCasUtil.select; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ExternalResource; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.FSCollectionFactory; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.cas.FSArray; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.CompoundPart; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LinkingMorpheme; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.decompounding.ranking.DummyRanker; -import de.tudarmstadt.ukp.dkpro.core.decompounding.ranking.Ranker; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundedWord; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.Fragment; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.SplitterAlgorithm; - -/** - * Annotates compound parts and linking morphemes. - */ -@ResourceMetaData(name="Compound Annotator") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.CompoundPart", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LinkingMorpheme" }) -public class CompoundAnnotator - extends JCasAnnotator_ImplBase -{ - - /** - * This component allows the user to create different strategies for decompounding words, - * combining different splitting algorithms with different ranking algorithms. This external - * resource wraps the splitter algorithm which shall be used by the annotator. - */ - public static final String PARAM_SPLITTING_ALGO = "splittingAlgorithm"; - @ExternalResource(key = PARAM_SPLITTING_ALGO) - private SplitterAlgorithm splitter; - - /** - * This external resource wraps the ranking algorithm which shall be used by the annotator. - */ - public static final String PARAM_RANKING_ALGO = "rankingAlgorithm"; - @ExternalResource(key = PARAM_RANKING_ALGO, mandatory = false) - private Ranker ranker; - - @Override - public void initialize(final UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - if (ranker == null) { - ranker = new DummyRanker(); - } - } - - @Override - public void process(final JCas aJCas) - throws AnalysisEngineProcessException - { - for (Token token : select(aJCas, Token.class)) { - final String coveredText = token.getText(); - DecompoundedWord result; - result = ranker.highestRank(splitter.split(coveredText)); - if (!result.isCompound()) { - continue; - } - final int beginIndex = token.getBegin(); - final Compound compound = new Compound(aJCas, beginIndex, token.getEnd()); - indexSplits(aJCas, result.getSplits(), beginIndex, token.getEnd(), null, compound); - compound.addToIndexes(); - } - } - - private void indexSplits(final JCas aJCas, final List splits, final int beginIndex, - final int tokenEndIndex, final Split parentSplit, final Compound compound) - { - if (splits.size() == 1) { - return; - } - final List splitChildren = new ArrayList(); - final Fragment element = splits.get(0); - int endIndex = beginIndex + element.getWord().length(); - final Split split = new CompoundPart(aJCas, beginIndex, endIndex); - split.addToIndexes(); - splitChildren.add(split); - int newBeginIndex = endIndex; - if (element.hasMorpheme()) { - endIndex = newBeginIndex + element.getMorpheme().length(); - final Split morpheme = new LinkingMorpheme(aJCas, newBeginIndex, endIndex); - morpheme.addToIndexes(); - splitChildren.add(morpheme); - newBeginIndex = endIndex; - } - final Split remainingSplit = new CompoundPart(aJCas, newBeginIndex, tokenEndIndex); - splitChildren.add(remainingSplit); - final FSArray childArray = FSCollectionFactory.createFSArray(aJCas, splitChildren); - if (parentSplit == null) { - compound.setSplits(childArray); - } - else { - parentSplit.setSplits(childArray); - } - indexSplits(aJCas, splits.subList(1, splits.size()), newBeginIndex, tokenEndIndex, - remainingSplit, compound); - remainingSplit.addToIndexes(); - - } -} +/* + * Copyright 2010 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.uima.annotator; + +import static org.apache.uima.fit.util.JCasUtil.select; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ExternalResource; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.FSCollectionFactory; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.decompounding.ranking.DummyRanker; +import org.dkpro.core.decompounding.ranking.Ranker; +import org.dkpro.core.decompounding.splitter.DecompoundedWord; +import org.dkpro.core.decompounding.splitter.Fragment; +import org.dkpro.core.decompounding.splitter.SplitterAlgorithm; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.CompoundPart; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LinkingMorpheme; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Annotates compound parts and linking morphemes. + */ +@Component(OperationType.ANNOTATION_OF_COMPOUNDING_FEATURES) +@ResourceMetaData(name = "Compound Annotator") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.CompoundPart", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LinkingMorpheme" }) +public class CompoundAnnotator + extends JCasAnnotator_ImplBase +{ + /** + * This component allows the user to create different strategies for decompounding words, + * combining different splitting algorithms with different ranking algorithms. This external + * resource wraps the splitter algorithm which shall be used by the annotator. + */ + public static final String RES_SPLITTING_ALGO = "splittingAlgorithm"; + @ExternalResource(key = RES_SPLITTING_ALGO) + private SplitterAlgorithm splitter; + + /** + * This external resource wraps the ranking algorithm which shall be used by the annotator. + */ + public static final String RES_RANKING_ALGO = "rankingAlgorithm"; + @ExternalResource(key = RES_RANKING_ALGO, mandatory = false) + private Ranker ranker; + + @Override + public void initialize(final UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + if (ranker == null) { + ranker = new DummyRanker(); + } + } + + @Override + public void process(final JCas aJCas) + throws AnalysisEngineProcessException + { + for (Token token : select(aJCas, Token.class)) { + final String coveredText = token.getText(); + DecompoundedWord result; + result = ranker.highestRank(splitter.split(coveredText)); + if (!result.isCompound()) { + continue; + } + final int beginIndex = token.getBegin(); + final Compound compound = new Compound(aJCas, beginIndex, token.getEnd()); + indexSplits(aJCas, result.getSplits(), beginIndex, token.getEnd(), null, compound); + compound.addToIndexes(); + } + } + + private void indexSplits(final JCas aJCas, final List splits, final int beginIndex, + final int tokenEndIndex, final Split parentSplit, final Compound compound) + { + if (splits.size() == 1) { + return; + } + final List splitChildren = new ArrayList(); + final Fragment element = splits.get(0); + int endIndex = beginIndex + element.getWord().length(); + final Split split = new CompoundPart(aJCas, beginIndex, endIndex); + split.addToIndexes(); + splitChildren.add(split); + int newBeginIndex = endIndex; + if (element.hasMorpheme()) { + endIndex = newBeginIndex + element.getMorpheme().length(); + final Split morpheme = new LinkingMorpheme(aJCas, newBeginIndex, endIndex); + morpheme.addToIndexes(); + splitChildren.add(morpheme); + newBeginIndex = endIndex; + } + final Split remainingSplit = new CompoundPart(aJCas, newBeginIndex, tokenEndIndex); + splitChildren.add(remainingSplit); + final FSArray childArray = FSCollectionFactory.createFSArray(aJCas, splitChildren); + if (parentSplit == null) { + compound.setSplits(childArray); + } + else { + parentSplit.setSplits(childArray); + } + indexSplits(aJCas, splits.subList(1, splits.size()), newBeginIndex, tokenEndIndex, + remainingSplit, compound); + remainingSplit.addToIndexes(); + } +} diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/annotator/package-info.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/annotator/package-info.java new file mode 100644 index 0000000000..d354fe4947 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/annotator/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +/** + * This package contains UIMA annotation classes. + * + * The annotators iterate over all token in a CAS and try to split them. + */ +package org.dkpro.core.decompounding.uima.annotator; diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/AsvToolboxSplitterResource.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/AsvToolboxSplitterResource.java new file mode 100644 index 0000000000..546488a473 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/AsvToolboxSplitterResource.java @@ -0,0 +1,104 @@ +/* + * Copyright 2010 + + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.uima.resource; + +import static org.apache.uima.util.Level.SEVERE; + +import java.io.IOException; +import java.util.Map; + +import org.apache.uima.fit.descriptor.ExternalResource; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.ResourceSpecifier; +import org.dkpro.core.decompounding.dictionary.Dictionary; +import org.dkpro.core.decompounding.dictionary.LinkingMorphemes; +import org.dkpro.core.decompounding.splitter.DecompoundingTree; + +public class AsvToolboxSplitterResource + extends SplitterResource +{ + + /** + * + * This external resource wraps the patricia trie which shall be used by the ASV Toolbox + * splitter. + * + */ + + public static final String PARAM_PATRICIA_TRIES_RESOURCE = "patriciaTriesResource"; + @ExternalResource(key = PARAM_PATRICIA_TRIES_RESOURCE) + private SharedPatriciaTries patriciaTriesResource; + + @Override + public boolean initialize(ResourceSpecifier aSpecifier, Map aAdditionalParams) + throws ResourceInitializationException + { + if (!super.initialize(aSpecifier, aAdditionalParams)) { + return false; + } + return true; + } + + @Override + public void afterResourcesInitialized() throws RuntimeException + { + try { + splitter = patriciaTriesResource.getSplitter(); + } + catch (IOException e) { + getLogger().log(SEVERE, "IOException caught when getting the patricia trie resource"); + getLogger().log(SEVERE, e.getLocalizedMessage()); + getLogger().log(SEVERE, e.getMessage()); + throw new RuntimeException(e); + } + catch (ResourceInitializationException e) { + getLogger().log(SEVERE, + "RuntimeException caught when getting the patrica trie resource"); + getLogger().log(SEVERE, e.getLocalizedMessage()); + getLogger().log(SEVERE, e.getMessage()); + throw new RuntimeException(e); + } + } + + @Override + public DecompoundingTree split(String aWord) + { + return splitter.split(aWord); + } + + @Override + public void setDictionary(Dictionary aDict) + { + splitter.setDictionary(aDict); + } + + @Override + public void setLinkingMorphemes(LinkingMorphemes aMorphemes) + { + splitter.setLinkingMorphemes(aMorphemes); + } + + @Override + public void setMaximalTreeDepth(int aDepth) + { + splitter.setMaximalTreeDepth(aDepth); + } + +} diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/BananaSplitterResource.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/BananaSplitterResource.java new file mode 100644 index 0000000000..a5a9e131eb --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/BananaSplitterResource.java @@ -0,0 +1,69 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.decompounding.uima.resource; + +import java.util.Map; + +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.ResourceSpecifier; +import org.dkpro.core.decompounding.dictionary.Dictionary; +import org.dkpro.core.decompounding.dictionary.LinkingMorphemes; +import org.dkpro.core.decompounding.splitter.BananaSplitterAlgorithm; +import org.dkpro.core.decompounding.splitter.DecompoundingTree; + +public class BananaSplitterResource + extends SplitterResource +{ + @SuppressWarnings({ "rawtypes" }) + @Override + public boolean initialize(ResourceSpecifier aSpecifier, + Map aAdditionalParams) + throws ResourceInitializationException + { + if (!super.initialize(aSpecifier, aAdditionalParams)) { + return false; + } + + splitter = new BananaSplitterAlgorithm(); + return true; + } + + @Override + public DecompoundingTree split(String aWord) + { + return splitter.split(aWord); + } + + @Override + public void setDictionary(Dictionary aDict) + { + splitter.setDictionary(aDict); + } + + @Override + public void setLinkingMorphemes(LinkingMorphemes aMorphemes) + { + splitter.setLinkingMorphemes(aMorphemes); + } + + @Override + public void setMaximalTreeDepth(int aDepth) + { + splitter.setMaximalTreeDepth(aDepth); + } +} diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/DataDrivenSplitterResource.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/DataDrivenSplitterResource.java new file mode 100644 index 0000000000..231f3c5c77 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/DataDrivenSplitterResource.java @@ -0,0 +1,71 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.decompounding.uima.resource; + +import java.util.Map; + +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.ResourceSpecifier; +import org.dkpro.core.decompounding.dictionary.Dictionary; +import org.dkpro.core.decompounding.dictionary.LinkingMorphemes; +import org.dkpro.core.decompounding.splitter.DataDrivenSplitterAlgorithm; +import org.dkpro.core.decompounding.splitter.DecompoundingTree; + +public class DataDrivenSplitterResource + extends SplitterResource +{ + + @SuppressWarnings({ "rawtypes" }) + @Override + public boolean initialize(ResourceSpecifier aSpecifier, + Map aAdditionalParams) + throws ResourceInitializationException + { + if (!super.initialize(aSpecifier, aAdditionalParams)) { + return false; + } + + splitter = new DataDrivenSplitterAlgorithm(); + + return true; + } + + @Override + public DecompoundingTree split(String aWord) + { + return splitter.split(aWord); + } + + @Override + public void setDictionary(Dictionary aDict) + { + splitter.setDictionary(aDict); + } + + @Override + public void setLinkingMorphemes(LinkingMorphemes aMorphemes) + { + splitter.setLinkingMorphemes(aMorphemes); + } + + @Override + public void setMaximalTreeDepth(int aDepth) + { + splitter.setMaximalTreeDepth(aDepth); + } +} diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/FrequencyRankerResource.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/FrequencyRankerResource.java new file mode 100644 index 0000000000..a7ae680d16 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/FrequencyRankerResource.java @@ -0,0 +1,57 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.decompounding.uima.resource; + +import java.util.Map; + +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.ResourceSpecifier; +import org.dkpro.core.decompounding.ranking.FrequencyGeometricMeanRanker; +import org.dkpro.core.decompounding.splitter.DecompoundedWord; +import org.dkpro.core.decompounding.splitter.DecompoundingTree; +import org.dkpro.core.decompounding.web1t.Finder; + +public class FrequencyRankerResource + extends RankerResource +{ + @SuppressWarnings({ "rawtypes" }) + @Override + public boolean initialize(ResourceSpecifier aSpecifier, + Map aAdditionalParams) + throws ResourceInitializationException + { + if (!super.initialize(aSpecifier, aAdditionalParams)) { + return false; + } + + ranker = new FrequencyGeometricMeanRanker(); + return true; + } + + @Override + public DecompoundedWord highestRank(DecompoundingTree aSplitTree) + { + return ranker.highestRank(aSplitTree); + } + + @Override + public void setFinder(Finder aFinder) + { + ranker.setFinder(aFinder); + } +} diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/JWordSplitterResource.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/JWordSplitterResource.java new file mode 100644 index 0000000000..9b1ca3554f --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/JWordSplitterResource.java @@ -0,0 +1,70 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.decompounding.uima.resource; + +import java.util.Map; + +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.ResourceSpecifier; +import org.dkpro.core.decompounding.dictionary.Dictionary; +import org.dkpro.core.decompounding.dictionary.LinkingMorphemes; +import org.dkpro.core.decompounding.splitter.DecompoundingTree; +import org.dkpro.core.decompounding.splitter.JWordSplitterAlgorithm; + +public class JWordSplitterResource + extends SplitterResource +{ + @SuppressWarnings({ "rawtypes" }) + @Override + public boolean initialize(ResourceSpecifier aSpecifier, + Map aAdditionalParams) + throws ResourceInitializationException + { + if (!super.initialize(aSpecifier, aAdditionalParams)) { + return false; + } + + splitter = new JWordSplitterAlgorithm(); + + return true; + } + + @Override + public DecompoundingTree split(String aWord) + { + return splitter.split(aWord); + } + + @Override + public void setDictionary(Dictionary aDict) + { + splitter.setDictionary(aDict); + } + + @Override + public void setLinkingMorphemes(LinkingMorphemes aMorphemes) + { + splitter.setLinkingMorphemes(aMorphemes); + } + + @Override + public void setMaximalTreeDepth(int aDepth) + { + splitter.setMaximalTreeDepth(aDepth); + } +} diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/LeftToRightSplitterResource.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/LeftToRightSplitterResource.java new file mode 100644 index 0000000000..f9645210ee --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/LeftToRightSplitterResource.java @@ -0,0 +1,71 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.decompounding.uima.resource; + +import java.util.Map; + +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.ResourceSpecifier; +import org.dkpro.core.decompounding.dictionary.Dictionary; +import org.dkpro.core.decompounding.dictionary.LinkingMorphemes; +import org.dkpro.core.decompounding.splitter.DecompoundingTree; +import org.dkpro.core.decompounding.splitter.LeftToRightSplitterAlgorithm; + +public class LeftToRightSplitterResource + extends SplitterResource +{ + + @SuppressWarnings({ "rawtypes" }) + @Override + public boolean initialize(ResourceSpecifier aSpecifier, + Map aAdditionalParams) + throws ResourceInitializationException + { + if (!super.initialize(aSpecifier, aAdditionalParams)) { + return false; + } + + splitter = new LeftToRightSplitterAlgorithm(); + + return true; + } + + @Override + public DecompoundingTree split(String aWord) + { + return splitter.split(aWord); + } + + @Override + public void setDictionary(Dictionary aDict) + { + splitter.setDictionary(aDict); + } + + @Override + public void setMaximalTreeDepth(int aDepth) + { + splitter.setMaximalTreeDepth(aDepth); + } + + @Override + public void setLinkingMorphemes(LinkingMorphemes aMorphemes) + { + splitter.setLinkingMorphemes(aMorphemes); + } +} diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/MutualInformationRankerResource.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/MutualInformationRankerResource.java new file mode 100644 index 0000000000..4cbf21d0eb --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/MutualInformationRankerResource.java @@ -0,0 +1,57 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.decompounding.uima.resource; + +import java.util.Map; + +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.ResourceSpecifier; +import org.dkpro.core.decompounding.ranking.MutualInformationRanker; +import org.dkpro.core.decompounding.splitter.DecompoundedWord; +import org.dkpro.core.decompounding.splitter.DecompoundingTree; +import org.dkpro.core.decompounding.web1t.Finder; + +public class MutualInformationRankerResource + extends RankerResource +{ + @SuppressWarnings({ "rawtypes" }) + @Override + public boolean initialize(ResourceSpecifier aSpecifier, + Map aAdditionalParams) + throws ResourceInitializationException + { + if (!super.initialize(aSpecifier, aAdditionalParams)) { + return false; + } + + ranker = new MutualInformationRanker(); + return true; + } + + @Override + public DecompoundedWord highestRank(DecompoundingTree aSplitTree) + { + return ranker.highestRank(aSplitTree); + } + + @Override + public void setFinder(Finder aFinder) + { + ranker.setFinder(aFinder); + } +} diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/ProbabilityRankerResource.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/ProbabilityRankerResource.java new file mode 100644 index 0000000000..a1dec3d294 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/ProbabilityRankerResource.java @@ -0,0 +1,57 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.decompounding.uima.resource; + +import java.util.Map; + +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.ResourceSpecifier; +import org.dkpro.core.decompounding.ranking.CompoundProbabilityRanker; +import org.dkpro.core.decompounding.splitter.DecompoundedWord; +import org.dkpro.core.decompounding.splitter.DecompoundingTree; +import org.dkpro.core.decompounding.web1t.Finder; + +public class ProbabilityRankerResource + extends RankerResource +{ + @SuppressWarnings({ "rawtypes" }) + @Override + public boolean initialize(ResourceSpecifier aSpecifier, Map aAdditionalParams) + throws ResourceInitializationException + { + if (!super.initialize(aSpecifier, aAdditionalParams)) { + return false; + } + + ranker = new CompoundProbabilityRanker(); + + return true; + } + + @Override + public DecompoundedWord highestRank(DecompoundingTree aSplitTree) + { + return ranker.highestRank(aSplitTree); + } + + @Override + public void setFinder(Finder aFinder) + { + ranker.setFinder(aFinder); + } +} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/RankerResource.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/RankerResource.java similarity index 82% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/RankerResource.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/RankerResource.java index 2e0181281c..be4bcfca53 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/RankerResource.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/RankerResource.java @@ -1,62 +1,59 @@ -/* - * Copyright 2010 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource; - -import java.util.Map; - -import org.apache.uima.fit.component.Resource_ImplBase; -import org.apache.uima.fit.descriptor.ExternalResource; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceSpecifier; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.ranking.Ranker; - -public abstract class RankerResource - extends Resource_ImplBase - implements Ranker -{ - - /** - * - * This external resource wraps the finder resource used by the ranker. - * - * */ - public static final String PARAM_FINDER_RESOURCE = "finderResource"; - @ExternalResource(key = PARAM_FINDER_RESOURCE) - private SharedFinder finderResource; - - protected Ranker ranker; - - @SuppressWarnings({ "rawtypes" }) - @Override - public boolean initialize(ResourceSpecifier aSpecifier, - Map aAdditionalParams) - throws ResourceInitializationException - { - if (!super.initialize(aSpecifier, aAdditionalParams)) { - return false; - } - return true; - } - - @Override - public void afterResourcesInitialized(){ - ranker.setFinder(finderResource.getFinder()); - } - -} +/* + * Copyright 2010 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.decompounding.uima.resource; + +import java.util.Map; + +import org.apache.uima.fit.component.Resource_ImplBase; +import org.apache.uima.fit.descriptor.ExternalResource; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.ResourceSpecifier; +import org.dkpro.core.decompounding.ranking.Ranker; + +public abstract class RankerResource + extends Resource_ImplBase + implements Ranker +{ + /** + * + * This external resource wraps the finder resource used by the ranker. + * + */ + public static final String PARAM_FINDER_RESOURCE = "finderResource"; + @ExternalResource(key = PARAM_FINDER_RESOURCE) + private SharedFinder finderResource; + + protected Ranker ranker; + + @SuppressWarnings({ "rawtypes" }) + @Override + public boolean initialize(ResourceSpecifier aSpecifier, Map aAdditionalParams) + throws ResourceInitializationException + { + if (!super.initialize(aSpecifier, aAdditionalParams)) { + return false; + } + return true; + } + + @Override + public void afterResourcesInitialized() + { + ranker.setFinder(finderResource.getFinder()); + } +} diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/SharedDictionary.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/SharedDictionary.java similarity index 82% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/SharedDictionary.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/SharedDictionary.java index 6614e9e380..bd10fd421b 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/SharedDictionary.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/SharedDictionary.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource; +package org.dkpro.core.decompounding.uima.resource; import java.io.BufferedReader; import java.io.ByteArrayInputStream; @@ -28,17 +28,15 @@ import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.German98Dictionary; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.decompounding.dictionary.Dictionary; +import org.dkpro.core.decompounding.dictionary.German98Dictionary; public class SharedDictionary extends Resource_ImplBase { - /** * Use this language instead of the default language. */ @@ -53,6 +51,20 @@ public class SharedDictionary @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + *

The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.

+ */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + /** * Load the model from this location instead of locating the model automatically. */ @@ -64,7 +76,8 @@ public class SharedDictionary * The character encoding used by the model. */ public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; - @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String modelEncoding; /** @@ -149,12 +162,11 @@ protected Dictionary produceResource(InputStream aStream) public Dictionary getDictionary() throws IOException { - if(this.dict == null){ + if (this.dict == null) { affixModelProvider.configure(); modelProvider.configure(); this.dict = modelProvider.getResource(); } return this.dict; } - } diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/SharedFinder.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/SharedFinder.java similarity index 88% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/SharedFinder.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/SharedFinder.java index cd2a745f31..0392f35d6c 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/SharedFinder.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/SharedFinder.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource; +package org.dkpro.core.decompounding.uima.resource; import java.io.File; import java.io.IOException; @@ -27,20 +27,18 @@ import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; - -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.Finder; +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.decompounding.web1t.Finder; public class SharedFinder extends Resource_ImplBase { - public static final String PARAM_INDEX_PATH = "indexLocation"; - @ConfigurationParameter(name = PARAM_INDEX_PATH, mandatory=true) + @ConfigurationParameter(name = PARAM_INDEX_PATH, mandatory = true) private String indexLocation; public static final String PARAM_NGRAM_LOCATION = "ngramLocation"; - @ConfigurationParameter(name = PARAM_NGRAM_LOCATION, mandatory=true) + @ConfigurationParameter(name = PARAM_NGRAM_LOCATION, mandatory = true) private String ngramLocation; private Finder finder; @@ -73,5 +71,4 @@ public Finder getFinder() { return finder; } - } diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/SharedLinkingMorphemes.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/SharedLinkingMorphemes.java similarity index 76% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/SharedLinkingMorphemes.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/SharedLinkingMorphemes.java index f34bd17d81..c6e5a290f0 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/SharedLinkingMorphemes.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/SharedLinkingMorphemes.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource; +package org.dkpro.core.decompounding.uima.resource; import java.io.IOException; import java.io.InputStream; @@ -25,16 +25,14 @@ import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.decompounding.dictionary.LinkingMorphemes; public class SharedLinkingMorphemes extends Resource_ImplBase { - /** * Use this language instead of the default language. */ @@ -49,6 +47,20 @@ public class SharedLinkingMorphemes @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + *

The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.

+ */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + /** * Load the model from this location instead of locating the model automatically. */ @@ -100,11 +112,10 @@ protected LinkingMorphemes produceResource(InputStream aStream) public LinkingMorphemes getLinkingMorphemes() throws IOException { - if(morphemes == null){ + if (morphemes == null) { modelProvider.configure(); morphemes = modelProvider.getResource(); } return morphemes; } - } diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/SharedPatriciaTries.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/SharedPatriciaTries.java similarity index 92% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/SharedPatriciaTries.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/SharedPatriciaTries.java index cbc9ce382e..fa7a08dad6 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/SharedPatriciaTries.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/SharedPatriciaTries.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource; +package org.dkpro.core.decompounding.uima.resource; import java.io.File; import java.io.IOException; @@ -26,13 +26,12 @@ import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.AsvToolboxSplitterAlgorithm; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.SplitterAlgorithm; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.decompounding.splitter.AsvToolboxSplitterAlgorithm; +import org.dkpro.core.decompounding.splitter.SplitterAlgorithm; public class SharedPatriciaTries extends Resource_ImplBase diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/SplitterResource.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/SplitterResource.java similarity index 91% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/SplitterResource.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/SplitterResource.java index f2e245c678..b05acce7c5 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/resource/SplitterResource.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/SplitterResource.java @@ -1,83 +1,82 @@ -/* - * Copyright 2010 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource; - -import static org.apache.uima.util.Level.SEVERE; - -import java.io.IOException; -import java.util.Map; - -import org.apache.uima.fit.component.Resource_ImplBase; -import org.apache.uima.fit.descriptor.ExternalResource; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceSpecifier; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.SplitterAlgorithm; - -public abstract class SplitterResource - extends Resource_ImplBase - implements SplitterAlgorithm -{ - - /** - * - * This external resource wraps the dictionary which shall be used by the splitter. - * - * */ - - public static final String PARAM_DICT_RESOURCE = "dictionaryResource"; - @ExternalResource(key = PARAM_DICT_RESOURCE) - private SharedDictionary dictResource; - - /** - * - * This external resource wraps the morphemes list which shall be used by the splitter. - * - * */ - - public static final String PARAM_MORPHEME_RESOURCE = "linkingMorphemeResource"; - @ExternalResource(key = PARAM_MORPHEME_RESOURCE) - private SharedLinkingMorphemes morphemesResource; - - protected SplitterAlgorithm splitter; - - @Override - public boolean initialize(ResourceSpecifier aSpecifier, Map aAdditionalParams) - throws ResourceInitializationException - { - if (!super.initialize(aSpecifier, aAdditionalParams)) { - return false; - } - return true; - } - - @Override - public void afterResourcesInitialized() throws RuntimeException { - try { - splitter.setDictionary(dictResource.getDictionary()); - splitter.setLinkingMorphemes(morphemesResource.getLinkingMorphemes()); - } - catch (IOException e) { - getLogger().log(SEVERE, "IOException caught when getting the dictionary resource"); - getLogger().log(SEVERE, e.getLocalizedMessage()); - getLogger().log(SEVERE, e.getMessage()); - throw new RuntimeException(e); - } - } - -} +/* + * Copyright 2010 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.decompounding.uima.resource; + +import static org.apache.uima.util.Level.SEVERE; + +import java.io.IOException; +import java.util.Map; + +import org.apache.uima.fit.component.Resource_ImplBase; +import org.apache.uima.fit.descriptor.ExternalResource; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.ResourceSpecifier; +import org.dkpro.core.decompounding.splitter.SplitterAlgorithm; + +public abstract class SplitterResource + extends Resource_ImplBase + implements SplitterAlgorithm +{ + + /** + * + * This external resource wraps the dictionary which shall be used by the splitter. + * + * */ + + public static final String PARAM_DICT_RESOURCE = "dictionaryResource"; + @ExternalResource(key = PARAM_DICT_RESOURCE) + private SharedDictionary dictResource; + + /** + * + * This external resource wraps the morphemes list which shall be used by the splitter. + * + * */ + + public static final String PARAM_MORPHEME_RESOURCE = "linkingMorphemeResource"; + @ExternalResource(key = PARAM_MORPHEME_RESOURCE) + private SharedLinkingMorphemes morphemesResource; + + protected SplitterAlgorithm splitter; + + @Override + public boolean initialize(ResourceSpecifier aSpecifier, Map aAdditionalParams) + throws ResourceInitializationException + { + if (!super.initialize(aSpecifier, aAdditionalParams)) { + return false; + } + return true; + } + + @Override + public void afterResourcesInitialized() throws RuntimeException { + try { + splitter.setDictionary(dictResource.getDictionary()); + splitter.setLinkingMorphemes(morphemesResource.getLinkingMorphemes()); + } + catch (IOException e) { + getLogger().log(SEVERE, "IOException caught when getting the dictionary resource"); + getLogger().log(SEVERE, e.getLocalizedMessage()); + getLogger().log(SEVERE, e.getMessage()); + throw new RuntimeException(e); + } + } + +} diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/package-info.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/package-info.java new file mode 100644 index 0000000000..92359b10e5 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/uima/resource/package-info.java @@ -0,0 +1,25 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +/** + * This package contains UIMA resources classes. Each splitter and each ranker needs to have a UIMA + * resource so the user can combine different strategies and choose the one which best suits its + * needs. Besides the splitters and rankers, there is also a resource for the Dictionary, for Finder + * and for the LinkingMorphemes. + */ +package org.dkpro.core.decompounding.uima.resource; diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/Finder.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/web1t/Finder.java similarity index 99% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/Finder.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/web1t/Finder.java index a3a2f91a51..8d37eec5da 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/Finder.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/web1t/Finder.java @@ -16,7 +16,7 @@ * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.web1t; +package org.dkpro.core.decompounding.web1t; import java.io.Closeable; import java.io.File; diff --git a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/LuceneIndexer.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/web1t/LuceneIndexer.java similarity index 98% rename from dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/LuceneIndexer.java rename to dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/web1t/LuceneIndexer.java index da79e7ef22..d5bea8648f 100644 --- a/dkpro-core-decompounding-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/LuceneIndexer.java +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/web1t/LuceneIndexer.java @@ -16,7 +16,7 @@ * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.web1t; +package org.dkpro.core.decompounding.web1t; import java.io.BufferedReader; import java.io.File; @@ -39,8 +39,7 @@ import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary; +import org.dkpro.core.decompounding.dictionary.Dictionary; /** * Index the Google Web1T corpus in Lucene. diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/web1t/NGramModel.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/web1t/NGramModel.java new file mode 100644 index 0000000000..0c24ba759e --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/web1t/NGramModel.java @@ -0,0 +1,67 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.web1t; + +/** + * N-gram model class. + * + * This is only a data container for the n-grams. + */ +public class NGramModel +{ + private String gram; + private int freq; + + public NGramModel(String aGram, int aFreq) + { + gram = aGram; + freq = aFreq; + } + + public String getGram() + { + return gram; + } + + public void setGram(String aGram) + { + gram = aGram; + } + + public int getFreq() + { + return freq; + } + + public void setFreq(int aFreq) + { + freq = aFreq; + } + + public int getN() + { + return gram.split(" ").length; + } + + @Override + public String toString() + { + return "[" + gram + "] (freq=" + freq + ")"; + } +} diff --git a/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/web1t/package-info.java b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/web1t/package-info.java new file mode 100644 index 0000000000..c0a72ba0c2 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/main/java/org/dkpro/core/decompounding/web1t/package-info.java @@ -0,0 +1,26 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +/** + * This package contains all classes that are needed access the Google web1T data set. + * + * The {@link org.dkpro.core.decompounding.web1t.LuceneIndexer} creates a Lucence + * index from the data set and the {@link org.dkpro.core.decompounding.web1t.Finder} + * can be used to search on the Lucence index. + */ +package org.dkpro.core.decompounding.web1t; diff --git a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/German98DictionaryTest.java b/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/German98DictionaryTest.java deleted file mode 100644 index 465c31c200..0000000000 --- a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/German98DictionaryTest.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.io.File; -import java.io.IOException; - -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; - -public class German98DictionaryTest -{ - @Test - public void testContains() throws IOException - { - final File affixFile = ResourceUtils.getUrlAsFile(getClass().getResource( - "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-affix.aff"), false); - final File dictFile = ResourceUtils.getUrlAsFile(getClass().getResource( - "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-igerman98.dic"), false); - final German98Dictionary dict = new German98Dictionary(dictFile, affixFile, "UTF-8"); - assertEquals(298506, dict.getAll().size()); - - assertTrue(dict.contains("hallo")); - assertTrue(dict.contains("versuchen")); - assertTrue(dict.contains("arbeiten")); - assertTrue(dict.contains("arbeit")); - } -} diff --git a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/LinkingMorphemesTest.java b/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/LinkingMorphemesTest.java deleted file mode 100644 index 741560d6c3..0000000000 --- a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/LinkingMorphemesTest.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary; - -import java.io.File; -import java.util.ArrayList; -import java.util.List; - -import junit.framework.Assert; - -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes; - -public class LinkingMorphemesTest -{ - - @Test - public void testStringConstructor() - { - LinkingMorphemes l = new LinkingMorphemes("s", "ens"); - Assert.assertEquals(2, l.getAll().size()); - Assert.assertEquals("s", l.getAll().get(0)); - } - - @Test - public void testListConstructor() - { - List list = new ArrayList(); - list.add("s"); - list.add("ens"); - - LinkingMorphemes l = new LinkingMorphemes(list); - Assert.assertEquals(2, l.getAll().size()); - Assert.assertEquals("s", l.getAll().get(0)); - } - - @Test - public void testFileConstructor() throws Exception - { - LinkingMorphemes l = new LinkingMorphemes(new File( - "src/test/resources/dic/morphemes.txt")); - Assert.assertEquals(2, l.getAll().size()); - Assert.assertEquals("s", l.getAll().get(0)); - } -} diff --git a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/SimpleDictionaryTest.java b/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/SimpleDictionaryTest.java deleted file mode 100644 index 9d69ae9a04..0000000000 --- a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/dictionary/SimpleDictionaryTest.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary; - -import static org.hamcrest.CoreMatchers.is; -import static org.hamcrest.CoreMatchers.not; -import static org.junit.Assert.assertThat; - -import java.io.File; -import java.io.IOException; - -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; - - -public class SimpleDictionaryTest -{ - - private SimpleDictionary dict; - - @Before - public void setUp() throws IOException{ - - final File dictFile = ResourceUtils.getUrlAsFile(getClass().getResource( - "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-igerman98.dic"), false); - dict = new SimpleDictionary(dictFile, "UTF-8"); - } - - @Test - public void testContains() - { - Assert.assertEquals(72508, dict.getAll().size()); - - Assert.assertTrue(dict.contains("worauf")); - Assert.assertTrue(dict.contains("woraufhin")); - Assert.assertTrue(dict.contains("woraus")); - } - - @Test - public void testDictionary(){ - - assertThat(dict.getAll().size(), not(0)); - assertThat(dict.contains("zu"),is(true)); - } - -} diff --git a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/ProbabilityBasedTest.java b/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/ProbabilityBasedTest.java deleted file mode 100644 index eaa4791f30..0000000000 --- a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/ProbabilityBasedTest.java +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.ranking; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import junit.framework.Assert; - -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundedWord; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundingTree; -import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.ValueNode; -import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.Finder; -import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.LuceneIndexer; - -public class ProbabilityBasedTest -{ - - static File source = new File("src/test/resources/ranking/n-grams"); - static File index = new File("target/test/index"); - static File jWeb1T = new File("src/test/resources/web1t/de"); - - @BeforeClass - public static void createIndex() - throws Exception - { - index.mkdirs(); - - LuceneIndexer indexer = new LuceneIndexer(source, index); - indexer.index(); - } - - @Test - public void testRankList() throws IOException - { - CompoundProbabilityRanker ranker = new CompoundProbabilityRanker(new Finder(index, jWeb1T)); - - List list = new ArrayList(); - DecompoundedWord s1 = DecompoundedWord.createFromString("Aktionsplan"); - list.add(s1); - DecompoundedWord s2 = DecompoundedWord.createFromString("Akt+ion(s)+plan"); - list.add(s2); - DecompoundedWord s3 = DecompoundedWord.createFromString("Aktion(s)+plan"); - list.add(s3); - - List result = ranker.rank(list); - Assert.assertEquals(s1, result.get(0)); - - Assert.assertEquals(s1, ranker.highestRank(list)); - } - - @Test - public void testRankTree() throws IOException - { - CompoundProbabilityRanker ranker = new CompoundProbabilityRanker(new Finder(index, jWeb1T)); - - DecompoundedWord s1 = DecompoundedWord.createFromString("Aktionsplan"); - DecompoundedWord s2 = DecompoundedWord.createFromString("Akt+ion(s)+plan"); - DecompoundedWord s3 = DecompoundedWord.createFromString("Aktion(s)+plan"); - - DecompoundingTree tree = new DecompoundingTree(s1); - tree.getRoot().addChild(new ValueNode(s2)); - tree.getRoot().addChild(new ValueNode(s3)); - - DecompoundedWord result = ranker.highestRank(tree); - Assert.assertEquals(s1, result); - } - - - @AfterClass - public static void tearDown() - throws Exception - { - // Delete index again - for (File f : index.listFiles()) { - for (File _f : f.listFiles()) { - _f.delete(); - } - f.delete(); - } - - index.delete(); - } -} diff --git a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/BananaSplitterTest.java b/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/BananaSplitterTest.java deleted file mode 100644 index 93299a8767..0000000000 --- a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/BananaSplitterTest.java +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter; - -import static org.hamcrest.CoreMatchers.is; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertThat; - -import java.io.File; -import java.io.IOException; -import java.util.List; - -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.SimpleDictionary; - -public class BananaSplitterTest -{ - @Test - public void testSplitter() - throws IOException - { - BananaSplitterAlgorithm splitter = new BananaSplitterAlgorithm(); - splitter.setDictionary(new SimpleDictionary("Garage", "einfahrt")); - - List result = splitter.split("Garageneinfahrt").getAllSplits(); - assertEquals(2, result.size()); - assertEquals("Garageneinfahrt", result.get(0).toString()); - assertEquals("garage(n)+einfahrt", result.get(1).toString()); - } - - @Test - public void testSplitter2() - throws IOException - { - final File dictFile = ResourceUtils.getUrlAsFile(getClass().getResource( - "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-igerman98.dic"), false); - Dictionary dict = new SimpleDictionary(dictFile, "UTF-8"); - BananaSplitterAlgorithm splitter = new BananaSplitterAlgorithm(); - splitter.setDictionary(dict); - List result = splitter.split("geräteelektronik").getAllSplits(); - assertThat(result.size(), is(1)); - - } - -} diff --git a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/DataDrivenAlgorithmTest.java b/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/DataDrivenAlgorithmTest.java deleted file mode 100644 index 8f9cbc210e..0000000000 --- a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/DataDrivenAlgorithmTest.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter; - -import static org.hamcrest.CoreMatchers.is; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertThat; - -import java.io.File; -import java.io.IOException; -import java.util.List; - -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.SimpleDictionary; - -public class DataDrivenAlgorithmTest -{ - - @Test - public void testSplit() - { - SimpleDictionary dict = new SimpleDictionary("friedens", "politik", "friedenspolitik", - "friedensverhaltungen", "friedenshaltung", "frittieren", "friseur", "außenpolitik", - "innenpolitik"); - LinkingMorphemes morphemes = new LinkingMorphemes("en", "s", "ens"); - - DataDrivenSplitterAlgorithm algo = new DataDrivenSplitterAlgorithm(dict, morphemes); - List result = algo.split("friedenspolitik").getAllSplits(); - - assertEquals(2, result.size()); - assertEquals("friedenspolitik", result.get(0).toString()); - assertEquals("friedens+politik", result.get(1).toString()); - } - - @Test - public void testSplit2() - throws IOException - { - - final File dictFile = ResourceUtils.getUrlAsFile(getClass().getResource( - "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-igerman98.dic"), false); - SimpleDictionary dict = new SimpleDictionary(dictFile, "UTF-8"); - final File morphemesFile = ResourceUtils.getUrlAsFile(getClass().getResource( - "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-linking.linking"), false); - LinkingMorphemes morphemes = new LinkingMorphemes(morphemesFile); - DataDrivenSplitterAlgorithm splitter = new DataDrivenSplitterAlgorithm(dict, morphemes); - List result = splitter.split("geräteelektronik").getAllSplits(); - assertThat(result.size(), is(1)); - - } -} diff --git a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/DecompoundedWordTest.java b/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/DecompoundedWordTest.java deleted file mode 100644 index f43c1eafc6..0000000000 --- a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/DecompoundedWordTest.java +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter; - -import static org.hamcrest.CoreMatchers.is; -import static org.junit.Assert.assertThat; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -import junit.framework.Assert; - -import org.junit.Test; - -public class DecompoundedWordTest -{ - - @Test - public void testCreate() - { - DecompoundedWord s = DecompoundedWord.createFromString("aktion(s)+plan"); - - Assert.assertEquals("aktion", s.getSplits().get(0).getWord()); - Assert.assertEquals("s", s.getSplits().get(0).getMorpheme()); - - Assert.assertEquals("plan", s.getSplits().get(1).getWord()); - Assert.assertEquals(null, s.getSplits().get(1).getMorpheme()); - } - - @Test - public void testToString() - { - Fragment e1 = new Fragment(); - e1.setWord("aktion"); - e1.setMorpheme("s"); - - Fragment e2 = new Fragment(); - e2.setWord("plan"); - - DecompoundedWord s = new DecompoundedWord(); - s.appendSplitElement(e1); - s.appendSplitElement(e2); - - Assert.assertEquals("aktion(s)+plan", s.toString()); - } - - @Test - public void testEquals() - { - Fragment e1 = new Fragment(); - e1.setWord("aktion"); - e1.setMorpheme("s"); - - Fragment e2 = new Fragment(); - e2.setWord("plan"); - - DecompoundedWord s1 = new DecompoundedWord(); - s1.appendSplitElement(e1); - s1.appendSplitElement(e2); - - Fragment e3 = new Fragment(); - e3.setWord("aktion"); - e3.setMorpheme("s"); - - Fragment e4 = new Fragment(); - e4.setWord("plan"); - - DecompoundedWord s2 = new DecompoundedWord(); - s2.appendSplitElement(e3); - s2.appendSplitElement(e4); - - Assert.assertTrue(s1.equals(s2)); - - e2.setMorpheme("e"); - Assert.assertFalse(s1.equals(s2)); - } - - @Test - public void testEqualsWithoutMorpheme() - { - DecompoundedWord e1 = DecompoundedWord.createFromString("zugang(s)+liste"); - DecompoundedWord e2 = DecompoundedWord.createFromString("zugangs+liste"); - - Assert.assertTrue(e1.equalWithoutMorpheme(e2)); - Assert.assertTrue(e2.equalWithoutMorpheme(e1)); - } - - @Test - public void testReplaceSplit() - { - DecompoundedWord s = DecompoundedWord.createFromString("Donau+dampfschiff+fahrt"); - s.replaceSplitElement(1, DecompoundedWord.createFromString("dampf+schiff")); - Assert.assertEquals("Donau+dampf+schiff+fahrt", s.toString()); - - s = DecompoundedWord.createFromString("Donau+dampfschiff+fahrten"); - s.replaceSplitElement(2, new Fragment("fahrt", "en")); - Assert.assertEquals("Donau+dampfschiff+fahrt(en)", s.toString()); - } - - @Test - public void testSort() - { - DecompoundedWord s1 = DecompoundedWord.createFromString("Aktion(s)+plan"); - DecompoundedWord s2 = DecompoundedWord.createFromString("Akt+ion(s)+plan"); - DecompoundedWord s3 = DecompoundedWord.createFromString("Aktionsplan"); - - List splits = new ArrayList(); - splits.add(s1); - splits.add(s2); - splits.add(s3); - - s1.setWeight(2); - s2.setWeight(3); - s3.setWeight(1); - - Collections.sort(splits); - - Assert.assertEquals(s2, splits.get(0)); - Assert.assertEquals(s1, splits.get(1)); - Assert.assertEquals(s3, splits.get(2)); - } - - @Test - public void testIsCompound() - { - DecompoundedWord s1 = DecompoundedWord.createFromString("Aktion(s)+plan"); - DecompoundedWord s2 = DecompoundedWord.createFromString("Aktionsplan"); - assertThat(s1.isCompound(), is(true)); - assertThat(s2.isCompound(), is(false)); - } - - @Test - public void testHasLastFragmentMorpheme() - { - DecompoundedWord s1 = DecompoundedWord.createFromString("Aktion(s)+plan"); - DecompoundedWord s2 = DecompoundedWord.createFromString("unter+flur+konvektor(en)"); - assertThat(s1.hasLastFragmentMorpheme(), is(false)); - assertThat(s2.hasLastFragmentMorpheme(), is(true)); - } - -} \ No newline at end of file diff --git a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/FragmentTest.java b/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/FragmentTest.java deleted file mode 100644 index a34ce1ad86..0000000000 --- a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/FragmentTest.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter; - -import org.junit.Assert; - -import org.hamcrest.CoreMatchers; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.Fragment; - -public class FragmentTest -{ - - @Test - public void testCreate() - { - Fragment e = Fragment.createFromString("aktion(s)"); - - Assert.assertEquals("aktion", e.getWord()); - Assert.assertEquals("s", e.getMorpheme()); - - e = Fragment.createFromString("plan"); - Assert.assertEquals("plan", e.getWord()); - Assert.assertEquals(null, e.getMorpheme()); - } - - @Test - public void testToString() - { - Fragment e = new Fragment(); - e.setWord("aktion"); - e.setMorpheme("s"); - Assert.assertEquals("aktion(s)", e.toString()); - - e.setMorpheme(null); - Assert.assertEquals("aktion", e.toString()); - } - - @Test - public void testEquals() - { - Fragment e1 = new Fragment(); - e1.setWord("aktion"); - e1.setMorpheme("s"); - - Fragment e2 = new Fragment(); - e2.setWord("aktion"); - e2.setMorpheme("s"); - - Assert.assertTrue(e1.equals(e1)); - - e2.setMorpheme(null); - Assert.assertFalse(e1.equals(e2)); - } - - @Test - public void testCreateFromString() - { - Fragment fragm = Fragment.createFromString("("); - Assert.assertThat(fragm.getWord(),CoreMatchers.is("(")); - } - -} diff --git a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/JWordSplitterTest.java b/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/JWordSplitterTest.java deleted file mode 100644 index a638da57a9..0000000000 --- a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/JWordSplitterTest.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter; - -import static org.hamcrest.CoreMatchers.is; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertThat; - -import java.io.File; -import java.io.IOException; -import java.util.List; - -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.SimpleDictionary; - -public class JWordSplitterTest -{ - - @Test - public void testSplitter() - throws IOException - { - JWordSplitterAlgorithm splitter = new JWordSplitterAlgorithm(); - List result = splitter.split("Aktionsplan").getAllSplits(); - assertEquals(2, result.size()); - assertEquals("Aktionsplan", result.get(0).toString()); - assertEquals("Aktion(s)+plan", result.get(1).toString()); - } - - @Test - public void testSplliter2() throws IOException { - JWordSplitterAlgorithm splitter = new JWordSplitterAlgorithm(); - final File dictFile = ResourceUtils.getUrlAsFile(getClass().getResource( - "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-igerman98.dic"), false);; - - Dictionary dict = new SimpleDictionary(dictFile, "UTF-8"); - splitter.setDictionary(dict); - List result = splitter.split("geräteelektronik").getAllSplits(); - assertThat(result.size(),is(1)); - } - - @Test - public void testSplliter3() throws IOException { - JWordSplitterAlgorithm splitter = new JWordSplitterAlgorithm(); - final File dictFile = ResourceUtils.getUrlAsFile(getClass().getResource( - "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-igerman98.dic"), false);; - - Dictionary dict = new SimpleDictionary(dictFile, "UTF-8"); - splitter.setDictionary(dict); - List result = splitter.split("Schwerwiegend").getAllSplits(); - assertThat(result.size(),is(1)); - } - -} diff --git a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/LeftToRightSplitAlgorithmTest.java b/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/LeftToRightSplitAlgorithmTest.java deleted file mode 100644 index 280c4ef439..0000000000 --- a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/splitter/LeftToRightSplitAlgorithmTest.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter; - -import static org.hamcrest.CoreMatchers.is; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertThat; - -import java.io.File; -import java.io.IOException; -import java.util.List; - -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.SimpleDictionary; - -public class LeftToRightSplitAlgorithmTest -{ - - @Test - public void testSplit1() - { - Dictionary dict = new SimpleDictionary("Akt", "ion", "plan", "Aktion", - "Aktionsplan"); - LinkingMorphemes morphemes = new LinkingMorphemes("s"); - LeftToRightSplitterAlgorithm algo = new LeftToRightSplitterAlgorithm(dict, - morphemes); - - List result = algo.split("Aktionsplan").getAllSplits(); - assertEquals(6, result.size()); - assertEquals("aktionsplan", result.get(0).toString()); - assertEquals("akt+ionsplan", result.get(1).toString()); - assertEquals("akt+ion+splan", result.get(2).toString()); - assertEquals("akt+ion(s)+plan", result.get(3).toString()); - assertEquals("aktion+splan", result.get(4).toString()); - assertEquals("aktion(s)+plan", result.get(5).toString()); - } - - @Test - public void testSplit2() - { - Dictionary dict = new SimpleDictionary("Donau", "dampf", "schiff", - "fahrt", "dampfschiff", "schifffahrt"); - LinkingMorphemes morphemes = new LinkingMorphemes("s"); - LeftToRightSplitterAlgorithm algo = new LeftToRightSplitterAlgorithm(dict, - morphemes); - - List result = algo.split("Donaudampfschifffahrt").getAllSplits(); - assertEquals(6, result.size()); - } - - @Test - public void testSplit3() - { - Dictionary dict = new SimpleDictionary("Super", "mann", "anzug", - "Supermann", "anzug"); - LinkingMorphemes morphemes = new LinkingMorphemes("s"); - LeftToRightSplitterAlgorithm algo = new LeftToRightSplitterAlgorithm(dict, - morphemes); - - List result = algo.split("Supermannanzug").getAllSplits(); - // Super+mann+anzug, Supermann+anzug - assertEquals(4, result.size()); - } - - @Test - public void testMorphemes1() - { - Dictionary dict = new SimpleDictionary("alarm", "reaktion"); - LinkingMorphemes morphemes = new LinkingMorphemes("en"); - LeftToRightSplitterAlgorithm algo = new LeftToRightSplitterAlgorithm(dict, - morphemes); - - List result = algo.split("alarmreaktionen").getAllSplits(); - // Super+mann+anzug, Supermann+anzug - assertEquals(3, result.size()); - assertEquals("alarmreaktionen", result.get(0).toString()); - assertEquals("alarm+reaktionen", result.get(1).toString()); - assertEquals("alarm+reaktion(en)", result.get(2).toString()); - } - - @Test - public void testSplit4() throws IOException{ - - final File dictFile = ResourceUtils.getUrlAsFile(getClass().getResource( - "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-igerman98.dic"), false); - final File morphemesFile = ResourceUtils.getUrlAsFile(getClass().getResource( - "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-linking.linking"), false); - - Dictionary dict = new SimpleDictionary(dictFile, "UTF-8"); - LinkingMorphemes morphemes = new LinkingMorphemes(morphemesFile); - - LeftToRightSplitterAlgorithm splitter = new LeftToRightSplitterAlgorithm(dict,morphemes); - - List result = splitter.split("geräteelektronik").getAllSplits(); - - assertThat(result.size(),is(1)); - - } - -} diff --git a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/trie/TrieTest.java b/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/trie/TrieTest.java deleted file mode 100644 index c3fac3e2c5..0000000000 --- a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/trie/TrieTest.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.trie; - -import static org.junit.Assert.assertEquals; - -import java.io.File; -import java.io.IOException; - -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.German98Dictionary; - -public class TrieTest -{ - - @Test - public void testAddSorted() - { - TrieStructure t = new TrieStructure(); - - t.addWord("abc"); - assertEquals(new Integer(1), t.findWord("a").getValue()); - assertEquals(new Integer(1), t.findWord("ab").getValue()); - assertEquals(new Integer(0), t.findWord("abc").getValue()); - - t.addWord("abcde"); - assertEquals(new Integer(2), t.findWord("a").getValue()); - assertEquals(new Integer(2), t.findWord("ab").getValue()); - assertEquals(new Integer(1), t.findWord("abc").getValue()); - assertEquals(new Integer(1), t.findWord("abcd").getValue()); - assertEquals(new Integer(0), t.findWord("abcde").getValue()); - - t.addWord("abde"); - assertEquals(new Integer(3), t.findWord("a").getValue()); - assertEquals(new Integer(3), t.findWord("ab").getValue()); - assertEquals(new Integer(1), t.findWord("abd").getValue()); - assertEquals(new Integer(0), t.findWord("abde").getValue()); - } - - @Test - public void testAddUnsorted() - { - TrieStructure t = new TrieStructure(); - - t.addWord("abde"); - assertEquals(new Integer(1), t.findWord("a").getValue()); - assertEquals(new Integer(1), t.findWord("ab").getValue()); - assertEquals(new Integer(1), t.findWord("abd").getValue()); - assertEquals(new Integer(0), t.findWord("abde").getValue()); - - t.addWord("abc"); - assertEquals(new Integer(2), t.findWord("a").getValue()); - assertEquals(new Integer(2), t.findWord("ab").getValue()); - assertEquals(new Integer(0), t.findWord("abc").getValue()); - - t.addWord("abcde"); - assertEquals(new Integer(3), t.findWord("a").getValue()); - assertEquals(new Integer(3), t.findWord("ab").getValue()); - assertEquals(new Integer(1), t.findWord("abc").getValue()); - assertEquals(new Integer(1), t.findWord("abcd").getValue()); - assertEquals(new Integer(0), t.findWord("abcde").getValue()); - } - - @Test - public void testSimpleDict() throws IOException - { - final File affixFile = ResourceUtils.getUrlAsFile(getClass().getResource( - "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-affix.aff"), false); - final File dictFile = ResourceUtils.getUrlAsFile(getClass().getResource( - "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-igerman98.dic"), false); - final German98Dictionary dict = new German98Dictionary(dictFile, affixFile, "UTF-8"); - TrieStructure t = TrieStructure.createForDict(dict); - - assertEquals(new Integer(14963), t.findWord("h").getValue()); - assertEquals(new Integer(257), t.findWord("hel").getValue()); - assertEquals(new Integer(0), t.findWord("hallo").getValue()); - - assertEquals(new Integer(8033), t.findWord("t").getValue()); - assertEquals(new Integer(2714), t.findWord("tr").getValue()); - assertEquals(new Integer(996), t.findWord("tra").getValue()); - assertEquals(new Integer(38), t.findWord("tram").getValue()); - - assertEquals(new Integer(11138), t.findWord("w").getValue()); - assertEquals(new Integer(178), t.findWord("wor").getValue()); - assertEquals(new Integer(160), t.findWord("wort").getValue()); - - } - - @Test - public void testSimpleDictReverse() throws IOException - { - final File affixFile = ResourceUtils.getUrlAsFile(getClass().getResource( - "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-affix.aff"), false); - final File dictFile = ResourceUtils.getUrlAsFile(getClass().getResource( - "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-igerman98.dic"), false); - final German98Dictionary dict = new German98Dictionary(dictFile, affixFile, "UTF-8"); - TrieStructure t = TrieStructure.createForDict(dict); - assertEquals(new Integer(11121), t.findWord("d").getValue()); - assertEquals(new Integer(2494), t.findWord("de").getValue()); - assertEquals(new Integer(69), t.findWord("dei").getValue()); - - assertEquals(new Integer(13809), t.findWord("k").getValue()); - assertEquals(new Integer(2101), t.findWord("o").getValue()); - } -} diff --git a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/annotator/CompoundAnnotatorTest.java b/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/annotator/CompoundAnnotatorTest.java deleted file mode 100644 index f117ed8c86..0000000000 --- a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/uima/annotator/CompoundAnnotatorTest.java +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Copyright 2010 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.annotator; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.ExternalResourceFactory.createExternalResourceDescription; -import static org.hamcrest.CoreMatchers.is; -import static org.junit.Assert.assertThat; - -import java.io.File; -import java.util.ArrayList; -import java.util.List; - -import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.CASException; -import org.apache.uima.fit.factory.AnalysisEngineFactory; -import org.apache.uima.fit.testing.factory.TokenBuilder; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.tcas.Annotation; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.CompoundPart; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LinkingMorpheme; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource.AsvToolboxSplitterResource; -import de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource.FrequencyRankerResource; -import de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource.LeftToRightSplitterResource; -import de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource.RankerResource; -import de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource.SharedDictionary; -import de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource.SharedFinder; -import de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource.SharedLinkingMorphemes; -import de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource.SharedPatriciaTries; -import de.tudarmstadt.ukp.dkpro.core.decompounding.uima.resource.SplitterResource; -import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.LuceneIndexer; - -public class CompoundAnnotatorTest -{ - - static File source = new File("src/test/resources/ranking/n-grams"); - static File index = new File("target/test/index"); - static String jWeb1TPath = "src/test/resources/web1t/de"; - static String indexPath = "target/test/index"; - - - @BeforeClass - public static void createIndex() - throws Exception - { - index.mkdirs(); - - LuceneIndexer indexer = new LuceneIndexer(source, index); - indexer.index(); - } - - @Test - public void testWithoutRanking() throws CASException, UIMAException { - AnalysisEngineDescription aed = createEngineDescription( - CompoundAnnotator.class, - CompoundAnnotator.PARAM_SPLITTING_ALGO, - createExternalResourceDescription( - LeftToRightSplitterResource.class, - SplitterResource.PARAM_DICT_RESOURCE, - createExternalResourceDescription(SharedDictionary.class), - SplitterResource.PARAM_MORPHEME_RESOURCE, - createExternalResourceDescription(SharedLinkingMorphemes.class))); - String[] splits = new String[] { "Aktion", "s", "plan", "Doppel","prozessormaschine"}; - String[] compoundsParts = new String[] { "Aktion", "plan", "Doppel", "prozessormaschine"}; - runAnnotator(aed, splits, compoundsParts); - } - - @Test - public void testWithAsvToolbox() throws CASException, UIMAException { - AnalysisEngineDescription aed = createEngineDescription( - CompoundAnnotator.class, - CompoundAnnotator.PARAM_SPLITTING_ALGO, - createExternalResourceDescription( - AsvToolboxSplitterResource.class, - AsvToolboxSplitterResource.PARAM_DICT_RESOURCE, - createExternalResourceDescription(SharedDictionary.class), - AsvToolboxSplitterResource.PARAM_MORPHEME_RESOURCE, - createExternalResourceDescription(SharedLinkingMorphemes.class), - AsvToolboxSplitterResource.PARAM_PATRICIA_TRIES_RESOURCE, - createExternalResourceDescription(SharedPatriciaTries.class)), - CompoundAnnotator.PARAM_RANKING_ALGO, - createExternalResourceDescription( - FrequencyRankerResource.class, - RankerResource.PARAM_FINDER_RESOURCE, - createExternalResourceDescription(SharedFinder.class, - SharedFinder.PARAM_INDEX_PATH, indexPath, - SharedFinder.PARAM_NGRAM_LOCATION, jWeb1TPath))); - String[] splits = new String[] { "Aktion", "s", "plan", "Doppel","prozessormaschine", - "prozessor","maschine"}; - String[] compoundsParts = new String[] { "Aktion", "plan", "Doppel", "prozessormaschine", - "prozessor","maschine"}; - runAnnotator(aed, splits, compoundsParts); - } - - - @Test - public void testWithDefaults() throws CASException, UIMAException { - AnalysisEngineDescription aed = createEngineDescription( - CompoundAnnotator.class, - CompoundAnnotator.PARAM_SPLITTING_ALGO, - createExternalResourceDescription( - LeftToRightSplitterResource.class, - SplitterResource.PARAM_DICT_RESOURCE, - createExternalResourceDescription(SharedDictionary.class), - SplitterResource.PARAM_MORPHEME_RESOURCE, - createExternalResourceDescription(SharedLinkingMorphemes.class)), - CompoundAnnotator.PARAM_RANKING_ALGO, - createExternalResourceDescription( - FrequencyRankerResource.class, - RankerResource.PARAM_FINDER_RESOURCE, - createExternalResourceDescription(SharedFinder.class, - SharedFinder.PARAM_INDEX_PATH, indexPath, - SharedFinder.PARAM_NGRAM_LOCATION, jWeb1TPath))); - String[] splits = new String[] { "Aktion", "s", "plan", "Doppel","prozessormaschine", - "prozessor","maschine"}; - String[] compoundsParts = new String[] { "Aktion", "plan", "Doppel", "prozessormaschine", - "prozessor","maschine"}; - runAnnotator(aed, splits, compoundsParts); - } - - private void runAnnotator(AnalysisEngineDescription aed, String[] splits, - String[] compoundsParts) - throws CASException, UIMAException{ - // Create Analysis Engine - AnalysisEngine ae = AnalysisEngineFactory.createEngine(aed); - - // Create cas with token - CAS cas = ae.newCAS(); - TokenBuilder builder = new TokenBuilder(Token.class, - Annotation.class); - builder.buildTokens(cas.getJCas(), "Aktionsplan im Doppelprozessormaschine"); - ae.typeSystemInit(cas.getTypeSystem()); - ae.process(cas); - - String[] compounds = new String[] {"Aktionsplan", "Doppelprozessormaschine"}; - String[] linkingMorphemes = new String[] {"s"}; - - // Check if splits and morphemes are equal - assertThat(getAnnotation(cas.getJCas(), Compound.class), is(compounds)); - assertThat(getAnnotation(cas.getJCas(), Split.class), is(splits)); - assertThat(getAnnotation(cas.getJCas(), CompoundPart.class), is(compoundsParts)); - assertThat(getAnnotation(cas.getJCas(), LinkingMorpheme.class), is(linkingMorphemes)); - } - - protected String[] getAnnotation(JCas aCas, Class aClass) - { - List result = new ArrayList(); - for (T s : JCasUtil.select(aCas, aClass)) { - result.add(s.getCoveredText()); - } - - return result.toArray(new String[] {}); - } - - @AfterClass - public static void tearDown() - throws Exception - { - // Delete index again - for (File f : index.listFiles()) { - for (File _f : f.listFiles()) { - _f.delete(); - } - f.delete(); - } - - index.delete(); - } - -} diff --git a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/FinderPerformanceTest.java b/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/FinderPerformanceTest.java deleted file mode 100644 index 44befca63e..0000000000 --- a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/FinderPerformanceTest.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ - -package de.tudarmstadt.ukp.dkpro.core.decompounding.web1t; - -import java.io.File; -import java.io.IOException; - -import junit.framework.Assert; - -import org.junit.Test; - -public class FinderPerformanceTest -{ - - /** - * Test the finder performace on the generated web1t index Time is printed - * on the console. - */ - @Test - public void testPerformance1() throws IOException - { - File file = new File("/home/jens/Desktop/web1tIndex4"); - if (!file.exists()) { - return; - } - File jWeb1T = new File("/home/likewise-open/UKP/santos/UKP/Library/DKPro/web1t/de"); - Finder f = new Finder(file, jWeb1T); - String[] words = new String[] { "hallo welt", "wie geht es euch", - "alpha", "zutun", "lasst uns nach hause gehen", "rennen" }; - long time = 0; - - for (String word : words) { - long start = System.currentTimeMillis(); - Assert.assertTrue(f.find(word).size() > 0); - long end = System.currentTimeMillis(); - time += end - start; - System.out.println("Time for '" + word + "' (ms): " - + (end - start)); - } - - System.out.println(" -> Average time (ms): " - + ((float) time / (float) words.length)); - } - - @Test - public void testPerformance2() throws IOException - { - File file = new File("/home/jens/Desktop/web1tIndex4"); - File jWeb1T = new File("/home/likewise-open/UKP/santos/UKP/Library/DKPro/web1t/de"); - if (!file.exists()) { - return; - } - - Finder f = new Finder(file, jWeb1T); - - String[] words = { "filmtauscher", "minimalanforderungen", - "berufungsinstanz" }; - - long time = 0; - long count = 0; - for (String word : words) { - for (int i = 1; i < word.length(); i++) { - String searchFor = word.substring(0, i); - - long start = System.currentTimeMillis(); - f.contains(searchFor); - long end = System.currentTimeMillis(); - - time += end - start; - count++; - - System.out.println("Time for '" + searchFor + "' (ms): " - + (end - start)); - } - } - - System.out.println("Average time (ms): " - + ((float) time / (float) count)); - } -} diff --git a/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/dictionary/German98DictionaryTest.java b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/dictionary/German98DictionaryTest.java new file mode 100644 index 0000000000..fe007c4f27 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/dictionary/German98DictionaryTest.java @@ -0,0 +1,51 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.dictionary; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.IOException; + +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.decompounding.dictionary.German98Dictionary; +import org.junit.Test; + +public class German98DictionaryTest +{ + @Test + public void testContains() throws IOException + { + final File affixFile = ResourceUtils.getUrlAsFile( + getClass().getResource( + "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-affix.aff"), + false); + final File dictFile = ResourceUtils.getUrlAsFile(getClass().getResource( + "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-igerman98.dic"), + false); + final German98Dictionary dict = new German98Dictionary(dictFile, affixFile, "UTF-8"); + assertEquals(298506, dict.getAll().size()); + + assertTrue(dict.contains("hallo")); + assertTrue(dict.contains("versuchen")); + assertTrue(dict.contains("arbeiten")); + assertTrue(dict.contains("arbeit")); + } +} diff --git a/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/dictionary/LinkingMorphemesTest.java b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/dictionary/LinkingMorphemesTest.java new file mode 100644 index 0000000000..5b61acd784 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/dictionary/LinkingMorphemesTest.java @@ -0,0 +1,60 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.dictionary; + +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +import org.dkpro.core.decompounding.dictionary.LinkingMorphemes; +import org.junit.Test; + +public class LinkingMorphemesTest +{ + + @Test + public void testStringConstructor() + { + LinkingMorphemes l = new LinkingMorphemes("s", "ens"); + assertEquals(2, l.getAll().size()); + assertEquals("s", l.getAll().get(0)); + } + + @Test + public void testListConstructor() + { + List list = new ArrayList(); + list.add("s"); + list.add("ens"); + + LinkingMorphemes l = new LinkingMorphemes(list); + assertEquals(2, l.getAll().size()); + assertEquals("s", l.getAll().get(0)); + } + + @Test + public void testFileConstructor() throws Exception + { + LinkingMorphemes l = new LinkingMorphemes(new File("src/test/resources/dic/morphemes.txt")); + assertEquals(2, l.getAll().size()); + assertEquals("s", l.getAll().get(0)); + } +} diff --git a/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/dictionary/SimpleDictionaryTest.java b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/dictionary/SimpleDictionaryTest.java new file mode 100644 index 0000000000..21faa318e6 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/dictionary/SimpleDictionaryTest.java @@ -0,0 +1,62 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.dictionary; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.IOException; + +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.decompounding.dictionary.SimpleDictionary; +import org.junit.Before; +import org.junit.Test; + +public class SimpleDictionaryTest +{ + private SimpleDictionary dict; + + @Before + public void setUp() throws IOException + { + final File dictFile = ResourceUtils.getUrlAsFile(getClass().getResource( + "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-igerman98.dic"), + false); + dict = new SimpleDictionary(dictFile, "UTF-8"); + } + + @Test + public void testContains() + { + assertEquals(72508, dict.getAll().size()); + + assertTrue(dict.contains("worauf")); + assertTrue(dict.contains("woraufhin")); + assertTrue(dict.contains("woraus")); + } + + @Test + public void testDictionary() + { + assertThat(dict.getAll()).isNotEmpty(); + assertThat(dict.contains("zu")).isTrue(); + } +} diff --git a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/DummyRankerTest.java b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/ranking/DummyRankerTest.java similarity index 76% rename from dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/DummyRankerTest.java rename to dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/ranking/DummyRankerTest.java index de37bc5f42..0442cb105f 100644 --- a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/DummyRankerTest.java +++ b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/ranking/DummyRankerTest.java @@ -16,24 +16,23 @@ * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.ranking; +package org.dkpro.core.decompounding.ranking; -import java.io.IOException; +import static org.junit.Assert.assertEquals; -import junit.framework.Assert; +import java.io.IOException; +import org.dkpro.core.decompounding.ranking.DummyRanker; +import org.dkpro.core.decompounding.splitter.DecompoundedWord; +import org.dkpro.core.decompounding.splitter.DecompoundingTree; +import org.dkpro.core.decompounding.trie.ValueNode; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundedWord; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundingTree; -import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.ValueNode; - public class DummyRankerTest { @Test - public void testRankTree() - throws IOException + public void testRankTree() throws IOException { DummyRanker ranker = new DummyRanker(); @@ -46,7 +45,6 @@ public void testRankTree() tree.getRoot().addChild(new ValueNode(s3)); DecompoundedWord result = ranker.highestRank(tree); - Assert.assertEquals(s3, result); + assertEquals(s3, result); } - } diff --git a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/FrequencyBasedTest.java b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/ranking/FrequencyBasedTest.java similarity index 88% rename from dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/FrequencyBasedTest.java rename to dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/ranking/FrequencyBasedTest.java index 68e83cdd6d..6b6fddad43 100644 --- a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/FrequencyBasedTest.java +++ b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/ranking/FrequencyBasedTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.decompounding.ranking; +package org.dkpro.core.decompounding.ranking; import static org.junit.Assert.assertEquals; @@ -24,18 +24,18 @@ import java.util.ArrayList; import java.util.List; +import org.dkpro.core.decompounding.ranking.FrequencyGeometricMeanRanker; +import org.dkpro.core.decompounding.splitter.DecompoundedWord; +import org.dkpro.core.decompounding.splitter.DecompoundingTree; +import org.dkpro.core.decompounding.trie.ValueNode; +import org.dkpro.core.decompounding.web1t.Finder; +import org.dkpro.core.decompounding.web1t.LuceneIndexer; +import org.dkpro.core.testing.DkproTestContext; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundedWord; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundingTree; -import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.ValueNode; -import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.Finder; -import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.LuceneIndexer; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - public class FrequencyBasedTest { private static File source; diff --git a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/MutualInformationBasedTest.java b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/ranking/MutualInformationBasedTest.java similarity index 89% rename from dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/MutualInformationBasedTest.java rename to dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/ranking/MutualInformationBasedTest.java index 9ad411a30e..43fb57bd3f 100644 --- a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/ranking/MutualInformationBasedTest.java +++ b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/ranking/MutualInformationBasedTest.java @@ -16,7 +16,7 @@ * limitations under the License. **/ -package de.tudarmstadt.ukp.dkpro.core.decompounding.ranking; +package org.dkpro.core.decompounding.ranking; import static org.junit.Assert.assertEquals; @@ -25,16 +25,16 @@ import java.util.ArrayList; import java.util.List; +import org.dkpro.core.decompounding.ranking.MutualInformationRanker; +import org.dkpro.core.decompounding.splitter.DecompoundedWord; +import org.dkpro.core.decompounding.splitter.DecompoundingTree; +import org.dkpro.core.decompounding.trie.ValueNode; +import org.dkpro.core.decompounding.web1t.Finder; +import org.dkpro.core.decompounding.web1t.LuceneIndexer; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundedWord; -import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundingTree; -import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.ValueNode; -import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.Finder; -import de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.LuceneIndexer; - public class MutualInformationBasedTest { private static File source; diff --git a/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/ranking/ProbabilityBasedTest.java b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/ranking/ProbabilityBasedTest.java new file mode 100644 index 0000000000..48bc59abec --- /dev/null +++ b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/ranking/ProbabilityBasedTest.java @@ -0,0 +1,103 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.ranking; + +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.dkpro.core.decompounding.ranking.CompoundProbabilityRanker; +import org.dkpro.core.decompounding.splitter.DecompoundedWord; +import org.dkpro.core.decompounding.splitter.DecompoundingTree; +import org.dkpro.core.decompounding.trie.ValueNode; +import org.dkpro.core.decompounding.web1t.Finder; +import org.dkpro.core.decompounding.web1t.LuceneIndexer; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +public class ProbabilityBasedTest +{ + + static File source = new File("src/test/resources/ranking/n-grams"); + static File index = new File("target/test/index"); + static File jWeb1T = new File("src/test/resources/web1t/de"); + + @BeforeClass + public static void createIndex() throws Exception + { + index.mkdirs(); + + LuceneIndexer indexer = new LuceneIndexer(source, index); + indexer.index(); + } + + @Test + public void testRankList() throws IOException + { + CompoundProbabilityRanker ranker = new CompoundProbabilityRanker(new Finder(index, jWeb1T)); + + List list = new ArrayList(); + DecompoundedWord s1 = DecompoundedWord.createFromString("Aktionsplan"); + list.add(s1); + DecompoundedWord s2 = DecompoundedWord.createFromString("Akt+ion(s)+plan"); + list.add(s2); + DecompoundedWord s3 = DecompoundedWord.createFromString("Aktion(s)+plan"); + list.add(s3); + + List result = ranker.rank(list); + assertEquals(s1, result.get(0)); + + assertEquals(s1, ranker.highestRank(list)); + } + + @Test + public void testRankTree() throws IOException + { + CompoundProbabilityRanker ranker = new CompoundProbabilityRanker(new Finder(index, jWeb1T)); + + DecompoundedWord s1 = DecompoundedWord.createFromString("Aktionsplan"); + DecompoundedWord s2 = DecompoundedWord.createFromString("Akt+ion(s)+plan"); + DecompoundedWord s3 = DecompoundedWord.createFromString("Aktion(s)+plan"); + + DecompoundingTree tree = new DecompoundingTree(s1); + tree.getRoot().addChild(new ValueNode(s2)); + tree.getRoot().addChild(new ValueNode(s3)); + + DecompoundedWord result = ranker.highestRank(tree); + assertEquals(s1, result); + } + + @AfterClass + public static void tearDown() throws Exception + { + // Delete index again + for (File f : index.listFiles()) { + for (File _f : f.listFiles()) { + _f.delete(); + } + f.delete(); + } + + index.delete(); + } +} diff --git a/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/splitter/BananaSplitterTest.java b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/splitter/BananaSplitterTest.java new file mode 100644 index 0000000000..3429590190 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/splitter/BananaSplitterTest.java @@ -0,0 +1,63 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.decompounding.splitter; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.decompounding.dictionary.Dictionary; +import org.dkpro.core.decompounding.dictionary.SimpleDictionary; +import org.dkpro.core.decompounding.splitter.BananaSplitterAlgorithm; +import org.dkpro.core.decompounding.splitter.DecompoundedWord; +import org.junit.Test; + +public class BananaSplitterTest +{ + @Test + public void testSplitter() throws IOException + { + BananaSplitterAlgorithm splitter = new BananaSplitterAlgorithm(); + splitter.setDictionary(new SimpleDictionary("Garage", "einfahrt")); + + List result = splitter.split("Garageneinfahrt").getAllSplits(); + + assertEquals(2, result.size()); + assertEquals("Garageneinfahrt", result.get(0).toString()); + assertEquals("garage(n)+einfahrt", result.get(1).toString()); + } + + @Test + public void testSplitter2() throws IOException + { + final File dictFile = ResourceUtils.getUrlAsFile(getClass().getResource( + "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-igerman98.dic"), + false); + Dictionary dict = new SimpleDictionary(dictFile, "UTF-8"); + BananaSplitterAlgorithm splitter = new BananaSplitterAlgorithm(); + splitter.setDictionary(dict); + + List result = splitter.split("geräteelektronik").getAllSplits(); + + assertThat(result).hasSize(1); + } +} diff --git a/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/splitter/DataDrivenAlgorithmTest.java b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/splitter/DataDrivenAlgorithmTest.java new file mode 100644 index 0000000000..fb64f155f9 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/splitter/DataDrivenAlgorithmTest.java @@ -0,0 +1,67 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.splitter; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.decompounding.dictionary.LinkingMorphemes; +import org.dkpro.core.decompounding.dictionary.SimpleDictionary; +import org.junit.Test; + +public class DataDrivenAlgorithmTest +{ + @Test + public void testSplit() + { + SimpleDictionary dict = new SimpleDictionary("friedens", "politik", "friedenspolitik", + "friedensverhaltungen", "friedenshaltung", "frittieren", "friseur", "außenpolitik", + "innenpolitik"); + LinkingMorphemes morphemes = new LinkingMorphemes("en", "s", "ens"); + + DataDrivenSplitterAlgorithm algo = new DataDrivenSplitterAlgorithm(dict, morphemes); + List result = algo.split("friedenspolitik").getAllSplits(); + + assertEquals(2, result.size()); + assertEquals("friedenspolitik", result.get(0).toString()); + assertEquals("friedens+politik", result.get(1).toString()); + } + + @Test + public void testSplit2() throws IOException + { + final File dictFile = ResourceUtils.getUrlAsFile(getClass().getResource( + "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-igerman98.dic"), + false); + SimpleDictionary dict = new SimpleDictionary(dictFile, "UTF-8"); + final File morphemesFile = ResourceUtils.getUrlAsFile(getClass().getResource( + "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-linking.linking"), + false); + LinkingMorphemes morphemes = new LinkingMorphemes(morphemesFile); + DataDrivenSplitterAlgorithm splitter = new DataDrivenSplitterAlgorithm(dict, morphemes); + List result = splitter.split("geräteelektronik").getAllSplits(); + + assertThat(result).hasSize(1); + } +} diff --git a/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/splitter/DecompoundedWordTest.java b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/splitter/DecompoundedWordTest.java new file mode 100644 index 0000000000..2775c27aea --- /dev/null +++ b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/splitter/DecompoundedWordTest.java @@ -0,0 +1,161 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.splitter; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.dkpro.core.decompounding.splitter.DecompoundedWord; +import org.dkpro.core.decompounding.splitter.Fragment; +import org.junit.Test; + +public class DecompoundedWordTest +{ + + @Test + public void testCreate() + { + DecompoundedWord s = DecompoundedWord.createFromString("aktion(s)+plan"); + + assertEquals("aktion", s.getSplits().get(0).getWord()); + assertEquals("s", s.getSplits().get(0).getMorpheme()); + + assertEquals("plan", s.getSplits().get(1).getWord()); + assertEquals(null, s.getSplits().get(1).getMorpheme()); + } + + @Test + public void testToString() + { + Fragment e1 = new Fragment(); + e1.setWord("aktion"); + e1.setMorpheme("s"); + + Fragment e2 = new Fragment(); + e2.setWord("plan"); + + DecompoundedWord s = new DecompoundedWord(); + s.appendSplitElement(e1); + s.appendSplitElement(e2); + + assertEquals("aktion(s)+plan", s.toString()); + } + + @Test + public void testEquals() + { + Fragment e1 = new Fragment(); + e1.setWord("aktion"); + e1.setMorpheme("s"); + + Fragment e2 = new Fragment(); + e2.setWord("plan"); + + DecompoundedWord s1 = new DecompoundedWord(); + s1.appendSplitElement(e1); + s1.appendSplitElement(e2); + + Fragment e3 = new Fragment(); + e3.setWord("aktion"); + e3.setMorpheme("s"); + + Fragment e4 = new Fragment(); + e4.setWord("plan"); + + DecompoundedWord s2 = new DecompoundedWord(); + s2.appendSplitElement(e3); + s2.appendSplitElement(e4); + + assertTrue(s1.equals(s2)); + + e2.setMorpheme("e"); + assertFalse(s1.equals(s2)); + } + + @Test + public void testEqualsWithoutMorpheme() + { + DecompoundedWord e1 = DecompoundedWord.createFromString("zugang(s)+liste"); + DecompoundedWord e2 = DecompoundedWord.createFromString("zugangs+liste"); + + assertTrue(e1.equalWithoutMorpheme(e2)); + assertTrue(e2.equalWithoutMorpheme(e1)); + } + + @Test + public void testReplaceSplit() + { + DecompoundedWord s = DecompoundedWord.createFromString("Donau+dampfschiff+fahrt"); + s.replaceSplitElement(1, DecompoundedWord.createFromString("dampf+schiff")); + assertEquals("Donau+dampf+schiff+fahrt", s.toString()); + + s = DecompoundedWord.createFromString("Donau+dampfschiff+fahrten"); + s.replaceSplitElement(2, new Fragment("fahrt", "en")); + assertEquals("Donau+dampfschiff+fahrt(en)", s.toString()); + } + + @Test + public void testSort() + { + DecompoundedWord s1 = DecompoundedWord.createFromString("Aktion(s)+plan"); + DecompoundedWord s2 = DecompoundedWord.createFromString("Akt+ion(s)+plan"); + DecompoundedWord s3 = DecompoundedWord.createFromString("Aktionsplan"); + + List splits = new ArrayList(); + splits.add(s1); + splits.add(s2); + splits.add(s3); + + s1.setWeight(2); + s2.setWeight(3); + s3.setWeight(1); + + Collections.sort(splits); + + assertEquals(s2, splits.get(0)); + assertEquals(s1, splits.get(1)); + assertEquals(s3, splits.get(2)); + } + + @Test + public void testIsCompound() + { + DecompoundedWord s1 = DecompoundedWord.createFromString("Aktion(s)+plan"); + DecompoundedWord s2 = DecompoundedWord.createFromString("Aktionsplan"); + + assertThat(s1.isCompound()).isTrue(); + assertThat(s2.isCompound()).isFalse(); + } + + @Test + public void testHasLastFragmentMorpheme() + { + DecompoundedWord s1 = DecompoundedWord.createFromString("Aktion(s)+plan"); + DecompoundedWord s2 = DecompoundedWord.createFromString("unter+flur+konvektor(en)"); + + assertThat(s1.hasLastFragmentMorpheme()).isFalse(); + assertThat(s2.hasLastFragmentMorpheme()).isTrue(); + } +} diff --git a/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/splitter/FragmentTest.java b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/splitter/FragmentTest.java new file mode 100644 index 0000000000..a08e27e65a --- /dev/null +++ b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/splitter/FragmentTest.java @@ -0,0 +1,79 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.splitter; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.dkpro.core.decompounding.splitter.Fragment; +import org.junit.Assert; +import org.junit.Test; + +public class FragmentTest +{ + + @Test + public void testCreate() + { + Fragment e = Fragment.createFromString("aktion(s)"); + + Assert.assertEquals("aktion", e.getWord()); + Assert.assertEquals("s", e.getMorpheme()); + + e = Fragment.createFromString("plan"); + Assert.assertEquals("plan", e.getWord()); + Assert.assertEquals(null, e.getMorpheme()); + } + + @Test + public void testToString() + { + Fragment e = new Fragment(); + e.setWord("aktion"); + e.setMorpheme("s"); + Assert.assertEquals("aktion(s)", e.toString()); + + e.setMorpheme(null); + Assert.assertEquals("aktion", e.toString()); + } + + @Test + public void testEquals() + { + Fragment e1 = new Fragment(); + e1.setWord("aktion"); + e1.setMorpheme("s"); + + Fragment e2 = new Fragment(); + e2.setWord("aktion"); + e2.setMorpheme("s"); + + Assert.assertTrue(e1.equals(e1)); + + e2.setMorpheme(null); + Assert.assertFalse(e1.equals(e2)); + } + + @Test + public void testCreateFromString() + { + Fragment fragm = Fragment.createFromString("("); + + assertThat(fragm.getWord()).isEqualTo("("); + } +} diff --git a/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/splitter/JWordSplitterTest.java b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/splitter/JWordSplitterTest.java new file mode 100644 index 0000000000..9c79fd8cf5 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/splitter/JWordSplitterTest.java @@ -0,0 +1,75 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.decompounding.splitter; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.decompounding.dictionary.Dictionary; +import org.dkpro.core.decompounding.dictionary.SimpleDictionary; +import org.dkpro.core.decompounding.splitter.DecompoundedWord; +import org.dkpro.core.decompounding.splitter.JWordSplitterAlgorithm; +import org.junit.Test; + +public class JWordSplitterTest +{ + @Test + public void testSplitter() + throws IOException + { + JWordSplitterAlgorithm splitter = new JWordSplitterAlgorithm(); + List result = splitter.split("Aktionsplan").getAllSplits(); + + assertEquals(2, result.size()); + assertEquals("Aktionsplan", result.get(0).toString()); + assertEquals("Aktion(s)+plan", result.get(1).toString()); + } + + @Test + public void testSplliter2() throws IOException { + JWordSplitterAlgorithm splitter = new JWordSplitterAlgorithm(); + final File dictFile = ResourceUtils.getUrlAsFile(getClass().getResource( + "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-igerman98.dic"), + false); + + Dictionary dict = new SimpleDictionary(dictFile, "UTF-8"); + splitter.setDictionary(dict); + List result = splitter.split("geräteelektronik").getAllSplits(); + + assertThat(result).hasSize(1); + } + + @Test + public void testSplliter3() throws IOException { + JWordSplitterAlgorithm splitter = new JWordSplitterAlgorithm(); + final File dictFile = ResourceUtils.getUrlAsFile(getClass().getResource( + "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-igerman98.dic"), + false); + + Dictionary dict = new SimpleDictionary(dictFile, "UTF-8"); + splitter.setDictionary(dict); + List result = splitter.split("Schwerwiegend").getAllSplits(); + + assertThat(result).hasSize(1); + } +} diff --git a/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/splitter/LeftToRightSplitAlgorithmTest.java b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/splitter/LeftToRightSplitAlgorithmTest.java new file mode 100644 index 0000000000..aeee010c7d --- /dev/null +++ b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/splitter/LeftToRightSplitAlgorithmTest.java @@ -0,0 +1,120 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.splitter; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.decompounding.dictionary.Dictionary; +import org.dkpro.core.decompounding.dictionary.LinkingMorphemes; +import org.dkpro.core.decompounding.dictionary.SimpleDictionary; +import org.dkpro.core.decompounding.splitter.DecompoundedWord; +import org.dkpro.core.decompounding.splitter.LeftToRightSplitterAlgorithm; +import org.junit.Test; + +public class LeftToRightSplitAlgorithmTest +{ + @Test + public void testSplit1() + { + Dictionary dict = new SimpleDictionary("Akt", "ion", "plan", "Aktion", + "Aktionsplan"); + LinkingMorphemes morphemes = new LinkingMorphemes("s"); + LeftToRightSplitterAlgorithm algo = new LeftToRightSplitterAlgorithm(dict, + morphemes); + + List result = algo.split("Aktionsplan").getAllSplits(); + assertEquals(6, result.size()); + assertEquals("aktionsplan", result.get(0).toString()); + assertEquals("akt+ionsplan", result.get(1).toString()); + assertEquals("akt+ion+splan", result.get(2).toString()); + assertEquals("akt+ion(s)+plan", result.get(3).toString()); + assertEquals("aktion+splan", result.get(4).toString()); + assertEquals("aktion(s)+plan", result.get(5).toString()); + } + + @Test + public void testSplit2() + { + Dictionary dict = new SimpleDictionary("Donau", "dampf", "schiff", + "fahrt", "dampfschiff", "schifffahrt"); + LinkingMorphemes morphemes = new LinkingMorphemes("s"); + LeftToRightSplitterAlgorithm algo = new LeftToRightSplitterAlgorithm(dict, + morphemes); + + List result = algo.split("Donaudampfschifffahrt").getAllSplits(); + assertEquals(6, result.size()); + } + + @Test + public void testSplit3() + { + Dictionary dict = new SimpleDictionary("Super", "mann", "anzug", + "Supermann", "anzug"); + LinkingMorphemes morphemes = new LinkingMorphemes("s"); + LeftToRightSplitterAlgorithm algo = new LeftToRightSplitterAlgorithm(dict, + morphemes); + + List result = algo.split("Supermannanzug").getAllSplits(); + // Super+mann+anzug, Supermann+anzug + assertEquals(4, result.size()); + } + + @Test + public void testMorphemes1() + { + Dictionary dict = new SimpleDictionary("alarm", "reaktion"); + LinkingMorphemes morphemes = new LinkingMorphemes("en"); + LeftToRightSplitterAlgorithm algo = new LeftToRightSplitterAlgorithm(dict, + morphemes); + + List result = algo.split("alarmreaktionen").getAllSplits(); + + // Super+mann+anzug, Supermann+anzug + assertEquals(3, result.size()); + assertEquals("alarmreaktionen", result.get(0).toString()); + assertEquals("alarm+reaktionen", result.get(1).toString()); + assertEquals("alarm+reaktion(en)", result.get(2).toString()); + } + + @Test + public void testSplit4() throws IOException + { + final File dictFile = ResourceUtils.getUrlAsFile(getClass().getResource( + "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-igerman98.dic"), + false); + final File morphemesFile = ResourceUtils.getUrlAsFile(getClass().getResource( + "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-linking.linking"), + false); + + Dictionary dict = new SimpleDictionary(dictFile, "UTF-8"); + LinkingMorphemes morphemes = new LinkingMorphemes(morphemesFile); + + LeftToRightSplitterAlgorithm splitter = new LeftToRightSplitterAlgorithm(dict,morphemes); + + List result = splitter.split("geräteelektronik").getAllSplits(); + + assertThat(result).hasSize(1); + } +} diff --git a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/trie/TreeNodeTest.java b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/trie/TreeNodeTest.java similarity index 94% rename from dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/trie/TreeNodeTest.java rename to dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/trie/TreeNodeTest.java index eeace06fee..0a7d89e99f 100644 --- a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/trie/TreeNodeTest.java +++ b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/trie/TreeNodeTest.java @@ -15,12 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.decompounding.trie; +package org.dkpro.core.decompounding.trie; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import org.dkpro.core.decompounding.trie.KeyValueNode; import org.junit.Test; public class TreeNodeTest diff --git a/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/trie/TrieTest.java b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/trie/TrieTest.java new file mode 100644 index 0000000000..a8c5d7bae2 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/trie/TrieTest.java @@ -0,0 +1,125 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.trie; + +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.IOException; + +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.decompounding.dictionary.German98Dictionary; +import org.dkpro.core.decompounding.trie.TrieStructure; +import org.junit.Test; + +public class TrieTest +{ + + @Test + public void testAddSorted() + { + TrieStructure t = new TrieStructure(); + + t.addWord("abc"); + assertEquals(new Integer(1), t.findWord("a").getValue()); + assertEquals(new Integer(1), t.findWord("ab").getValue()); + assertEquals(new Integer(0), t.findWord("abc").getValue()); + + t.addWord("abcde"); + assertEquals(new Integer(2), t.findWord("a").getValue()); + assertEquals(new Integer(2), t.findWord("ab").getValue()); + assertEquals(new Integer(1), t.findWord("abc").getValue()); + assertEquals(new Integer(1), t.findWord("abcd").getValue()); + assertEquals(new Integer(0), t.findWord("abcde").getValue()); + + t.addWord("abde"); + assertEquals(new Integer(3), t.findWord("a").getValue()); + assertEquals(new Integer(3), t.findWord("ab").getValue()); + assertEquals(new Integer(1), t.findWord("abd").getValue()); + assertEquals(new Integer(0), t.findWord("abde").getValue()); + } + + @Test + public void testAddUnsorted() + { + TrieStructure t = new TrieStructure(); + + t.addWord("abde"); + assertEquals(new Integer(1), t.findWord("a").getValue()); + assertEquals(new Integer(1), t.findWord("ab").getValue()); + assertEquals(new Integer(1), t.findWord("abd").getValue()); + assertEquals(new Integer(0), t.findWord("abde").getValue()); + + t.addWord("abc"); + assertEquals(new Integer(2), t.findWord("a").getValue()); + assertEquals(new Integer(2), t.findWord("ab").getValue()); + assertEquals(new Integer(0), t.findWord("abc").getValue()); + + t.addWord("abcde"); + assertEquals(new Integer(3), t.findWord("a").getValue()); + assertEquals(new Integer(3), t.findWord("ab").getValue()); + assertEquals(new Integer(1), t.findWord("abc").getValue()); + assertEquals(new Integer(1), t.findWord("abcd").getValue()); + assertEquals(new Integer(0), t.findWord("abcde").getValue()); + } + + @Test + public void testSimpleDict() throws IOException + { + final File affixFile = ResourceUtils.getUrlAsFile(getClass().getResource( + "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-affix.aff"), false); + final File dictFile = ResourceUtils.getUrlAsFile(getClass().getResource( + "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-igerman98.dic"), + false); + final German98Dictionary dict = new German98Dictionary(dictFile, affixFile, "UTF-8"); + TrieStructure t = TrieStructure.createForDict(dict); + + assertEquals(new Integer(14963), t.findWord("h").getValue()); + assertEquals(new Integer(257), t.findWord("hel").getValue()); + assertEquals(new Integer(0), t.findWord("hallo").getValue()); + + assertEquals(new Integer(8033), t.findWord("t").getValue()); + assertEquals(new Integer(2714), t.findWord("tr").getValue()); + assertEquals(new Integer(996), t.findWord("tra").getValue()); + assertEquals(new Integer(38), t.findWord("tram").getValue()); + + assertEquals(new Integer(11138), t.findWord("w").getValue()); + assertEquals(new Integer(178), t.findWord("wor").getValue()); + assertEquals(new Integer(160), t.findWord("wort").getValue()); + + } + + @Test + public void testSimpleDictReverse() throws IOException + { + final File affixFile = ResourceUtils.getUrlAsFile(getClass().getResource( + "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-affix.aff"), false); + final File dictFile = ResourceUtils.getUrlAsFile(getClass().getResource( + "/de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling-de-igerman98.dic"), + false); + final German98Dictionary dict = new German98Dictionary(dictFile, affixFile, "UTF-8"); + TrieStructure t = TrieStructure.createForDict(dict); + assertEquals(new Integer(11121), t.findWord("d").getValue()); + assertEquals(new Integer(2494), t.findWord("de").getValue()); + assertEquals(new Integer(69), t.findWord("dei").getValue()); + + assertEquals(new Integer(13809), t.findWord("k").getValue()); + assertEquals(new Integer(2101), t.findWord("o").getValue()); + } +} diff --git a/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/uima/annotator/CompoundAnnotatorTest.java b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/uima/annotator/CompoundAnnotatorTest.java new file mode 100644 index 0000000000..2a1168b0c0 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/uima/annotator/CompoundAnnotatorTest.java @@ -0,0 +1,196 @@ +/* + * Copyright 2010 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package org.dkpro.core.decompounding.uima.annotator; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.ExternalResourceFactory.createResourceDescription; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.CASException; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.testing.factory.TokenBuilder; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.dkpro.core.decompounding.uima.resource.AsvToolboxSplitterResource; +import org.dkpro.core.decompounding.uima.resource.FrequencyRankerResource; +import org.dkpro.core.decompounding.uima.resource.LeftToRightSplitterResource; +import org.dkpro.core.decompounding.uima.resource.RankerResource; +import org.dkpro.core.decompounding.uima.resource.SharedDictionary; +import org.dkpro.core.decompounding.uima.resource.SharedFinder; +import org.dkpro.core.decompounding.uima.resource.SharedLinkingMorphemes; +import org.dkpro.core.decompounding.uima.resource.SharedPatriciaTries; +import org.dkpro.core.decompounding.uima.resource.SplitterResource; +import org.dkpro.core.decompounding.web1t.LuceneIndexer; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.CompoundPart; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LinkingMorpheme; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class CompoundAnnotatorTest +{ + + static File source = new File("src/test/resources/ranking/n-grams"); + static File index = new File("target/test/index"); + static String jWeb1TPath = "src/test/resources/web1t/de"; + static String indexPath = "target/test/index"; + + + @BeforeClass + public static void createIndex() + throws Exception + { + index.mkdirs(); + + LuceneIndexer indexer = new LuceneIndexer(source, index); + indexer.index(); + } + + @Test + public void testWithoutRanking() throws CASException, UIMAException { + AnalysisEngineDescription aed = createEngineDescription( + CompoundAnnotator.class, + CompoundAnnotator.RES_SPLITTING_ALGO, + createResourceDescription( + LeftToRightSplitterResource.class, + SplitterResource.PARAM_DICT_RESOURCE, + createResourceDescription(SharedDictionary.class), + SplitterResource.PARAM_MORPHEME_RESOURCE, + createResourceDescription(SharedLinkingMorphemes.class))); + String[] splits = new String[] { "Aktion", "s", "plan", "Doppel","prozessormaschine"}; + String[] compoundsParts = new String[] { "Aktion", "plan", "Doppel", "prozessormaschine"}; + runAnnotator(aed, splits, compoundsParts); + } + + @Test + public void testWithAsvToolbox() throws CASException, UIMAException { + AnalysisEngineDescription aed = createEngineDescription( + CompoundAnnotator.class, + CompoundAnnotator.RES_SPLITTING_ALGO, + createResourceDescription( + AsvToolboxSplitterResource.class, + AsvToolboxSplitterResource.PARAM_DICT_RESOURCE, + createResourceDescription(SharedDictionary.class), + AsvToolboxSplitterResource.PARAM_MORPHEME_RESOURCE, + createResourceDescription(SharedLinkingMorphemes.class), + AsvToolboxSplitterResource.PARAM_PATRICIA_TRIES_RESOURCE, + createResourceDescription(SharedPatriciaTries.class)), + CompoundAnnotator.RES_RANKING_ALGO, + createResourceDescription( + FrequencyRankerResource.class, + RankerResource.PARAM_FINDER_RESOURCE, + createResourceDescription(SharedFinder.class, + SharedFinder.PARAM_INDEX_PATH, indexPath, + SharedFinder.PARAM_NGRAM_LOCATION, jWeb1TPath))); + String[] splits = new String[] { "Aktion", "s", "plan", "Doppel","prozessormaschine", + "prozessor","maschine"}; + String[] compoundsParts = new String[] { "Aktion", "plan", "Doppel", "prozessormaschine", + "prozessor","maschine"}; + runAnnotator(aed, splits, compoundsParts); + } + + + @Test + public void testWithDefaults() throws CASException, UIMAException { + AnalysisEngineDescription aed = createEngineDescription( + CompoundAnnotator.class, + CompoundAnnotator.RES_SPLITTING_ALGO, + createResourceDescription( + LeftToRightSplitterResource.class, + SplitterResource.PARAM_DICT_RESOURCE, + createResourceDescription(SharedDictionary.class), + SplitterResource.PARAM_MORPHEME_RESOURCE, + createResourceDescription(SharedLinkingMorphemes.class)), + CompoundAnnotator.RES_RANKING_ALGO, + createResourceDescription( + FrequencyRankerResource.class, + RankerResource.PARAM_FINDER_RESOURCE, + createResourceDescription(SharedFinder.class, + SharedFinder.PARAM_INDEX_PATH, indexPath, + SharedFinder.PARAM_NGRAM_LOCATION, jWeb1TPath))); + String[] splits = new String[] { "Aktion", "s", "plan", "Doppel","prozessormaschine", + "prozessor","maschine"}; + String[] compoundsParts = new String[] { "Aktion", "plan", "Doppel", "prozessormaschine", + "prozessor","maschine"}; + runAnnotator(aed, splits, compoundsParts); + } + + private void runAnnotator(AnalysisEngineDescription aed, String[] splits, + String[] compoundsParts) + throws CASException, UIMAException + { + // Create Analysis Engine + AnalysisEngine ae = AnalysisEngineFactory.createEngine(aed); + + // Create cas with token + CAS cas = ae.newCAS(); + TokenBuilder builder = new TokenBuilder(Token.class, + Annotation.class); + builder.buildTokens(cas.getJCas(), "Aktionsplan im Doppelprozessormaschine"); + ae.typeSystemInit(cas.getTypeSystem()); + ae.process(cas); + + // Check if splits and morphemes are equal + assertThat(getAnnotation(cas.getJCas(), Compound.class)) + .containsExactly("Aktionsplan", "Doppelprozessormaschine"); + assertThat(getAnnotation(cas.getJCas(), Split.class)) + .containsExactly(splits); + assertThat(getAnnotation(cas.getJCas(), CompoundPart.class)) + .containsExactly(compoundsParts); + assertThat(getAnnotation(cas.getJCas(), LinkingMorpheme.class)) + .containsExactly("s"); + } + + protected String[] getAnnotation(JCas aCas, Class aClass) + { + List result = new ArrayList(); + for (T s : JCasUtil.select(aCas, aClass)) { + result.add(s.getCoveredText()); + } + + return result.toArray(new String[] {}); + } + + @AfterClass + public static void tearDown() + throws Exception + { + // Delete index again + for (File f : index.listFiles()) { + for (File _f : f.listFiles()) { + _f.delete(); + } + f.delete(); + } + + index.delete(); + } +} diff --git a/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/web1t/FinderPerformanceTest.java b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/web1t/FinderPerformanceTest.java new file mode 100644 index 0000000000..2ff0e6cb02 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/web1t/FinderPerformanceTest.java @@ -0,0 +1,97 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package org.dkpro.core.decompounding.web1t; + +import java.io.File; +import java.io.IOException; + +import org.dkpro.core.decompounding.web1t.Finder; +import org.junit.Test; + +import junit.framework.Assert; + +public class FinderPerformanceTest +{ + + /** + * Test the finder performace on the generated web1t index Time is printed + * on the console. + */ + @Test + public void testPerformance1() throws IOException + { + File file = new File("/home/jens/Desktop/web1tIndex4"); + if (!file.exists()) { + return; + } + File jWeb1T = new File("/home/likewise-open/UKP/santos/UKP/Library/DKPro/web1t/de"); + Finder f = new Finder(file, jWeb1T); + String[] words = new String[] { "hallo welt", "wie geht es euch", + "alpha", "zutun", "lasst uns nach hause gehen", "rennen" }; + long time = 0; + + for (String word : words) { + long start = System.currentTimeMillis(); + Assert.assertTrue(f.find(word).size() > 0); + long end = System.currentTimeMillis(); + time += end - start; + System.out.println("Time for '" + word + "' (ms): " + + (end - start)); + } + + System.out.println(" -> Average time (ms): " + + ((float) time / (float) words.length)); + } + + @Test + public void testPerformance2() throws IOException + { + File file = new File("/home/jens/Desktop/web1tIndex4"); + File jWeb1T = new File("/home/likewise-open/UKP/santos/UKP/Library/DKPro/web1t/de"); + if (!file.exists()) { + return; + } + + Finder f = new Finder(file, jWeb1T); + + String[] words = { "filmtauscher", "minimalanforderungen", + "berufungsinstanz" }; + + long time = 0; + long count = 0; + for (String word : words) { + for (int i = 1; i < word.length(); i++) { + String searchFor = word.substring(0, i); + + long start = System.currentTimeMillis(); + f.contains(searchFor); + long end = System.currentTimeMillis(); + + time += end - start; + count++; + + System.out.println("Time for '" + searchFor + "' (ms): " + + (end - start)); + } + } + + System.out.println("Average time (ms): " + + ((float) time / (float) count)); + } +} diff --git a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/FinderTest.java b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/web1t/FinderTest.java similarity index 93% rename from dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/FinderTest.java rename to dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/web1t/FinderTest.java index 0dc80f581c..977c5f1f06 100644 --- a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/FinderTest.java +++ b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/web1t/FinderTest.java @@ -15,18 +15,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.decompounding.web1t; +package org.dkpro.core.decompounding.web1t; import static org.junit.Assert.assertEquals; import java.io.File; import java.util.List; +import org.dkpro.core.decompounding.web1t.Finder; +import org.dkpro.core.decompounding.web1t.LuceneIndexer; +import org.dkpro.core.decompounding.web1t.NGramModel; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - public class FinderTest { private File source = new File("src/test/resources/n-grams"); @@ -112,7 +114,7 @@ public void testFinder2() throws Exception } index.delete(); - } + } @Rule public DkproTestContext testContext = new DkproTestContext(); diff --git a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/LuceneIndexerTest.java b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/web1t/LuceneIndexerTest.java similarity index 80% rename from dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/LuceneIndexerTest.java rename to dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/web1t/LuceneIndexerTest.java index 8d9c3c7bcb..12d189e05e 100644 --- a/dkpro-core-decompounding-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/decompounding/web1t/LuceneIndexerTest.java +++ b/dkpro-core-decompounding-asl/src/test/java/org/dkpro/core/decompounding/web1t/LuceneIndexerTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.decompounding.web1t; +package org.dkpro.core.decompounding.web1t; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; @@ -25,26 +25,27 @@ import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.store.FSDirectory; +import org.dkpro.core.decompounding.web1t.Finder; +import org.dkpro.core.decompounding.web1t.LuceneIndexer; +import org.dkpro.core.testing.DkproTestContext; import org.junit.After; import org.junit.Before; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - public class LuceneIndexerTest { private File testOutput; - private File source; - private File index; - private File targetIndex0; - private File targetIndex1; - private File jWeb1T; + private File source; + private File index; + private File targetIndex0; + private File targetIndex1; + private File jWeb1T; - @Before - public void setUp() - throws Exception - { + @Before + public void setUp() + throws Exception + { source = new File("src/test/resources/n-grams"); jWeb1T = new File("src/test/resources/web1t/de"); @@ -59,7 +60,7 @@ public void setUp() // Create index LuceneIndexer indexer = new LuceneIndexer(source, index, 2); indexer.index(); - } + } @Test public void testSearch() throws Exception @@ -95,21 +96,21 @@ public void testData() throws Exception } } - @After - public void tearDown() - throws Exception - { - // Delete index again - for (File f : index.listFiles()) { - for (File _f : f.listFiles()) { - _f.delete(); - } - f.delete(); - } + @After + public void tearDown() + throws Exception + { + // Delete index again + for (File f : index.listFiles()) { + for (File _f : f.listFiles()) { + _f.delete(); + } + f.delete(); + } - index.delete(); - } - + index.delete(); + } + @Rule public DkproTestContext testContext = new DkproTestContext(); } diff --git a/dkpro-core-decompounding-asl/src/test/resources/log4j.properties b/dkpro-core-decompounding-asl/src/test/resources/log4j.properties deleted file mode 100644 index 9ef9876f5c..0000000000 --- a/dkpro-core-decompounding-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,7 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG diff --git a/dkpro-core-decompounding-asl/src/test/resources/log4j2.xml b/dkpro-core-decompounding-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..19bf03b585 --- /dev/null +++ b/dkpro-core-decompounding-asl/src/test/resources/log4j2.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/dkpro-core-dictionaryannotator-asl/pom.xml b/dkpro-core-dictionaryannotator-asl/pom.xml index 5a8121cd02..d16e72488f 100644 --- a/dkpro-core-dictionaryannotator-asl/pom.xml +++ b/dkpro-core-dictionaryannotator-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.dictionaryannotator-asl + dkpro-core-dictionaryannotator-asl jar DKPro Core ASL - Dictionary Annotator + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -49,20 +50,24 @@ commons-io - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.semantics-asl + org.dkpro.core + dkpro-core-api-semantics-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -70,19 +75,36 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.ner-asl + org.dkpro.core + dkpro-core-api-ner-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl test + + + + eu.openminted.share.annotations + omtd-share-annotations-maven-plugin + + + + **/SemanticFieldAnnotator.xml + + + + + diff --git a/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/DictionaryAnnotator.java b/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/DictionaryAnnotator.java deleted file mode 100644 index 8dd529012f..0000000000 --- a/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/DictionaryAnnotator.java +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.dictionaryannotator; - -import static org.apache.uima.fit.util.CasUtil.getType; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; - -import java.io.IOException; -import java.io.InputStream; -import java.net.URL; -import java.util.ArrayList; -import java.util.List; - -import org.apache.commons.io.IOUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.NGram; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * Takes a plain text file with phrases as input and annotates the phrases in the CAS file. The - * annotation type defaults to {@link NGram}, but can be changed. - * - * The component requires that {@link Token}s and {@link Sentence}es are annotated in the CAS. - * - * The format of the phrase file is one phrase per line, tokens are separated by space: - * - *
- * this is a phrase
- * another phrase
- * 
- */ -@ResourceMetaData(name="Dictionary Annotator") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) -public class DictionaryAnnotator - extends JCasAnnotator_ImplBase -{ - /** - * The file must contain one phrase per line - phrases will be split at " " - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true) - private String phraseFile; - - /** - * The character encoding used by the model. - */ - public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; - @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, defaultValue="UTF-8") - private String modelEncoding; - - /** - * The annotation to create on matching phases. If nothing is specified, this defaults to - * {@link NGram}. - */ - public static final String PARAM_ANNOTATION_TYPE = "annotationType"; - @ConfigurationParameter(name = PARAM_ANNOTATION_TYPE, mandatory = false) - private String annotationType; - - /** - * Set this feature on the created annotations. - */ - public static final String PARAM_VALUE_FEATURE = "valueFeature"; - @ConfigurationParameter(name = PARAM_VALUE_FEATURE, mandatory = false, defaultValue = "value") - private String valueFeature; - - /** - * The value to set the feature configured in {@link #PARAM_VALUE_FEATURE} to. - */ - public static final String PARAM_VALUE = "value"; - @ConfigurationParameter(name = PARAM_VALUE, mandatory = false) - private String value; - - private PhraseTree phrases; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - if (annotationType == null) { - annotationType = NGram.class.getName(); - } - - phrases = new PhraseTree(); - - InputStream is = null; - try { - URL phraseFileUrl = ResourceUtils.resolveLocation(phraseFile, aContext); - is = phraseFileUrl.openStream(); - for (String inputLine : IOUtils.readLines(is, modelEncoding)) { - String[] phraseSplit = inputLine.split(" "); - phrases.addPhrase(phraseSplit); - } - } - catch (IOException e) { - throw new ResourceInitializationException(e); - } - finally { - IOUtils.closeQuietly(is); - } - } - - @Override - public void process(JCas jcas) - throws AnalysisEngineProcessException - { - Type type = getType(jcas.getCas(), annotationType); - - Feature f = null; - if ((valueFeature != null) && (value != null)) { - f = type.getFeatureByBaseName(valueFeature); - if (f == null) { - throw new IllegalArgumentException("Undeclared feature [" + valueFeature - + "] in type [" + annotationType + "]"); - } - } - - for (Sentence currSentence : select(jcas, Sentence.class)) { - ArrayList tokens = new ArrayList(selectCovered(Token.class, currSentence)); - - for (int i = 0; i < tokens.size(); i++) { - List tokensToSentenceEnd = tokens.subList(i, tokens.size() - 1); - String[] sentenceToEnd = new String[tokens.size()]; - - for (int j = 0; j < tokensToSentenceEnd.size(); j++) { - sentenceToEnd[j] = tokensToSentenceEnd.get(j).getText(); - } - - String[] longestMatch = phrases.getLongestMatch(sentenceToEnd); - - if (longestMatch != null) { - Token beginToken = tokens.get(i); - Token endToken = tokens.get(i + longestMatch.length - 1); - - AnnotationFS newFound = jcas.getCas().createAnnotation(type, - beginToken.getBegin(), endToken.getEnd()); - - if (f != null) { - newFound.setFeatureValueFromString(f, value); - } - - jcas.getCas().addFsToIndexes(newFound); - } - } - } - } -} diff --git a/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/PhraseTreeElement.java b/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/PhraseTreeElement.java deleted file mode 100644 index 854356c9b6..0000000000 --- a/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/PhraseTreeElement.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.dictionaryannotator; - -import java.util.HashMap; -import java.util.Map; - -/** - */ -public class PhraseTreeElement -{ - private String word; - - private boolean endElement; - - private Map children; - - public PhraseTreeElement(String aWord) - { - word = aWord; - - children = new HashMap(); - } - - public String getWord() - { - return word; - } - - public PhraseTreeElement addChild(String aWord) - { - // do not add if it exists - PhraseTreeElement child = getChild(aWord); - - if (child == null) { - child = new PhraseTreeElement(aWord); - children.put(aWord, child); - } - - return child; - } - - public PhraseTreeElement getChild(String aWord) - { - return children.get(aWord); - } - - public boolean isEndElement() - { - return endElement; - } - - public void setEndElement(boolean aEndElement) - { - endElement = aEndElement; - } -} diff --git a/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/package-info.java b/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/package-info.java deleted file mode 100644 index 48b699087a..0000000000 --- a/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Annotate words and multi-words using a dictionary. - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.dictionaryannotator; diff --git a/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/semantictagging/SemanticTagResource.java b/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/semantictagging/SemanticTagResource.java deleted file mode 100644 index 07a835d467..0000000000 --- a/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/semantictagging/SemanticTagResource.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.dictionaryannotator.semantictagging; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.net.URL; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.fit.component.Resource_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.resource.ResourceAccessException; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceSpecifier; - -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - - -/** - * - * This shared resource can be added as ExternalResource in Analysis Engines - * that annotate tokens with semantic tags looked up in a key-value map - * e.g., to annotate common nouns with semantic field information from WordNet. - * - * - */ - -public class SemanticTagResource - extends Resource_ImplBase - implements SemanticTagProvider -{ - - public final static String PARAM_RESOURCE_PATH = "resourcePath"; - @ConfigurationParameter(name = PARAM_RESOURCE_PATH, mandatory = true) - // TODO add default like: defaultValue = "classpath:de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling/de/igerman98/de_DE_igerman98.dic" - private String resourcePath; - - private Map keySemanticTagMap= new HashMap(); - - @Override - public boolean initialize(ResourceSpecifier aSpecifier, Map aAdditionalParams) - throws ResourceInitializationException - { - if (!super.initialize(aSpecifier, aAdditionalParams)) { - return false; - } - - try { - final URL uri = ResourceUtils.resolveLocation(resourcePath, this, null); - readFileToMap(new BufferedReader(new InputStreamReader(uri.openStream()))); - - } - catch (IOException e) { - throw new ResourceInitializationException(e); - } - - return true; - - } - - - @Override - public String getSemanticTag(Token token) throws ResourceAccessException { - - try { - if (keySemanticTagMap.containsKey(token.getLemma().getValue())) { - return keySemanticTagMap.get(token.getLemma().getValue()); - } else { - return "UNKNOWN"; - } - } catch (Exception e) { - throw new ResourceAccessException(e); - } - } - - @Override - public String getSemanticTag(List tokens) throws ResourceAccessException { - - List lemmas = new ArrayList(); - for (Token token : tokens) { - lemmas.add(token.getLemma().getValue()); - } - String lemmaString = StringUtils.join(lemmas, " "); - - try { - if (keySemanticTagMap.containsKey(lemmaString)) { - return keySemanticTagMap.get(lemmaString); - } else { - return "UNKNOWN"; - } - } catch (Exception e) { - throw new ResourceAccessException(e); - } - } - - - - private void readFileToMap(BufferedReader bufferedReader) throws IOException { - String line; - - while((line = bufferedReader.readLine())!=null){ - String temp[] = line.split("\t"); - String key = temp[0]; - String semField = temp[1]; - System.out.println(line); - keySemanticTagMap.put(key, semField); - } - } - - -} diff --git a/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/semantictagging/package-info.java b/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/semantictagging/package-info.java deleted file mode 100644 index 9eea7fc3a7..0000000000 --- a/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/semantictagging/package-info.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * - * Interface that provides access to semantic tags from various resources (UBY, key-value maps ...) - * - * - */ -package de.tudarmstadt.ukp.dkpro.core.dictionaryannotator.semantictagging; \ No newline at end of file diff --git a/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/DictionaryAnnotator.java b/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/DictionaryAnnotator.java new file mode 100644 index 0000000000..77c343d766 --- /dev/null +++ b/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/DictionaryAnnotator.java @@ -0,0 +1,186 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.dictionaryannotator; + +import static org.apache.uima.fit.util.CasUtil.getType; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.ResourceUtils; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.NGram; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Takes a plain text file with phrases as input and annotates the phrases in the CAS file. The + * annotation type defaults to {@link NGram}, but can be changed. + * + * The component requires that {@link Token}s and {@link Sentence}es are annotated in the CAS. + * + * The format of the phrase file is one phrase per line, tokens are separated by space: + * + *
+ * this is a phrase
+ * another phrase
+ * 
+ */ +@Component(OperationType.GAZETEER_BASED_MATCHER) +@ResourceMetaData(name = "Dictionary Annotator") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) +public class DictionaryAnnotator + extends JCasAnnotator_ImplBase +{ + /** + * The file must contain one phrase per line - phrases will be split at " " + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true) + private String phraseFile; + + /** + * The character encoding used by the model. + */ + public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; + @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, defaultValue = "UTF-8") + private String modelEncoding; + + /** + * The annotation to create on matching phases. If nothing is specified, this defaults to + * {@link NGram}. + */ + public static final String PARAM_ANNOTATION_TYPE = "annotationType"; + @ConfigurationParameter(name = PARAM_ANNOTATION_TYPE, mandatory = false) + private String annotationType; + + /** + * Set this feature on the created annotations. + */ + public static final String PARAM_VALUE_FEATURE = "valueFeature"; + @ConfigurationParameter(name = PARAM_VALUE_FEATURE, mandatory = false, defaultValue = "value") + private String valueFeature; + + /** + * The value to set the feature configured in {@link #PARAM_VALUE_FEATURE} to. + */ + public static final String PARAM_VALUE = "value"; + @ConfigurationParameter(name = PARAM_VALUE, mandatory = false) + private String value; + + private PhraseTree phrases; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + if (annotationType == null) { + annotationType = NGram.class.getName(); + } + + phrases = new PhraseTree(); + + InputStream is = null; + try { + URL phraseFileUrl = ResourceUtils.resolveLocation(phraseFile, aContext); + is = phraseFileUrl.openStream(); + for (String inputLine : IOUtils.readLines(is, modelEncoding)) { + String[] phraseSplit = inputLine.split(" "); + phrases.addPhrase(phraseSplit); + } + } + catch (IOException e) { + throw new ResourceInitializationException(e); + } + finally { + IOUtils.closeQuietly(is); + } + } + + @Override + public void process(JCas jcas) + throws AnalysisEngineProcessException + { + Type type = getType(jcas.getCas(), annotationType); + + Feature f = null; + if ((valueFeature != null) && (value != null)) { + f = type.getFeatureByBaseName(valueFeature); + if (f == null) { + throw new IllegalArgumentException("Undeclared feature [" + valueFeature + + "] in type [" + annotationType + "]"); + } + } + + for (Sentence currSentence : select(jcas, Sentence.class)) { + List tokens = new ArrayList<>(selectCovered(Token.class, currSentence)); + + for (int i = 0; i < tokens.size(); i++) { + List tokensToSentenceEnd = tokens.subList(i, tokens.size()); + String[] sentenceToEnd = new String[tokens.size()]; + + for (int j = 0; j < tokensToSentenceEnd.size(); j++) { + sentenceToEnd[j] = tokensToSentenceEnd.get(j).getText(); + } + + String[] longestMatch = phrases.getLongestMatch(sentenceToEnd); + + if (longestMatch != null) { + Token beginToken = tokens.get(i); + Token endToken = tokens.get(i + longestMatch.length - 1); + + AnnotationFS newFound = jcas.getCas().createAnnotation(type, + beginToken.getBegin(), endToken.getEnd()); + + if (f != null) { + newFound.setFeatureValueFromString(f, value); + } + + jcas.getCas().addFsToIndexes(newFound); + } + } + } + } +} diff --git a/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/PhraseTree.java b/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/PhraseTree.java similarity index 98% rename from dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/PhraseTree.java rename to dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/PhraseTree.java index 722449c7f6..4979da9025 100644 --- a/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/PhraseTree.java +++ b/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/PhraseTree.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.dictionaryannotator; +package org.dkpro.core.dictionaryannotator; import java.util.Arrays; diff --git a/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/PhraseTreeElement.java b/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/PhraseTreeElement.java new file mode 100644 index 0000000000..055c1ae70a --- /dev/null +++ b/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/PhraseTreeElement.java @@ -0,0 +1,72 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.dictionaryannotator; + +import java.util.HashMap; +import java.util.Map; + +/** + */ +public class PhraseTreeElement +{ + private String word; + + private boolean endElement; + + private Map children; + + public PhraseTreeElement(String aWord) + { + word = aWord; + + children = new HashMap(); + } + + public String getWord() + { + return word; + } + + public PhraseTreeElement addChild(String aWord) + { + // do not add if it exists + PhraseTreeElement child = getChild(aWord); + + if (child == null) { + child = new PhraseTreeElement(aWord); + children.put(aWord, child); + } + + return child; + } + + public PhraseTreeElement getChild(String aWord) + { + return children.get(aWord); + } + + public boolean isEndElement() + { + return endElement; + } + + public void setEndElement(boolean aEndElement) + { + endElement = aEndElement; + } +} diff --git a/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/package-info.java b/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/package-info.java new file mode 100644 index 0000000000..97e7d2b03a --- /dev/null +++ b/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Annotate words and multi-words using a dictionary. + * + * @since 1.1.0 + */ +package org.dkpro.core.dictionaryannotator; diff --git a/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/semantictagging/SemanticFieldAnnotator.java b/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/semantictagging/SemanticFieldAnnotator.java similarity index 85% rename from dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/semantictagging/SemanticFieldAnnotator.java rename to dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/semantictagging/SemanticFieldAnnotator.java index 87042bf3b0..29f2f8a5f1 100644 --- a/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/semantictagging/SemanticFieldAnnotator.java +++ b/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/semantictagging/SemanticFieldAnnotator.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.dictionaryannotator.semantictagging; +package org.dkpro.core.dictionaryannotator.semantictagging; import java.util.Collection; import java.util.Collections; @@ -36,26 +36,34 @@ import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticField; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.NamedEntity" }) /** * This Analysis Engine annotates * English single words with semantic field information retrieved from an ExternalResource. * This could be a lexical resource such as WordNet or a simple key-value map. * The annotation is stored in the SemanticField annotation type. */ -@ResourceMetaData(name="Semantic Field Annotator") +@Component(OperationType.MATCHER) +@ResourceMetaData(name = "Semantic Field Annotator") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity" }) public class SemanticFieldAnnotator extends JCasAnnotator_ImplBase { - public static final String PARAM_SEMANTIC_FIELD_RESOURCE = "semanticFieldResource"; - @ExternalResource(key = PARAM_SEMANTIC_FIELD_RESOURCE) + /** + * The semantic resource to use. + */ + public static final String RES_SEMANTIC_FIELD_RESOURCE = "semanticFieldResource"; + @ExternalResource(key = RES_SEMANTIC_FIELD_RESOURCE) private SemanticTagResource semanticFieldResource; // TODO a parameter for the language would be good @@ -70,8 +78,8 @@ public class SemanticFieldAnnotator /** * A constraint on the annotations that should be considered in form of a JXPath statement. * Example: set {@link #PARAM_ANNOTATION_TYPE} to a {@code NamedEntity} type and set the - * {@link #PARAM_CONSTRAINT} to {@code ".[value = 'LOCATION']"} to annotate only tokens with semantic fields that are - * part of a location named entity. + * {@link #PARAM_CONSTRAINT} to {@code ".[value = 'LOCATION']"} to annotate only tokens with + * semantic fields that are part of a location named entity. */ public static final String PARAM_CONSTRAINT = "constraint"; @ConfigurationParameter(name = PARAM_CONSTRAINT, mandatory = false) diff --git a/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/semantictagging/SemanticTagProvider.java b/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/semantictagging/SemanticTagProvider.java similarity index 96% rename from dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/semantictagging/SemanticTagProvider.java rename to dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/semantictagging/SemanticTagProvider.java index fd942d2ac8..0e7f1085a6 100644 --- a/dkpro-core-dictionaryannotator-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/semantictagging/SemanticTagProvider.java +++ b/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/semantictagging/SemanticTagProvider.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.dictionaryannotator.semantictagging; +package org.dkpro.core.dictionaryannotator.semantictagging; import java.util.List; diff --git a/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/semantictagging/SemanticTagResource.java b/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/semantictagging/SemanticTagResource.java new file mode 100644 index 0000000000..5c1a44697d --- /dev/null +++ b/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/semantictagging/SemanticTagResource.java @@ -0,0 +1,129 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.dictionaryannotator.semantictagging; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.fit.component.Resource_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.resource.ResourceAccessException; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.ResourceSpecifier; +import org.dkpro.core.api.resources.ResourceUtils; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + + +/** + * + * This shared resource can be added as ExternalResource in Analysis Engines + * that annotate tokens with semantic tags looked up in a key-value map + * e.g., to annotate common nouns with semantic field information from WordNet. + */ +public class SemanticTagResource + extends Resource_ImplBase + implements SemanticTagProvider +{ + + public final static String PARAM_RESOURCE_PATH = "resourcePath"; + @ConfigurationParameter(name = PARAM_RESOURCE_PATH, mandatory = true) + // TODO add default like: defaultValue = "classpath:de/tudarmstadt/ukp/dkpro/core/decompounding/lib/spelling/de/igerman98/de_DE_igerman98.dic" + private String resourcePath; + + private Map keySemanticTagMap = new HashMap(); + + @Override + public boolean initialize(ResourceSpecifier aSpecifier, Map aAdditionalParams) + throws ResourceInitializationException + { + if (!super.initialize(aSpecifier, aAdditionalParams)) { + return false; + } + + try { + final URL uri = ResourceUtils.resolveLocation(resourcePath, this, null); + readFileToMap(new BufferedReader(new InputStreamReader(uri.openStream()))); + + } + catch (IOException e) { + throw new ResourceInitializationException(e); + } + + return true; + + } + + + @Override + public String getSemanticTag(Token token) throws ResourceAccessException { + + try { + if (keySemanticTagMap.containsKey(token.getLemma().getValue())) { + return keySemanticTagMap.get(token.getLemma().getValue()); + } else { + return "UNKNOWN"; + } + } catch (Exception e) { + throw new ResourceAccessException(e); + } + } + + @Override + public String getSemanticTag(List tokens) throws ResourceAccessException { + + List lemmas = new ArrayList(); + for (Token token : tokens) { + lemmas.add(token.getLemma().getValue()); + } + String lemmaString = StringUtils.join(lemmas, " "); + + try { + if (keySemanticTagMap.containsKey(lemmaString)) { + return keySemanticTagMap.get(lemmaString); + } else { + return "UNKNOWN"; + } + } catch (Exception e) { + throw new ResourceAccessException(e); + } + } + + + + private void readFileToMap(BufferedReader bufferedReader) throws IOException { + String line; + + while ((line = bufferedReader.readLine()) != null) { + String[] temp = line.split("\t"); + String key = temp[0]; + String semField = temp[1]; + System.out.println(line); + keySemanticTagMap.put(key, semField); + } + } + + +} diff --git a/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/semantictagging/package-info.java b/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/semantictagging/package-info.java new file mode 100644 index 0000000000..2092d515e9 --- /dev/null +++ b/dkpro-core-dictionaryannotator-asl/src/main/java/org/dkpro/core/dictionaryannotator/semantictagging/package-info.java @@ -0,0 +1,22 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Interface that provides access to semantic tags from various resources (UBY, key-value maps ...) + */ +package org.dkpro.core.dictionaryannotator.semantictagging; diff --git a/dkpro-core-dictionaryannotator-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/DictionaryAnnotatorTest.java b/dkpro-core-dictionaryannotator-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/DictionaryAnnotatorTest.java deleted file mode 100644 index 218493c0df..0000000000 --- a/dkpro-core-dictionaryannotator-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/DictionaryAnnotatorTest.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.dictionaryannotator; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.util.JCasUtil.selectSingle; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; - -import org.apache.commons.lang3.exception.ExceptionUtils; -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.factory.JCasFactory; -import org.apache.uima.fit.testing.factory.TokenBuilder; -import org.apache.uima.jcas.JCas; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -public class DictionaryAnnotatorTest -{ - @Test - public void test() throws Exception - { - AnalysisEngine ae = createEngine(DictionaryAnnotator.class, - DictionaryAnnotator.PARAM_ANNOTATION_TYPE, NamedEntity.class, - DictionaryAnnotator.PARAM_MODEL_LOCATION, "src/test/resources/persons.txt"); - - JCas jcas = JCasFactory.createJCas(); - TokenBuilder tb = new TokenBuilder(Token.class, Sentence.class); - tb.buildTokens(jcas, "I am John Silver 's ghost ."); - - ae.process(jcas); - - NamedEntity ne = selectSingle(jcas, NamedEntity.class); - assertEquals("John Silver", ne.getCoveredText()); - } - - @Test - public void testWithValue() throws Exception - { - AnalysisEngine ae = createEngine(DictionaryAnnotator.class, - DictionaryAnnotator.PARAM_ANNOTATION_TYPE, NamedEntity.class, - DictionaryAnnotator.PARAM_VALUE, "PERSON", - DictionaryAnnotator.PARAM_MODEL_LOCATION, "src/test/resources/persons.txt"); - - JCas jcas = JCasFactory.createJCas(); - TokenBuilder tb = new TokenBuilder(Token.class, Sentence.class); - tb.buildTokens(jcas, "I am John Silver 's ghost ."); - - ae.process(jcas); - - NamedEntity ne = selectSingle(jcas, NamedEntity.class); - assertEquals("PERSON", ne.getValue()); - assertEquals("John Silver", ne.getCoveredText()); - } - - @Test - public void testWithWrongType() throws Exception - { - try { - AnalysisEngine ae = createEngine(DictionaryAnnotator.class, - DictionaryAnnotator.PARAM_ANNOTATION_TYPE, "lala", - DictionaryAnnotator.PARAM_VALUE, "PERSON", - DictionaryAnnotator.PARAM_MODEL_LOCATION, "src/test/resources/persons.txt"); - - JCas jcas = JCasFactory.createJCas(); - TokenBuilder tb = new TokenBuilder(Token.class, Sentence.class); - tb.buildTokens(jcas, "I am John Silver 's ghost ."); - - ae.process(jcas); - fail("An exception for an undeclared type should have been thrown"); - } - catch (AnalysisEngineProcessException e) { - assertTrue(ExceptionUtils.getRootCauseMessage(e).contains("Undeclared type")); - } - } - - @Test - public void testWithWrongValueFeature() throws Exception - { - try { - AnalysisEngine ae = createEngine(DictionaryAnnotator.class, - DictionaryAnnotator.PARAM_ANNOTATION_TYPE, NamedEntity.class, - DictionaryAnnotator.PARAM_VALUE_FEATURE, "lala", - DictionaryAnnotator.PARAM_VALUE, "PERSON", - DictionaryAnnotator.PARAM_MODEL_LOCATION, "src/test/resources/persons.txt"); - - JCas jcas = JCasFactory.createJCas(); - TokenBuilder tb = new TokenBuilder(Token.class, Sentence.class); - tb.buildTokens(jcas, "I am John Silver 's ghost ."); - - ae.process(jcas); - fail("An exception for an undeclared type should have been thrown"); - } - catch (AnalysisEngineProcessException e) { - assertTrue(ExceptionUtils.getRootCauseMessage(e).contains("Undeclared feature")); - } - } -} diff --git a/dkpro-core-dictionaryannotator-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/PhraseTreeTest.java b/dkpro-core-dictionaryannotator-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/PhraseTreeTest.java deleted file mode 100644 index 66c9fc28be..0000000000 --- a/dkpro-core-dictionaryannotator-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/PhraseTreeTest.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.dictionaryannotator; - - -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNull; -import static org.junit.Assert.assertTrue; - -import org.junit.Before; -import org.junit.Test; - -public class PhraseTreeTest -{ - private PhraseTree phrases; - - @Before - public void setUp() - throws Exception - { - phrases = new PhraseTree(); - - phrases.addPhrase("the red dog".split(" ")); - phrases.addPhrase("the red".split(" ")); - phrases.addPhrase("the new kid".split(" ")); - phrases.addPhrase("a".split(" ")); - } - - @Test - public void containsTest() - throws Exception - { - assertFalse(phrases.contains("the".split(" "))); - assertFalse(phrases.contains("the new".split(" "))); - assertFalse(phrases.contains("the new BUNNY".split(" "))); - assertFalse(phrases.contains("the red dog barks".split(" "))); - assertTrue(phrases.contains("a".split(" "))); - assertTrue(phrases.contains("the red dog".split(" "))); - assertTrue(phrases.contains("the red".split(" "))); - assertTrue(phrases.contains("the new kid".split(" "))); - } - - @Test - public void matchTest() - throws Exception - { - String[] sentence = "the red dog whines".split(" "); - String[] longestMatch = phrases.getLongestMatch(sentence); - - assertArrayEquals(longestMatch, "the red dog".split(" ")); - - sentence = "the".split(" "); - assertNull(phrases.getLongestMatch(sentence)); - - sentence = "red dog".split(" "); - assertNull(phrases.getLongestMatch(sentence)); - - sentence = "the new".split(" "); - assertNull(phrases.getLongestMatch(sentence)); - } -} diff --git a/dkpro-core-dictionaryannotator-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/semantictagging/SemanticFieldAnnotatorTest.java b/dkpro-core-dictionaryannotator-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/semantictagging/SemanticFieldAnnotatorTest.java deleted file mode 100644 index 1602982b49..0000000000 --- a/dkpro-core-dictionaryannotator-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/dictionaryannotator/semantictagging/SemanticFieldAnnotatorTest.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.dictionaryannotator.semantictagging; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.ExternalResourceFactory.createExternalResourceDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.fit.testing.factory.TokenBuilder; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_NOUN; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticField; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; - - - -/** - * - */ -public class SemanticFieldAnnotatorTest { - - @Test - public void test() - throws Exception - { - runTest("en", "Vanilla in the sky prefers braveness over jumpiness .", - new String[] { "vanilla", "in", "the", "sky", "prefer", "braveness", "over", "jumpiness", "." }, - new String[] { "NN", "NOT_RELEVANT", "NOT_RELEVANT", "NN", "NOT_RELEVANT", "NN", "NOT_RELEVANT", "NN", "$." }, - new String[] { "plant", "object", "attribute", "feeling" }); - - runTest("en", "Vanilla in the distantGalaxyBehindJupiter prefers braveness over jumpiness .", - new String[] { "vanilla", "in", "the", "distantGalaxyBehindJupiter", "prefer", "braveness", "over", "jumpiness", "." }, - new String[] { "NN", "NOT_RELEVANT", "NOT_RELEVANT", "NN", "NOT_RELEVANT", "NN", "NOT_RELEVANT", "NN", "$." }, - new String[] { "plant", "UNKNOWN", "attribute", "feeling" }); - } - - private void runTest(String language, String testDocument, String[] documentLemmas, - String[] documentPosTags, String[] documentNounSemanticFields) - throws UIMAException - { - - AnalysisEngineDescription processor = createEngineDescription( - - createEngineDescription( - SemanticFieldAnnotator.class, - SemanticFieldAnnotator.PARAM_ANNOTATION_TYPE, Token.class, - SemanticFieldAnnotator.PARAM_CONSTRAINT, ".[pos/posValue = 'NN']", - SemanticFieldAnnotator.PARAM_SEMANTIC_FIELD_RESOURCE, - createExternalResourceDescription(SemanticTagResource.class, - SemanticTagResource.PARAM_RESOURCE_PATH, - "src/test/resources/nounSemanticFieldMapTest.txt"))); - - AnalysisEngine engine = createEngine(processor); - JCas aJCas = engine.newJCas(); - aJCas.setDocumentLanguage(language); - - TokenBuilder tb = new TokenBuilder(Token.class, - Sentence.class); - tb.buildTokens(aJCas, testDocument); - - int offset = 0; - for (Token token : JCasUtil.select(aJCas, Token.class)) { - - if (documentPosTags[offset].matches("NN")) { - POS_NOUN nn = new POS_NOUN(aJCas, token.getBegin(), token.getEnd()); - nn.setPosValue(documentPosTags[offset]); - nn.addToIndexes(); - token.setPos(nn); - } - else { - POS pos = new POS(aJCas, token.getBegin(), token.getEnd()); - pos.setPosValue(documentPosTags[offset]); - pos.addToIndexes(); - token.setPos(pos); - } - - Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); - lemma.setValue(documentLemmas[offset]); - lemma.addToIndexes(); - token.setLemma(lemma); - - offset++; - } - engine.process(aJCas); - - AssertAnnotations.assertSemanticField(documentNounSemanticFields, - select(aJCas, SemanticField.class)); - } -} diff --git a/dkpro-core-dictionaryannotator-asl/src/test/java/org/dkpro/core/dictionaryannotator/DictionaryAnnotatorTest.java b/dkpro-core-dictionaryannotator-asl/src/test/java/org/dkpro/core/dictionaryannotator/DictionaryAnnotatorTest.java new file mode 100644 index 0000000000..94efc7b972 --- /dev/null +++ b/dkpro-core-dictionaryannotator-asl/src/test/java/org/dkpro/core/dictionaryannotator/DictionaryAnnotatorTest.java @@ -0,0 +1,138 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.dictionaryannotator; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.util.JCasUtil.selectSingle; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import org.apache.commons.lang3.exception.ExceptionUtils; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.testing.factory.TokenBuilder; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.dictionaryannotator.DictionaryAnnotator; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class DictionaryAnnotatorTest +{ + @Test + public void test() throws Exception + { + AnalysisEngine ae = createEngine(DictionaryAnnotator.class, + DictionaryAnnotator.PARAM_ANNOTATION_TYPE, NamedEntity.class, + DictionaryAnnotator.PARAM_MODEL_LOCATION, "src/test/resources/persons.txt"); + + JCas jcas = JCasFactory.createJCas(); + TokenBuilder tb = new TokenBuilder<>(Token.class, Sentence.class); + tb.buildTokens(jcas, "I am John Silver 's ghost ."); + + ae.process(jcas); + + NamedEntity ne = selectSingle(jcas, NamedEntity.class); + assertEquals("John Silver", ne.getCoveredText()); + } + + @Test + public void testWithValue() throws Exception + { + AnalysisEngine ae = createEngine(DictionaryAnnotator.class, + DictionaryAnnotator.PARAM_ANNOTATION_TYPE, NamedEntity.class, + DictionaryAnnotator.PARAM_VALUE, "PERSON", + DictionaryAnnotator.PARAM_MODEL_LOCATION, "src/test/resources/persons.txt"); + + JCas jcas = JCasFactory.createJCas(); + TokenBuilder tb = new TokenBuilder<>(Token.class, Sentence.class); + tb.buildTokens(jcas, "I am John Silver 's ghost ."); + + ae.process(jcas); + + NamedEntity ne = selectSingle(jcas, NamedEntity.class); + assertEquals("PERSON", ne.getValue()); + assertEquals("John Silver", ne.getCoveredText()); + } + + @Test + public void testWithWrongType() throws Exception + { + try { + AnalysisEngine ae = createEngine(DictionaryAnnotator.class, + DictionaryAnnotator.PARAM_ANNOTATION_TYPE, "lala", + DictionaryAnnotator.PARAM_VALUE, "PERSON", + DictionaryAnnotator.PARAM_MODEL_LOCATION, "src/test/resources/persons.txt"); + + JCas jcas = JCasFactory.createJCas(); + TokenBuilder tb = new TokenBuilder<>(Token.class, Sentence.class); + tb.buildTokens(jcas, "I am John Silver 's ghost ."); + + ae.process(jcas); + fail("An exception for an undeclared type should have been thrown"); + } + catch (AnalysisEngineProcessException e) { + assertTrue(ExceptionUtils.getRootCauseMessage(e).contains("Undeclared type")); + } + } + + @Test + public void testWithWrongValueFeature() throws Exception + { + try { + AnalysisEngine ae = createEngine(DictionaryAnnotator.class, + DictionaryAnnotator.PARAM_ANNOTATION_TYPE, NamedEntity.class, + DictionaryAnnotator.PARAM_VALUE_FEATURE, "lala", + DictionaryAnnotator.PARAM_VALUE, "PERSON", + DictionaryAnnotator.PARAM_MODEL_LOCATION, "src/test/resources/persons.txt"); + + JCas jcas = JCasFactory.createJCas(); + TokenBuilder tb = new TokenBuilder<>(Token.class, Sentence.class); + tb.buildTokens(jcas, "I am John Silver 's ghost ."); + + ae.process(jcas); + fail("An exception for an undeclared type should have been thrown"); + } + catch (AnalysisEngineProcessException e) { + assertTrue(ExceptionUtils.getRootCauseMessage(e).contains("Undeclared feature")); + } + } + + @Test + public void testMatchesAtEndOfSentence() throws Exception + { + AnalysisEngine ae = createEngine(DictionaryAnnotator.class, + DictionaryAnnotator.PARAM_ANNOTATION_TYPE, NamedEntity.class, + DictionaryAnnotator.PARAM_VALUE, "PERSON", + DictionaryAnnotator.PARAM_MODEL_LOCATION, "src/test/resources/persons.txt"); + + JCas jcas = JCasFactory.createJCas(); + TokenBuilder tb = new TokenBuilder<>(Token.class, Sentence.class); + tb.buildTokens(jcas, "I am John Silver"); + + ae.process(jcas); + + NamedEntity ne = selectSingle(jcas, NamedEntity.class); + assertEquals("PERSON", ne.getValue()); + assertEquals("John Silver", ne.getCoveredText()); + } +} diff --git a/dkpro-core-dictionaryannotator-asl/src/test/java/org/dkpro/core/dictionaryannotator/PhraseTreeTest.java b/dkpro-core-dictionaryannotator-asl/src/test/java/org/dkpro/core/dictionaryannotator/PhraseTreeTest.java new file mode 100644 index 0000000000..6de6147305 --- /dev/null +++ b/dkpro-core-dictionaryannotator-asl/src/test/java/org/dkpro/core/dictionaryannotator/PhraseTreeTest.java @@ -0,0 +1,77 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.dictionaryannotator; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import org.dkpro.core.dictionaryannotator.PhraseTree; +import org.junit.Before; +import org.junit.Test; + +public class PhraseTreeTest +{ + private PhraseTree phrases; + + @Before + public void setUp() + throws Exception + { + phrases = new PhraseTree(); + + phrases.addPhrase("the red dog".split(" ")); + phrases.addPhrase("the red".split(" ")); + phrases.addPhrase("the new kid".split(" ")); + phrases.addPhrase("a".split(" ")); + } + + @Test + public void containsTest() + throws Exception + { + assertFalse(phrases.contains("the".split(" "))); + assertFalse(phrases.contains("the new".split(" "))); + assertFalse(phrases.contains("the new BUNNY".split(" "))); + assertFalse(phrases.contains("the red dog barks".split(" "))); + assertTrue(phrases.contains("a".split(" "))); + assertTrue(phrases.contains("the red dog".split(" "))); + assertTrue(phrases.contains("the red".split(" "))); + assertTrue(phrases.contains("the new kid".split(" "))); + } + + @Test + public void matchTest() + throws Exception + { + String[] sentence = "the red dog whines".split(" "); + String[] longestMatch = phrases.getLongestMatch(sentence); + + assertArrayEquals(longestMatch, "the red dog".split(" ")); + + sentence = "the".split(" "); + assertNull(phrases.getLongestMatch(sentence)); + + sentence = "red dog".split(" "); + assertNull(phrases.getLongestMatch(sentence)); + + sentence = "the new".split(" "); + assertNull(phrases.getLongestMatch(sentence)); + } +} diff --git a/dkpro-core-dictionaryannotator-asl/src/test/java/org/dkpro/core/dictionaryannotator/semantictagging/SemanticFieldAnnotatorTest.java b/dkpro-core-dictionaryannotator-asl/src/test/java/org/dkpro/core/dictionaryannotator/semantictagging/SemanticFieldAnnotatorTest.java new file mode 100644 index 0000000000..66a232797c --- /dev/null +++ b/dkpro-core-dictionaryannotator-asl/src/test/java/org/dkpro/core/dictionaryannotator/semantictagging/SemanticFieldAnnotatorTest.java @@ -0,0 +1,114 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.dictionaryannotator.semantictagging; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.ExternalResourceFactory.createResourceDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.fit.testing.factory.TokenBuilder; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_NOUN; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticField; + +public class SemanticFieldAnnotatorTest +{ + + @Test + public void test() + throws Exception + { + runTest("en", "Vanilla in the sky prefers braveness over jumpiness .", + new String[] { "vanilla", "in", "the", "sky", "prefer", "braveness", "over", + "jumpiness", "." }, + new String[] { "NN", "NOT_RELEVANT", "NOT_RELEVANT", "NN", "NOT_RELEVANT", "NN", + "NOT_RELEVANT", "NN", "$." }, + new String[] { "plant", "object", "attribute", "feeling" }); + + runTest("en", "Vanilla in the distantGalaxyBehindJupiter prefers braveness over jumpiness .", + new String[] { "vanilla", "in", "the", "distantGalaxyBehindJupiter", "prefer", + "braveness", "over", "jumpiness", "." }, + new String[] { "NN", "NOT_RELEVANT", "NOT_RELEVANT", "NN", "NOT_RELEVANT", "NN", + "NOT_RELEVANT", "NN", "$." }, + new String[] { "plant", "UNKNOWN", "attribute", "feeling" }); + } + + private void runTest(String language, String testDocument, String[] documentLemmas, + String[] documentPosTags, String[] documentNounSemanticFields) + throws UIMAException + { + + AnalysisEngineDescription processor = createEngineDescription( + createEngineDescription( + SemanticFieldAnnotator.class, + SemanticFieldAnnotator.PARAM_ANNOTATION_TYPE, Token.class, + SemanticFieldAnnotator.PARAM_CONSTRAINT, ".[pos/posValue = 'NN']", + SemanticFieldAnnotator.RES_SEMANTIC_FIELD_RESOURCE, + createResourceDescription(SemanticTagResource.class, + SemanticTagResource.PARAM_RESOURCE_PATH, + "src/test/resources/nounSemanticFieldMapTest.txt"))); + + AnalysisEngine engine = createEngine(processor); + JCas aJCas = engine.newJCas(); + aJCas.setDocumentLanguage(language); + + TokenBuilder tb = new TokenBuilder(Token.class, + Sentence.class); + tb.buildTokens(aJCas, testDocument); + + int offset = 0; + for (Token token : JCasUtil.select(aJCas, Token.class)) { + + if (documentPosTags[offset].matches("NN")) { + POS_NOUN nn = new POS_NOUN(aJCas, token.getBegin(), token.getEnd()); + nn.setPosValue(documentPosTags[offset]); + nn.addToIndexes(); + token.setPos(nn); + } + else { + POS pos = new POS(aJCas, token.getBegin(), token.getEnd()); + pos.setPosValue(documentPosTags[offset]); + pos.addToIndexes(); + token.setPos(pos); + } + + Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); + lemma.setValue(documentLemmas[offset]); + lemma.addToIndexes(); + token.setLemma(lemma); + + offset++; + } + engine.process(aJCas); + + AssertAnnotations.assertSemanticField(documentNounSemanticFields, + select(aJCas, SemanticField.class)); + } +} diff --git a/dkpro-core-doc/pom.xml b/dkpro-core-doc/pom.xml index 64015cc03e..1a60835509 100644 --- a/dkpro-core-doc/pom.xml +++ b/dkpro-core-doc/pom.xml @@ -18,65 +18,54 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl jar - de.tudarmstadt.ukp.dkpro.core.doc-asl + dkpro-core-doc-asl DKPro Core ASL - Documentation + https://dkpro.github.io/dkpro-core/ true - 2.4.7 - 1.5.4.1 - 1.5.0-alpha.11 - org.dkpro.meta - dkpro-meta-core - 0.1.0 + org.codehaus.groovy + groovy + + + org.codehaus.groovy + groovy-xml org.codehaus.groovy - groovy-all + groovy-templates org.apache.commons commons-lang3 + + org.yaml + snakeyaml + + + org.dkpro.meta + dkpro-meta-core + 0.3.0 + - - - - org.codehaus.groovy - groovy-all - ${groovy.version} - - - org.codehaus.groovy - groovy - ${groovy.version} - - - org.codehaus.groovy - groovy-json - ${groovy.version} - - - org.codehaus.groovy - groovy-templates - ${groovy.version} - - - - org.codehaus.gmavenplus - gmavenplus-plugin - 1.5 + org.codehaus.gmaven + groovy-maven-plugin + 2.1.1 + + compile + generate-documentation @@ -85,12 +74,14 @@ execute - - - + + + @@ -100,15 +91,24 @@ execute - - - + + + + + + org.yaml + snakeyaml + ${snakeyaml.version} + + org.asciidoctor @@ -122,7 +122,6 @@ html5 - coderay user-guide.adoc ./user-guide/images book @@ -143,7 +142,6 @@ pdf - coderay user-guide.adoc ./user-guide/images @@ -161,7 +159,6 @@ html5 - coderay developer-guide.adoc ./developer-guide/images book @@ -181,7 +178,6 @@ html5 - coderay typesystem-reference.adoc ./typesystem-reference/images book @@ -201,7 +197,6 @@ html5 - coderay tagset-reference.adoc ./tagset-reference/images book @@ -221,7 +216,6 @@ html5 - coderay component-reference.adoc ./component-reference/images book @@ -241,7 +235,6 @@ html5 - coderay format-reference.adoc ./format-reference/images book @@ -261,7 +254,6 @@ html5 - coderay dataset-reference.adoc ./dataset-reference/images book @@ -281,7 +273,6 @@ html5 - coderay model-reference.adoc ./model-reference/images book @@ -301,7 +292,6 @@ html5 - coderay language-reference.adoc ./language-reference/images book @@ -321,7 +311,6 @@ org.asciidoctor asciidoctor-maven-plugin - 1.5.3 8 @@ -332,18 +321,6 @@ font - - - org.asciidoctor - asciidoctorj - ${asciidoctor.version} - - - org.asciidoctor - asciidoctorj-pdf - ${asciidoctor.pdf.version} - - org.apache.maven.plugins @@ -371,7 +348,7 @@ org.codehaus.gmavenplus gmavenplus-plugin - 1.5 + 1.6.3 build @@ -380,8 +357,8 @@ addTestSources generateStubs compile - testGenerateStubs - testCompile + generateTestStubs + compileTests removeStubs removeTestStubs @@ -391,7 +368,7 @@ com.bluetrainsoftware.maven groovydoc-maven-plugin - 1.3 + 2.1 attach-docs diff --git a/dkpro-core-doc/src/main/asciidoc/component-reference/sectionIntroSegmenter.adoc b/dkpro-core-doc/src/main/asciidoc/component-reference/sectionIntroSegmenter.adoc index 107feed7ed..2f4561abec 100644 --- a/dkpro-core-doc/src/main/asciidoc/component-reference/sectionIntroSegmenter.adoc +++ b/dkpro-core-doc/src/main/asciidoc/component-reference/sectionIntroSegmenter.adoc @@ -1,3 +1,19 @@ +// Copyright 2016 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + Segmenter components identify sentence boundaries and tokens. The order in which sentence splitting and tokenization are done differs between the integrated the NLP libraries. Thus, we chose to integrate both steps into a segmenter component to avoid the need to diff --git a/dkpro-core-doc/src/main/asciidoc/component-reference/sectionIntroTopic_Model.adoc b/dkpro-core-doc/src/main/asciidoc/component-reference/sectionIntroTopic_Model.adoc index 9e07cd00fe..f819600ed9 100644 --- a/dkpro-core-doc/src/main/asciidoc/component-reference/sectionIntroTopic_Model.adoc +++ b/dkpro-core-doc/src/main/asciidoc/component-reference/sectionIntroTopic_Model.adoc @@ -1,3 +1,19 @@ +// Copyright 2016 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + Topic modeling is a statistical approach to discover abstract _topics_ in a collection of documents. A topic is characterized by a probability distribution of the words in the document collection. Once a topic model has been generated, it can be used to analyze unseen documents. The result of the diff --git a/dkpro-core-doc/src/main/asciidoc/developer-guide/models.adoc b/dkpro-core-doc/src/main/asciidoc/developer-guide/models.adoc index e13aae7ca5..377addcde4 100644 --- a/dkpro-core-doc/src/main/asciidoc/developer-guide/models.adoc +++ b/dkpro-core-doc/src/main/asciidoc/developer-guide/models.adoc @@ -388,7 +388,7 @@ a model produces, not those that it consumes. | `pos.tagset` | -| `morph.tagset` +| `morph.tagset` | |==== @@ -413,16 +413,16 @@ change in the future. [options="header"] |==== |Entry|Description -| `DC.title` +| `DC.title` | -| `DC.creator` +| `DC.creator` | -| `DC.identifier` +| `DC.identifier` | -| `DC.rights` +| `DC.rights` | |==== @@ -431,16 +431,16 @@ change in the future. [options="header"] |==== |Entry|Description -| `mstparser.param.order` +| `mstparser.param.order` | Used by the MstParser component to indicate the type of model -| `flushSequence` +| `flushSequence` | Used by the TreeTagger components to mark the boundary between two documents. | `pos.tagset.tagSplitPattern` | -| `pos.tag.map.XXX` +| `pos.tag.map.XXX` | |==== diff --git a/dkpro-core-doc/src/main/asciidoc/developer-guide/testing.adoc b/dkpro-core-doc/src/main/asciidoc/developer-guide/testing.adoc index 0a0a398d2c..3f2f79b4a6 100644 --- a/dkpro-core-doc/src/main/asciidoc/developer-guide/testing.adoc +++ b/dkpro-core-doc/src/main/asciidoc/developer-guide/testing.adoc @@ -92,31 +92,39 @@ of annotation supported by DKPro Core, e.g.: == Testing I/O componets -The IOTestRunner class offers convenient methods to test I/O components: +The `ReaderAssert` and `WriterAssert` classes can be used to text I/O components. They allow building +AssertJ-style unit tests with DKPro Core reader and writer components. -* `testRoundTrip` can be used to test converting a format to CAS, converting it back and comparing - it to the original -* `testOneWay` instead is useful to read data and compare it to a reference file in a different - format (e.g. CasDumpWriter format). It can also be used if there a full round-trip is not possible - because some information is lost or cannot be exported exactly as ingested from the original file. +One of the simplest tests is a *round-trip test* where an input file is read using a reader for a +particular format, then written out again using a writer for the same format. -The input file and reference file path given to these methods is always considered relative to -`src/test/resources`. - -.Example using `testRoundTrip` with extra parameters (Conll2006ReaderWriterTest) +.Example of a round-trip test [source,java,indent=0] ---- include::{source-dir}dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2006ReaderWriterTest.java[tags=testRoundTrip] ---- -.Example using `testOneWay` with extra parameters (Conll2006ReaderWriterTest) +The reader is set up to reader the test input file. Instead of setting `PARAM_SOURCE_LOCATION`, it is +also possible to set the input location using `readingFrom()`. The writer automatically makes use of +a test output folder provided by a `DkproTestContext` - therefore a target location does not need to +be configured explicitly. + +Assuming the writer produces only a single output file, this file can be accessed for +assertions using `outputAsString()`. If multiple output files are created, an argument can be passed +to that method, e.g. `outputAsString("output.txt")`. This will look for a at the target location whose +name ends in `output.txt`. If there is none or more than one matching file, the test will fail. + +If the original input file is in a different format or cannot be fully reproduced by the writer, +then it is easy to set up a *one way test*, simply by changing the final comparison. The following +example also shows how to specify additional parameters on the reader or writer. + +.Example of a one-way test [source,java,indent=0] ---- include::{source-dir}dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2006ReaderWriterTest.java[tags=testOneWay] ---- -.Example using `testRoundTrip` with extra parameters (BratReaderWriterTest) -[source,java,indent=0]] ----- -include::{source-dir}dkpro-core-io-brat-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/brat/BratReaderWriterTest.java[tags=testOneWay] ----- +In order to test the ability of readers to read multiple files, the `asJCasList()` method can be used. +While pipelines typically re-use a single CAS which is repeatedly reset and refilled, this method +generates a list of separate CAS instances which can be individually validated after the test. To +access elements of the list use `element(n)`. \ No newline at end of file diff --git a/dkpro-core-doc/src/main/asciidoc/docinfo.html b/dkpro-core-doc/src/main/asciidoc/docinfo.html index 2ec764d524..a5886ccd74 100644 --- a/dkpro-core-doc/src/main/asciidoc/docinfo.html +++ b/dkpro-core-doc/src/main/asciidoc/docinfo.html @@ -1,3 +1,21 @@ + + diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroAclAnthology.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroAclAnthology.adoc index 2d215060fc..71b79e6700 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroAclAnthology.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroAclAnthology.adoc @@ -1,2 +1,18 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + .Known corpora in this format * link:http://acl-arc.comp.nus.edu.sg[ACL Anthology Reference Corpus (ACL ARC)] diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroBinaryCas.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroBinaryCas.adoc index f41cd2f529..f57a7c7d65 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroBinaryCas.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroBinaryCas.adoc @@ -1,3 +1,19 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + The CAS is the native data model used by UIMA. There are various ways of saving CAS data, using XMI, XCAS, or binary formats. This module supports the binary formats. diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroBlikiWikipedia.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroBlikiWikipedia.adoc index 88745c9e4d..9396f5a5e8 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroBlikiWikipedia.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroBlikiWikipedia.adoc @@ -1,3 +1,19 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + Access the online Wikipedia and extract its contents using the Bliki engine. .See also diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroBnc.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroBnc.adoc index ca57d26d8d..ca74368630 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroBnc.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroBnc.adoc @@ -1,2 +1,18 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + .Known corpora in this format * link:http://www.natcorp.ox.ac.uk[British National Corpus] diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroBrat.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroBrat.adoc new file mode 100644 index 0000000000..f5abb7717a --- /dev/null +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroBrat.adoc @@ -0,0 +1,309 @@ +// Copyright 2019 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +This format is the native format of the link:https://brat.nlplab.org[brat rapid annotation tool]. +Its official documentation can be found link:https://brat.nlplab.org/standoff.html[here]. + +In general, the format consists of two files for each document: + +* an `.ann` file containing the annotations. These are the files you need to point the + `PARAM_SOURCE_LOCATION` parameter of the `BratReader` to. +* a plain text file (`.txt`) containing the document text in UTF-8. These files need to be next to + the corresponding `.ann` files and have the same name, just with the `.txt` extension instead of + `.ann` extension. + +The brat format supports different types of annotations which start with different letters in the +`.ann` file: + +.brat annotation types +|==== +| Type | Letter | Comment + +| Text annotations +| `T` +| + +| Event annotations +| `E` +| + +| Relation annotations +| `R` +| + +| Note annotations +| `#` +| + +| Normalization annotations +| `N` +| currently not supported by DKPro Core +|==== + +.Attributes +Additionally, attributes (`A`) can be attached to annotations. Note that DKPro Core supports +attributes on relations, but the brat tool itself can only deal with attributes on text annotations +and events. The `BratReader` will try to store the values of attributes in correspondingly named +features on the target UIMA types. + +.Reading the brat format +The DKPro Core `BratReader` tries its best to map a given brat file into the UIMA type system of the +CAS it is given. Thus, the `BratReader` is not bound strictly to the pre-defined DKPro Core types, +but supports any custom types as well. Since the type names in UIMA are typically long (e.g. +`de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location`) and the names used in brat tend to be short +(e.g. `LOC`), an explicit mapping is usually required. This mapping can be provided as JSON which +needs to be passed to the `PARAM_MAPPINGS` parameter of the `BratReader`. Note that the parameter +takes actual JSON, not the path to a JSON file. + +The mapping file consists of five sections: *text type mapping*, *relation type mapping*, *span mapping*, *relation mapping* and *comment mapping*. + +.Mappings JSON file: high-level structure +[source,json] +---- +{ + 'textTypeMapppings': [ ... ], + 'relationTypeMapppings': [ ... ], + 'spans': [ ... ], + 'relations': [ ... ], + 'comments': [ ... ] +} +---- + +.Type mappings +The *type mappings* (span and relation) indicates how to find the UIMA type for a given brat annotation. Each type mapping contains two mandatory fields: + +* `from`: this field is a regular expression which matches the annotation name used by brat. Note that dashes (`-`) in the brat name must be replaced by dots (`.`) or escaped dots (`\.`) to match here! It is also possible to match multiple brat annotations at once using regular expressions such as `(PER|LOC)` or `.*-LOC`. +* `to`: this is the UIMA type to map to. + +The order of the mappings matter - brat annotations are matched to them in the order they are +defined in the mappings file. This allows e.g. to put a *catch-all* mapping at the end with +`'from': '.*'` which would match all brat annotations not matched by a previous mapping. + +.Mapping text annotations +For the purpose of mapping, brat event (`E`) and text (`T`) annotations are both considered +*text type annotations*. + +.Example: Mapping brat text-type annotations to UIMA types +[source,json] +---- +{ + 'textTypeMapppings': [ + { + 'from': 'LOC', + 'to': 'de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location' + }, + { + 'from': 'PER', + 'to': 'de.tudarmstadt.ukp.dkpro.core.api.ner.type.Person' + }, + ... + ], + ... +} +---- + +In addition to the `textTypeMapppings` section, there is a `spans` section. This can be used +to further configure any annotations of a given UIMA type that are created by the reader. In +addition to the `defaultFeatureValues` (see futher below) option, there is the option to store original +brat annotation name in a feature indicated by `subCatFeature`. The example below stores the name of +the brat annotation into the `value` feature of the `NamedEntity` type. + +.Example: Span mappings +[source,json] +---- +{ + 'spans': [ + {", + 'type': 'de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity', + 'subCatFeature': 'value', + 'defaultFeatureValues': { + 'identity': 'none' + }", + }", + ... + ], + ... +} +---- + +.Mapping events +Event annotations (`E`) from brat are basically treated like text annotations (`T`). However, events +can have multiple arguments in brat and these arguments point to other annotations. The `BratReader` +will try to store these argument values in the target UIMA type in corresponding feature values. + +For example if the brat file contains an event annotation as shown below, the target UIMA type for +the brat `pred` annotation should have a feature `subject` and a feature `object` which would be able +to accept the type of annotation to which the brat `entity` annotation is mapped. + +[source,brat] +---- +T1 pred 5 10 likes +T2 entity 0 4 John +T3 entity 11 16 pizza +E1 pred:T1 subject:T2 object:T3 +---- + +.Mapping relations +Relation annotations can be mapped in the same way. + +.Example: Mapping brat relation annotations to UIMA types +[source,json] +---- +{ + 'relationTypeMapppings': [ + { + 'from': 'nsubj|obj|iobj', + 'to': 'de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency' + }, + ... + ], + ... +} +---- + +In addition to the `textTypeMapppings` section, there is a `relations` section. Here, the features used +to represent the relation end points can be configured. The example matches all brat relation +annotations which have been mapped to the `Dependency` UIMA type. The first argument from the brat +relation is mapped to the `source` feature while the second argument is mapped to the `target` +feature. The option `flags1` or `flags2` can be set to `A` to indicate that either the offsets of the +first or second argument are used as the offsets of the created UIMA annotation. Also, the +`subCatFeature` and `defaultFeatureValues` already mentioned for the span mappings are supported. + +.Example: Mapping brat relation annotations to UIMA types +[source,json] +---- +{ + 'relations': [ + { + 'type': 'de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency', + 'arg1': 'source', + 'arg2': 'target', + 'flags2': 'A', + 'subCatFeature': 'DependencyType', + 'defaultFeatureValues': { + 'flavour': 'basic' + } + }, + ... + ], + ... +} +---- + +.Mapping brat comments to UIMA +The *comment* field of annotations is the only free text field in brat (all others have a +controlled vocabulary). Sometimes the field is indeed used for comments. But sometimes, the +field is also used to store actual tags. In order to map comments to UIMA, a `comments` +section needs to be added to the mapping file. A comment mapping then consists of these +items: + +* `type`: the name of a UIMA type to which the brat annotation was matched. +* `feature`: the feature of the UIMA type where the comment value should be stored +* `match` (optional): a regular expression indicating when to use this mapping rule. +* `replace` (optional): can be used to modify the value stores in the UIMA feature. If the + `match` field includes capturing groups in its regular expression, these can be accessed + here e.g. using `$1`. This can be used to normalize values. + +Mind that the same type can appear multiple times if the comment field should be mapped +to different features depending on the comment value. The example below maps the comment +value to the `value` feature if the comment is `PER`, `LOC`, `ORG` or `MISC`. However, if +the value field is a URL, then the comment is mapped into the `identifier` feature. + +.Example: Mapping brat relation annotations to UIMA types +[source,json] +---- +{ + 'comments': [ + { + 'type': 'de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity', + 'feature': 'value', + 'match': '^(PER|LOC|ORG|MISC)$', + }, + { + 'type': 'de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity', + 'feature': 'identifier', + 'match': '^http://.*$' + }, + ... + ], + ... +} +---- + + +.Default feature values (text-type and relation annotations) +It may be desirable to set certain UIMA features as part of the conversion. E.g. when +reading dependency relation annotations, it may be useful to set the `flavour` feature +of the DKPro Core `Dependency` type to `basic`. This can be done by adding a +`defaultFeatureValues` section to the mapping. + + +.Example: Default feature values +[source,json] +---- +{ + 'relationTypeMapppings': [ + { + 'from': 'nsubj|obj|iobj', + 'to': 'de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency', + 'defaultFeatureValues': { + 'flavour': 'basic' + } + }, + ... + ], + ... +} +---- + +Another use-case of default feature values is if the brat annotation label is actually a +concatenation of multiple tags which should be split up into multiple features at the UIMA +level: + +.Example: Multiple default feature values +[source,json] +---- +{ + 'textTypeMapppings': [ + { + 'from': 'top-left', + 'to': 'custom.Direction', + 'defaultFeatureValues': { + 'horizontal': 'left', + 'vertical': 'top' + } + }, + { + 'from': 'bottom-right', + 'to': 'custom.Direction', + 'defaultFeatureValues': { + 'horizontal': 'right', + 'vertical': 'bottom' + } + }, + ... + ], + ... +} +---- + +.Segmentation +Note that the brat annotation format does not have a built-in concept of token or sentence +boundaries. So unless these are explicitly annotated in the brat file and mapped to the DKPro Core +`Token` and `Sentence` types, there will not be any such annotations available. If you apply a +segmenter component (e.g. the DKPro Core `BreakIteratorSegmenter`) to the output of the reader you +will get token and sentence boundaries, but they *might* not coincide with the annotations boundaries +read from the brat file. Your mileage may vary. diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2000.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2000.adoc index f668e23d60..76ebfd3210 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2000.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2000.adoc @@ -1,3 +1,19 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + The CoNLL 2000 format represents POS and Chunk tags. Fields in a line are separated by spaces. Sentences are separated by a blank new line. @@ -9,34 +25,34 @@ Sentences are separated by a blank new line. | Token | token -| POSTAG  +| POSTAG | POS | part-of-speech tag -| CHUNK +| CHUNK | Chunk | chunk (IOB1 encoded) |==== .Example -[source,text] +[source,text,tabsize=0] ---- -He PRP B-NP -reckons VBZ B-VP -the DT B-NP -current JJ I-NP -account NN I-NP -deficit NN I-NP -will MD B-VP -narrow VB I-VP -to TO B-PP -only RB B-NP -# # I-NP -1.8 CD I-NP -billion CD I-NP -in IN B-PP -September NNP B-NP -. . O +He PRP B-NP +reckons VBZ B-VP +the DT B-NP +current JJ I-NP +account NN I-NP +deficit NN I-NP +will MD B-VP +narrow VB I-VP +to TO B-PP +only RB B-NP +# # I-NP +1.8 CD I-NP +billion CD I-NP +in IN B-PP +September NNP B-NP +. . O ---- .Known corpora in this format diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2002.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2002.adoc index 1f34dc06f7..2fffe77920 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2002.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2002.adoc @@ -1,10 +1,26 @@ +// Copyright 2016 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + The CoNLL 2002 format encodes named entity spans. Fields are separated by a single space. Sentences are separated by a blank new line. .Columns [cols="1,2,3", options="header"] |==== -| Column | Type/Feature | Description +| Column | Type/Feature | Description | FORM | Token | Word form or punctuation symbol. @@ -15,36 +31,33 @@ Sentences are separated by a blank new line. |==== .Example -[source,text] +[source,text,tabsize=0] ---- -Wolff B-PER -, O -currently O -a O +Wolff B-PER +, O +currently O +a O journalist O -in O -Argentina B-LOC -, O -played O -with O -Del B-PER -Bosque I-PER -in O -the O -final O -years O -of O -the O -seventies O -in O -Real B-ORG -Madrid I-ORG -. O +in O +Argentina B-LOC +, O +played O +with O +Del B-PER +Bosque I-PER +in O +the O +final O +years O +of O +the O +seventies O +in O +Real B-ORG +Madrid I-ORG +. O ---- -NOTE: For readability, the columns in the example above are aligned. In actual files, there is only - a single space separating the fields in each line. - .Known corpora in this format [cols="2*", options="header"] |==== diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2003.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2003.adoc index b3ed6629a9..6916c21165 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2003.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2003.adoc @@ -1,4 +1,20 @@ -The CoNLL 2004 format encodes named entity spans and chunk spans. Fields are separated by a single +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +The CoNLL 2003 format encodes named entity spans and chunk spans. Fields are separated by a single space. Sentences are separated by a blank new line. Named entities and chunks are encoded in the IOB1 format. I.e. a `B` prefix is only used if the category of the following span differs from the category of the current span. @@ -6,7 +22,7 @@ category of the current span. .Columns [cols="1,2,3", options="header"] |==== -| Column | Type/Feature | Description +| Column | Type/Feature | Description | FORM | Token | Word form or punctuation symbol. @@ -21,20 +37,17 @@ category of the current span. |==== .Example -[source,text] +[source,text,tabsize=0] ---- -U.N. NNP I-NP I-ORG -official NN I-NP O -Ekeus NNP I-NP I-PER -heads VBZ I-VP O -for IN I-PP O -Baghdad NNP I-NP I-LOC -. . O O +U.N. NNP I-NP I-ORG +official NN I-NP O +Ekeus NNP I-NP I-PER +heads VBZ I-VP O +for IN I-PP O +Baghdad NNP I-NP I-LOC +. . O O ---- -NOTE: For readability, the columns in the example above are aligned. In actual files, there is only - a single space separating the fields in each line. - .Known corpora in this format [cols="2*", options="header"] |==== diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2006.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2006.adoc index be77859995..751dd9882b 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2006.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2006.adoc @@ -1,9 +1,25 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + The CoNLL 2006 (aka CoNLL-X) format targets dependency parsing. Columns are tab-separated. Sentences are separated by a blank new line. .Columns [cols="1,2,3", options="header"] |==== -| Column | Type/Feature | Description +| Column | Type/Feature | Description | ID | ignored @@ -47,9 +63,9 @@ The CoNLL 2006 (aka CoNLL-X) format targets dependency parsing. Columns are tab- |==== .Example -[source,text] +[source,text,tabsize=0] ---- -Heutzutage heutzutage ADV _ _ ADV _ _ +Heutzutage heutzutage ADV _ _ ADV _ _ ---- .Known corpora in this format @@ -58,13 +74,10 @@ Heutzutage heutzutage ADV _ _ ADV _ _ | Corpus | Language -| link:http://ilk.uvt.nl/conll/free_data.html[CoNLL-X Shared Task free data] -| Danish, Dutch, Portuguese, and Swedish - -| link:https://code.google.com/p/copenhagen-dependency-treebank/[Copenhagen Dependency Treebanks] +| link:http://mbkromann.github.io/copenhagen-dependency-treebank/[Copenhagen Dependency Treebanks] | Danish -| link:http://www.ling.helsinki.fi/kieliteknologia/tutkimus/treebank/[FinnTreeBank] (in recent versions with additional pseudo-XML metadata) +| link:http://www.ling.helsinki.fi/kieliteknologia/tutkimus/treebank/index-print.shtml[FinnTreeBank] (in recent versions with additional pseudo-XML metadata) | Finnish | link:http://www.linguateca.pt/floresta/CoNLL-X[Floresta Sintá(c)tica (Bosque-CoNLL)] diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2008.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2008.adoc index e406094c9b..ba0dba6446 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2008.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2008.adoc @@ -1,9 +1,25 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + The CoNLL 2008 format targets syntactic and semantic dependencies. Columns are tab-separated. Sentences are separated by a blank new line. .Columns [cols="1,2,3", options="header"] |==== -| Column | Type/Feature | Description +| Column | Type/Feature | Description | ID | ignored @@ -55,27 +71,27 @@ The CoNLL 2008 format targets syntactic and semantic dependencies. Columns are t |==== .Example -[source,text] +[source,text,tabsize=0] ---- -1 Some some DT _ Some some DT 10 SBJ _ _ _ _ A1 _ _ _ -2 of of IN _ of of IN 1 NMOD _ _ _ _ _ _ _ _ -3 the the DT _ the the DT 5 NMOD _ _ _ _ _ _ _ _ -4 strongest strongest JJS _ strongest strong JJS 5 NMOD _ _ _ _ _ _ _ _ -5 critics critics NNS _ critics critic NNS 2 PMOD critic.01 A0 _ _ _ _ _ _ -6 of of IN _ of of IN 5 NMOD _ A1 _ _ _ _ _ _ -7 our our PRP$ _ our our PRP$ 9 NMOD _ _ A1 A0 _ _ _ _ -8 welfare welfare NN _ welfare welfare NN 9 NMOD welfare.01 _ A2 _ _ _ _ _ -9 system system NN _ system system NN 6 PMOD system.01 _ _ _ _ _ _ _ -10 are are VBP _ are be VBP 0 ROOT be.01 _ _ _ _ _ _ _ -11 the the DT _ the the DT 12 NMOD _ _ _ _ _ _ _ _ -12 people people NNS _ people people NNS 10 PRD person.02 _ _ _ A2 A0 A0 A1 -13 who who WP _ who who WP 14 SBJ _ _ _ _ _ _ _ _ -14 have have VBP _ have have VBP 12 NMOD have.04 _ _ _ _ SU _ _ -15 become become VBN _ become become VBN 14 VC become.01 _ _ _ _ A1 A1 _ -16 dependent dependent JJ _ dependent dependent JJ 15 PRD _ _ _ _ _ _ _ _ -17 on on IN _ on on IN 16 AMOD _ _ _ _ _ _ _ _ -18 it it PRP _ it it PRP 17 PMOD _ _ _ _ _ _ _ _ -19 . . . _ . . . 10 P _ _ _ _ _ _ _ _ +1 Some some DT _ Some some DT 10 SBJ _ _ _ _ A1 _ _ _ +2 of of IN _ of of IN 1 NMOD _ _ _ _ _ _ _ _ +3 the the DT _ the the DT 5 NMOD _ _ _ _ _ _ _ _ +4 strongest strongest JJS _ strongest strong JJS 5 NMOD _ _ _ _ _ _ _ _ +5 critics critics NNS _ critics critic NNS 2 PMOD critic.01 A0 _ _ _ _ _ _ +6 of of IN _ of of IN 5 NMOD _ A1 _ _ _ _ _ _ +7 our our PRP$ _ our our PRP$ 9 NMOD _ _ A1 A0 _ _ _ _ +8 welfare welfare NN _ welfare welfare NN 9 NMOD welfare.01 _ A2 _ _ _ _ _ +9 system system NN _ system system NN 6 PMOD system.01 _ _ _ _ _ _ _ +10 are are VBP _ are be VBP 0 ROOT be.01 _ _ _ _ _ _ _ +11 the the DT _ the the DT 12 NMOD _ _ _ _ _ _ _ _ +12 people people NNS _ people people NNS 10 PRD person.02 _ _ _ A2 A0 A0 A1 +13 who who WP _ who who WP 14 SBJ _ _ _ _ _ _ _ _ +14 have have VBP _ have have VBP 12 NMOD have.04 _ _ _ _ SU _ _ +15 become become VBN _ become become VBN 14 VC become.01 _ _ _ _ A1 A1 _ +16 dependent dependent JJ _ dependent dependent JJ 15 PRD _ _ _ _ _ _ _ _ +17 on on IN _ on on IN 16 AMOD _ _ _ _ _ _ _ _ +18 it it PRP _ it it PRP 17 PMOD _ _ _ _ _ _ _ _ +19 . . . _ . . . 10 P _ _ _ _ _ _ _ _ ---- .Known corpora in this format diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2009.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2009.adoc index dae0645204..2bd67fa414 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2009.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2009.adoc @@ -1,9 +1,25 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + The CoNLL 2009 format targets semantic role labeling. Columns are tab-separated. Sentences are separated by a blank new line. .Columns [cols="1,2,3", options="header"] |==== -| Column | Type/Feature | Description +| Column | Type/Feature | Description | ID | ignored @@ -70,23 +86,23 @@ The CoNLL 2009 format targets semantic role labeling. Columns are tab-separated. |==== .Example -[source,text] +[source,text,tabsize=0] ---- -1 The the the DT DT _ _ 4 4 NMOD NMOD _ _ _ _ -2 most most most RBS RBS _ _ 3 3 AMOD AMOD _ _ _ _ -3 troublesome troublesome troublesome JJ JJ _ _ 4 4 NMOD NMOD _ _ _ _ -4 report report report NN NN _ _ 5 5 SBJ SBJ _ _ _ _ -5 may may may MD MD _ _ 0 0 ROOT ROOT _ _ _ _ -6 be be be VB VB _ _ 5 5 VC VC _ _ _ _ -7 the the the DT DT _ _ 11 11 NMOD NMOD _ _ _ _ -8 August august august NNP NNP _ _ 11 11 NMOD NMOD _ _ _ AM-TMP -9 merchandise merchandise merchandise NN NN _ _ 10 10 NMOD NMOD _ _ A1 _ -10 trade trade trade NN NN _ _ 11 11 NMOD NMOD Y trade.01 _ A1 -11 deficit deficit deficit NN NN _ _ 6 6 PRD PRD Y deficit.01 _ A2 -12 due due due JJ JJ _ _ 13 11 AMOD APPO _ _ _ _ -13 out out out IN IN _ _ 11 12 APPO AMOD _ _ _ _ -14 tomorrow tomorrow tomorrow NN NN _ _ 13 12 TMP TMP _ _ _ _ -15 . . . . . _ _ 5 5 P P _ _ _ _ +1 The the the DT DT _ _ 4 4 NMOD NMOD _ _ _ _ +2 most most most RBS RBS _ _ 3 3 AMOD AMOD _ _ _ _ +3 troublesome troublesome troublesome JJ JJ _ _ 4 4 NMOD NMOD _ _ _ _ +4 report report report NN NN _ _ 5 5 SBJ SBJ _ _ _ _ +5 may may may MD MD _ _ 0 0 ROOT ROOT _ _ _ _ +6 be be be VB VB _ _ 5 5 VC VC _ _ _ _ +7 the the the DT DT _ _ 11 11 NMOD NMOD _ _ _ _ +8 August august august NNP NNP _ _ 11 11 NMOD NMOD _ _ _ AM-TMP +9 merchandise merchandise merchandise NN NN _ _ 10 10 NMOD NMOD _ _ A1 _ +10 trade trade trade NN NN _ _ 11 11 NMOD NMOD Y trade.01 _ A1 +11 deficit deficit deficit NN NN _ _ 6 6 PRD PRD Y deficit.01 _ A2 +12 due due due JJ JJ _ _ 13 11 AMOD APPO _ _ _ _ +13 out out out IN IN _ _ 11 12 APPO AMOD _ _ _ _ +14 tomorrow tomorrow tomorrow NN NN _ _ 13 12 TMP TMP _ _ _ _ +15 . . . . . _ _ 5 5 P P _ _ _ _ ---- .Known corpora in this format diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2012.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2012.adoc index 38a699a27b..3f93f9e6ad 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2012.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConll2012.adoc @@ -1,9 +1,36 @@ -The CoNLL 2012 format targets semantic role labeling and coreference. Columns are tab-separated. Sentences are separated by a blank new line. +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +The CoNLL 2012 format targets semantic role labeling and coreference. Columns are whitespace-separated (tabs or spaces). Sentences are separated by a blank new line. + +Note that this format cannot deal with the following situations: +* An annotation has no label (e.g. a `SemPred` annotation has no category) - in such a case `null` is + written into the corresponding column. However, the reader will actually read this value as the + label. +* If a `SemPred` annotation is at the same position as a `SemArg` annotation linked to it, then only + the `(V*)` representing the `SemPred` annotation will be written. +* `SemPred` annotations spanning more than one token are not supported +* If there are multiple `SemPred` annotations on the same token, then only one of them is written. + This is because the `category` of the `SemPred` annotation goes to the **Predicate Frameset ID** + and that can only hold one value which. .Columns [cols="1,2,3", options="header"] |==== -| Column | Type/Feature | Description +| Column | Type/Feature | Description | Document ID | ignored @@ -11,11 +38,11 @@ The CoNLL 2012 format targets semantic role labeling and coreference. Columns ar | Part number | ignored -| Some files are divided into multiple parts numbered as 000, 001, 002, ... etc. +| Some files are divided into multiple parts numbered as 000, 001, 002, ... etc. | Word number | ignored -| +| | Word itself | document text @@ -27,11 +54,11 @@ The CoNLL 2012 format targets semantic role labeling and coreference. Columns ar | Parse bit | Constituent -| This is the bracketed structure broken before the first open parenthesis in the parse, and the word/part-of-speech leaf replaced with a `*`. The full parse can be created by substituting the asterix with the `([pos] [word])` string (or leaf) and concatenating the items in the rows of that column. +| This is the bracketed structure broken before the first open parenthesis in the parse, and the word/part-of-speech leaf replaced with a `*`. The full parse can be created by substituting the asterisk with the `([pos] [word])` string (or leaf) and concatenating the items in the rows of that column. | Predicate lemma | Lemma -| The predicate lemma is mentioned for the rows for which we have semantic role information. All other rows are marked with a "-". +| The predicate lemma is mentioned for the rows for which we have semantic role information. All other rows are marked with a `-`. | Predicate Frameset ID | SemPred @@ -59,12 +86,12 @@ The CoNLL 2012 format targets semantic role labeling and coreference. Columns ar |==== .Example -[source,text] +[source,text,tabsize=0] ---- -en-orig.conll 0 0 John NNP (TOP(S(NP*) john - - - (PERSON) (A0) (1) -en-orig.conll 0 1 went VBD (VP* go go.02 - - * (V*) - -en-orig.conll 0 2 to TO (PP* to - - - * * - -en-orig.conll 0 3 the DT (NP* the - - - * * (2 -en-orig.conll 0 4 market NN *))) market - - - * (A1) 2) -en-orig.conll 0 5 . . *)) . - - - * * - +en-orig.conll 0 0 John NNP (TOP(S(NP*) john - - - (PERSON) (A0) (1) +en-orig.conll 0 1 went VBD (VP* go go.02 - - * (V*) - +en-orig.conll 0 2 to TO (PP* to - - - * * - +en-orig.conll 0 3 the DT (NP* the - - - * * (2 +en-orig.conll 0 4 market NN *))) market - - - * (A1) 2) +en-orig.conll 0 5 . . *)) . - - - * * - ---- diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConllCoreNlp.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConllCoreNlp.adoc new file mode 100644 index 0000000000..12aa4aba2d --- /dev/null +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConllCoreNlp.adoc @@ -0,0 +1,66 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +The CoreNLP CoNLL format is used by the Stanford CoreNLP package. Columns are tab-separated. +Sentences are separated by a blank new line. + +.Columns +[cols="1,2,3", options="header"] +|==== +| Column | Type/Feature | Description + +| ID +| ignored +| Token counter, starting at 1 for each new sentence. + +| FORM +| Token +| Word form or punctuation symbol. + +| LEMMA +| Lemma +| Lemma of the word form. + +| POSTAG +| POS PosValue +| Fine-grained part-of-speech tag, where the tagset depends on the language, or identical to the coarse-grained part-of-speech tag if not available. + +| NER +| NamedEntity +| Named Entity tag, or underscore if not available. If a named entity covers multiple tokens, all +of the tokens simply carry the same label without (no sequence encoding). + +| HEAD +| Dependency +| Head of the current token, which is either a value of ID or zero ('0'). Note that depending on the original treebank annotation, there may be multiple tokens with an ID of zero. + +| DEPREL +| Dependency +| Dependency relation to the HEAD. The set of dependency relations depends on the particular language. Note that depending on the original treebank annotation, the dependency relation may be meaningful or simply 'ROOT'. +|==== + +.Example +[source,text,tabsize=0] +---- +1 Selectum Selectum NNP O _ _ +2 , , , O _ _ +3 Société Société NNP O _ _ +4 d'Investissement d'Investissement NNP O _ _ +5 à à NNP O _ _ +6 Capital Capital NNP O _ _ +7 Variable Variable NNP O _ _ +8 . . . O _ _ +---- diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConllU.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConllU.adoc index aa70edd909..bacad80af5 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConllU.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroConllU.adoc @@ -1,13 +1,30 @@ -The CoNLL 2012 format targets semantic role labeling and coreference. Columns are tab-separated. Sentences are separated by a blank new line. +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +The CoNLL-U format format targets dependency parsing. Columns are tab-separated. Sentences are +separated by a blank new line. .Columns [cols="1,2,3", options="header"] |==== -| Column | Type/Feature | Description +| Column | Type/Feature | Description | ID | ignored -| Word index, integer starting at 1 for each new sentence; may be a range for tokens with multiple words. +| Word index, integer starting at 1 for each new sentence; may be a range for tokens with multiple words. | FORM | Token @@ -47,14 +64,14 @@ The CoNLL 2012 format targets semantic role labeling and coreference. Columns ar |==== .Example -[source,text] +[source,text,tabsize=0] ---- -1 They they PRON PRN Case=Nom|Number=Plur 2 nsubj 4:nsubj _ -2 buy buy VERB VB Number=Plur|Person=3|Tense=Pres 0 root _ _ -3 and and CONJ CC _ 2 cc _ _ -4 sell sell VERB VB Number=Plur|Person=3|Tense=Pres 2 conj 0:root _ -5 books book NOUN NNS Number=Plur 2 dobj 4:dobj SpaceAfter=No -6 . . PUNCT . _ 2 punct _ _ +1 They they PRON PRN Case=Nom|Number=Plur 2 nsubj 4:nsubj _ +2 buy buy VERB VB Number=Plur|Person=3|Tense=Pres 0 root _ _ +3 and and CONJ CC _ 2 cc _ _ +4 sell sell VERB VB Number=Plur|Person=3|Tense=Pres 2 conj 0:root _ +5 books book NOUN NNS Number=Plur 2 dobj 4:dobj SpaceAfter=No +6 . . PUNCT . _ 2 punct _ _ ---- .Known corpora in this format diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroImsCwb.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroImsCwb.adoc index a763059391..1ac17b1b15 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroImsCwb.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroImsCwb.adoc @@ -1,9 +1,65 @@ -The IMS Open Corpus Workbench is a linguistic search engine. It uses a tab-separated format +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +The "verticalized XML" format used by the link:http://cwb.sourceforge.net[IMS Open Corpus Workbench], a linguistic search engine. It uses a tab-separated format with limited markup (e.g. for sentences, documents, but not recursive structures like -parse-trees). If a local installation of the corpus workbench is available, it can be used +parse-trees). In principle, it is a generic format - i.e. there can be arbitrary columns, pseudo-XML elements and attributes. However, support is limited to a specific set of columns that must appear exactly in a specific order: *token text*, *part-of-speech tag*, *lemma*. Also only specific pseudo-XML elements and attributes are supported: `text` (including an `id` attribute), `s`. + +If a local installation of the corpus workbench is available, it can be used by this module to immediately generate the corpus workbench index format. Search is not supported by this module. +.Example +[source,text,tabsize=0] +---- + + +Nikita NE Nikita +( $( ( +La FM La +Femme NN Femme +Nikita NE Nikita +) $( ) +Dieser PDS dies +Episodenführer NN Episodenführer +wurde VAFIN werden +von APPR von +September NN September +1998 CARD 1998 +bis APPR bis +Mai NN Mai +1999 CARD 1999 +von APPR von +Konstantin NE Konstantin +C.W. NE C.W. +Volkmann NE Volkmann +geschrieben VVPP schreiben +und KON und +im APPRART im +Mai NN Mai +2000 CARD 2000 +von APPR von +Stefan NE Stefan +Börzel NN Börzel +übernommen VVPP übernehmen +. $. . + + +---- + .See also * link:http://cwb.sourceforge.net[IMS Open Corpus Workbench] diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroLif.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroLif.adoc new file mode 100644 index 0000000000..8c43d55850 --- /dev/null +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroLif.adoc @@ -0,0 +1,38 @@ +// Copyright 2019 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +The the link:https://wiki.lappsgrid.org/interchange/[LAPPS Interchange Format] (LIF) is a JSON-based format which is used by the link:http://www.lappsgrid.org[Language Applications Grid]. The the format is in principle generic, the support for it is based on the link:http://vocab.lappsgrid.org[LAPPS Web Service Exchange Vocabulary]. + +.Example +[source,text] +---- +{ + "id": "v2", + "metadata": { + "contains": { + "Token": { + "producer": "org.anc.lapps.stanford.SATokenizer:1.4.0", + "type": "tokenization:stanford" }, + "Token#pos": { + "producer": "org.anc.lapps.stanford.SATagger:1.4.0", + "posTagSet": "penn", + "type": "postagging:stanford" }}}, + "annotations": [ + { "@type": "Token", "id": "tok0", "start": 0, "end": 4, "features": { "pos": "NNP" } }, + { "@type": "Token", "id": "tok1", "start": 5, "end": 10, "features": { "pos": "VBZ" } }, + { "@type": "Token", "id": "tok2", "start": 10, "end": 11, "features": { "pos": "." } } ] +} +---- diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroNegra.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroNegra.adoc index 2b797d4be0..3a05783072 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroNegra.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroNegra.adoc @@ -1,3 +1,19 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + .See also * link:http://www.coli.uni-saarland.de/%7Ethorsten/publications/Brants-CLAUS98.pdf[Thorsten Brants, 1997, NeGra Export Format for Annotated Corpora (Version 3)] diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroNif.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroNif.adoc index 014ef3851e..90a07fbabd 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroNif.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroNif.adoc @@ -1,5 +1,59 @@ -The NLP Interchange Format (NIF) provides a way of representing NLP information using semantic web -technology, specifically RDF and OWL. +// Copyright 2019 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +The link:https://persistence.uni-leipzig.org/nlp2rdf/[NLP Interchange Format] (NIF) provides a way of representing NLP information using semantic web technology, specifically RDF and OWL. A few additions of the format were defined in the apparently in-official link:https://nif.readthedocs.io/en/latest/[NIF 2.1] specification. + +.Example +[source,text] +---- +@prefix rdfs: . +@prefix nif: . +@prefix itsrdf: . +@prefix xsd: . +@prefix rdf: . + + + a nif:RFC5147String , nif:String , nif:Context ; + nif:beginIndex "0"^^xsd:nonNegativeInteger ; + nif:endIndex "86"^^xsd:nonNegativeInteger ; + nif:isString "Japan (Japanese: 日本 Nippon or Nihon) is a stratovolcanic archipelago of 6,852 islands."^^xsd:string ; + nif:topic . + + + a nif:RFC5147String , nif:String ; + nif:anchorOf "Japan"^^xsd:string ; + nif:beginIndex "0"^^xsd:nonNegativeInteger ; + nif:endIndex "5"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taClassRef , ; + itsrdf:taIdentRef . + + + a nif:RFC5147String , nif:String ; + nif:anchorOf "stratovolcanic archipelago"^^xsd:string ; + nif:beginIndex "42"^^xsd:nonNegativeInteger ; + nif:endIndex "68"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taClassRef , rdfs:Class ; + itsrdf:taIdentRef . + + + a nif:Annotation ; + itsrdf:taIdentRef . +---- .Known corpora in this format * link:https://datahub.io/dataset/kore-50-nif-ner-corpus[KORE 50 NIF NER Corpus] diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroPennTreebankCombined.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroPennTreebankCombined.adoc index 9fe658212a..87cd841baf 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroPennTreebankCombined.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroPennTreebankCombined.adoc @@ -1,2 +1,18 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + .Known corpora in this format * link:http://www.linguateca.pt/floresta/corpus.html[Floresta Sintá(c)tica (Bosque)] - Portuguese diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroPerseus.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroPerseus.adoc new file mode 100644 index 0000000000..1758544c6f --- /dev/null +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroPerseus.adoc @@ -0,0 +1,40 @@ +// Copyright 2019 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +An XML format used by the link:https://github.com/PerseusDL/treebank_data/tree/master/v2.1[Perseus Ancient Greek and Latin Dependency Treebank]. + +.Example (excerpt from link:https://github.com/PerseusDL/treebank_data/blob/master/v2.1/Greek/texts/tlg0013.tlg002.perseus-grc1.tb.xml[tlg0013.tlg002.perseus-grc1.tb.xml]) +[source,text] +---- + + + + + + + + + + + + + + + + + + +---- diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTGrep.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTGrep.adoc index 01041410fe..352bcfb4d5 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTGrep.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTGrep.adoc @@ -1,3 +1,19 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + TGrep and TGrep2 are a tools to search over syntactic parse trees represented as bracketed structures. This module supports in particular TGrep2 and allows to conveniently generate TGrep2 indexes which can then be searched. Search is not supported by this diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTcf.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTcf.adoc index e1d9721b86..232f63013d 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTcf.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTcf.adoc @@ -1,3 +1,19 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + The TCF (Text Corpus Format) was created in the context of the CLARIN project. It is mainly used to exchange data between the different web-services that are part of the WebLicht platform. diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTei.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTei.adoc index 7791d0c02f..9b57c0cef7 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTei.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTei.adoc @@ -1,3 +1,21 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +The link:https://tei-c.org/guidelines/p5/[TEI P5 XML] format is a widely used standard format. It is a very complex format and furthermore is often extended for specific corpora. The reader and writer components offered by DKPro Core support various common element types, but by far not all. + .Known corpora in this format * link:http://nltk.org/nltk_data/[Brown Corpus (TEI XML Version)] * link:http://www.textgrid.de/Digitale-Bibliothek[Digitale Bibliothek bei TextGrid] diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTigerXml.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTigerXml.adoc index ebf2a9840a..e2a3a8c233 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTigerXml.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTigerXml.adoc @@ -1,3 +1,19 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + The link:http://www.ims.uni-stuttgart.de/forschung/ressourcen/werkzeuge/TIGERSearch/doc/html/TigerXML.html[TIGER XML format] was created for encoding syntactic constituency structures in the German TIGER corpus. It has since been used for many other corpora as well. link:http://www.ims.uni-stuttgart.de/forschung/ressourcen/werkzeuge/tigersearch.html[TIGERSearch] is a linguistic search engine specifically targetting this diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTuebaDZ.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTuebaDZ.adoc index 2b33af961f..58f51e9e73 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTuebaDZ.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTuebaDZ.adoc @@ -1,3 +1,19 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + The TüBa-D/Z treebank is a syntactically annotated German newspaper corpus based on data taken from the daily issues of 'die tageszeitung' (taz). @@ -6,7 +22,7 @@ Sentences have a header line and are followed by a blank new line. .Columns [cols="1,2,3", options="header"] |==== -| Column | Type/Feature | Description +| Column | Type/Feature | Description | FORM | Token diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTuepp.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTuepp.adoc index 401ca111c9..4f1ae239c5 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTuepp.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroTuepp.adoc @@ -1,3 +1,19 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + TüPP D/Z is a collection of articles from the German newspaper taz (die tageszeitung) annotated and encoded in a XML format. diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroWeb1T.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroWeb1T.adoc index f2f9ee6c91..0151c525c6 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroWeb1T.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroWeb1T.adoc @@ -1,3 +1,19 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + The Web1T n-gram corpus is a huge collection of n-grams collected from the internet. The jweb1t library allows to access this corpus efficiently. This module provides support for the file format used by the Web1T n-gram corpus and allows to conveniently created diff --git a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroXmi.adoc b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroXmi.adoc index 925fa20880..9d4a5b5c17 100644 --- a/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroXmi.adoc +++ b/dkpro-core-doc/src/main/asciidoc/format-reference/sectionIntroXmi.adoc @@ -1,3 +1,19 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + One of the official formats supported by UIMA is the XMI format. It is an XML-based format that does not support a few very specific characters which are invalid in XML. But it is able to capture all the information contained in the CAS. The XMI format is the de-facto standard for exchanging data @@ -6,7 +22,7 @@ in the UIMA world. Most UIMA-related tools support it. The XMI format does not include type system information. It is therefore recommended to always configure the XmiWriter component to also write out the type system to a file. -If you with to view anntated documents using the UIMA CAS Editor in Eclipse, you can e.g. set up +If you with to view annotated documents using the UIMA CAS Editor in Eclipse, you can e.g. set up your XmiWriter in the following way to write out XMIs and a type system file: [source,java] diff --git a/dkpro-core-doc/src/main/asciidoc/typesystem-reference/images/ts_xml.png b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/images/ts_xml.png new file mode 100644 index 0000000000..4ed232ecfc Binary files /dev/null and b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/images/ts_xml.png differ diff --git a/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroCoreference.adoc b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroCoreference.adoc index e7327b6391..adf8f9be57 100644 --- a/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroCoreference.adoc +++ b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroCoreference.adoc @@ -1,3 +1,19 @@ +// Copyright 2016 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + .Coreference types image::ts_coreference.png[align="center"] diff --git a/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroFrequency.adoc b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroFrequency.adoc index c6a7c21cdc..f3cb4dc9bf 100644 --- a/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroFrequency.adoc +++ b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroFrequency.adoc @@ -1,3 +1,19 @@ +// Copyright 2016 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + .Phrase types image::ts_phrase.png[align="center"] diff --git a/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroMetadata.adoc b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroMetadata.adoc index 7b3936c1d0..8b8420ef9f 100644 --- a/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroMetadata.adoc +++ b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroMetadata.adoc @@ -1,3 +1,19 @@ +// Copyright 2016 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + [NOTE] ==== Recording tagset and tag descriptions in the CAS is still a feature under diff --git a/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroMorphology.adoc b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroMorphology.adoc index 5c4e9be39e..afdca16aa6 100644 --- a/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroMorphology.adoc +++ b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroMorphology.adoc @@ -1,2 +1,18 @@ +// Copyright 2016 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + .Morphology types image::ts_morphology.png[align="center"] diff --git a/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroPhonetics.adoc b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroPhonetics.adoc index ba9328c770..98682f140f 100644 --- a/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroPhonetics.adoc +++ b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroPhonetics.adoc @@ -1,2 +1,18 @@ +// Copyright 2016 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + .Phonetics types image::ts_phonetics.png[align="center"] diff --git a/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroSegmentation.adoc b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroSegmentation.adoc index 957dcc5310..e28bc4489e 100644 --- a/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroSegmentation.adoc +++ b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroSegmentation.adoc @@ -1,3 +1,19 @@ +// Copyright 2016 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + .Segmentation types image::ts_segmentation.png[align="center"] diff --git a/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroSemantics.adoc b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroSemantics.adoc index 9744588f8c..34a848cfb6 100644 --- a/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroSemantics.adoc +++ b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroSemantics.adoc @@ -1,3 +1,19 @@ +// Copyright 2016 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + .Segmentation types image::ts_semantics.png[align="center"] diff --git a/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroSyntax.adoc b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroSyntax.adoc index da4c63cf24..fa0cb2f26c 100644 --- a/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroSyntax.adoc +++ b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroSyntax.adoc @@ -1,2 +1,18 @@ +// Copyright 2016 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + .Syntax types image::ts_syntax.png[align="center"] diff --git a/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroXML.adoc b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroXML.adoc new file mode 100644 index 0000000000..9fb9d899df --- /dev/null +++ b/dkpro-core-doc/src/main/asciidoc/typesystem-reference/sectionIntroXML.adoc @@ -0,0 +1,18 @@ +// Copyright 2016 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +.XML structure types +image::ts_xml.png[align="center"] diff --git a/dkpro-core-doc/src/main/asciidoc/user-guide/bibliography.adoc b/dkpro-core-doc/src/main/asciidoc/user-guide/bibliography.adoc index 73bb7e2cbb..babc4415db 100644 --- a/dkpro-core-doc/src/main/asciidoc/user-guide/bibliography.adoc +++ b/dkpro-core-doc/src/main/asciidoc/user-guide/bibliography.adoc @@ -1,3 +1,19 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + === References Here are some further references that might be helpful when deciding which tools to use: diff --git a/dkpro-core-doc/src/main/asciidoc/user-guide/datasets.adoc b/dkpro-core-doc/src/main/asciidoc/user-guide/datasets.adoc index 79627fbbf7..b646b15926 100644 --- a/dkpro-core-doc/src/main/asciidoc/user-guide/datasets.adoc +++ b/dkpro-core-doc/src/main/asciidoc/user-guide/datasets.adoc @@ -87,7 +87,7 @@ artifacts:: A list of artifacts that make of the dataset. The relevant artifacts be limited to the data files themselves, but could also include license texts or readme files if they are not part of a dataset archive. If a dataset is not distributed as an archive but rather as a set of files, each of the files should be listed here. - To describe an artifact, the **name**, **url**, and **sha1** checksum are required. + To describe an artifact, the **name**, **url**, and **sha512** checksum are required. The name of the artifact should correspond to the filename part of the URL from which the artifact is downloaded. However, sometimes it is convenient to use a simpler name, e.g. `data.zip`. However, the extension should always be preserved. This is particularly @@ -95,6 +95,14 @@ artifacts:: A list of artifacts that make of the dataset. The relevant artifacts [sect_datasets_actions] section below. + If an artifact contains multiple datasets, it can be **shared** to avoid downloading and caching it redundantly. See [sect_datasets_sharing] below. + + It is possible to set the **verificationMode** to **TEXT** in order to normalize whitespace + before calculating the checksum. This is recommended for license files or documentation but + not for actual data files (even if they are in a text format such as a CoNLL variant).+ + An artifact can be marked as optional by setting **optional** to **true**. This is useful + when referencing secondary artifacts such as licenses or documentation from sources other than + the main dataset files. E.g. if the sources for the secondary artifacts are not available, then + failing to download the optional artifacts will not fail the materialization of the entire + dataset.+ + .Example artifacts section [source,yaml,indent=0] @@ -102,7 +110,7 @@ artifacts:: A list of artifacts that make of the dataset. The relevant artifacts artifacts: gum.zip: url: "https://github.com/amir-zeldes/gum/archive/V2.2.0.zip" - sha1: b17e276998ced83153be605d8157afacf1f10fdc + sha1: c9606ba69ec1152267b8...(snip)... actions: - action: explode configuration: { includes: ["dep/*", "LICENSE.txt", "README.md"], strip: 1 } diff --git a/dkpro-core-doc/src/main/groovy/de/tudarmstadt/ukp/dkpro/core/doc/DocumentationBuilder.groovy b/dkpro-core-doc/src/main/groovy/org/dkpro/core/doc/DocumentationBuilder.groovy similarity index 98% rename from dkpro-core-doc/src/main/groovy/de/tudarmstadt/ukp/dkpro/core/doc/DocumentationBuilder.groovy rename to dkpro-core-doc/src/main/groovy/org/dkpro/core/doc/DocumentationBuilder.groovy index 96428592ab..c9910b95bb 100644 --- a/dkpro-core-doc/src/main/groovy/de/tudarmstadt/ukp/dkpro/core/doc/DocumentationBuilder.groovy +++ b/dkpro-core-doc/src/main/groovy/org/dkpro/core/doc/DocumentationBuilder.groovy @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.doc; +package org.dkpro.core.doc; import static groovy.io.FileType.FILES; import groovy.json.*; @@ -43,6 +43,8 @@ class DocumentationBuilder { MetadataModel model = new MetadataAggregator().build(dkproCorePath); + Yaml.any(); + def templateBinding = [ project: ContextHolder.project, log: ContextHolder.log, diff --git a/dkpro-core-doc/src/main/groovy/de/tudarmstadt/ukp/dkpro/core/doc/PomSanityCheck.groovy b/dkpro-core-doc/src/main/groovy/org/dkpro/core/doc/PomSanityCheck.groovy similarity index 98% rename from dkpro-core-doc/src/main/groovy/de/tudarmstadt/ukp/dkpro/core/doc/PomSanityCheck.groovy rename to dkpro-core-doc/src/main/groovy/org/dkpro/core/doc/PomSanityCheck.groovy index d57f39ecae..d03a35e424 100644 --- a/dkpro-core-doc/src/main/groovy/de/tudarmstadt/ukp/dkpro/core/doc/PomSanityCheck.groovy +++ b/dkpro-core-doc/src/main/groovy/org/dkpro/core/doc/PomSanityCheck.groovy @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.doc +package org.dkpro.core.doc import static groovy.io.FileType.FILES diff --git a/dkpro-core-doc/src/main/groovy/de/tudarmstadt/ukp/dkpro/core/doc/Util.groovy b/dkpro-core-doc/src/main/groovy/org/dkpro/core/doc/Util.groovy similarity index 97% rename from dkpro-core-doc/src/main/groovy/de/tudarmstadt/ukp/dkpro/core/doc/Util.groovy rename to dkpro-core-doc/src/main/groovy/org/dkpro/core/doc/Util.groovy index eba90b2ede..7228df2dc7 100644 --- a/dkpro-core-doc/src/main/groovy/de/tudarmstadt/ukp/dkpro/core/doc/Util.groovy +++ b/dkpro-core-doc/src/main/groovy/org/dkpro/core/doc/Util.groovy @@ -15,9 +15,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.doc +package org.dkpro.core.doc -import java.text.BreakIterator +import java.text.BreakIterator; class Util { static def editOnGithub(url) diff --git a/dkpro-core-doc/src/main/odf/figures.odp b/dkpro-core-doc/src/main/odf/figures.odp index 07915aafb9..cbc32f929c 100644 Binary files a/dkpro-core-doc/src/main/odf/figures.odp and b/dkpro-core-doc/src/main/odf/figures.odp differ diff --git a/dkpro-core-doc/src/main/script/generateComponentManifest.groovy b/dkpro-core-doc/src/main/script/generateComponentManifest.groovy index 042a1da2e2..2e19406f59 100644 --- a/dkpro-core-doc/src/main/script/generateComponentManifest.groovy +++ b/dkpro-core-doc/src/main/script/generateComponentManifest.groovy @@ -1,5 +1,21 @@ #!/usr/bin/env groovy +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + import static groovy.io.FileType.FILES; import groovy.json.*; import groovy.transform.Field; diff --git a/dkpro-core-doc/src/main/script/mappings/typesystemmapping.yaml b/dkpro-core-doc/src/main/script/mappings/typesystemmapping.yaml index c9edb9324c..611db4e94d 100644 --- a/dkpro-core-doc/src/main/script/mappings/typesystemmapping.yaml +++ b/dkpro-core-doc/src/main/script/mappings/typesystemmapping.yaml @@ -1,3 +1,19 @@ +# Copyright 2016 +# Ubiquitous Knowledge Processing (UKP) Lab +# Technische Universität Darmstadt +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly: features: description: @@ -24,7 +40,8 @@ de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain: features: first: externalReferences: [] - externalReferences: [] + externalReferences: + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/Coreference', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink: features: next: @@ -33,7 +50,8 @@ de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink: externalReferences: [] referenceRelation: externalReferences: [] - externalReferences: [] + externalReferences: + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/Coreference', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.discourse.type.pdtb.DiscourseArgument: features: parentRelationId: @@ -143,7 +161,8 @@ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures: externalReferences: [] reflex: externalReferences: [] - externalReferences: [] + externalReferences: + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/MorphologicalAnnotationType', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.ADJ: features: {} externalReferences: @@ -182,6 +201,8 @@ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS: externalReferences: - {source: NIF, id: 'http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#posTag', rel: similar} - {source: GATE, id: 'Token:pos', rel: similar } + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/PartOfSpeech', rel: similar } + - {source: OLiA, id: 'http://purl.org/olia/olia-top.owl#MorphosyntacticCategory', rel: similar } externalReferences: - {source: 'http://universaldependencies.org/u/pos/', id: 'http://universaldependencies.org/u/pos/', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.ADP: @@ -245,6 +266,7 @@ de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData: externalReferences: [] externalReferences: - {source: LAPPS, id: 'http://vocab.lappsgrid.org/Document', rel: similar} + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/DocumentAnnotationType', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.metadata.type.MetaDataStringField: features: key: @@ -282,6 +304,7 @@ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Date: - {source: LAPPS, id: 'http://vocab.lappsgrid.org/Date', rel: similar} - {source: ISOcat, id: 'http://www.isocat.org/datcat/DC-6123', rel: similar} - {source: 'schema.org', id: 'http://schema.org/Date', rel: similar} + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/Date', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.ner.type.Disease: features: {} externalReferences: [] @@ -317,6 +340,7 @@ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location: - {source: ISOcat, id: 'http://www.isocat.org/datcat/DC-4339', rel: similar} - {source: 'schema.org', id: 'http://schema.org/Location', rel: similar} - {source: NERD, id: 'http://nerd.eurecom.fr/ontology#Location', rel: similar} + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/Location', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.ner.type.Money: features: {} externalReferences: [] @@ -333,6 +357,7 @@ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity: - {source: ISOcat, id: 'http://www.isocat.org/datcat/DC-2275', rel: similar} - {source: OWL, id: 'http://www.w3.org/2002/07/owl#Thing', rel: similar} - {source: NIF, id: 'http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#EntityOccurrence', rel: similar} + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/NamedEntity', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.ner.type.Nationality: features: {} externalReferences: [] @@ -352,6 +377,7 @@ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization: - {source: ISOcat, id: 'http://www.isocat.org/datcat/DC-2979', rel: similar} - {source: 'schema.org', id: 'http://schema.org/Organization', rel: similar} - {source: NERD, id: 'http://nerd.eurecom.fr/ontology#Organization', rel: similar} + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/Organization', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.ner.type.PerDesc: features: {} externalReferences: [] @@ -365,6 +391,7 @@ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Person: - {source: ISOcat, id: 'http://www.isocat.org/datcat/DC-2978', rel: similar} - {source: 'schema.org', id: 'http://schema.org/Person', rel: similar} - {source: NERD, id: 'http://nerd.eurecom.fr/ontology#Location', rel: similar} + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/Person', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.ner.type.Plant: features: {} externalReferences: [] @@ -424,6 +451,7 @@ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma: externalReferences: - {source: GATE, id: 'Token:lemma', rel: similar } - {source: NIF, id: 'http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#lemma', rel: similar} + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/Lemma', rel: similar} externalReferences: [] de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LinkingMorpheme: features: {} @@ -438,6 +466,7 @@ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph: externalReferences: - {source: LAPPS, id: 'http://vocab.lappsgrid.org/Paragraph', rel: similar} - {source: NIF, id: 'http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#Paragraph', rel: similar} + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/Paragraph', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence: features: id: @@ -447,6 +476,8 @@ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence: - {source: LAPPS, id: 'http://vocab.lappsgrid.org/Sentence', rel: similar} - {source: ISOcat, id: 'http://www.isocat.org/datcat/DC-1386', rel: similar} - {source: NIF, id: 'http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#Sentence', rel: similar} + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/Sentence', rel: similar} + - {source: OLiA, id: 'http://purl.org/olia/olia-top.owl#Sentence', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split: features: splits: @@ -458,7 +489,9 @@ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem: externalReferences: - {source: NIF, id: 'http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#stem', rel: similar} - {source: GATE, id: 'Token:stem', rel: similar } - externalReferences: [] + externalReferences: + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/Stem', rel: similar } + - {source: OLiA, id: 'http://purl.org/olia/olia-top.owl#Stem', rel: similar } de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.StopWord: features: {} externalReferences: [] @@ -481,11 +514,15 @@ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token: - {source: LAPPS, id: 'http://vocab.lappsgrid.org/Token', rel: similar} - {source: ISOcat, id: 'http://www.isocat.org/datcat/DC-1403', rel: similar} - {source: NIF, id: 'http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#Sentence', rel: similar} + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/Token', rel: similar} + - {source: OLiA, id: 'http://purl.org/olia/olia-top.owl#Token', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticArgument: features: role: externalReferences: [] externalReferences: [] +de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg: + externalReferences: [] de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticField: features: value: @@ -497,12 +534,22 @@ de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticPredicate: externalReferences: [] arguments: externalReferences: [] - externalReferences: [] + externalReferences: + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/SemanticFrame', rel: similar} +de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred: + features: + category: + externalReferences: [] + arguments: + externalReferences: [] + externalReferences: + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/SemanticFrame', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.semantics.type.WordSense: features: value: externalReferences: [] - externalReferences: [] + externalReferences: + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/WordSense', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.structure.type.Field: features: name: @@ -533,7 +580,8 @@ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk: features: chunkValue: externalReferences: [] - externalReferences: [] + externalReferences: + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/Chunk', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.INTJ: features: {} externalReferences: [] @@ -578,6 +626,8 @@ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent: externalReferences: [] externalReferences: - {source: LAPPS, id: 'http//vocab.lappsgrid.org/Constituent', rel: similar} + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/Constituent', rel: similar} + - {source: OLiA, id: 'http://purl.org/olia/olia-top.owl#Constituent', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.FRAG: features: {} externalReferences: [] @@ -613,7 +663,9 @@ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.QP: externalReferences: [] de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT: features: {} - externalReferences: [] + externalReferences: + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/ConstituencyTree', rel: similar} + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/Constituent', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.RRC: features: {} externalReferences: [] @@ -729,6 +781,7 @@ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency: externalReferences: [] externalReferences: - {source: LAPPS, id: 'http://vocab.lappsgrid.org/Dependency', rel: similar} + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/Dependency', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.EXPL: features: {} externalReferences: [] @@ -824,7 +877,9 @@ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.REL: externalReferences: [] de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT: features: {} - externalReferences: [] + externalReferences: + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/DependencyTree', rel: similar} + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/Dependency', rel: similar} de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.TMOD: features: {} externalReferences: [] @@ -937,11 +992,14 @@ de.tudarmstadt.ukp.dkpro.core.sentiment.type.StanfordSentimentAnnotation: externalReferences: [] veryPositive: externalReferences: [] - externalReferences: [] + externalReferences: + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/Sentiment', rel: similar} de.tudarmstadt.ukp.dkpro.core.type.ReadabilityScore: features: measureName: externalReferences: [] score: externalReferences: [] - externalReferences: [] + externalReferences: + - {source: OMTD-SHARE, id: 'http://w3id.org/meta-share/omtd-share/Readability', rel: similar} + \ No newline at end of file diff --git a/dkpro-core-doc/src/main/script/templates/componentsDetails.adoc b/dkpro-core-doc/src/main/script/templates/componentsDetails.adoc index 427eea02ba..4e44626d47 100644 --- a/dkpro-core-doc/src/main/script/templates/componentsDetails.adoc +++ b/dkpro-core-doc/src/main/script/templates/componentsDetails.adoc @@ -1,5 +1,21 @@ <% -import de.tudarmstadt.ukp.dkpro.core.doc.Util; +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import org.dkpro.core.doc.Util; import static groovy.json.StringEscapeUtils.escapeJava; engines diff --git a/dkpro-core-doc/src/main/script/templates/componentsOverview.adoc b/dkpro-core-doc/src/main/script/templates/componentsOverview.adoc index 6eefbf727d..5aecb45b28 100644 --- a/dkpro-core-doc/src/main/script/templates/componentsOverview.adoc +++ b/dkpro-core-doc/src/main/script/templates/componentsOverview.adoc @@ -1,5 +1,21 @@ -<% -import de.tudarmstadt.ukp.dkpro.core.doc.Util; +<% +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import org.dkpro.core.doc.Util; %> .Analysis Components (${engines.size()}) diff --git a/dkpro-core-doc/src/main/script/templates/componentsProducersConsumers.adoc b/dkpro-core-doc/src/main/script/templates/componentsProducersConsumers.adoc index 62da0f7b5d..04f76a7f2d 100644 --- a/dkpro-core-doc/src/main/script/templates/componentsProducersConsumers.adoc +++ b/dkpro-core-doc/src/main/script/templates/componentsProducersConsumers.adoc @@ -1,5 +1,21 @@ -<% -import de.tudarmstadt.ukp.dkpro.core.doc.Util; +<% +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import org.dkpro.core.doc.Util; %> .Producers and consumers by type diff --git a/dkpro-core-doc/src/main/script/templates/datasetsDetails.adoc b/dkpro-core-doc/src/main/script/templates/datasetsDetails.adoc index ed7675d495..26fc2bdcf9 100644 --- a/dkpro-core-doc/src/main/script/templates/datasetsDetails.adoc +++ b/dkpro-core-doc/src/main/script/templates/datasetsDetails.adoc @@ -1,4 +1,20 @@ <% +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + def renderLicense(license) { if (license.url) { diff --git a/dkpro-core-doc/src/main/script/templates/datasetsOverview.adoc b/dkpro-core-doc/src/main/script/templates/datasetsOverview.adoc index c7af272a0b..d3ac85e31b 100644 --- a/dkpro-core-doc/src/main/script/templates/datasetsOverview.adoc +++ b/dkpro-core-doc/src/main/script/templates/datasetsOverview.adoc @@ -1,3 +1,21 @@ +<% +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +%> + .Datasets (${datasets.size()}) [options="header", cols="4,1,1,1,1"] |==== diff --git a/dkpro-core-doc/src/main/script/templates/formatsDetails.adoc b/dkpro-core-doc/src/main/script/templates/formatsDetails.adoc index 5ca8d2da75..b4a9bb06b8 100644 --- a/dkpro-core-doc/src/main/script/templates/formatsDetails.adoc +++ b/dkpro-core-doc/src/main/script/templates/formatsDetails.adoc @@ -1,5 +1,21 @@ <% -import de.tudarmstadt.ukp.dkpro.core.doc.Util; +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import org.dkpro.core.doc.Util; import static groovy.json.StringEscapeUtils.escapeJava; import org.apache.commons.lang3.StringUtils; import groovy.transform.Field; diff --git a/dkpro-core-doc/src/main/script/templates/formatsOverview.adoc b/dkpro-core-doc/src/main/script/templates/formatsOverview.adoc index fabe4e1427..ed8c7cada2 100644 --- a/dkpro-core-doc/src/main/script/templates/formatsOverview.adoc +++ b/dkpro-core-doc/src/main/script/templates/formatsOverview.adoc @@ -1,3 +1,21 @@ +<% +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +%> + .Formats (${formats.size()}) [options="header"] |==== diff --git a/dkpro-core-doc/src/main/script/templates/languagesDetails.adoc b/dkpro-core-doc/src/main/script/templates/languagesDetails.adoc index 36767f7fbf..f4e1a27754 100644 --- a/dkpro-core-doc/src/main/script/templates/languagesDetails.adoc +++ b/dkpro-core-doc/src/main/script/templates/languagesDetails.adoc @@ -1,5 +1,21 @@ -<% -import de.tudarmstadt.ukp.dkpro.core.doc.Util; +<% +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import org.dkpro.core.doc.Util; def languages = engines.collect { k,e -> e.allLanguages }.flatten().unique().sort(); def roles = engines.collect { k,e -> e.role }.flatten().unique().sort { it != 'Other' ? it : 'Z' }; diff --git a/dkpro-core-doc/src/main/script/templates/languagesOverview.adoc b/dkpro-core-doc/src/main/script/templates/languagesOverview.adoc index fea499f27e..f70d03de03 100644 --- a/dkpro-core-doc/src/main/script/templates/languagesOverview.adoc +++ b/dkpro-core-doc/src/main/script/templates/languagesOverview.adoc @@ -1,5 +1,21 @@ -<% -import de.tudarmstadt.ukp.dkpro.core.doc.Util; +<% +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import org.dkpro.core.doc.Util; def languages = engines.collect { k,e -> e.allLanguages }.flatten().unique().sort(); def roles = engines.collect { k,e -> e.role }.flatten().unique().sort { it != 'Other' ? it : 'Z' }; diff --git a/dkpro-core-doc/src/main/script/templates/modelsDetails.adoc b/dkpro-core-doc/src/main/script/templates/modelsDetails.adoc index 07c3e5840c..cdff438a52 100644 --- a/dkpro-core-doc/src/main/script/templates/modelsDetails.adoc +++ b/dkpro-core-doc/src/main/script/templates/modelsDetails.adoc @@ -1,5 +1,21 @@ -<% -import de.tudarmstadt.ukp.dkpro.core.doc.Util; +<% +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import org.dkpro.core.doc.Util; def metadataValueText(model, key, value) { diff --git a/dkpro-core-doc/src/main/script/templates/modelsOverview.adoc b/dkpro-core-doc/src/main/script/templates/modelsOverview.adoc index 26772d9900..e93648bbd1 100644 --- a/dkpro-core-doc/src/main/script/templates/modelsOverview.adoc +++ b/dkpro-core-doc/src/main/script/templates/modelsOverview.adoc @@ -1,5 +1,21 @@ -<% -import de.tudarmstadt.ukp.dkpro.core.doc.Util; +<% +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import org.dkpro.core.doc.Util; %> .Models (${models.size()}) diff --git a/dkpro-core-doc/src/main/script/templates/tagsetJson.groovy b/dkpro-core-doc/src/main/script/templates/tagsetJson.groovy index dbcf8e46c1..2d126adf26 100644 --- a/dkpro-core-doc/src/main/script/templates/tagsetJson.groovy +++ b/dkpro-core-doc/src/main/script/templates/tagsetJson.groovy @@ -1,3 +1,19 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + import groovy.json.*; import java.util.regex.Pattern; diff --git a/dkpro-core-doc/src/main/script/templates/tagsetsDetails.adoc b/dkpro-core-doc/src/main/script/templates/tagsetsDetails.adoc index 75af936030..ba6bd4b0b7 100644 --- a/dkpro-core-doc/src/main/script/templates/tagsetsDetails.adoc +++ b/dkpro-core-doc/src/main/script/templates/tagsetsDetails.adoc @@ -1,5 +1,21 @@ -<% -import de.tudarmstadt.ukp.dkpro.core.doc.Util; +<% +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import org.dkpro.core.doc.Util; import java.util.regex.Pattern; def escape(text) diff --git a/dkpro-core-doc/src/main/script/templates/tagsetsOverview.adoc b/dkpro-core-doc/src/main/script/templates/tagsetsOverview.adoc index 9258824a0b..d9c4b99dab 100644 --- a/dkpro-core-doc/src/main/script/templates/tagsetsOverview.adoc +++ b/dkpro-core-doc/src/main/script/templates/tagsetsOverview.adoc @@ -1,4 +1,20 @@ -<% +<% +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + def redirection(tagset) { def id = tagset.mapping.getString('__META_REDIRECT__'); diff --git a/dkpro-core-doc/src/main/script/templates/typesystemDetails.adoc b/dkpro-core-doc/src/main/script/templates/typesystemDetails.adoc index 09e80d46aa..6af431f9c9 100644 --- a/dkpro-core-doc/src/main/script/templates/typesystemDetails.adoc +++ b/dkpro-core-doc/src/main/script/templates/typesystemDetails.adoc @@ -1,5 +1,21 @@ -<% -import de.tudarmstadt.ukp.dkpro.core.doc.Util; +<% +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import org.dkpro.core.doc.Util; import org.dkpro.meta.core.model.EngineModel; import org.dkpro.meta.core.model.FormatModel; diff --git a/dkpro-core-doc/src/main/script/templates/typesystemOverview.adoc b/dkpro-core-doc/src/main/script/templates/typesystemOverview.adoc index ea87709dcf..de68092f5f 100644 --- a/dkpro-core-doc/src/main/script/templates/typesystemOverview.adoc +++ b/dkpro-core-doc/src/main/script/templates/typesystemOverview.adoc @@ -1,5 +1,21 @@ -<% -import de.tudarmstadt.ukp.dkpro.core.doc.Util; +<% +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import org.dkpro.core.doc.Util; %> .Top-level Types diff --git a/dkpro-core-doc/src/main/script/templates/typesystemSubtypeOverview.adoc b/dkpro-core-doc/src/main/script/templates/typesystemSubtypeOverview.adoc index 82b2d1f329..e2942623b8 100644 --- a/dkpro-core-doc/src/main/script/templates/typesystemSubtypeOverview.adoc +++ b/dkpro-core-doc/src/main/script/templates/typesystemSubtypeOverview.adoc @@ -1,5 +1,21 @@ -<% -import de.tudarmstadt.ukp.dkpro.core.doc.Util; +<% +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import org.dkpro.core.doc.Util; %> <% diff --git a/dkpro-core-doc/src/main/script/templates/typesystemYaml.groovy b/dkpro-core-doc/src/main/script/templates/typesystemYaml.groovy index 35c9c48a6d..18c2b0f096 100644 --- a/dkpro-core-doc/src/main/script/templates/typesystemYaml.groovy +++ b/dkpro-core-doc/src/main/script/templates/typesystemYaml.groovy @@ -1,3 +1,19 @@ +// Copyright 2018 +// Ubiquitous Knowledge Processing (UKP) Lab +// Technische Universität Darmstadt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + import groovy.json.*; import org.yaml.snakeyaml.Yaml; import java.util.regex.Pattern; diff --git a/dkpro-core-eval-asl/pom.xml b/dkpro-core-eval-asl/pom.xml index f36ed5d894..9012eaf081 100644 --- a/dkpro-core-eval-asl/pom.xml +++ b/dkpro-core-eval-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.eval-asl + dkpro-core-eval-asl jar DKPro Core ASL - Evaluation + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -40,8 +41,8 @@ snakeyaml - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl diff --git a/dkpro-core-eval-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/eval/EvalUtil.java b/dkpro-core-eval-asl/src/main/java/org/dkpro/core/eval/EvalUtil.java similarity index 94% rename from dkpro-core-eval-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/eval/EvalUtil.java rename to dkpro-core-eval-asl/src/main/java/org/dkpro/core/eval/EvalUtil.java index f17e1f409f..b33ff562ea 100644 --- a/dkpro-core-eval-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/eval/EvalUtil.java +++ b/dkpro-core-eval-asl/src/main/java/org/dkpro/core/eval/EvalUtil.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.eval; +package org.dkpro.core.eval; import static org.apache.uima.fit.pipeline.SimplePipeline.iteratePipeline; import static org.apache.uima.fit.util.JCasUtil.select; @@ -36,12 +36,12 @@ import org.apache.uima.fit.pipeline.JCasIterable; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; +import org.dkpro.core.eval.measure.FMeasure; +import org.dkpro.core.eval.model.Span; +import org.dkpro.core.eval.report.Result; import org.yaml.snakeyaml.Yaml; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.eval.measure.FMeasure; -import de.tudarmstadt.ukp.dkpro.core.eval.model.Span; -import de.tudarmstadt.ukp.dkpro.core.eval.report.Result; public class EvalUtil { diff --git a/dkpro-core-eval-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/eval/measure/FMeasure.java b/dkpro-core-eval-asl/src/main/java/org/dkpro/core/eval/measure/FMeasure.java similarity index 97% rename from dkpro-core-eval-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/eval/measure/FMeasure.java rename to dkpro-core-eval-asl/src/main/java/org/dkpro/core/eval/measure/FMeasure.java index a85bdbc2a7..5f8993f912 100644 --- a/dkpro-core-eval-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/eval/measure/FMeasure.java +++ b/dkpro-core-eval-asl/src/main/java/org/dkpro/core/eval/measure/FMeasure.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.eval.measure; +package org.dkpro.core.eval.measure; import java.util.Collection; import java.util.HashSet; diff --git a/dkpro-core-eval-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/eval/model/Span.java b/dkpro-core-eval-asl/src/main/java/org/dkpro/core/eval/model/Span.java similarity index 98% rename from dkpro-core-eval-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/eval/model/Span.java rename to dkpro-core-eval-asl/src/main/java/org/dkpro/core/eval/model/Span.java index 0c42037566..f2b9ee0a03 100644 --- a/dkpro-core-eval-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/eval/model/Span.java +++ b/dkpro-core-eval-asl/src/main/java/org/dkpro/core/eval/model/Span.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.eval.model; +package org.dkpro.core.eval.model; public class Span { diff --git a/dkpro-core-eval-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/eval/report/Result.java b/dkpro-core-eval-asl/src/main/java/org/dkpro/core/eval/report/Result.java similarity index 96% rename from dkpro-core-eval-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/eval/report/Result.java rename to dkpro-core-eval-asl/src/main/java/org/dkpro/core/eval/report/Result.java index a22606fffa..47ebcc8b6f 100644 --- a/dkpro-core-eval-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/eval/report/Result.java +++ b/dkpro-core-eval-asl/src/main/java/org/dkpro/core/eval/report/Result.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.eval.report; +package org.dkpro.core.eval.report; public class Result { diff --git a/dkpro-core-flextag-asl/pom.xml b/dkpro-core-flextag-asl/pom.xml index c3d77b8aac..0363fd92d4 100644 --- a/dkpro-core-flextag-asl/pom.xml +++ b/dkpro-core-flextag-asl/pom.xml @@ -16,16 +16,19 @@ limitations under the License. --> - + 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.flextag-asl + dkpro-core-flextag-asl DKPro Core ASL - FlexTag + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -36,20 +39,20 @@ uimafit-core - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl org.dkpro.tc @@ -67,14 +70,18 @@ de.unidue.ltl.flextag flextag-features + + eu.openminted.share.annotations + omtd-share-annotations-api + junit junit test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test diff --git a/dkpro-core-flextag-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/flextag/FlexTagPosTagger.java b/dkpro-core-flextag-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/flextag/FlexTagPosTagger.java deleted file mode 100644 index b7a00ee238..0000000000 --- a/dkpro-core-flextag-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/flextag/FlexTagPosTagger.java +++ /dev/null @@ -1,179 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.flextag; - -import java.io.File; -import java.io.IOException; -import java.net.URL; -import java.util.ArrayList; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.Type; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.factory.AnalysisEngineFactory; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.dkpro.tc.api.type.TextClassificationOutcome; -import org.dkpro.tc.ml.uima.TcAnnotator; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * Flexible part-of-speech tagger. - */ -@ResourceMetaData(name="FlexTag POS-Tagger") -public class FlexTagPosTagger - extends JCasAnnotator_ImplBase -{ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - private String modelLocation; - - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - private String language; - - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - private String variant; - - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - private String posMappingLocation; - - - private AnalysisEngine flexTagEngine = null; - private ModelProviderBase modelProvider = null; - private MappingProvider mappingProvider=null; - - @Override - public void initialize(final UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - - initModelProvider(); - mappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - language, modelProvider); - - flexTagEngine = AnalysisEngineFactory.createEngine(TcAnnotator.class, - TcAnnotator.PARAM_TC_MODEL_LOCATION, modelProvider.getResource(), - TcAnnotator.PARAM_NAME_SEQUENCE_ANNOTATION, Sentence.class.getName(), - TcAnnotator.PARAM_NAME_UNIT_ANNOTATION, Token.class.getName()); - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - mappingProvider.configure(aJCas.getCas()); - - flexTagEngine.process(aJCas); - - annotateTaggingResultsLinkToTokens(aJCas); - } - - private void annotateTaggingResultsLinkToTokens(JCas aJCas) - { - List tokens = getTokens(aJCas); - List outcomes = getPredictions(aJCas); - - for (int i = 0; i < tokens.size(); i++) { - Token token = tokens.get(i); - TextClassificationOutcome outcome = outcomes.get(i); - String posTag = outcome.getOutcome(); - - POS p = createPartOfSpeechAnnotationFromOutcome(aJCas, token.getBegin(), - token.getEnd(), posTag); - token.setPos(p); - outcome.removeFromIndexes(aJCas); - } - - } - - private POS createPartOfSpeechAnnotationFromOutcome(JCas aJCas, int begin, int end, - String aOutcome) - { - Type posTag = mappingProvider.getTagType(aOutcome); - POS posAnno = (POS) aJCas.getCas().createAnnotation(posTag, begin, end); - posAnno.setPosValue(aOutcome); - POSUtils.assignCoarseValue(posAnno); - posAnno.addToIndexes(); - - return posAnno; - - } - - private List getPredictions(JCas aJCas) - { - return new ArrayList(JCasUtil.select(aJCas, - TextClassificationOutcome.class)); - } - - private List getTokens(JCas aJCas) - { - return new ArrayList(JCasUtil.select(aJCas, Token.class)); - } - - private void initModelProvider() - throws ResourceInitializationException - { - modelProvider = new ModelProviderBase() - { - { - setContextObject(FlexTagPosTagger.this); - - setDefault(ARTIFACT_ID, "${groupId}.flextag-model-${language}-${variant}"); - setDefault(LOCATION, - "classpath:/${package}/lib/tagger-${language}-${variant}.properties"); - - setOverride(LOCATION, modelLocation); - setOverride(LANGUAGE, language); - setOverride(VARIANT, variant); - } - - @Override - protected File produceResource(URL aUrl) - throws IOException - { - File folder = ResourceUtils.getClasspathAsFolder(aUrl.toString(), true); - return folder; - } - }; - try { - modelProvider.configure(); - } - catch (IOException e) { - throw new ResourceInitializationException(e); - } - } -} diff --git a/dkpro-core-flextag-asl/src/main/java/org/dkpro/core/flextag/FlexTagPosTagger.java b/dkpro-core-flextag-asl/src/main/java/org/dkpro/core/flextag/FlexTagPosTagger.java new file mode 100644 index 0000000000..0e269a3a91 --- /dev/null +++ b/dkpro-core-flextag-asl/src/main/java/org/dkpro/core/flextag/FlexTagPosTagger.java @@ -0,0 +1,223 @@ +/** + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.flextag; + +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; + +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.Type; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.tc.api.type.TextClassificationOutcome; +import org.dkpro.tc.ml.uima.TcAnnotator; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Flexible part-of-speech tagger. + */ +@Component(OperationType.PART_OF_SPEECH_TAGGER) +@ResourceMetaData(name = "FlexTag POS-Tagger") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +public class FlexTagPosTagger + extends JCasAnnotator_ImplBase +{ + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + *

The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.

+ */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Location from which the model is read. This is either a local path or a classpath location. + * In the latter case, the model artifact (if any) is searched as well. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + private String modelLocation; + + /** + * Use this language instead of the document language to resolve the model and tag set mapping. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + private String language; + + /** + * Variant of a model the model. Used to address a specific model if here are multiple models + * for one language. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + private String variant; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Location of the mapping file for part-of-speech tags to UIMA types. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + private String posMappingLocation; + + private AnalysisEngine flexTagEngine = null; + private ModelProviderBase modelProvider = null; + private MappingProvider mappingProvider = null; + + @Override + public void initialize(final UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + initModelProvider(); + + mappingProvider = createPosMappingProvider(this, posMappingLocation, language, + modelProvider); + + flexTagEngine = AnalysisEngineFactory.createEngine(TcAnnotator.class, + TcAnnotator.PARAM_TC_MODEL_LOCATION, modelProvider.getResource(), + TcAnnotator.PARAM_NAME_SEQUENCE_ANNOTATION, Sentence.class.getName(), + TcAnnotator.PARAM_NAME_UNIT_ANNOTATION, Token.class.getName()); + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + mappingProvider.configure(aJCas.getCas()); + + flexTagEngine.process(aJCas); + + annotateTaggingResultsLinkToTokens(aJCas); + } + + private void annotateTaggingResultsLinkToTokens(JCas aJCas) + { + List tokens = getTokens(aJCas); + List outcomes = getPredictions(aJCas); + + for (int i = 0; i < tokens.size(); i++) { + Token token = tokens.get(i); + TextClassificationOutcome outcome = outcomes.get(i); + String posTag = outcome.getOutcome(); + + POS p = createPartOfSpeechAnnotationFromOutcome(aJCas, token.getBegin(), + token.getEnd(), posTag); + token.setPos(p); + outcome.removeFromIndexes(aJCas); + } + + } + + private POS createPartOfSpeechAnnotationFromOutcome(JCas aJCas, int begin, int end, + String aOutcome) + { + Type posTag = mappingProvider.getTagType(aOutcome); + POS posAnno = (POS) aJCas.getCas().createAnnotation(posTag, begin, end); + posAnno.setPosValue(aOutcome); + POSUtils.assignCoarseValue(posAnno); + posAnno.addToIndexes(); + + return posAnno; + + } + + private List getPredictions(JCas aJCas) + { + return new ArrayList(JCasUtil.select(aJCas, + TextClassificationOutcome.class)); + } + + private List getTokens(JCas aJCas) + { + return new ArrayList(JCasUtil.select(aJCas, Token.class)); + } + + private void initModelProvider() + throws ResourceInitializationException + { + modelProvider = new ModelProviderBase() + { + { + setContextObject(FlexTagPosTagger.this); + + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(ARTIFACT_ID, "${groupId}.flextag-model-${language}-${variant}"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/flextag/lib/tagger-${language}-${variant}.properties"); + + setOverride(LOCATION, modelLocation); + setOverride(LANGUAGE, language); + setOverride(VARIANT, variant); + } + + @Override + protected File produceResource(URL aUrl) + throws IOException + { + File folder = ResourceUtils.getClasspathAsFolder(aUrl.toString(), true); + return folder; + } + }; + try { + modelProvider.configure(); + } + catch (IOException e) { + throw new ResourceInitializationException(e); + } + } +} diff --git a/dkpro-core-flextag-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/flextag/FlexTagPosTaggerTest.java b/dkpro-core-flextag-asl/src/test/java/org/dkpro/core/flextag/FlexTagPosTaggerTest.java similarity index 89% rename from dkpro-core-flextag-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/flextag/FlexTagPosTaggerTest.java rename to dkpro-core-flextag-asl/src/test/java/org/dkpro/core/flextag/FlexTagPosTaggerTest.java index 8f85275cc0..15f9160cc7 100644 --- a/dkpro-core-flextag-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/flextag/FlexTagPosTaggerTest.java +++ b/dkpro-core-flextag-asl/src/test/java/org/dkpro/core/flextag/FlexTagPosTaggerTest.java @@ -1,5 +1,5 @@ /** - * Copyright 2007-2017 + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -15,21 +15,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.flextag; +package org.dkpro.core.flextag; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; +import org.dkpro.core.flextag.FlexTagPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class FlexTagPosTaggerTest { diff --git a/dkpro-core-flextag-asl/src/test/resources/log4j.properties b/dkpro-core-flextag-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-flextag-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-flextag-asl/src/test/resources/log4j2.xml b/dkpro-core-flextag-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-flextag-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-frequency-asl/pom.xml b/dkpro-core-frequency-asl/pom.xml index b9fae107d1..341daf84bb 100644 --- a/dkpro-core-frequency-asl/pom.xml +++ b/dkpro-core-frequency-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.frequency-asl + dkpro-core-frequency-asl jar DKPro Core ASL - Frequency (ASL) + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -49,57 +50,60 @@ 1.1.2 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.featurepath-asl + org.dkpro.core + dkpro-core-api-featurepath-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.frequency-asl + org.dkpro.core + dkpro-core-api-frequency-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl - junit - junit - test + eu.openminted.share.annotations + omtd-share-annotations-api - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.tokit-asl - test + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.text-asl - test + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl - test + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.apache.commons + commons-collections4 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + junit + junit test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-tokit-asl + test - org.apache.commons - commons-collections4 + org.dkpro.core + dkpro-core-io-text-asl + test + + + org.dkpro.core + dkpro-core-testing-asl + test @@ -119,5 +123,21 @@ + + + eu.openminted.share.annotations + omtd-share-annotations-maven-plugin + + + + **/*.xml + + + + \ No newline at end of file diff --git a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/phrasedetection/FrequencyCounter.java b/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/phrasedetection/FrequencyCounter.java deleted file mode 100644 index c3436ccd6a..0000000000 --- a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/phrasedetection/FrequencyCounter.java +++ /dev/null @@ -1,244 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - *

- * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - *

- * http://www.apache.org/licenses/LICENSE-2.0 - *

- * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.frequency.phrasedetection; - -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator.PhraseSequenceGenerator; -import de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator.StringSequenceGenerator; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import org.apache.commons.collections4.Bag; -import org.apache.commons.collections4.bag.HashBag; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import java.io.File; -import java.io.IOException; -import java.io.OutputStream; -import java.util.stream.Stream; - -/** - * Count unigrams and bigrams in a collection. - */ -@ResourceMetaData(name="Frequency Count Writer") -public class FrequencyCounter - extends JCasFileWriter_ImplBase -{ - /** - * When concatenating multiple tokens, this string is inserted between them. - */ - static final String BIGRAM_SEPARATOR = " "; - /** - * Columns (i.e. tokens and counts) are separated by this character. - */ - static final String COLUMN_SEPARATOR = "\t"; - /** - * When hitting a column separator within a token, it is replaced by this token. - */ - static final String COLUMN_SEP_REPLACEMENT = " "; - - /** - * This string (a line) will separate unigrams from bigrams in the output file - **/ - static final String NGRAM_SEPARATOR_LINE = "----------------------------------------------------"; - static final String NEWLINE_REGEX = "\r\n?|\n"; - - /** - * The feature path. Default: tokens. - */ - public static final String PARAM_FEATURE_PATH = "featurePath"; - @ConfigurationParameter(name = PARAM_FEATURE_PATH, mandatory = false) - private String featurePath; - private static final String DEFAULT_FEATURE_PATH = Token.class.getCanonicalName(); - - /** - * Set this parameter if bigrams should only be counted when occurring within a covering type, e.g. sentences. - */ - public static final String PARAM_COVERING_TYPE = "coveringType"; - @ConfigurationParameter(name = PARAM_COVERING_TYPE, mandatory = false) - private String coveringType; - - /** - * If true, all tokens are lowercased. - */ - public static final String PARAM_LOWERCASE = "lowercase"; - @ConfigurationParameter(name = PARAM_LOWERCASE, mandatory = true, defaultValue = "false") - private boolean lowercase; - - /** - * Tokens occurring fewer times than this value are omitted. Default: 5. - */ - public static final String PARAM_MIN_COUNT = "minCount"; - @ConfigurationParameter(name = PARAM_MIN_COUNT, mandatory = true, defaultValue = "5") - private int minCount; - - /** - * If true, sort output by count (descending order). - */ - public static final String PARAM_SORT_BY_COUNT = "sortByCount"; - @ConfigurationParameter(name = PARAM_SORT_BY_COUNT, mandatory = true, defaultValue = "false") - private boolean sortByCount; - - /** - * If true, sort output alphabetically. - */ - public static final String PARAM_SORT_BY_ALPHABET = "sortByAlphabet"; - @ConfigurationParameter(name = PARAM_SORT_BY_ALPHABET, mandatory = true, defaultValue = "false") - private boolean sortByAlphabet; - - public static final String PARAM_STOPWORDS_FILE = "stopwordsFile"; - @ConfigurationParameter(name = PARAM_STOPWORDS_FILE, mandatory = true, defaultValue = "") - private String stopwordsFile; - - public static final String PARAM_STOPWORDS_REPLACEMENT = "stopwordsReplacement"; - @ConfigurationParameter(name = PARAM_STOPWORDS_REPLACEMENT, mandatory = true, defaultValue = "") - private String stopwordsReplacement; - - public static final String PARAM_FILTER_REGEX = "filterRegex"; - @ConfigurationParameter(name = PARAM_FILTER_REGEX, mandatory = true, defaultValue = "") - private String filterRegex; - - public static final String PARAM_REGEX_REPLACEMENT = "regexReplacement"; - @ConfigurationParameter(name = PARAM_REGEX_REPLACEMENT, mandatory = true, defaultValue = "") - private String regexReplacement; - - private Bag unigrams; - private Bag bigrams; - private StringSequenceGenerator sequenceGenerator; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - if (sortByAlphabet && sortByCount) { - throw new ResourceInitializationException(new IllegalArgumentException( - "Can only sort either by count or alphabetically.")); - } - - unigrams = new HashBag<>(); - bigrams = new HashBag<>(); - - /* set feature path to default */ - if (featurePath == null) { - featurePath = DEFAULT_FEATURE_PATH; - } - - /* init sequence generator */ - try { - sequenceGenerator = new PhraseSequenceGenerator.Builder() - .featurePath(featurePath) - .coveringType(coveringType) - .lowercase(lowercase) - .stopwordsFile(stopwordsFile) - .stopwordsReplacement(stopwordsReplacement) - .filterRegex(filterRegex) - .filterRegexReplacement(regexReplacement) - .buildStringSequenceGenerator(); - } - catch (IOException e) { - throw new ResourceInitializationException(e); - } - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - try { - /* iterate over sequences (e.g. sentences)*/ - for (String[] sequence : sequenceGenerator.tokenSequences(aJCas)) { - /* iterate over tokens in sequence */ - for (int i = 0; i < sequence.length; i++) { - /* count unigrams */ - String unigram = sequence[i] - .replaceAll(COLUMN_SEPARATOR, COLUMN_SEP_REPLACEMENT) - .replaceAll(NEWLINE_REGEX, COLUMN_SEP_REPLACEMENT); - unigrams.add(unigram); - - /* count bigrams */ - if (i + 1 < sequence.length) { - String bigram = unigram + BIGRAM_SEPARATOR + sequence[i + 1] - .replaceAll(COLUMN_SEPARATOR, COLUMN_SEP_REPLACEMENT) - .replaceAll(NEWLINE_REGEX, COLUMN_SEP_REPLACEMENT); - bigrams.add(bigram); - } - } - } - } - catch (FeaturePathException e) { - throw new AnalysisEngineProcessException(e); - } - } - - @Override - public void collectionProcessComplete() - throws AnalysisEngineProcessException - { - getLogger().info("Vocabulary size: " + unigrams.uniqueSet().size()); - try { - getLogger().info("Writing frequencies to " + getTargetLocation()); - OutputStream os = CompressionUtils.getOutputStream(new File(getTargetLocation())); - - writeNgrams(os, unigrams); - os.write((NGRAM_SEPARATOR_LINE + "\n").getBytes()); - writeNgrams(os, bigrams); - os.close(); - } - catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - } - - /** - * Write counter with counts from a bag to an output stream. - * - * @param os an {@link OutputStream} - * @param counter a {@link Bag} of string counter - */ - private void writeNgrams(OutputStream os, Bag counter) - { - /* create token stream */ - Stream stream = counter.uniqueSet().stream() - .filter(token -> counter.getCount(token) >= minCount); - - /* sort output */ - if (sortByAlphabet) { - stream = stream.sorted(String::compareTo); - } - else if (sortByCount) { - stream = stream.sorted((o1, o2) -> - -Integer.compare(counter.getCount(o1), counter.getCount(o2))); - } - - /* write tokens with counts */ - stream.forEach(token -> { - try { - os.write((token + COLUMN_SEPARATOR + counter.getCount(token) + "\n").getBytes()); - } - catch (IOException e) { - throw new RuntimeException(e); - } - }); - } -} diff --git a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/resources/BerkeleyLmFrequencyCountProvider.java b/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/resources/BerkeleyLmFrequencyCountProvider.java deleted file mode 100644 index 8e3c093771..0000000000 --- a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/resources/BerkeleyLmFrequencyCountProvider.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.frequency.resources; - -import java.util.Map; - -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceSpecifier; - -import de.tudarmstadt.ukp.dkpro.core.api.frequency.FrequencyCountResourceBase; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.provider.FrequencyCountProvider; -import de.tudarmstadt.ukp.dkpro.core.frequency.BerkeleyLmProvider; - -/** - * External resource wrapper for the Berkeley LM frequency count provider. - * - * - */ -public final class BerkeleyLmFrequencyCountProvider - extends FrequencyCountResourceBase - implements FrequencyCountProvider -{ - - public static final String PARAM_BINARY = "BinaryFile"; - @ConfigurationParameter(name = PARAM_BINARY, mandatory = true) - protected String file; - - public static final String PARAM_PROVIDER_LANGUAGE = "ProviderLanguage"; - @ConfigurationParameter(name = PARAM_PROVIDER_LANGUAGE, mandatory = true) - protected String language; - - @Override - public boolean initialize(ResourceSpecifier aSpecifier, Map aAdditionalParams) - throws ResourceInitializationException - { - if (!super.initialize(aSpecifier, aAdditionalParams)) { - return false; - } - - try { - initializeProvider(); - } - catch (Exception e) { - throw new ResourceInitializationException(e); - } - - return true; - } - - protected void initializeProvider() throws Exception{ - provider = new BerkeleyLmProvider(file, language); - } - -} diff --git a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/TfidfAnnotator.java b/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/TfidfAnnotator.java deleted file mode 100644 index ddb2739171..0000000000 --- a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/TfidfAnnotator.java +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.frequency.tfidf; - -import java.util.Locale; -import java.util.Map.Entry; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.Level; - -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathFactory; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.tfidf.type.Tfidf; -import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.model.DfModel; -import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.model.DfStore; -import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.model.SharedDfModel; -import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.util.FreqDist; -import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.util.TermIterator; -import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.util.TfidfUtils; - -/** - * This component adds {@link Tfidf} annotations consisting of a term and a tfidf weight.
- * The annotator is type agnostic concerning the input annotation, so you have to specify the - * annotation type and string representation. It uses a pre-serialized {@link DfStore}, which can be - * created using the {@link TfidfConsumer}. - */ -@ResourceMetaData(name="TF/IDF Annotator") -@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.frequency.tfidf.type.Tfidf" }) -public class TfidfAnnotator - extends JCasAnnotator_ImplBase -{ - - /** - * This annotator is type agnostic, so it is mandatory to specify the type of the working - * annotation and how to obtain the string representation with the feature path. - */ - public static final String PARAM_FEATURE_PATH = "featurePath"; - @ConfigurationParameter(name = PARAM_FEATURE_PATH, mandatory = true) - protected String featurePath; - - /** - * Provide the path to the Df-Model. When a shared {@link SharedDfModel} is bound to this - * annotator, this is ignored. - */ - public static final String PARAM_TFDF_PATH = "tfdfPath"; - @ConfigurationParameter(name = PARAM_TFDF_PATH, mandatory = false) - private String tfdfPath; - - /** - * If set to true, the whole text is handled in lower case. - */ - public static final String PARAM_LOWERCASE = "lowercase"; - @ConfigurationParameter(name = PARAM_LOWERCASE, mandatory = false, defaultValue = "false") - protected boolean lowercase; - - /** - * The model for term frequency weighting.
- * Invoke toString() on an enum of {@link WeightingModeTf} for setup. - *

- * Default value is "NORMAL" yielding an unweighted tf. - */ - public static final String PARAM_TF_MODE = "weightingModeTf"; - @ConfigurationParameter(name = PARAM_TF_MODE, mandatory = false, defaultValue = "NORMAL") - private WeightingModeTf weightingModeTf; - - /** - * The model for inverse document frequency weighting.
- * Invoke toString() on an enum of {@link WeightingModeIdf} for setup. - *

- * Default value is "NORMAL" yielding an unweighted idf. - */ - public static final String PARAM_IDF_MODE = "weightingModeIdf"; - @ConfigurationParameter(name = PARAM_IDF_MODE, mandatory = false, defaultValue = "NORMAL") - private WeightingModeIdf weightingModeIdf; - - /** - * Available modes for term frequency - */ - public enum WeightingModeTf - { - BINARY, NORMAL, LOG, LOG_PLUS_ONE - } - - /** - * Available modes for inverse document frequency - */ - public enum WeightingModeIdf - { - BINARY, CONSTANT_ONE, NORMAL, LOG - } - - private DfModel dfModel; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - try { - dfModel = TfidfUtils.getDfModel(tfdfPath); - } - catch (Exception e) { - throw new ResourceInitializationException(e); - } - } - - @Override - public void process(JCas jcas) - throws AnalysisEngineProcessException - { - - FreqDist termFrequencies = getTermFrequencies(jcas); - - try { - for (Entry entry : FeaturePathFactory.select(jcas.getCas(), - featurePath)) { - String term = entry.getValue(); - if (lowercase) { - term = term.toLowerCase(); - } - - int tf = termFrequencies.getCount(term); - int df = dfModel.getDf(term); - if (df == 0) { - getContext().getLogger().log(Level.WARNING, - "Term [" + term + "] not found in dfStore!"); - } - - double tfidf = getWeightedTf(tf) * getWeightedIdf(df, dfModel.getDocumentCount()); - - logTfidf(term, tf, df, tfidf); - - Tfidf tfidfAnnotation = new Tfidf(jcas); - tfidfAnnotation.setTerm(term); - tfidfAnnotation.setTfidfValue(tfidf); - tfidfAnnotation.setBegin(entry.getKey().getBegin()); - tfidfAnnotation.setEnd(entry.getKey().getEnd()); - tfidfAnnotation.addToIndexes(); - } - } - catch (FeaturePathException e) { - throw new AnalysisEngineProcessException(e); - } - } - - protected FreqDist getTermFrequencies(JCas jcas) - throws AnalysisEngineProcessException - { - // count all terms with the given annotation - FreqDist termFrequencies = new FreqDist(); - for (String term : TermIterator.create(jcas, featurePath, lowercase)) { - termFrequencies.count(term); - } - return termFrequencies; - } - - /** - * Calculates a weighted tf according to given settings. - */ - private double getWeightedTf(int tf) - { - switch (weightingModeTf) { - case NORMAL: - return tf; - case LOG: - return tf > 0 ? Math.log(tf) : 0D; - case LOG_PLUS_ONE: - return tf > 0 ? Math.log(tf + 1) : 0D; - case BINARY: - return tf > 0 ? 1D : 0D; - default: - throw new IllegalStateException(); - } - } - - /** - * Calculates a weighted idf according to given settings. - */ - private double getWeightedIdf(int df, int n) - { - switch (weightingModeIdf) { - case NORMAL: - return (double) n / df; - case LOG: - return df > 0 ? Math.log((double) n / df) : 0D; - case CONSTANT_ONE: - return 1D; - case BINARY: - return df > 0 ? 1D : 0D; - default: - throw new IllegalStateException(); - } - } - - private void logTfidf(String term, int tf, int df, double tfidf) - { - if (getContext().getLogger().isLoggable(Level.FINEST)) { - getContext().getLogger().log( - Level.FINEST, - String.format(Locale.US, "\"%s\" (tf: %d, df: %d, tfidf: %.2f)", term, tf, df, - tfidf)); - } - - } - -} diff --git a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/TfidfConsumer.java b/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/TfidfConsumer.java deleted file mode 100644 index 2708e407e3..0000000000 --- a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/TfidfConsumer.java +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.frequency.tfidf; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.model.DfModel; -import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.model.DfStore; -import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.util.TermIterator; -import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.util.TfidfUtils; - -/** - * This consumer builds a {@link DfModel}. It collects the df (document frequency) counts for the - * processed collection. The counts are serialized as a {@link DfModel}-object. - */ -@ResourceMetaData(name="TF/IDF Model Writer") -public class TfidfConsumer - extends JCasAnnotator_ImplBase -{ - @Deprecated - public static final String PARAM_OUTPUT_PATH = ComponentParameters.PARAM_TARGET_LOCATION; - /** - * Specifies the path and filename where the model file is written. - */ - public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; - @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) - private String outputPath; - - /** - * If set to true, the whole text is handled in lower case. - */ - public static final String PARAM_LOWERCASE = "lowercase"; - @ConfigurationParameter(name = PARAM_LOWERCASE, mandatory = true, defaultValue = "false") - private boolean lowercase; - - /** - * This annotator is type agnostic, so it is mandatory to specify the type of the working - * annotation and how to obtain the string representation with the feature path. - */ - public static final String PARAM_FEATURE_PATH = "featurePath"; - @ConfigurationParameter(name = PARAM_FEATURE_PATH, mandatory = true) - private String featurePath; - - private DfStore dfStore; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - dfStore = new DfStore(featurePath, lowercase); - } - - @Override - public void process(JCas jcas) - throws AnalysisEngineProcessException - { - dfStore.registerNewDocument(); - - for (String term : TermIterator.create(jcas, featurePath, lowercase)) { - dfStore.countTerm(term); - } - - dfStore.closeCurrentDocument(); - } - - /** - * When this method is called by the framework, the dfModel is serialized. - */ - @Override - public void collectionProcessComplete() - throws AnalysisEngineProcessException - { - try { - TfidfUtils.writeDfModel(dfStore, outputPath); - } - catch (Exception e) { - throw new AnalysisEngineProcessException(e); - } - } -} diff --git a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/BerkeleyLmProvider.java b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/BerkeleyLmProvider.java similarity index 84% rename from dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/BerkeleyLmProvider.java rename to dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/BerkeleyLmProvider.java index 6858466def..880bd1f694 100644 --- a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/BerkeleyLmProvider.java +++ b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/BerkeleyLmProvider.java @@ -15,12 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency; +package org.dkpro.core.frequency; import java.util.Arrays; import java.util.Iterator; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.provider.FrequencyCountProvider; +import org.dkpro.core.api.frequency.provider.FrequencyCountProvider; + import edu.berkeley.nlp.lm.NgramLanguageModel; import edu.berkeley.nlp.lm.io.LmReaders; @@ -30,26 +31,26 @@ * nGram index files are looked up in DKPRO_HOME directory. */ public class BerkeleyLmProvider - implements FrequencyCountProvider + implements FrequencyCountProvider { private final NgramLanguageModel lm; private String language; - public BerkeleyLmProvider(String binaryFile, String language) - throws Exception - { - lm = LmReaders.readLmBinary(binaryFile); - this.language = language; - } + public BerkeleyLmProvider(String binaryFile, String language) + throws Exception + { + lm = LmReaders.readLmBinary(binaryFile); + this.language = language; + } - // FIXME how to obtain phrase count from logProb + // FIXME how to obtain phrase count from logProb @Override public long getFrequency(String phrase) - { + { throw new UnsupportedOperationException("Not implemented yet."); // return getProbnew Float(Math.exp(logProb)).longValue(); - } + } @Override public double getProbability(String phrase) @@ -98,4 +99,4 @@ public String getID() { return this.getClass().getSimpleName(); } -} \ No newline at end of file +} diff --git a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/Web1TFileAccessProvider.java b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/Web1TFileAccessProvider.java similarity index 97% rename from dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/Web1TFileAccessProvider.java rename to dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/Web1TFileAccessProvider.java index 4ec8cbc84e..57e76ac63f 100644 --- a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/Web1TFileAccessProvider.java +++ b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/Web1TFileAccessProvider.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency; +package org.dkpro.core.frequency; import java.io.File; import java.io.IOException; @@ -54,4 +54,4 @@ public Web1TFileAccessProvider(String language, File indexPath, int minN, int ma basePath = indexPath.getAbsolutePath(); this.language = language; } -} \ No newline at end of file +} diff --git a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/Web1TInMemoryProvider.java b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/Web1TInMemoryProvider.java similarity index 95% rename from dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/Web1TInMemoryProvider.java rename to dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/Web1TInMemoryProvider.java index fb7065dab7..ccf7032efa 100644 --- a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/Web1TInMemoryProvider.java +++ b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/Web1TInMemoryProvider.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency; +package org.dkpro.core.frequency; import java.io.IOException; @@ -32,4 +32,4 @@ public Web1TInMemoryProvider(String language, String ngramLocation, int maxNgram basePath = ngramLocation; this.language = language; } -} \ No newline at end of file +} diff --git a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/Web1TProviderBase.java b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/Web1TProviderBase.java similarity index 92% rename from dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/Web1TProviderBase.java rename to dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/Web1TProviderBase.java index 6fc4cf9b11..726e80c683 100644 --- a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/Web1TProviderBase.java +++ b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/Web1TProviderBase.java @@ -15,16 +15,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency; +package org.dkpro.core.frequency; import java.io.IOException; import java.util.Iterator; +import org.dkpro.core.api.frequency.provider.FrequencyCountProviderBase; + import com.googlecode.jweb1t.JWeb1TIterator; import com.googlecode.jweb1t.Searcher; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.provider.FrequencyCountProviderBase; - public abstract class Web1TProviderBase extends FrequencyCountProviderBase { @@ -73,4 +73,4 @@ public String getLanguage() { return this.language; } -} \ No newline at end of file +} diff --git a/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/phrasedetection/FrequencyWriter.java b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/phrasedetection/FrequencyWriter.java new file mode 100644 index 0000000000..8b28d5d0d7 --- /dev/null +++ b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/phrasedetection/FrequencyWriter.java @@ -0,0 +1,253 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.frequency.phrasedetection; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.util.stream.Stream; + +import org.apache.commons.collections4.Bag; +import org.apache.commons.collections4.bag.HashBag; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.featurepath.FeaturePathException; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.io.sequencegenerator.PhraseSequenceGenerator; +import org.dkpro.core.api.io.sequencegenerator.StringSequenceGenerator; +import org.dkpro.core.api.resources.CompressionUtils; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Count uni-grams and bi-grams in a collection. + */ +@ResourceMetaData(name = "Frequency Writer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +public class FrequencyWriter + extends JCasFileWriter_ImplBase +{ + /** + * When concatenating multiple tokens, this string is inserted between them. + */ + static final String BIGRAM_SEPARATOR = " "; + /** + * Columns (i.e. tokens and counts) are separated by this character. + */ + static final String COLUMN_SEPARATOR = "\t"; + /** + * When hitting a column separator within a token, it is replaced by this token. + */ + static final String COLUMN_SEP_REPLACEMENT = " "; + + /** + * This string (a line) will separate unigrams from bigrams in the output file + **/ + static final String NGRAM_SEPARATOR_LINE = "----------------------------------------------------"; + static final String NEWLINE_REGEX = "\r\n?|\n"; + + /** + * The feature path. + */ + public static final String PARAM_FEATURE_PATH = "featurePath"; + @ConfigurationParameter(name = PARAM_FEATURE_PATH, mandatory = true, defaultValue = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token") + private String featurePath; + + /** + * Set this parameter if bigrams should only be counted when occurring within a covering type, + * e.g. sentences. + */ + public static final String PARAM_COVERING_TYPE = "coveringType"; + @ConfigurationParameter(name = PARAM_COVERING_TYPE, mandatory = false) + private String coveringType; + + /** + * If true, all tokens are lowercased. + */ + public static final String PARAM_LOWERCASE = "lowercase"; + @ConfigurationParameter(name = PARAM_LOWERCASE, mandatory = true, defaultValue = "false") + private boolean lowercase; + + /** + * Tokens occurring fewer times than this value are omitted. + */ + public static final String PARAM_MIN_COUNT = "minCount"; + @ConfigurationParameter(name = PARAM_MIN_COUNT, mandatory = true, defaultValue = "5") + private int minCount; + + /** + * If true, sort output by count (descending order). + */ + public static final String PARAM_SORT_BY_COUNT = "sortByCount"; + @ConfigurationParameter(name = PARAM_SORT_BY_COUNT, mandatory = true, defaultValue = "false") + private boolean sortByCount; + + /** + * If true, sort output alphabetically. + */ + public static final String PARAM_SORT_BY_ALPHABET = "sortByAlphabet"; + @ConfigurationParameter(name = PARAM_SORT_BY_ALPHABET, mandatory = true, defaultValue = "false") + private boolean sortByAlphabet; + + /** + * Path of a file containing stopwords one work per line. + */ + public static final String PARAM_STOPWORDS_FILE = "stopwordsFile"; + @ConfigurationParameter(name = PARAM_STOPWORDS_FILE, mandatory = true, defaultValue = "") + private String stopwordsFile; + + /** + * Stopwords are replaced by this value. + */ + public static final String PARAM_STOPWORDS_REPLACEMENT = "stopwordsReplacement"; + @ConfigurationParameter(name = PARAM_STOPWORDS_REPLACEMENT, mandatory = true, defaultValue = "") + private String stopwordsReplacement; + + /** + * Regular expression of tokens to be filtered. + */ + public static final String PARAM_FILTER_REGEX = "filterRegex"; + @ConfigurationParameter(name = PARAM_FILTER_REGEX, mandatory = true, defaultValue = "") + private String filterRegex; + + /** + * Value with which tokens matching the regular expression are replaced. + */ + public static final String PARAM_REGEX_REPLACEMENT = "regexReplacement"; + @ConfigurationParameter(name = PARAM_REGEX_REPLACEMENT, mandatory = true, defaultValue = "") + private String regexReplacement; + + private Bag unigrams; + private Bag bigrams; + private StringSequenceGenerator sequenceGenerator; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + if (sortByAlphabet && sortByCount) { + throw new ResourceInitializationException(new IllegalArgumentException( + "Can only sort either by count or alphabetically.")); + } + + unigrams = new HashBag<>(); + bigrams = new HashBag<>(); + + /* init sequence generator */ + try { + sequenceGenerator = new PhraseSequenceGenerator.Builder() + .featurePath(featurePath) + .coveringType(coveringType) + .lowercase(lowercase) + .stopwordsFile(stopwordsFile) + .stopwordsReplacement(stopwordsReplacement) + .filterRegex(filterRegex) + .filterRegexReplacement(regexReplacement) + .buildStringSequenceGenerator(); + } + catch (IOException e) { + throw new ResourceInitializationException(e); + } + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + try { + /* iterate over sequences (e.g. sentences)*/ + for (String[] sequence : sequenceGenerator.tokenSequences(aJCas)) { + /* iterate over tokens in sequence */ + for (int i = 0; i < sequence.length; i++) { + /* count unigrams */ + String unigram = sequence[i] + .replaceAll(COLUMN_SEPARATOR, COLUMN_SEP_REPLACEMENT) + .replaceAll(NEWLINE_REGEX, COLUMN_SEP_REPLACEMENT); + unigrams.add(unigram); + + /* count bigrams */ + if (i + 1 < sequence.length) { + String bigram = unigram + BIGRAM_SEPARATOR + sequence[i + 1] + .replaceAll(COLUMN_SEPARATOR, COLUMN_SEP_REPLACEMENT) + .replaceAll(NEWLINE_REGEX, COLUMN_SEP_REPLACEMENT); + bigrams.add(bigram); + } + } + } + } + catch (FeaturePathException e) { + throw new AnalysisEngineProcessException(e); + } + } + + @Override + public void collectionProcessComplete() + throws AnalysisEngineProcessException + { + getLogger().info("Vocabulary size: " + unigrams.uniqueSet().size()); + try { + getLogger().info("Writing frequencies to " + getTargetLocation()); + OutputStream os = CompressionUtils.getOutputStream(new File(getTargetLocation())); + + writeNgrams(os, unigrams); + os.write((NGRAM_SEPARATOR_LINE + "\n").getBytes()); + writeNgrams(os, bigrams); + os.close(); + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + + /** + * Write counter with counts from a bag to an output stream. + * + * @param os an {@link OutputStream} + * @param counter a {@link Bag} of string counter + */ + private void writeNgrams(OutputStream os, Bag counter) + { + /* create token stream */ + Stream stream = counter.uniqueSet().stream() + .filter(token -> counter.getCount(token) >= minCount); + + /* sort output */ + if (sortByAlphabet) { + stream = stream.sorted(String::compareTo); + } + else if (sortByCount) { + stream = stream.sorted((o1, o2) -> + -Integer.compare(counter.getCount(o1), counter.getCount(o2))); + } + + /* write tokens with counts */ + stream.forEach(token -> { + try { + os.write((token + COLUMN_SEPARATOR + counter.getCount(token) + "\n").getBytes()); + } + catch (IOException e) { + throw new RuntimeException(e); + } + }); + } +} diff --git a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/phrasedetection/PhraseAnnotator.java b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/phrasedetection/PhraseAnnotator.java similarity index 82% rename from dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/phrasedetection/PhraseAnnotator.java rename to dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/phrasedetection/PhraseAnnotator.java index 6a8b631d0a..15c9899123 100644 --- a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/phrasedetection/PhraseAnnotator.java +++ b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/phrasedetection/PhraseAnnotator.java @@ -15,22 +15,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency.phrasedetection; +package org.dkpro.core.frequency.phrasedetection; -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; -import de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator.PhraseSequenceGenerator; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LexicalPhrase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.cas.TOP; -import org.apache.uima.resource.ResourceInitializationException; +import static org.apache.uima.fit.util.JCasUtil.select; import java.io.BufferedReader; import java.io.FileInputStream; @@ -40,27 +27,41 @@ import java.util.List; import java.util.Map; -import static org.apache.uima.fit.util.JCasUtil.select; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.TOP; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.featurepath.FeaturePathException; +import org.dkpro.core.api.io.sequencegenerator.PhraseSequenceGenerator; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CompressionUtils; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LexicalPhrase; +import eu.openminted.share.annotations.api.DocumentationResource; /** - * Annotate phrases in a sentence. Depending on the provided unigrams and the threshold, these + * Annotate phrases in a sentence. Depending on the provided n-grams and the threshold, these * comprise either one or two annotations (tokens, lemmas, ...). *

- * In order to identify longer phrases, run the {@link FrequencyCounter} and this annotator - * multiple times, each time taking the results of the previous run as input. From the second run on, set phrases - * in the feature path parameter {@link #PARAM_FEATURE_PATH}. + * In order to identify longer phrases, run the {@link FrequencyWriter} and this annotator multiple + * times, each time taking the results of the previous run as input. From the second run on, set + * phrases in the feature path parameter {@link #PARAM_FEATURE_PATH}. */ -@ResourceMetaData(name="Phrase Annotator") +@ResourceMetaData(name = "Phrase Annotator") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") public class PhraseAnnotator extends JCasAnnotator_ImplBase { /** - * The feature path to use for building bigrams. Default: tokens. + * The feature path to use for building bigrams. */ public static final String PARAM_FEATURE_PATH = "featurePath"; - @ConfigurationParameter(name = PARAM_FEATURE_PATH, mandatory = false) + @ConfigurationParameter(name = PARAM_FEATURE_PATH, mandatory = true, defaultValue = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token") private String featurePath; - private static final String DEFAULT_FEATURE_PATH = Token.class.getCanonicalName(); /** * If true, lowercase everything. @@ -70,47 +71,59 @@ public class PhraseAnnotator private boolean lowercase; /** - * The file providing the unigram and bigram unigrams to use. + * The file providing the uni-grams and bi-grams to use. */ public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true) private String modelLocation; /** - * The discount in order to prevent too many phrases consisting of very infrequent words to be formed. - * A typical value is the minimum count set during model creation ({@link FrequencyCounter#PARAM_MIN_COUNT}), - * which is by default set to 5. + * The discount in order to prevent too many phrases consisting of very infrequent words to be + * formed. A typical value is the minimum count set during model creation + * ({@link FrequencyWriter#PARAM_MIN_COUNT}), which is by default set to 5. */ public static final String PARAM_DISCOUNT = "discount"; @ConfigurationParameter(name = PARAM_DISCOUNT, mandatory = true, defaultValue = "5") private int discount; /** - * The threshold score for phrase construction. Default is 100. Lower values result in fewer phrases. - * The value strongly depends on the size of the corpus and the token unigrams. + * The threshold score for phrase construction. Default is 100. Lower values result in fewer + * phrases. The value strongly depends on the size of the corpus and the token unigrams. */ public static final String PARAM_THRESHOLD = "threshold"; @ConfigurationParameter(name = PARAM_THRESHOLD, mandatory = true, defaultValue = "100") private float threshold; + /** + * Path of a file containing stopwords one work per line. + */ public static final String PARAM_STOPWORDS_FILE = "stopwordsFile"; @ConfigurationParameter(name = PARAM_STOPWORDS_FILE, mandatory = true, defaultValue = "") private String stopwordsFile; + /** + * Stopwords are replaced by this value. + */ public static final String PARAM_STOPWORDS_REPLACEMENT = "stopwordsReplacement"; @ConfigurationParameter(name = PARAM_STOPWORDS_REPLACEMENT, mandatory = true, defaultValue = "") private String stopwordsReplacement; + /** + * Regular expression of tokens to be filtered. + */ public static final String PARAM_FILTER_REGEX = "filterRegex"; @ConfigurationParameter(name = PARAM_FILTER_REGEX, mandatory = true, defaultValue = "") private String filterRegex; + /** + * Value with which tokens matching the regular expression are replaced. + */ public static final String PARAM_REGEX_REPLACEMENT = "regexReplacement"; @ConfigurationParameter(name = PARAM_REGEX_REPLACEMENT, mandatory = true, defaultValue = "") private String regexReplacement; - /** - * Set this parameter if bigrams should only be counted when occurring within a covering type, e.g. sentences. + * Set this parameter if bigrams should only be counted when occurring within a covering type, + * e.g. sentences. */ public static final String PARAM_COVERING_TYPE = "coveringType"; @ConfigurationParameter(name = PARAM_COVERING_TYPE, mandatory = false) @@ -128,10 +141,6 @@ public void initialize(UimaContext context) { super.initialize(context); - /* set feature path to default */ - if (featurePath == null) { - featurePath = DEFAULT_FEATURE_PATH; - } try { sequenceGenerator = new PhraseSequenceGenerator.Builder() .featurePath(featurePath) @@ -182,7 +191,7 @@ public void process(JCas aJCas) /* do not look for bigram on last token */ LexicalPhrase phrase2 = sequence[i + 1]; String token2 = phrase2.getText(); - String bigram = token1 + FrequencyCounter.BIGRAM_SEPARATOR + token2; + String bigram = token1 + FrequencyWriter.BIGRAM_SEPARATOR + token2; if (bigrams.containsKey(bigram)) { assert unigrams.containsKey(token1); @@ -227,7 +236,7 @@ private void readCounts() String line; while ((line = reader.readLine()) != null) { - if (line.equals(FrequencyCounter.NGRAM_SEPARATOR_LINE)) { + if (line.equals(FrequencyWriter.NGRAM_SEPARATOR_LINE)) { /* this should only happen once per file */ if (!countingUnigrams) { throw new IllegalStateException( @@ -236,7 +245,7 @@ private void readCounts() countingUnigrams = false; } else { - String[] columns = line.split(FrequencyCounter.COLUMN_SEPARATOR); + String[] columns = line.split(FrequencyWriter.COLUMN_SEPARATOR); if (columns.length != 2) { throw new IllegalStateException("Invalid line in input file:\n" + line); } diff --git a/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/resources/BerkeleyLmFrequencyCountProvider.java b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/resources/BerkeleyLmFrequencyCountProvider.java new file mode 100644 index 0000000000..e7ea4d2976 --- /dev/null +++ b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/resources/BerkeleyLmFrequencyCountProvider.java @@ -0,0 +1,68 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.frequency.resources; + +import java.util.Map; + +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.ResourceSpecifier; +import org.dkpro.core.api.frequency.FrequencyCountResourceBase; +import org.dkpro.core.api.frequency.provider.FrequencyCountProvider; +import org.dkpro.core.frequency.BerkeleyLmProvider; + +/** + * External resource wrapper for the Berkeley LM frequency count provider. + */ +public final class BerkeleyLmFrequencyCountProvider + extends FrequencyCountResourceBase + implements FrequencyCountProvider +{ + + public static final String PARAM_BINARY = "BinaryFile"; + @ConfigurationParameter(name = PARAM_BINARY, mandatory = true) + protected String file; + + public static final String PARAM_PROVIDER_LANGUAGE = "ProviderLanguage"; + @ConfigurationParameter(name = PARAM_PROVIDER_LANGUAGE, mandatory = true) + protected String language; + + @Override + public boolean initialize(ResourceSpecifier aSpecifier, Map aAdditionalParams) + throws ResourceInitializationException + { + if (!super.initialize(aSpecifier, aAdditionalParams)) { + return false; + } + + try { + initializeProvider(); + } + catch (Exception e) { + throw new ResourceInitializationException(e); + } + + return true; + } + + @Override + protected void initializeProvider() throws Exception + { + provider = new BerkeleyLmProvider(file, language); + } +} diff --git a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/resources/Web1TFrequencyCountResource.java b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/resources/Web1TFrequencyCountResource.java similarity index 79% rename from dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/resources/Web1TFrequencyCountResource.java rename to dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/resources/Web1TFrequencyCountResource.java index d085c34256..e17a71d736 100644 --- a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/resources/Web1TFrequencyCountResource.java +++ b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/resources/Web1TFrequencyCountResource.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency.resources; +package org.dkpro.core.frequency.resources; import java.io.File; import java.io.IOException; @@ -25,24 +25,20 @@ import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; - -import de.tudarmstadt.ukp.dkpro.core.api.frequency.FrequencyCountResourceBase; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.provider.FrequencyCountProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.frequency.Web1TFileAccessProvider; +import org.dkpro.core.api.frequency.FrequencyCountResourceBase; +import org.dkpro.core.api.frequency.provider.FrequencyCountProviderBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.frequency.Web1TFileAccessProvider; /** * External resource wrapper for the Web1T frequency count provider. - * - * */ public final class Web1TFrequencyCountResource extends FrequencyCountResourceBase { - // Attention! Can only have String parameters in external resources. public static final String PARAM_MIN_NGRAM_LEVEL = "MinLevel"; @@ -74,12 +70,12 @@ public final class Web1TFrequencyCountResource private CasConfigurableProviderBase web1TFolderProvider; @Override - public boolean initialize(ResourceSpecifier aSpecifier, Map aAdditionalParams) - throws ResourceInitializationException - { - if (!super.initialize(aSpecifier, aAdditionalParams)) { - return false; - } + public boolean initialize(ResourceSpecifier aSpecifier, Map aAdditionalParams) + throws ResourceInitializationException + { + if (!super.initialize(aSpecifier, aAdditionalParams)) { + return false; + } web1TFolderProvider = new ModelProviderBase() { @@ -105,14 +101,16 @@ protected File produceResource(URL aUrl) throws IOException } }; - return true; - } + return true; + } - protected void initializeProvider() throws ResourceInitializationException{ - try{ + @Override + protected void initializeProvider() throws ResourceInitializationException + { + try { web1TFolderProvider.configure(); provider = new Web1TFileAccessProvider( - language, + language, web1TFolderProvider.getResource(), new Integer(minLevel), new Integer(maxLevel) @@ -121,7 +119,7 @@ protected void initializeProvider() throws ResourceInitializationException{ catch (IOException e) { throw new ResourceInitializationException(e); } - ((FrequencyCountProviderBase) provider).setScaleDownFactor(Integer.parseInt(this.scaleDownFactor)); + ((FrequencyCountProviderBase) provider) + .setScaleDownFactor(Integer.parseInt(this.scaleDownFactor)); } - } diff --git a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/resources/Web1TInMemoryFrequencyCountResource.java b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/resources/Web1TInMemoryFrequencyCountResource.java similarity index 87% rename from dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/resources/Web1TInMemoryFrequencyCountResource.java rename to dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/resources/Web1TInMemoryFrequencyCountResource.java index 28189d40f4..652d660dff 100644 --- a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/resources/Web1TInMemoryFrequencyCountResource.java +++ b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/resources/Web1TInMemoryFrequencyCountResource.java @@ -15,18 +15,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency.resources; +package org.dkpro.core.frequency.resources; import java.util.Map; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; - -import de.tudarmstadt.ukp.dkpro.core.api.frequency.FrequencyCountResourceBase; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.provider.FrequencyCountProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.frequency.Web1TInMemoryProvider; +import org.dkpro.core.api.frequency.FrequencyCountResourceBase; +import org.dkpro.core.api.frequency.provider.FrequencyCountProviderBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.frequency.Web1TInMemoryProvider; /** * External resource wrapper for the Web1T in memory frequency count provider. diff --git a/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/TfIdfAnnotator.java b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/TfIdfAnnotator.java new file mode 100644 index 0000000000..2d39365784 --- /dev/null +++ b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/TfIdfAnnotator.java @@ -0,0 +1,231 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.frequency.tfidf; + +import java.util.Locale; +import java.util.Map.Entry; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Level; +import org.dkpro.core.api.featurepath.FeaturePathException; +import org.dkpro.core.api.featurepath.FeaturePathFactory; +import org.dkpro.core.frequency.tfidf.model.DfModel; +import org.dkpro.core.frequency.tfidf.model.DfStore; +import org.dkpro.core.frequency.tfidf.model.SharedDfModel; +import org.dkpro.core.frequency.tfidf.util.FreqDist; +import org.dkpro.core.frequency.tfidf.util.TermIterator; +import org.dkpro.core.frequency.tfidf.util.TfidfUtils; + +import de.tudarmstadt.ukp.dkpro.core.api.frequency.tfidf.type.Tfidf; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * This component adds {@link Tfidf} annotations consisting of a term and a tfidf weight.
+ * The annotator is type agnostic concerning the input annotation, so you have to specify the + * annotation type and string representation. It uses a pre-serialized {@link DfStore}, which can be + * created using the {@link TfIdfWriter}. + */ +@ResourceMetaData(name = "TF/IDF Annotator") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.frequency.tfidf.type.Tfidf" }) +public class TfIdfAnnotator + extends JCasAnnotator_ImplBase +{ + + /** + * This annotator is type agnostic, so it is mandatory to specify the type of the working + * annotation and how to obtain the string representation with the feature path. + */ + public static final String PARAM_FEATURE_PATH = "featurePath"; + @ConfigurationParameter(name = PARAM_FEATURE_PATH, mandatory = true) + protected String featurePath; + + /** + * Provide the path to the Df-Model. When a shared {@link SharedDfModel} is bound to this + * annotator, this is ignored. + */ + public static final String PARAM_TFDF_PATH = "tfdfPath"; + @ConfigurationParameter(name = PARAM_TFDF_PATH, mandatory = false) + private String tfdfPath; + + /** + * If set to true, the whole text is handled in lower case. + */ + public static final String PARAM_LOWERCASE = "lowercase"; + @ConfigurationParameter(name = PARAM_LOWERCASE, mandatory = false, defaultValue = "false") + protected boolean lowercase; + + /** + * The model for term frequency weighting.
+ * Invoke toString() on an enum of {@link WeightingModeTf} for setup. + *

+ * Default value is "NORMAL" yielding an unweighted tf. + */ + public static final String PARAM_TF_MODE = "weightingModeTf"; + @ConfigurationParameter(name = PARAM_TF_MODE, mandatory = false, defaultValue = "NORMAL") + private WeightingModeTf weightingModeTf; + + /** + * The model for inverse document frequency weighting.
+ * Invoke toString() on an enum of {@link WeightingModeIdf} for setup. + *

+ * Default value is "NORMAL" yielding an unweighted idf. + */ + public static final String PARAM_IDF_MODE = "weightingModeIdf"; + @ConfigurationParameter(name = PARAM_IDF_MODE, mandatory = false, defaultValue = "NORMAL") + private WeightingModeIdf weightingModeIdf; + + /** + * Available modes for term frequency + */ + public enum WeightingModeTf + { + BINARY, NORMAL, LOG, LOG_PLUS_ONE + } + + /** + * Available modes for inverse document frequency + */ + public enum WeightingModeIdf + { + BINARY, CONSTANT_ONE, NORMAL, LOG + } + + private DfModel dfModel; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + try { + dfModel = TfidfUtils.getDfModel(tfdfPath); + } + catch (Exception e) { + throw new ResourceInitializationException(e); + } + } + + @Override + public void process(JCas jcas) + throws AnalysisEngineProcessException + { + + FreqDist termFrequencies = getTermFrequencies(jcas); + + try { + for (Entry entry : FeaturePathFactory.select(jcas.getCas(), + featurePath)) { + String term = entry.getValue(); + if (lowercase) { + term = term.toLowerCase(); + } + + int tf = termFrequencies.getCount(term); + int df = dfModel.getDf(term); + if (df == 0) { + getContext().getLogger().log(Level.WARNING, + "Term [" + term + "] not found in dfStore!"); + } + + double tfidf = getWeightedTf(tf) * getWeightedIdf(df, dfModel.getDocumentCount()); + + logTfidf(term, tf, df, tfidf); + + Tfidf tfidfAnnotation = new Tfidf(jcas); + tfidfAnnotation.setTerm(term); + tfidfAnnotation.setTfidfValue(tfidf); + tfidfAnnotation.setBegin(entry.getKey().getBegin()); + tfidfAnnotation.setEnd(entry.getKey().getEnd()); + tfidfAnnotation.addToIndexes(); + } + } + catch (FeaturePathException e) { + throw new AnalysisEngineProcessException(e); + } + } + + protected FreqDist getTermFrequencies(JCas jcas) + throws AnalysisEngineProcessException + { + // count all terms with the given annotation + FreqDist termFrequencies = new FreqDist(); + for (String term : TermIterator.create(jcas, featurePath, lowercase)) { + termFrequencies.count(term); + } + return termFrequencies; + } + + /** + * Calculates a weighted tf according to given settings. + */ + private double getWeightedTf(int tf) + { + switch (weightingModeTf) { + case NORMAL: + return tf; + case LOG: + return tf > 0 ? Math.log(tf) : 0D; + case LOG_PLUS_ONE: + return tf > 0 ? Math.log(tf + 1) : 0D; + case BINARY: + return tf > 0 ? 1D : 0D; + default: + throw new IllegalStateException(); + } + } + + /** + * Calculates a weighted idf according to given settings. + */ + private double getWeightedIdf(int df, int n) + { + switch (weightingModeIdf) { + case NORMAL: + return (double) n / df; + case LOG: + return df > 0 ? Math.log((double) n / df) : 0D; + case CONSTANT_ONE: + return 1D; + case BINARY: + return df > 0 ? 1D : 0D; + default: + throw new IllegalStateException(); + } + } + + private void logTfidf(String term, int tf, int df, double tfidf) + { + if (getContext().getLogger().isLoggable(Level.FINEST)) { + getContext().getLogger().log( + Level.FINEST, + String.format(Locale.US, "\"%s\" (tf: %d, df: %d, tfidf: %.2f)", term, tf, df, + tfidf)); + } + + } + +} diff --git a/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/TfIdfWriter.java b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/TfIdfWriter.java new file mode 100644 index 0000000000..2f27c669ac --- /dev/null +++ b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/TfIdfWriter.java @@ -0,0 +1,109 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.frequency.tfidf; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.frequency.tfidf.model.DfModel; +import org.dkpro.core.frequency.tfidf.model.DfStore; +import org.dkpro.core.frequency.tfidf.util.TermIterator; +import org.dkpro.core.frequency.tfidf.util.TfidfUtils; + +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.Parameters; + +/** + * This consumer builds a {@link DfModel}. It collects the df (document frequency) counts for the + * processed collection. The counts are serialized as a {@link DfModel}-object. + */ +@ResourceMetaData(name = "TF/IDF Model Writer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@Parameters( + exclude = { + TfIdfWriter.PARAM_TARGET_LOCATION }) +public class TfIdfWriter + extends JCasAnnotator_ImplBase +{ + @Deprecated + public static final String PARAM_OUTPUT_PATH = ComponentParameters.PARAM_TARGET_LOCATION; + /** + * Specifies the path and filename where the model file is written. + */ + public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; + @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) + private String outputPath; + + /** + * If set to true, the whole text is handled in lower case. + */ + public static final String PARAM_LOWERCASE = "lowercase"; + @ConfigurationParameter(name = PARAM_LOWERCASE, mandatory = true, defaultValue = "false") + private boolean lowercase; + + /** + * This annotator is type agnostic, so it is mandatory to specify the type of the working + * annotation and how to obtain the string representation with the feature path. + */ + public static final String PARAM_FEATURE_PATH = "featurePath"; + @ConfigurationParameter(name = PARAM_FEATURE_PATH, mandatory = true) + private String featurePath; + + private DfStore dfStore; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + dfStore = new DfStore(featurePath, lowercase); + } + + @Override + public void process(JCas jcas) + throws AnalysisEngineProcessException + { + dfStore.registerNewDocument(); + + for (String term : TermIterator.create(jcas, featurePath, lowercase)) { + dfStore.countTerm(term); + } + + dfStore.closeCurrentDocument(); + } + + /** + * When this method is called by the framework, the dfModel is serialized. + */ + @Override + public void collectionProcessComplete() + throws AnalysisEngineProcessException + { + try { + TfidfUtils.writeDfModel(dfStore, outputPath); + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + } +} diff --git a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/model/DfModel.java b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/model/DfModel.java similarity index 95% rename from dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/model/DfModel.java rename to dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/model/DfModel.java index 929491278d..8af0595e5c 100644 --- a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/model/DfModel.java +++ b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/model/DfModel.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.model; +package org.dkpro.core.frequency.tfidf.model; import java.io.Serializable; diff --git a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/model/DfStore.java b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/model/DfStore.java similarity index 86% rename from dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/model/DfStore.java rename to dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/model/DfStore.java index 297bf2a52a..23aafb75e9 100644 --- a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/model/DfStore.java +++ b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/model/DfStore.java @@ -15,20 +15,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.model; +package org.dkpro.core.frequency.tfidf.model; import java.io.Serializable; import java.util.Set; import java.util.TreeSet; -import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.TfidfAnnotator; -import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.TfidfConsumer; -import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.util.FreqDist; +import org.dkpro.core.frequency.tfidf.TfIdfAnnotator; +import org.dkpro.core.frequency.tfidf.TfIdfWriter; +import org.dkpro.core.frequency.tfidf.util.FreqDist; /** * Container that stores the document frequency and additional data in a collection of documents. To - * be filled and serialized by {@link TfidfConsumer} and deserialized and used by - * {@link TfidfAnnotator}. + * be filled and serialized by {@link TfIdfWriter} and deserialized and used by + * {@link TfIdfAnnotator}. * */ public class DfStore diff --git a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/model/SharedDfModel.java b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/model/SharedDfModel.java similarity index 94% rename from dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/model/SharedDfModel.java rename to dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/model/SharedDfModel.java index f29a1615ff..6c32157f07 100644 --- a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/model/SharedDfModel.java +++ b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/model/SharedDfModel.java @@ -15,9 +15,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.model; +package org.dkpro.core.frequency.tfidf.model; -import static org.apache.uima.fit.factory.ExternalResourceFactory.bindResource; +import static org.apache.uima.fit.factory.ExternalResourceFactory.bindResourceUsingUrl; import java.io.File; import java.io.ObjectInputStream; @@ -62,7 +62,7 @@ public static void bindTo(AnalysisEngineDescription aaed, String pathToResource, Object... params) throws InvalidXMLException, MalformedURLException { - bindResource(aaed, DfModel.class.getName(), SharedDfModel.class, new File( + bindResourceUsingUrl(aaed, DfModel.class.getName(), SharedDfModel.class, new File( pathToResource).toURI().toURL().toString(), params); } diff --git a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/util/FreqDist.java b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/util/FreqDist.java similarity index 94% rename from dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/util/FreqDist.java rename to dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/util/FreqDist.java index 64283e5bf3..b1aec457cc 100644 --- a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/util/FreqDist.java +++ b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/util/FreqDist.java @@ -15,23 +15,23 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.util; +package org.dkpro.core.frequency.tfidf.util; import java.io.Serializable; -import java.util.*; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; import java.util.Map.Entry; /** * An object that counts objects. Inspired by NLTKs FreqDist-class. * - * * @param * The type of element which is counted. */ public class FreqDist implements Serializable { - private static final long serialVersionUID = 9155968779719980277L; private Map counts; @@ -82,10 +82,12 @@ public Map getProbabilities() */ public int getCount(T element) { - if (counts.containsKey(element)) + if (counts.containsKey(element)) { return counts.get(element); - else + } + else { return 0; + } } /** diff --git a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/util/TermIterator.java b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/util/TermIterator.java similarity index 92% rename from dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/util/TermIterator.java rename to dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/util/TermIterator.java index dbb6552d34..671ee89e5a 100644 --- a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/util/TermIterator.java +++ b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/util/TermIterator.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.util; +package org.dkpro.core.frequency.tfidf.util; import java.util.Iterator; import java.util.Map.Entry; @@ -23,9 +23,8 @@ import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.jcas.JCas; - -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathFactory; +import org.dkpro.core.api.featurepath.FeaturePathException; +import org.dkpro.core.api.featurepath.FeaturePathFactory; /** * Iterator over terms (Strings) in the JCas. diff --git a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/util/TfidfUtils.java b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/util/TfidfUtils.java similarity index 86% rename from dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/util/TfidfUtils.java rename to dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/util/TfidfUtils.java index 1ac17c7398..fc747e0be4 100644 --- a/dkpro-core-frequency-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/util/TfidfUtils.java +++ b/dkpro-core-frequency-asl/src/main/java/org/dkpro/core/frequency/tfidf/util/TfidfUtils.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.util; +package org.dkpro.core.frequency.tfidf.util; import java.io.BufferedOutputStream; import java.io.File; @@ -26,14 +26,11 @@ import java.io.ObjectOutputStream; import org.apache.commons.io.FileUtils; - -import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.model.DfModel; -import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.model.DfStore; +import org.dkpro.core.frequency.tfidf.model.DfModel; +import org.dkpro.core.frequency.tfidf.model.DfStore; /** * Serialization and deserialization methods. - * - * */ public class TfidfUtils { @@ -72,12 +69,15 @@ public static void serialize(Object object, String fileName) throws IOException { File file = new File(fileName); - if (!file.exists()) + if (!file.exists()) { FileUtils.touch(file); + } if (file.isDirectory()) { throw new IOException("A directory with that name exists!"); } - try (ObjectOutputStream objOut = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(file)))) { + + try (ObjectOutputStream objOut = new ObjectOutputStream( + new BufferedOutputStream(new FileOutputStream(file)))) { objOut.writeObject(object); objOut.flush(); objOut.close(); @@ -88,7 +88,8 @@ public static void serialize(Object object, String fileName) public static T deserialize(String filePath) throws IOException { - try (ObjectInputStream in = new ObjectInputStream(new FileInputStream(new File(filePath)))) { + try (ObjectInputStream in = new ObjectInputStream( + new FileInputStream(new File(filePath)))) { return (T) in.readObject(); } catch (ClassNotFoundException e) { diff --git a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/Web1TFrequencyProviderTest.java b/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/Web1TFrequencyProviderTest.java deleted file mode 100644 index 1ef21fae44..0000000000 --- a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/Web1TFrequencyProviderTest.java +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.frequency; - -import static org.junit.Assert.assertEquals; - -import java.io.File; -import java.util.Iterator; - -import org.junit.Test; - -public class Web1TFrequencyProviderTest -{ - - @Test - public void web1tTest_indexFiles() throws Exception - { - Web1TProviderBase web1t = new Web1TFileAccessProvider( - "en", - "src/test/resources/web1t/index-1gms", - "src/test/resources/web1t/index-2gms" - ); - - test(web1t); - } - - @Test - public void web1tTest_path() throws Exception - { - Web1TProviderBase web1t = new Web1TFileAccessProvider( - "en", - new File("src/test/resources/web1t/"), - 1, - 2 - ); - - test(web1t); - } - - @Test - public void web1tNgramIteratorTest() throws Exception - { - Web1TProviderBase web1t = new Web1TFileAccessProvider( - "en", - new File("src/test/resources/web1t/"), - 1, - 2 - ); - - int i=0; - Iterator ngramIterator = web1t.getNgramIterator(1); - while (ngramIterator.hasNext()) { - ngramIterator.next(); - i++; - } - assertEquals(i, 11); - } - - private void test(Web1TProviderBase web1t) throws Exception { - assertEquals(2147436244l, web1t.getFrequency("!")); - assertEquals(528, web1t.getFrequency("Nilmeier")); - assertEquals(106, web1t.getFrequency("influx takes")); - assertEquals(69, web1t.getFrequency("frist will")); - - assertEquals(13893397919l, web1t.getNrOfNgrams(1)); - assertEquals(6042, web1t.getNrOfNgrams(2)); - assertEquals(11, web1t.getNrOfDistinctNgrams(1)); - assertEquals(21, web1t.getNrOfDistinctNgrams(2)); - - } -} diff --git a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/TfidfAnnotatorTest.java b/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/TfidfAnnotatorTest.java deleted file mode 100644 index 303fda14ed..0000000000 --- a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/TfidfAnnotatorTest.java +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.frequency.tfidf; - -import static de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase.INCLUDE_PREFIX; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.fail; - -import java.io.File; -import java.util.HashMap; -import java.util.Map; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.JCasIterable; -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.apache.uima.jcas.JCas; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.rules.TestName; - -import de.tudarmstadt.ukp.dkpro.core.api.frequency.tfidf.type.Tfidf; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.TfidfAnnotator.WeightingModeIdf; -import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.TfidfAnnotator.WeightingModeTf; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; - -/** - * - * - */ -public class TfidfAnnotatorTest -{ - // assertEquals on doubles needs an epsilon - protected static final double EPSILON = 0.000001; - - private final static String CONSUMER_TEST_DATA_PATH = "src/test/resources/consumer/"; - - @Rule - public TemporaryFolder folder = new TemporaryFolder(); - - protected File model; - - @Before - public void buildModel() - throws Exception - { - model = folder.newFile(); - - // write the model - CollectionReaderDescription reader = createReaderDescription(TextReader.class, - TextReader.PARAM_SOURCE_LOCATION, CONSUMER_TEST_DATA_PATH, - TextReader.PARAM_PATTERNS, INCLUDE_PREFIX + "*.txt"); - - AnalysisEngineDescription aggregate = createEngineDescription( - createEngineDescription(BreakIteratorSegmenter.class), - createEngineDescription(TfidfConsumer.class, - TfidfConsumer.PARAM_FEATURE_PATH, Token.class, - TfidfConsumer.PARAM_TARGET_LOCATION, model)); - - SimplePipeline.runPipeline(reader, aggregate); - } - - @Test - public void tfidfTest_normal_constantOne() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription(TextReader.class, - TextReader.PARAM_SOURCE_LOCATION, CONSUMER_TEST_DATA_PATH, - TextReader.PARAM_PATTERNS, INCLUDE_PREFIX + "*.txt"); - - AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - - AnalysisEngineDescription tfidfAnnotator = createEngineDescription(TfidfAnnotator.class, - TfidfAnnotator.PARAM_FEATURE_PATH, Token.class, - TfidfAnnotator.PARAM_TFDF_PATH, model, - TfidfAnnotator.PARAM_TF_MODE, WeightingModeTf.NORMAL, - TfidfAnnotator.PARAM_IDF_MODE, WeightingModeIdf.CONSTANT_ONE); - - Map expectedDoc1 = new HashMap(); - expectedDoc1.put("example", 1.0); - expectedDoc1.put("sentence", 1.0); - expectedDoc1.put("funny", 1.0); - - Map expectedDoc2 = new HashMap(); - expectedDoc2.put("example", 2.0); - expectedDoc2.put("sentence", 1.0); - - for (JCas jcas : new JCasIterable(reader, segmenter, tfidfAnnotator)) { - testIt(jcas, expectedDoc1, expectedDoc2); - } - } - - @Test - public void tfidfTest_binary_binary() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription(TextReader.class, - TextReader.PARAM_SOURCE_LOCATION, CONSUMER_TEST_DATA_PATH, - TextReader.PARAM_PATTERNS, INCLUDE_PREFIX + "*.txt"); - - AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - - AnalysisEngineDescription tfidfAnnotator = createEngineDescription(TfidfAnnotator.class, - TfidfAnnotator.PARAM_FEATURE_PATH, Token.class, - TfidfAnnotator.PARAM_TFDF_PATH, model, - TfidfAnnotator.PARAM_TF_MODE, WeightingModeTf.BINARY, - TfidfAnnotator.PARAM_IDF_MODE, WeightingModeIdf.BINARY); - - Map expectedDoc1 = new HashMap(); - expectedDoc1.put("example", 1.0); - expectedDoc1.put("sentence", 1.0); - expectedDoc1.put("funny", 1.0); - - Map expectedDoc2 = new HashMap(); - expectedDoc2.put("example", 1.0); - expectedDoc2.put("sentence", 1.0); - - for (JCas jcas : new JCasIterable(reader, segmenter, tfidfAnnotator)) { - testIt(jcas, expectedDoc1, expectedDoc2); - } - } - - @Test - public void tfidfTest_normal_log() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription(TextReader.class, - TextReader.PARAM_SOURCE_LOCATION, CONSUMER_TEST_DATA_PATH, - TextReader.PARAM_PATTERNS, INCLUDE_PREFIX + "*.txt"); - - AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - - AnalysisEngineDescription tfidfAnnotator = createEngineDescription(TfidfAnnotator.class, - TfidfAnnotator.PARAM_FEATURE_PATH, Token.class, - TfidfAnnotator.PARAM_TFDF_PATH, model, - TfidfAnnotator.PARAM_TF_MODE, WeightingModeTf.NORMAL, - TfidfAnnotator.PARAM_IDF_MODE, WeightingModeIdf.LOG); - - Map expectedDoc1 = new HashMap(); - expectedDoc1.put("example", 0.0); - expectedDoc1.put("sentence", 0.0); - expectedDoc1.put("funny", Math.log(2)); - - Map expectedDoc2 = new HashMap(); - expectedDoc2.put("example", 0.0); - expectedDoc2.put("sentence", 0.0); - - for (JCas jcas : new JCasIterable(reader, segmenter, tfidfAnnotator)) { - testIt(jcas, expectedDoc1, expectedDoc2); - } - } - - private void testIt(JCas jcas, Map expectedDoc1, - Map expectedDoc2) - { - if (DocumentMetaData.get(jcas).getDocumentTitle().equals("test1.txt")) { - int i = 0; - for (Tfidf tfidf : select(jcas, Tfidf.class)) { - assertEquals(tfidf.getTerm(), expectedDoc1.get(tfidf.getTerm()).doubleValue(), - tfidf.getTfidfValue(), EPSILON); - i++; - } - assertEquals(3, i); - } - else if (DocumentMetaData.get(jcas).getDocumentTitle().equals("test2.txt")) { - int i = 0; - for (Tfidf tfidf : select(jcas, Tfidf.class)) { - assertEquals(tfidf.getTerm(), expectedDoc2.get(tfidf.getTerm()).doubleValue(), - tfidf.getTfidfValue(), EPSILON); - i++; - } - assertEquals(3, i); - } - else { - fail("There should be no other documents in that directory."); - } - } - - @Rule - public TestName name = new TestName(); - - @Before - public void printSeparator() - { - System.out.println("\n=== " + name.getMethodName() + " ====================="); - } -} diff --git a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/TfidfConsumerTest.java b/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/TfidfConsumerTest.java deleted file mode 100644 index cc0f125eca..0000000000 --- a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/tfidf/TfidfConsumerTest.java +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.frequency.tfidf; - -import static de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase.INCLUDE_PREFIX; -import static de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION; -import static de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase.PARAM_PATTERNS; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.junit.Assert.assertEquals; - -import java.io.File; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.rules.TestName; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.model.DfModel; -import de.tudarmstadt.ukp.dkpro.core.frequency.tfidf.util.TfidfUtils; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; - -/** - * Unit test of {@link TfidfConsumer} and {@link TfidfAnnotator}. - * - * - */ -public class TfidfConsumerTest -{ - @Rule - public TemporaryFolder folder = new TemporaryFolder(); - - @Test - public void rawScoresTest() - throws Exception - { - File target = folder.newFile(name.getMethodName()); - - CollectionReaderDescription reader = createReaderDescription(TextReader.class, - PARAM_SOURCE_LOCATION, "src/test/resources/consumer/", - PARAM_PATTERNS, INCLUDE_PREFIX + "*.txt"); - - AnalysisEngineDescription aggregate = createEngineDescription( - createEngineDescription(BreakIteratorSegmenter.class), - createEngineDescription(TfidfConsumer.class, - TfidfConsumer.PARAM_FEATURE_PATH, Token.class.getName(), - TfidfConsumer.PARAM_TARGET_LOCATION, target)); - - // now create the tf and df files - SimplePipeline.runPipeline(reader, aggregate); - - // check whether they were really created and contain the correct values - DfModel dfModel = TfidfUtils.getDfModel(target.getPath()); - - assertEquals(2, dfModel.getDf("example")); - assertEquals(2, dfModel.getDf("sentence")); - assertEquals(1, dfModel.getDf("funny")); - } - - @Rule - public TestName name = new TestName(); - - @Before - public void printSeparator() - { - System.out.println("\n=== " + name.getMethodName() + " ====================="); - } -} diff --git a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/BerkeleyLmProviderTest.java b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/BerkeleyLmProviderTest.java similarity index 96% rename from dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/BerkeleyLmProviderTest.java rename to dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/BerkeleyLmProviderTest.java index 464d4ccb72..dcaf9a6098 100644 --- a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/BerkeleyLmProviderTest.java +++ b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/BerkeleyLmProviderTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency; +package org.dkpro.core.frequency; import static org.junit.Assert.assertEquals; @@ -37,4 +37,4 @@ public void berkeleyLMTest() throws Exception { assertEquals(0.011155508, lm.getProbability("is"), 0.0000001); assertEquals(-4.49582195, lm.getLogProbability("is"), 0.0000001); } -} \ No newline at end of file +} diff --git a/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/Web1TFrequencyProviderTest.java b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/Web1TFrequencyProviderTest.java new file mode 100644 index 0000000000..ba7829443c --- /dev/null +++ b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/Web1TFrequencyProviderTest.java @@ -0,0 +1,75 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.frequency; + +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.util.Iterator; + +import org.junit.Test; + +public class Web1TFrequencyProviderTest +{ + + @Test + public void web1tTest_indexFiles() throws Exception + { + Web1TProviderBase web1t = new Web1TFileAccessProvider("en", + "src/test/resources/web1t/index-1gms", "src/test/resources/web1t/index-2gms"); + + test(web1t); + } + + @Test + public void web1tTest_path() throws Exception + { + Web1TProviderBase web1t = new Web1TFileAccessProvider("en", + new File("src/test/resources/web1t/"), 1, 2); + + test(web1t); + } + + @Test + public void web1tNgramIteratorTest() throws Exception + { + Web1TProviderBase web1t = new Web1TFileAccessProvider("en", + new File("src/test/resources/web1t/"), 1, 2); + + int i = 0; + Iterator ngramIterator = web1t.getNgramIterator(1); + while (ngramIterator.hasNext()) { + ngramIterator.next(); + i++; + } + assertEquals(i, 11); + } + + private void test(Web1TProviderBase web1t) throws Exception { + assertEquals(2147436244l, web1t.getFrequency("!")); + assertEquals(528, web1t.getFrequency("Nilmeier")); + assertEquals(106, web1t.getFrequency("influx takes")); + assertEquals(69, web1t.getFrequency("frist will")); + + assertEquals(13893397919l, web1t.getNrOfNgrams(1)); + assertEquals(6042, web1t.getNrOfNgrams(2)); + assertEquals(11, web1t.getNrOfDistinctNgrams(1)); + assertEquals(21, web1t.getNrOfDistinctNgrams(2)); + + } +} diff --git a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/Web1TInMemoryFrequencyProviderTest.java b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/Web1TInMemoryFrequencyProviderTest.java similarity index 87% rename from dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/Web1TInMemoryFrequencyProviderTest.java rename to dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/Web1TInMemoryFrequencyProviderTest.java index 2772bd1e59..6b8cdf848d 100644 --- a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/Web1TInMemoryFrequencyProviderTest.java +++ b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/Web1TInMemoryFrequencyProviderTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency; +package org.dkpro.core.frequency; import static org.junit.Assert.assertEquals; @@ -23,15 +23,10 @@ public class Web1TInMemoryFrequencyProviderTest { - @Test public void web1tTestInMemoryTest() throws Exception { - Web1TProviderBase web1t = new Web1TInMemoryProvider( - "en", - "src/test/resources/web1t/", - 2 - ); + Web1TProviderBase web1t = new Web1TInMemoryProvider("en", "src/test/resources/web1t/", 2); assertEquals(2147436244l, web1t.getFrequency("!")); assertEquals(528, web1t.getFrequency("Nilmeier")); diff --git a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/berkeleylm/CreateBerkelelyLmTestBinary.java b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/berkeleylm/CreateBerkelelyLmTestBinary.java similarity index 95% rename from dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/berkeleylm/CreateBerkelelyLmTestBinary.java rename to dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/berkeleylm/CreateBerkelelyLmTestBinary.java index 7a54208b04..576111b14a 100644 --- a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/berkeleylm/CreateBerkelelyLmTestBinary.java +++ b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/berkeleylm/CreateBerkelelyLmTestBinary.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency.berkeleylm; +package org.dkpro.core.frequency.berkeleylm; import java.io.IOException; @@ -25,8 +25,6 @@ public class CreateBerkelelyLmTestBinary { - - public static void main(String[] args) throws IOException { run("src/test/resources/googledir/", "target/test.ser"); @@ -42,4 +40,4 @@ private static void run(String path, String outFile) { Logger.endTrack(); } -} \ No newline at end of file +} diff --git a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/berkeleylm/CreateBerkeleyLmGoogleBinary.java b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/berkeleylm/CreateBerkeleyLmGoogleBinary.java similarity index 92% rename from dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/berkeleylm/CreateBerkeleyLmGoogleBinary.java rename to dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/berkeleylm/CreateBerkeleyLmGoogleBinary.java index a8ddfca7b9..2bf634d115 100644 --- a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/berkeleylm/CreateBerkeleyLmGoogleBinary.java +++ b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/berkeleylm/CreateBerkeleyLmGoogleBinary.java @@ -15,19 +15,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency.berkeleylm; +package org.dkpro.core.frequency.berkeleylm; import java.io.IOException; -import de.tudarmstadt.ukp.dkpro.core.api.resources.DkproContext; +import org.dkpro.core.api.resources.DkproContext; + import edu.berkeley.nlp.lm.NgramLanguageModel; import edu.berkeley.nlp.lm.io.LmReaders; import edu.berkeley.nlp.lm.util.Logger; public class CreateBerkeleyLmGoogleBinary { - - public static void main(String[] args) throws IOException { String path = DkproContext.getContext().getWorkspace("berkeley_lm").getAbsolutePath(); @@ -45,4 +44,4 @@ private static void run(String path, String outFile) { Logger.endTrack(); } -} \ No newline at end of file +} diff --git a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/phrasedetection/FrequencyCounterTest.java b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/phrasedetection/FrequencyCounterTest.java similarity index 79% rename from dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/phrasedetection/FrequencyCounterTest.java rename to dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/phrasedetection/FrequencyCounterTest.java index 7dff559c83..a73ac70d67 100644 --- a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/phrasedetection/FrequencyCounterTest.java +++ b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/phrasedetection/FrequencyCounterTest.java @@ -15,28 +15,28 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency.phrasedetection; +package org.dkpro.core.frequency.phrasedetection; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; -import de.tudarmstadt.ukp.dkpro.core.io.text.StringReader; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.io.text.StringReader; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.tokit.BreakIteratorSegmenter; import org.junit.Rule; import org.junit.Test; -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertTrue; - public class FrequencyCounterTest { @Rule @@ -58,9 +58,9 @@ public void testCount() StringReader.PARAM_DOCUMENT_TEXT, sentence, StringReader.PARAM_LANGUAGE, language); AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - AnalysisEngineDescription writer = createEngineDescription(FrequencyCounter.class, - FrequencyCounter.PARAM_TARGET_LOCATION, targetFile, - FrequencyCounter.PARAM_MIN_COUNT, minCount); + AnalysisEngineDescription writer = createEngineDescription(FrequencyWriter.class, + FrequencyWriter.PARAM_TARGET_LOCATION, targetFile, + FrequencyWriter.PARAM_MIN_COUNT, minCount); writer.doFullValidation(); SimplePipeline.runPipeline(reader, segmenter, writer); @@ -88,10 +88,10 @@ public void testCountSortedAlphabetically() StringReader.PARAM_DOCUMENT_TEXT, sentence, StringReader.PARAM_LANGUAGE, language); AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - AnalysisEngineDescription writer = createEngineDescription(FrequencyCounter.class, - FrequencyCounter.PARAM_TARGET_LOCATION, targetFile, - FrequencyCounter.PARAM_MIN_COUNT, minCount, - FrequencyCounter.PARAM_SORT_BY_ALPHABET, true); + AnalysisEngineDescription writer = createEngineDescription(FrequencyWriter.class, + FrequencyWriter.PARAM_TARGET_LOCATION, targetFile, + FrequencyWriter.PARAM_MIN_COUNT, minCount, + FrequencyWriter.PARAM_SORT_BY_ALPHABET, true); SimplePipeline.runPipeline(reader, segmenter, writer); @@ -115,10 +115,10 @@ public void testCountSortedByValue() StringReader.PARAM_DOCUMENT_TEXT, sentence, StringReader.PARAM_LANGUAGE, language); AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - AnalysisEngineDescription writer = createEngineDescription(FrequencyCounter.class, - FrequencyCounter.PARAM_TARGET_LOCATION, targetFile, - FrequencyCounter.PARAM_MIN_COUNT, minCount, - FrequencyCounter.PARAM_SORT_BY_COUNT, true); + AnalysisEngineDescription writer = createEngineDescription(FrequencyWriter.class, + FrequencyWriter.PARAM_TARGET_LOCATION, targetFile, + FrequencyWriter.PARAM_MIN_COUNT, minCount, + FrequencyWriter.PARAM_SORT_BY_COUNT, true); SimplePipeline.runPipeline(reader, segmenter, writer); @@ -126,9 +126,9 @@ public void testCountSortedByValue() /* check unigram sorting */ double[] unigrams = Files.lines(targetFile.toPath()) - .filter(line -> !line.equals(FrequencyCounter.NGRAM_SEPARATOR_LINE)) - .map(line -> line.split(FrequencyCounter.COLUMN_SEPARATOR)) - .filter(fields -> !fields[0].contains(FrequencyCounter.BIGRAM_SEPARATOR)) + .filter(line -> !line.equals(FrequencyWriter.NGRAM_SEPARATOR_LINE)) + .map(line -> line.split(FrequencyWriter.COLUMN_SEPARATOR)) + .filter(fields -> !fields[0].contains(FrequencyWriter.BIGRAM_SEPARATOR)) .map(fields -> fields[1]) .mapToDouble(Double::parseDouble) .toArray(); @@ -138,9 +138,9 @@ public void testCountSortedByValue() /* check bigram sorting */ double[] bigrams = Files.lines(targetFile.toPath()) - .filter(line -> !line.equals(FrequencyCounter.NGRAM_SEPARATOR_LINE)) - .map(line -> line.split(FrequencyCounter.COLUMN_SEPARATOR)) - .filter(fields -> fields[0].contains(FrequencyCounter.BIGRAM_SEPARATOR)) + .filter(line -> !line.equals(FrequencyWriter.NGRAM_SEPARATOR_LINE)) + .map(line -> line.split(FrequencyWriter.COLUMN_SEPARATOR)) + .filter(fields -> fields[0].contains(FrequencyWriter.BIGRAM_SEPARATOR)) .map(fields -> fields[1]) .mapToDouble(Double::parseDouble) .toArray(); @@ -159,10 +159,10 @@ public void testSortBoth() CollectionReaderDescription reader = createReaderDescription(StringReader.class, StringReader.PARAM_DOCUMENT_TEXT, sentence, StringReader.PARAM_LANGUAGE, language); - AnalysisEngineDescription writer = createEngineDescription(FrequencyCounter.class, - FrequencyCounter.PARAM_SORT_BY_COUNT, true, - FrequencyCounter.PARAM_SORT_BY_ALPHABET, true); + AnalysisEngineDescription writer = createEngineDescription(FrequencyWriter.class, + FrequencyWriter.PARAM_SORT_BY_COUNT, true, + FrequencyWriter.PARAM_SORT_BY_ALPHABET, true); SimplePipeline.runPipeline(reader, writer); } -} \ No newline at end of file +} diff --git a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/phrasedetection/PhraseAnnotatorTest.java b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/phrasedetection/PhraseAnnotatorTest.java similarity index 92% rename from dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/phrasedetection/PhraseAnnotatorTest.java rename to dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/phrasedetection/PhraseAnnotatorTest.java index af1f8b2198..4e68a73135 100644 --- a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/phrasedetection/PhraseAnnotatorTest.java +++ b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/phrasedetection/PhraseAnnotatorTest.java @@ -15,29 +15,30 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency.phrasedetection; +package org.dkpro.core.frequency.phrasedetection; + +import static junit.framework.TestCase.assertTrue; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.IOException; +import java.util.Collection; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LexicalPhrase; -import de.tudarmstadt.ukp.dkpro.core.io.text.StringReader; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.text.StringReader; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.tokit.BreakIteratorSegmenter; import org.junit.Rule; import org.junit.Test; -import java.io.File; -import java.io.IOException; -import java.util.Collection; - -import static junit.framework.TestCase.assertTrue; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.junit.Assert.assertEquals; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LexicalPhrase; public class PhraseAnnotatorTest { @@ -74,4 +75,4 @@ public void test() } // TODO: implement test for other covering type parameter values -} \ No newline at end of file +} diff --git a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/resources/BerkeleyLmFrequencyCountProviderTest.java b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/resources/BerkeleyLmFrequencyCountProviderTest.java similarity index 85% rename from dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/resources/BerkeleyLmFrequencyCountProviderTest.java rename to dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/resources/BerkeleyLmFrequencyCountProviderTest.java index 820b21a727..59e693ae15 100644 --- a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/resources/BerkeleyLmFrequencyCountProviderTest.java +++ b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/resources/BerkeleyLmFrequencyCountProviderTest.java @@ -15,11 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency.resources; +package org.dkpro.core.frequency.resources; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.ExternalResourceFactory.createExternalResourceDescription; +import static org.apache.uima.fit.factory.ExternalResourceFactory.createResourceDescription; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; @@ -27,10 +27,9 @@ import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.frequency.FrequencyCountResourceBase; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.FrequencyCountResourceBase; - public class BerkeleyLmFrequencyCountProviderTest { public static class Annotator extends JCasAnnotator_ImplBase { @@ -49,9 +48,9 @@ public void process(JCas aJCas) @Test public void configureAggregatedExample() throws Exception { AnalysisEngineDescription desc = createEngineDescription(Annotator.class, - Annotator.MODEL_KEY, createExternalResourceDescription( - BerkeleyLmFrequencyCountProvider.class, - BerkeleyLmFrequencyCountProvider.PARAM_PROVIDER_LANGUAGE, "en", + Annotator.MODEL_KEY, createResourceDescription( + BerkeleyLmFrequencyCountProvider.class, + BerkeleyLmFrequencyCountProvider.PARAM_PROVIDER_LANGUAGE, "en", BerkeleyLmFrequencyCountProvider.PARAM_BINARY, "src/test/resources/test.ser")); // Check the external resource was injected diff --git a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/resources/Web1tInMemoryResourceTest.java b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/resources/Web1tInMemoryResourceTest.java similarity index 88% rename from dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/resources/Web1tInMemoryResourceTest.java rename to dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/resources/Web1tInMemoryResourceTest.java index 8f249d9f0c..177583d0fd 100644 --- a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/resources/Web1tInMemoryResourceTest.java +++ b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/resources/Web1tInMemoryResourceTest.java @@ -15,11 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency.resources; +package org.dkpro.core.frequency.resources; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.ExternalResourceFactory.createExternalResourceDescription; +import static org.apache.uima.fit.factory.ExternalResourceFactory.createResourceDescription; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; @@ -27,10 +27,9 @@ import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.frequency.provider.FrequencyCountProvider; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.provider.FrequencyCountProvider; - public class Web1tInMemoryResourceTest { public static class Annotator extends JCasAnnotator_ImplBase { @@ -49,8 +48,8 @@ public void process(JCas aJCas) @Test public void configureAggregatedExample() throws Exception { AnalysisEngineDescription desc = createEngineDescription(Annotator.class, - Annotator.MODEL_KEY, createExternalResourceDescription( - Web1TInMemoryFrequencyCountResource.class, + Annotator.MODEL_KEY, createResourceDescription( + Web1TInMemoryFrequencyCountResource.class, Web1TInMemoryFrequencyCountResource.PARAM_MODEL_LOCATION, "src/test/resources/web1t/", Web1TInMemoryFrequencyCountResource.PARAM_LANGUAGE, "en", Web1TInMemoryFrequencyCountResource.PARAM_MAX_NGRAM_LEVEL, "2")); @@ -59,4 +58,4 @@ Annotator.MODEL_KEY, createExternalResourceDescription( AnalysisEngine ae = createEngine(desc); ae.process(ae.newJCas()); } -} \ No newline at end of file +} diff --git a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/resources/Web1tResourceTest.java b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/resources/Web1tResourceTest.java similarity index 88% rename from dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/resources/Web1tResourceTest.java rename to dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/resources/Web1tResourceTest.java index e63c8fc7d3..82ecf981a3 100644 --- a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/resources/Web1tResourceTest.java +++ b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/resources/Web1tResourceTest.java @@ -15,11 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency.resources; +package org.dkpro.core.frequency.resources; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.ExternalResourceFactory.createExternalResourceDescription; +import static org.apache.uima.fit.factory.ExternalResourceFactory.createResourceDescription; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; @@ -27,10 +27,9 @@ import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.frequency.provider.FrequencyCountProvider; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.provider.FrequencyCountProvider; - public class Web1tResourceTest { public static class Annotator extends JCasAnnotator_ImplBase { @@ -49,8 +48,8 @@ public void process(JCas aJCas) @Test public void configureAggregatedExample() throws Exception { AnalysisEngineDescription desc = createEngineDescription(Annotator.class, - Annotator.MODEL_KEY, createExternalResourceDescription( - Web1TFrequencyCountResource.class, + Annotator.MODEL_KEY, createResourceDescription( + Web1TFrequencyCountResource.class, Web1TFrequencyCountResource.PARAM_LANGUAGE, "en", Web1TFrequencyCountResource.PARAM_MIN_NGRAM_LEVEL, "1", Web1TFrequencyCountResource.PARAM_MAX_NGRAM_LEVEL, "2")); @@ -59,4 +58,4 @@ Annotator.MODEL_KEY, createExternalResourceDescription( AnalysisEngine ae = createEngine(desc); ae.process(ae.newJCas()); } -} \ No newline at end of file +} diff --git a/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/tfidf/TfIdfAnnotatorTest.java b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/tfidf/TfIdfAnnotatorTest.java new file mode 100644 index 0000000000..4db8078cc9 --- /dev/null +++ b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/tfidf/TfIdfAnnotatorTest.java @@ -0,0 +1,206 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.frequency.tfidf; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.api.io.ResourceCollectionReaderBase.INCLUDE_PREFIX; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +import java.io.File; +import java.util.HashMap; +import java.util.Map; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.JCasIterable; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.frequency.tfidf.TfIdfAnnotator.WeightingModeIdf; +import org.dkpro.core.frequency.tfidf.TfIdfAnnotator.WeightingModeTf; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.tokit.BreakIteratorSegmenter; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.rules.TestName; + +import de.tudarmstadt.ukp.dkpro.core.api.frequency.tfidf.type.Tfidf; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class TfIdfAnnotatorTest +{ + // assertEquals on doubles needs an epsilon + protected static final double EPSILON = 0.000001; + + private final static String CONSUMER_TEST_DATA_PATH = "src/test/resources/consumer/"; + + @Rule + public TemporaryFolder folder = new TemporaryFolder(); + + protected File model; + + @Before + public void buildModel() + throws Exception + { + model = folder.newFile(); + + // write the model + CollectionReaderDescription reader = createReaderDescription(TextReader.class, + TextReader.PARAM_SOURCE_LOCATION, CONSUMER_TEST_DATA_PATH, + TextReader.PARAM_PATTERNS, INCLUDE_PREFIX + "*.txt"); + + AnalysisEngineDescription aggregate = createEngineDescription( + createEngineDescription(BreakIteratorSegmenter.class), + createEngineDescription(TfIdfWriter.class, + TfIdfWriter.PARAM_FEATURE_PATH, Token.class, + TfIdfWriter.PARAM_TARGET_LOCATION, model)); + + SimplePipeline.runPipeline(reader, aggregate); + } + + @Test + public void tfidfTest_normal_constantOne() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription(TextReader.class, + TextReader.PARAM_SOURCE_LOCATION, CONSUMER_TEST_DATA_PATH, + TextReader.PARAM_PATTERNS, INCLUDE_PREFIX + "*.txt"); + + AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); + + AnalysisEngineDescription tfidfAnnotator = createEngineDescription(TfIdfAnnotator.class, + TfIdfAnnotator.PARAM_FEATURE_PATH, Token.class, + TfIdfAnnotator.PARAM_TFDF_PATH, model, + TfIdfAnnotator.PARAM_TF_MODE, WeightingModeTf.NORMAL, + TfIdfAnnotator.PARAM_IDF_MODE, WeightingModeIdf.CONSTANT_ONE); + + Map expectedDoc1 = new HashMap(); + expectedDoc1.put("example", 1.0); + expectedDoc1.put("sentence", 1.0); + expectedDoc1.put("funny", 1.0); + + Map expectedDoc2 = new HashMap(); + expectedDoc2.put("example", 2.0); + expectedDoc2.put("sentence", 1.0); + + for (JCas jcas : new JCasIterable(reader, segmenter, tfidfAnnotator)) { + testIt(jcas, expectedDoc1, expectedDoc2); + } + } + + @Test + public void tfidfTest_binary_binary() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription(TextReader.class, + TextReader.PARAM_SOURCE_LOCATION, CONSUMER_TEST_DATA_PATH, + TextReader.PARAM_PATTERNS, INCLUDE_PREFIX + "*.txt"); + + AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); + + AnalysisEngineDescription tfidfAnnotator = createEngineDescription(TfIdfAnnotator.class, + TfIdfAnnotator.PARAM_FEATURE_PATH, Token.class, + TfIdfAnnotator.PARAM_TFDF_PATH, model, + TfIdfAnnotator.PARAM_TF_MODE, WeightingModeTf.BINARY, + TfIdfAnnotator.PARAM_IDF_MODE, WeightingModeIdf.BINARY); + + Map expectedDoc1 = new HashMap(); + expectedDoc1.put("example", 1.0); + expectedDoc1.put("sentence", 1.0); + expectedDoc1.put("funny", 1.0); + + Map expectedDoc2 = new HashMap(); + expectedDoc2.put("example", 1.0); + expectedDoc2.put("sentence", 1.0); + + for (JCas jcas : new JCasIterable(reader, segmenter, tfidfAnnotator)) { + testIt(jcas, expectedDoc1, expectedDoc2); + } + } + + @Test + public void tfidfTest_normal_log() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription(TextReader.class, + TextReader.PARAM_SOURCE_LOCATION, CONSUMER_TEST_DATA_PATH, + TextReader.PARAM_PATTERNS, INCLUDE_PREFIX + "*.txt"); + + AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); + + AnalysisEngineDescription tfidfAnnotator = createEngineDescription(TfIdfAnnotator.class, + TfIdfAnnotator.PARAM_FEATURE_PATH, Token.class, + TfIdfAnnotator.PARAM_TFDF_PATH, model, + TfIdfAnnotator.PARAM_TF_MODE, WeightingModeTf.NORMAL, + TfIdfAnnotator.PARAM_IDF_MODE, WeightingModeIdf.LOG); + + Map expectedDoc1 = new HashMap(); + expectedDoc1.put("example", 0.0); + expectedDoc1.put("sentence", 0.0); + expectedDoc1.put("funny", Math.log(2)); + + Map expectedDoc2 = new HashMap(); + expectedDoc2.put("example", 0.0); + expectedDoc2.put("sentence", 0.0); + + for (JCas jcas : new JCasIterable(reader, segmenter, tfidfAnnotator)) { + testIt(jcas, expectedDoc1, expectedDoc2); + } + } + + private void testIt(JCas jcas, Map expectedDoc1, + Map expectedDoc2) + { + if (DocumentMetaData.get(jcas).getDocumentTitle().equals("test1.txt")) { + int i = 0; + for (Tfidf tfidf : select(jcas, Tfidf.class)) { + assertEquals(tfidf.getTerm(), expectedDoc1.get(tfidf.getTerm()).doubleValue(), + tfidf.getTfidfValue(), EPSILON); + i++; + } + assertEquals(3, i); + } + else if (DocumentMetaData.get(jcas).getDocumentTitle().equals("test2.txt")) { + int i = 0; + for (Tfidf tfidf : select(jcas, Tfidf.class)) { + assertEquals(tfidf.getTerm(), expectedDoc2.get(tfidf.getTerm()).doubleValue(), + tfidf.getTfidfValue(), EPSILON); + i++; + } + assertEquals(3, i); + } + else { + fail("There should be no other documents in that directory."); + } + } + + @Rule + public TestName name = new TestName(); + + @Before + public void printSeparator() + { + System.out.println("\n=== " + name.getMethodName() + " ====================="); + } +} diff --git a/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/tfidf/TfIdfWriterTest.java b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/tfidf/TfIdfWriterTest.java new file mode 100644 index 0000000000..316c368853 --- /dev/null +++ b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/tfidf/TfIdfWriterTest.java @@ -0,0 +1,89 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.frequency.tfidf; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.dkpro.core.api.io.ResourceCollectionReaderBase.INCLUDE_PREFIX; +import static org.dkpro.core.api.io.ResourceCollectionReaderBase.PARAM_PATTERNS; +import static org.dkpro.core.api.io.ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION; +import static org.junit.Assert.assertEquals; + +import java.io.File; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.frequency.tfidf.model.DfModel; +import org.dkpro.core.frequency.tfidf.util.TfidfUtils; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.tokit.BreakIteratorSegmenter; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.rules.TestName; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +/** + * Unit test of {@link TfIdfWriter} and {@link TfIdfAnnotator}. + * + * + */ +public class TfIdfWriterTest +{ + @Rule + public TemporaryFolder folder = new TemporaryFolder(); + + @Test + public void rawScoresTest() + throws Exception + { + File target = folder.newFile(name.getMethodName()); + + CollectionReaderDescription reader = createReaderDescription(TextReader.class, + PARAM_SOURCE_LOCATION, "src/test/resources/consumer/", + PARAM_PATTERNS, INCLUDE_PREFIX + "*.txt"); + + AnalysisEngineDescription aggregate = createEngineDescription( + createEngineDescription(BreakIteratorSegmenter.class), + createEngineDescription(TfIdfWriter.class, + TfIdfWriter.PARAM_FEATURE_PATH, Token.class.getName(), + TfIdfWriter.PARAM_TARGET_LOCATION, target)); + + // now create the tf and df files + SimplePipeline.runPipeline(reader, aggregate); + + // check whether they were really created and contain the correct values + DfModel dfModel = TfidfUtils.getDfModel(target.getPath()); + + assertEquals(2, dfModel.getDf("example")); + assertEquals(2, dfModel.getDf("sentence")); + assertEquals(1, dfModel.getDf("funny")); + } + + @Rule + public TestName name = new TestName(); + + @Before + public void printSeparator() + { + System.out.println("\n=== " + name.getMethodName() + " ====================="); + } +} diff --git a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/util/CreateTestIndexesWeb1T.java b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/util/CreateTestIndexesWeb1T.java similarity index 94% rename from dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/util/CreateTestIndexesWeb1T.java rename to dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/util/CreateTestIndexesWeb1T.java index b85441e320..418551fd12 100644 --- a/dkpro-core-frequency-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/frequency/util/CreateTestIndexesWeb1T.java +++ b/dkpro-core-frequency-asl/src/test/java/org/dkpro/core/frequency/util/CreateTestIndexesWeb1T.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.frequency.util; +package org.dkpro.core.frequency.util; import com.googlecode.jweb1t.JWeb1TIndexer; diff --git a/dkpro-core-fs-hdfs-asl/pom.xml b/dkpro-core-fs-hdfs-asl/pom.xml index ff5ed654b8..2c40fd29c9 100644 --- a/dkpro-core-fs-hdfs-asl/pom.xml +++ b/dkpro-core-fs-hdfs-asl/pom.xml @@ -18,16 +18,17 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.fs.hdfs-asl + dkpro-core-fs-hdfs-asl jar DKPro Core ASL - FS - HDFS (v ${hadoop.version}) + https://dkpro.github.io/dkpro-core/ - 2.7.1 + 3.3.0 @@ -36,18 +37,24 @@ org.apache.hadoop - hadoop-hdfs + hadoop-common ${hadoop.version} + + + org.slf4j + slf4j-log4j12 + + org.apache.hadoop - hadoop-common + hadoop-hdfs ${hadoop.version} org.springframework.data spring-data-hadoop-core - 2.3.0.RELEASE + 2.5.0.RELEASE org.apache.uima @@ -63,13 +70,19 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.text-asl + org.dkpro.core + dkpro-core-io-text-asl + test + + + org.apache.hadoop + hadoop-hdfs-client + ${hadoop.version} test @@ -85,10 +98,17 @@ ${hadoop.version} tests test + + + org.slf4j + slf4j-log4j12 + + log4j log4j + 1.2.17 test @@ -100,13 +120,14 @@ maven-dependency-plugin - + log4j:log4j org.apache.hadoop:hadoop-common + org.apache.hadoop:hadoop-hdfs diff --git a/dkpro-core-fs-hdfs-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/fs/hdfs/HdfsResourceLoaderLocator.java b/dkpro-core-fs-hdfs-asl/src/main/java/org/dkpro/core/fs/hdfs/HdfsResourceLoaderLocator.java similarity index 93% rename from dkpro-core-fs-hdfs-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/fs/hdfs/HdfsResourceLoaderLocator.java rename to dkpro-core-fs-hdfs-asl/src/main/java/org/dkpro/core/fs/hdfs/HdfsResourceLoaderLocator.java index eaee2bd8b4..481c9b651c 100644 --- a/dkpro-core-fs-hdfs-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/fs/hdfs/HdfsResourceLoaderLocator.java +++ b/dkpro-core-fs-hdfs-asl/src/main/java/org/dkpro/core/fs/hdfs/HdfsResourceLoaderLocator.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.fs.hdfs; +package org.dkpro.core.fs.hdfs; import java.net.URI; import java.net.URISyntaxException; @@ -51,7 +51,8 @@ public boolean initialize(ResourceSpecifier aSpecifier, Map aAdd } else { resolverInstance = new HdfsResourceLoader(new Configuration(), new URI(fileSystem)); - resolverInstance.setResourcePatternResolver(new PathMatchingResourcePatternResolver()); + resolverInstance + .setResourcePatternResolver(new PathMatchingResourcePatternResolver()); } } catch (URISyntaxException e) { diff --git a/dkpro-core-fs-hdfs-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/fs/hdfs/HdfsResourceLoaderLocatorTest.java b/dkpro-core-fs-hdfs-asl/src/test/java/org/dkpro/core/fs/hdfs/HdfsResourceLoaderLocatorTest.java similarity index 92% rename from dkpro-core-fs-hdfs-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/fs/hdfs/HdfsResourceLoaderLocatorTest.java rename to dkpro-core-fs-hdfs-asl/src/test/java/org/dkpro/core/fs/hdfs/HdfsResourceLoaderLocatorTest.java index 7c679ee6ef..675099aba6 100644 --- a/dkpro-core-fs-hdfs-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/fs/hdfs/HdfsResourceLoaderLocatorTest.java +++ b/dkpro-core-fs-hdfs-asl/src/test/java/org/dkpro/core/fs/hdfs/HdfsResourceLoaderLocatorTest.java @@ -15,10 +15,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.fs.hdfs; +package org.dkpro.core.fs.hdfs; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; -import static org.apache.uima.fit.factory.ExternalResourceFactory.createExternalResourceDescription; +import static org.apache.uima.fit.factory.ExternalResourceFactory.createResourceDescription; import static org.junit.Assert.assertEquals; import java.io.File; @@ -33,6 +33,9 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ExternalResourceDescription; +import org.dkpro.core.fs.hdfs.HdfsResourceLoaderLocator; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.testing.DkproTestContext; import org.junit.After; import org.junit.Assume; import org.junit.Before; @@ -40,15 +43,12 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - public class HdfsResourceLoaderLocatorTest { // Need to use this for a proper temporary folder because otherwise we get an error if // the tests runs within some folder that has percentage signs in its path... @Rule - public TemporaryFolder folder= new TemporaryFolder(); + public TemporaryFolder folder = new TemporaryFolder(); private MiniDFSCluster hdfsCluster; @@ -98,7 +98,7 @@ public void testExternalLoaderLocator() } // Set up HDFS resource locator - ExternalResourceDescription locator = createExternalResourceDescription( + ExternalResourceDescription locator = createResourceDescription( HdfsResourceLoaderLocator.class, HdfsResourceLoaderLocator.PARAM_FILESYSTEM, hdfsURI); diff --git a/dkpro-core-fs-hdfs-asl/src/test/resources/log4j.properties b/dkpro-core-fs-hdfs-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-fs-hdfs-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-fs-hdfs-asl/src/test/resources/log4j2.xml b/dkpro-core-fs-hdfs-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-fs-hdfs-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-gate-asl/pom.xml b/dkpro-core-gate-asl/pom.xml index 37bc721f28..672ed4e459 100644 --- a/dkpro-core-gate-asl/pom.xml +++ b/dkpro-core-gate-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.gate-asl + dkpro-core-gate-asl jar DKPro Core ASL - GATE (v ${gate.version}) + https://dkpro.github.io/dkpro-core/ 8.2 @@ -39,16 +40,16 @@ uimafit-core - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl uk.ac.gate @@ -66,8 +67,22 @@ - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api + + + + xml-apis + xml-apis + runtime junit @@ -75,8 +90,8 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test @@ -103,9 +118,9 @@ 20160531.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-opennlp-asl + 2.3.0-SNAPSHOT pom import @@ -119,6 +134,7 @@ maven-dependency-plugin + xml-apis:xml-apis de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.gate-model-lemmatizer-en-default de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.gate-model-tagger-en-annie diff --git a/dkpro-core-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/gate/GateLemmatizer.java b/dkpro-core-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/gate/GateLemmatizer.java deleted file mode 100644 index 6f93e5a470..0000000000 --- a/dkpro-core-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/gate/GateLemmatizer.java +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.gate; - -import java.io.IOException; -import java.net.URL; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_NOUN; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_PRON; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_VERB; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import gate.creole.ResourceInstantiationException; -import gate.creole.morph.Interpret; - -/** - * Wrapper for the GATE rule based lemmatizer. - * - * Based on code by Asher Stern from the BIUTEE textual entailment tool. - * - * @since 1.4.0 - */ -@ResourceMetaData(name="GATE Lemmatizer") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }) -public class GateLemmatizer - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - protected String modelLocation; - - // constants - public static final String GATE_LEMMATIZER_VERB_CATEGORY_STRING = "VB"; - public static final String GATE_LEMMATIZER_NOUN_CATEGORY_STRING = "NN"; - public static final String GATE_LEMMATIZER_ALL_CATEGORIES_STRING = "*"; - - private CasConfigurableProviderBase modelProvider; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - - modelProvider = new CasConfigurableProviderBase() { - { - setContextObject(GateLemmatizer.this); - - setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/gate/lib/" + - "lemmatizer-${language}-${variant}.properties"); - setDefault(VARIANT, "default"); - - setOverride(LOCATION, modelLocation); - setOverride(LANGUAGE, language); - setOverride(VARIANT, variant); - } - - @Override - protected Interpret produceResource(URL aUrl) throws IOException - { - try { - Interpret gateLemmatizerInterpretObject = new Interpret(); - gateLemmatizerInterpretObject.init(aUrl); - return gateLemmatizerInterpretObject; - } - catch (ResourceInstantiationException e) { - throw new IOException(e); - } - } - }; - } - - @Override - public void process(JCas jcas) - throws AnalysisEngineProcessException - { - modelProvider.configure(jcas.getCas()); - - String category = null; - for (Token token : JCasUtil.select(jcas, Token.class)) { - POS pos = token.getPos(); - - if (pos != null) { - if (pos.getClass().equals(POS_VERB.class)) { - category = GATE_LEMMATIZER_VERB_CATEGORY_STRING; - } - else if (pos.getClass().equals(POS_NOUN.class)) { - category = GATE_LEMMATIZER_NOUN_CATEGORY_STRING; - } - else if (pos.getClass().equals(POS_PRON.class)) { - category = GATE_LEMMATIZER_NOUN_CATEGORY_STRING; - } - else { - category = GATE_LEMMATIZER_ALL_CATEGORIES_STRING; - } - } - else { - category = GATE_LEMMATIZER_ALL_CATEGORIES_STRING; - } - - String tokenString = token.getText(); - String lemmaString = modelProvider.getResource().runMorpher(tokenString, category); - if (lemmaString == null) { - lemmaString = tokenString; - } - - Lemma lemma = new Lemma(jcas, token.getBegin(), token.getEnd()); - lemma.setValue(lemmaString); - lemma.addToIndexes(); - - // remove (a potentially existing) old lemma before adding a new one - if (token.getLemma() != null) { - Lemma oldLemma = token.getLemma(); - oldLemma.removeFromIndexes(); - } - - token.setLemma(lemma); - } - } -} diff --git a/dkpro-core-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/gate/package-info.java b/dkpro-core-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/gate/package-info.java deleted file mode 100644 index 7d8f5bd2f1..0000000000 --- a/dkpro-core-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/gate/package-info.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Integration of NLP components from GATE. - * - * @since 1.4.0 - */ -package de.tudarmstadt.ukp.dkpro.core.gate; diff --git a/dkpro-core-gate-asl/src/main/java/org/dkpro/core/gate/GateLemmatizer.java b/dkpro-core-gate-asl/src/main/java/org/dkpro/core/gate/GateLemmatizer.java new file mode 100644 index 0000000000..c2301422e4 --- /dev/null +++ b/dkpro-core-gate-asl/src/main/java/org/dkpro/core/gate/GateLemmatizer.java @@ -0,0 +1,189 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.gate; + +import java.io.IOException; +import java.net.URL; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_NOUN; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_PRON; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_VERB; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; +import gate.creole.ResourceInstantiationException; +import gate.creole.morph.Interpret; + +/** + * Wrapper for the GATE rule based lemmatizer. + * + * Based on code by Asher Stern from the BIUTEE textual entailment tool. + * + * @since 1.4.0 + */ +@Component(OperationType.LEMMATIZER) +@ResourceMetaData(name = "GATE Lemmatizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }) +public class GateLemmatizer + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + *

The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.

+ */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + protected String modelLocation; + + // constants + public static final String GATE_LEMMATIZER_VERB_CATEGORY_STRING = "VB"; + public static final String GATE_LEMMATIZER_NOUN_CATEGORY_STRING = "NN"; + public static final String GATE_LEMMATIZER_ALL_CATEGORIES_STRING = "*"; + + private CasConfigurableProviderBase modelProvider; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + modelProvider = new CasConfigurableProviderBase() { + { + setContextObject(GateLemmatizer.this); + + setDefault(PACKAGE, "de/tudarmstadt/ukp/dkpro/core/gate"); + setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/gate/lib/" + + "lemmatizer-${language}-${variant}.properties"); + setDefault(VARIANT, "default"); + + setOverride(LOCATION, modelLocation); + setOverride(LANGUAGE, language); + setOverride(VARIANT, variant); + } + + @Override + protected Interpret produceResource(URL aUrl) throws IOException + { + try { + Interpret gateLemmatizerInterpretObject = new Interpret(); + gateLemmatizerInterpretObject.init(aUrl); + return gateLemmatizerInterpretObject; + } + catch (ResourceInstantiationException e) { + throw new IOException(e); + } + } + }; + } + + @Override + public void process(JCas jcas) + throws AnalysisEngineProcessException + { + modelProvider.configure(jcas.getCas()); + + String category = null; + for (Token token : JCasUtil.select(jcas, Token.class)) { + POS pos = token.getPos(); + + if (pos != null) { + if (pos.getClass().equals(POS_VERB.class)) { + category = GATE_LEMMATIZER_VERB_CATEGORY_STRING; + } + else if (pos.getClass().equals(POS_NOUN.class)) { + category = GATE_LEMMATIZER_NOUN_CATEGORY_STRING; + } + else if (pos.getClass().equals(POS_PRON.class)) { + category = GATE_LEMMATIZER_NOUN_CATEGORY_STRING; + } + else { + category = GATE_LEMMATIZER_ALL_CATEGORIES_STRING; + } + } + else { + category = GATE_LEMMATIZER_ALL_CATEGORIES_STRING; + } + + String tokenString = token.getText(); + String lemmaString = modelProvider.getResource().runMorpher(tokenString, category); + if (lemmaString == null) { + lemmaString = tokenString; + } + + Lemma lemma = new Lemma(jcas, token.getBegin(), token.getEnd()); + lemma.setValue(lemmaString); + lemma.addToIndexes(); + + // remove (a potentially existing) old lemma before adding a new one + if (token.getLemma() != null) { + Lemma oldLemma = token.getLemma(); + oldLemma.removeFromIndexes(); + } + + token.setLemma(lemma); + } + } +} diff --git a/dkpro-core-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/gate/HepplePosTagger.java b/dkpro-core-gate-asl/src/main/java/org/dkpro/core/gate/HepplePosTagger.java similarity index 82% rename from dkpro-core-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/gate/HepplePosTagger.java rename to dkpro-core-gate-asl/src/main/java/org/dkpro/core/gate/HepplePosTagger.java index f444359bc0..ba2ae85aa5 100644 --- a/dkpro-core-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/gate/HepplePosTagger.java +++ b/dkpro-core-gate-asl/src/main/java/org/dkpro/core/gate/HepplePosTagger.java @@ -15,14 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.gate; +package org.dkpro.core.gate; import static java.util.Collections.singletonList; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import static org.apache.uima.fit.util.JCasUtil.toText; -import hepple.postag.InvalidRuleException; -import hepple.postag.POSTagger; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; import java.io.IOException; import java.net.URL; @@ -38,20 +37,26 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; +import hepple.postag.InvalidRuleException; +import hepple.postag.POSTagger; /** * GATE Hepple part-of-speech tagger. */ -@ResourceMetaData(name="GATE Hepple POS-Tagger") +@Component(OperationType.PART_OF_SPEECH_TAGGER) +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@ResourceMetaData(name = "GATE Hepple POS-Tagger") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", @@ -89,31 +94,28 @@ public class HepplePosTagger @ConfigurationParameter(name = PARAM_RULESET_LOCATION, mandatory = false) protected String rulesetLocation; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code true} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - /** * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} */ public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") protected boolean printTagSet; private CasConfigurableProviderBase ruleProvider; @@ -130,6 +132,7 @@ public void initialize(UimaContext aContext) { setContextObject(HepplePosTagger.this); + setDefault(PACKAGE, "de/tudarmstadt/ukp/dkpro/core/gate"); setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/gate/lib/" + "tagger/${language}/${variant}/ruleset"); setDefault(VARIANT, "annie"); @@ -164,7 +167,7 @@ protected URL produceResource(URL aUrl) throws IOException } }; - mappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, + mappingProvider = createPosMappingProvider(this, posMappingLocation, language, ruleProvider); } @@ -201,7 +204,8 @@ public void process(JCas aJCas) for (Token t : tokens) { Type posTag = mappingProvider.getTagType(tagged.get(i)[1]); POS posAnno = (POS) cas.createAnnotation(posTag, t.getBegin(), t.getEnd()); - posAnno.setPosValue(internTags ? tagged.get(i)[1].intern() : tagged.get(i)[1]); + String tag = tagged.get(i)[1]; + posAnno.setPosValue(tag != null ? tag.intern() : null); POSUtils.assignCoarseValue(posAnno); posAnno.addToIndexes(); t.setPos(posAnno); diff --git a/dkpro-core-gate-asl/src/main/java/org/dkpro/core/gate/package-info.java b/dkpro-core-gate-asl/src/main/java/org/dkpro/core/gate/package-info.java new file mode 100644 index 0000000000..6f82bffd94 --- /dev/null +++ b/dkpro-core-gate-asl/src/main/java/org/dkpro/core/gate/package-info.java @@ -0,0 +1,23 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Integration of NLP components from GATE. + * + * @since 1.4.0 + */ +package org.dkpro.core.gate; diff --git a/dkpro-core-gate-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/gate/GateLemmatizerTest.java b/dkpro-core-gate-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/gate/GateLemmatizerTest.java deleted file mode 100644 index 0174611ebb..0000000000 --- a/dkpro-core-gate-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/gate/GateLemmatizerTest.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.gate; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.jcas.JCas; -import org.junit.Rule; -import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -public class GateLemmatizerTest -{ - @Test - public void testEnglish() - throws Exception - { - JCas jcas = runTest("en", "We need a very complicated example sentence , which " + - "contains as many constituents and dependencies as possible ."); - - String[] lemmas = { "We", "need", "a", "very", "complicate", "example", "sentence", ",", - "which", "contain", "as", "many", "constituent", "and", "dependency", "as", - "possible", "." }; - - AssertAnnotations.assertLemma(lemmas, select(jcas, Lemma.class)); - } - - @Test - public void testEnglish2() - throws Exception - { - JCas jcas = runTest("en", "Two cars went around corners ."); - - String[] lemmas = { "Two", "car", "go", "around", "corner", "." }; - - AssertAnnotations.assertLemma(lemmas, select(jcas, Lemma.class)); - } - - private JCas runTest(String aLanguage, String aText) - throws Exception - { - AnalysisEngineDescription tagger = createEngineDescription(HepplePosTagger.class); - AnalysisEngineDescription lemma = createEngineDescription(GateLemmatizer.class); - - AnalysisEngineDescription aggregate = createEngineDescription(tagger, lemma); - - JCas jcas = TestRunner.runTest(aggregate, aLanguage, aText); - - return jcas; - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-gate-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/gate/HepplePosTaggerTest.java b/dkpro-core-gate-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/gate/HepplePosTaggerTest.java deleted file mode 100644 index 125cbce1c6..0000000000 --- a/dkpro-core-gate-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/gate/HepplePosTaggerTest.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.gate; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.jcas.JCas; -import org.junit.Rule; -import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -public class HepplePosTaggerTest -{ - @Test - public void testEnglish() - throws Exception - { - runTest("en", null, "This is a test . \n", - new String[] { "DT", "VBZ", "DT", "NN", "." }, - new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - - runTest("en", null, "A neural net . \n", - new String[] { "DT", "NN", "JJ", "." }, - new String[] { "POS_DET", "POS_NOUN", "POS_ADJ", "POS_PUNCT" }); - - runTest("en", null, "John is purchasing oranges . \n", - new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, - new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); - } - - private void runTest(String language, String variant, String testDocument, String[] tags, - String[] tagClasses) - throws Exception - { - AnalysisEngine engine = createEngine(HepplePosTagger.class, - HepplePosTagger.PARAM_VARIANT, variant, - HepplePosTagger.PARAM_PRINT_TAGSET, true); - - JCas jcas = TestRunner.runTest(engine, language, testDocument); - - AssertAnnotations.assertPOS(tagClasses, tags, select(jcas, POS.class)); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-gate-asl/src/test/java/org/dkpro/core/gate/GateLemmatizerTest.java b/dkpro-core-gate-asl/src/test/java/org/dkpro/core/gate/GateLemmatizerTest.java new file mode 100644 index 0000000000..280de6459f --- /dev/null +++ b/dkpro-core-gate-asl/src/test/java/org/dkpro/core/gate/GateLemmatizerTest.java @@ -0,0 +1,75 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.gate; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; + +public class GateLemmatizerTest +{ + @Test + public void testEnglish() + throws Exception + { + JCas jcas = runTest("en", "We need a very complicated example sentence , which " + + "contains as many constituents and dependencies as possible ."); + + String[] lemmas = { "We", "need", "a", "very", "complicate", "example", "sentence", ",", + "which", "contain", "as", "many", "constituent", "and", "dependency", "as", + "possible", "." }; + + AssertAnnotations.assertLemma(lemmas, select(jcas, Lemma.class)); + } + + @Test + public void testEnglish2() + throws Exception + { + JCas jcas = runTest("en", "Two cars went around corners ."); + + String[] lemmas = { "Two", "car", "go", "around", "corner", "." }; + + AssertAnnotations.assertLemma(lemmas, select(jcas, Lemma.class)); + } + + private JCas runTest(String aLanguage, String aText) + throws Exception + { + AnalysisEngineDescription tagger = createEngineDescription(HepplePosTagger.class); + AnalysisEngineDescription lemma = createEngineDescription(GateLemmatizer.class); + + AnalysisEngineDescription aggregate = createEngineDescription(tagger, lemma); + + JCas jcas = TestRunner.runTest(aggregate, aLanguage, aText); + + return jcas; + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-gate-asl/src/test/java/org/dkpro/core/gate/HepplePosTaggerTest.java b/dkpro-core-gate-asl/src/test/java/org/dkpro/core/gate/HepplePosTaggerTest.java new file mode 100644 index 0000000000..b030e03893 --- /dev/null +++ b/dkpro-core-gate-asl/src/test/java/org/dkpro/core/gate/HepplePosTaggerTest.java @@ -0,0 +1,67 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.gate; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; + +public class HepplePosTaggerTest +{ + @Test + public void testEnglish() + throws Exception + { + runTest("en", null, "This is a test . \n", + new String[] { "DT", "VBZ", "DT", "NN", "." }, + new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + + runTest("en", null, "A neural net . \n", + new String[] { "DT", "NN", "JJ", "." }, + new String[] { "POS_DET", "POS_NOUN", "POS_ADJ", "POS_PUNCT" }); + + runTest("en", null, "John is purchasing oranges . \n", + new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, + new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); + } + + private void runTest(String language, String variant, String testDocument, String[] tags, + String[] tagClasses) + throws Exception + { + AnalysisEngine engine = createEngine(HepplePosTagger.class, + HepplePosTagger.PARAM_VARIANT, variant, + HepplePosTagger.PARAM_PRINT_TAGSET, true); + + JCas jcas = TestRunner.runTest(engine, language, testDocument); + + AssertAnnotations.assertPOS(tagClasses, tags, select(jcas, POS.class)); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-gate-asl/src/test/resources/log4j.properties b/dkpro-core-gate-asl/src/test/resources/log4j.properties deleted file mode 100644 index 9ef9876f5c..0000000000 --- a/dkpro-core-gate-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,7 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG diff --git a/dkpro-core-gate-asl/src/test/resources/log4j2.xml b/dkpro-core-gate-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..19bf03b585 --- /dev/null +++ b/dkpro-core-gate-asl/src/test/resources/log4j2.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/dkpro-core-gosen-asl/pom.xml b/dkpro-core-gosen-asl/pom.xml index 080d8fd986..cba8536857 100644 --- a/dkpro-core-gosen-asl/pom.xml +++ b/dkpro-core-gosen-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.gosen-asl + dkpro-core-gosen-asl jar DKPro Core ASL - GoSen + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -39,14 +40,28 @@ com.google.code lucene-gosen-ipadic 1.2.1 + + + org.apache.solr + solr-core + + + org.apache.solr + solr-solrj + + - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -54,8 +69,8 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test diff --git a/dkpro-core-gosen-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/gosen/GosenSegmenter.java b/dkpro-core-gosen-asl/src/main/java/org/dkpro/core/gosen/GosenSegmenter.java similarity index 89% rename from dkpro-core-gosen-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/gosen/GosenSegmenter.java rename to dkpro-core-gosen-asl/src/main/java/org/dkpro/core/gosen/GosenSegmenter.java index 4a57f5b70b..164bf9e005 100644 --- a/dkpro-core-gosen-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/gosen/GosenSegmenter.java +++ b/dkpro-core-gosen-asl/src/main/java/org/dkpro/core/gosen/GosenSegmenter.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.gosen; +package org.dkpro.core.gosen; import java.io.IOException; import java.util.ArrayList; @@ -26,8 +26,10 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.Messages; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; +import org.dkpro.core.api.parameter.Messages; +import org.dkpro.core.api.segmentation.SegmenterBase; + +import eu.openminted.share.annotations.api.DocumentationResource; import net.java.sen.SenFactory; import net.java.sen.StringTagger; import net.java.sen.dictionary.Token; @@ -35,7 +37,8 @@ /** * Segmenter for Japanese text based on GoSen. */ -@ResourceMetaData(name="Gosen Segmenter") +@ResourceMetaData(name = "Gosen Segmenter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @LanguageCapability("ja") @TypeCapability( outputs = { diff --git a/dkpro-core-gosen-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/gosen/GosenSegmenterTest.java b/dkpro-core-gosen-asl/src/test/java/org/dkpro/core/gosen/GosenSegmenterTest.java similarity index 95% rename from dkpro-core-gosen-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/gosen/GosenSegmenterTest.java rename to dkpro-core-gosen-asl/src/test/java/org/dkpro/core/gosen/GosenSegmenterTest.java index e6563f98f8..bb601e6900 100644 --- a/dkpro-core-gosen-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/gosen/GosenSegmenterTest.java +++ b/dkpro-core-gosen-asl/src/test/java/org/dkpro/core/gosen/GosenSegmenterTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.gosen; +package org.dkpro.core.gosen; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.util.JCasUtil.select; @@ -23,13 +23,14 @@ import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; +import org.dkpro.core.gosen.GosenSegmenter; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class GosenSegmenterTest { diff --git a/dkpro-core-gosen-asl/src/test/resources/log4j.properties b/dkpro-core-gosen-asl/src/test/resources/log4j.properties deleted file mode 100644 index 9f0bdd6149..0000000000 --- a/dkpro-core-gosen-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,12 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO - -log4j.logger.de.tudarmstadt.ukp.dkpro.core.io.bincas.BinaryCasReader = WARN -log4j.logger.de.tudarmstadt.ukp.dkpro.core.io.bincas.BinaryCasWriter = WARN -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase = WARN \ No newline at end of file diff --git a/dkpro-core-gosen-asl/src/test/resources/log4j2.xml b/dkpro-core-gosen-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-gosen-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-gpl/.license-header.txt b/dkpro-core-gpl/.license-header.txt index ab08133a17..bbaf6e0e56 100644 --- a/dkpro-core-gpl/.license-header.txt +++ b/dkpro-core-gpl/.license-header.txt @@ -13,4 +13,4 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with this program. If not, see http://www.gnu.org/licenses/. +along with this program. If not, see http://www.gnu.org/licenses/. diff --git a/dkpro-core-gpl/LICENSE.txt b/dkpro-core-gpl/LICENSE.txt index d398380c8b..4b4ae023a5 100644 --- a/dkpro-core-gpl/LICENSE.txt +++ b/dkpro-core-gpl/LICENSE.txt @@ -645,7 +645,7 @@ the "copyright" line and a pointer to where the full notice is found. GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. + along with this program. If not, see http://www.gnu.org/licenses/. Also add information on how to contact you by electronic and paper mail. diff --git a/dkpro-core-gpl/pom.xml b/dkpro-core-gpl/pom.xml index 231cd62f42..866259fa09 100644 --- a/dkpro-core-gpl/pom.xml +++ b/dkpro-core-gpl/pom.xml @@ -1,6 +1,6 @@ - 4.0.0 - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT - .. - - de.tudarmstadt.ukp.dkpro.core-gpl - pom - DKPro Core GPL - - - GNU General Public License Version 3.0 - http://www.gnu.org/licenses/gpl-3.0-standalone.html - repo - - - - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT - pom - import - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.arktools-gpl - 1.10.0-SNAPSHOT - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.berkeleyparser-gpl - 1.10.0-SNAPSHOT - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.corenlp-gpl - 1.10.0-SNAPSHOT - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.lingpipe-gpl - 1.10.0-SNAPSHOT - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.tgrep-gpl - 1.10.0-SNAPSHOT - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.matetools-gpl - 1.10.0-SNAPSHOT - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.sfst-gpl - 1.10.0-SNAPSHOT - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.stanfordnlp-gpl - 1.10.0-SNAPSHOT - - - org.dkpro.core - dkpro-core-io-cermine-gpl - 1.10.0-SNAPSHOT - - - - - ../dkpro-core-arktools-gpl - ../dkpro-core-berkeleyparser-gpl - ../dkpro-core-corenlp-gpl - ../dkpro-core-lingpipe-gpl - ../dkpro-core-matetools-gpl - ../dkpro-core-stanfordnlp-gpl - ../dkpro-core-sfst-gpl - ../dkpro-core-io-tgrep-gpl - - - - - true - com.mycila - license-maven-plugin - 3.0 - -
${basedir}/.license-header.txt
- true - true - - ${project.inceptionYear} - ${currentYear} - - - release.properties - CHANGES - CHANGES.txt - NOTICE.txt - README.txt - LICENSE.txt - de.tudarmstadt.ukp.dkpro.core.examples-gpl/**/* - src/main/resources/**/* - src/test/resources/**/* - .license-header.txt - src/main/java/edu/stanford/nlp/parser/lexparser/**/* - src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/CasCopier.java - -
- - - check-headers - verify - - check - - - -
-
-
- - - - deps-not-on-maven-central - - - icm - ICM repository - http://maven.icm.edu.pl/artifactory/repo - - true - warn - - - - - - ../dkpro-core-io-cermine-gpl - - - - - - org.dkpro.core - dkpro-core-io-cermine-gpl - 1.10.0-SNAPSHOT - - - - - + 4.0.0 + + org.dkpro.core + dkpro-core + 2.3.0-SNAPSHOT + .. + + dkpro-core-gpl + pom + DKPro Core GPL + https://dkpro.github.io/dkpro-core/ + + + GNU General Public License Version 3.0 + http://www.gnu.org/licenses/gpl-3.0-standalone.html + repo + + + + + + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT + pom + import + + + org.dkpro.core + dkpro-core-arktools-gpl + 2.3.0-SNAPSHOT + + + org.dkpro.core + dkpro-core-berkeleyparser-gpl + 2.3.0-SNAPSHOT + + + org.dkpro.core + dkpro-core-corenlp-gpl + 2.3.0-SNAPSHOT + + + org.dkpro.core + dkpro-core-lingpipe-gpl + 2.3.0-SNAPSHOT + + + org.dkpro.core + dkpro-core-io-tgrep-gpl + 2.3.0-SNAPSHOT + + + org.dkpro.core + dkpro-core-matetools-gpl + 2.3.0-SNAPSHOT + + + org.dkpro.core + dkpro-core-maui-gpl + 2.3.0-SNAPSHOT + + + org.dkpro.core + dkpro-core-sfst-gpl + 2.3.0-SNAPSHOT + + + org.dkpro.core + dkpro-core-stanfordnlp-gpl + 2.3.0-SNAPSHOT + + + org.dkpro.core + dkpro-core-io-cermine-gpl + 2.3.0-SNAPSHOT + + + + + ../dkpro-core-arktools-gpl + ../dkpro-core-berkeleyparser-gpl + ../dkpro-core-corenlp-gpl + ../dkpro-core-lingpipe-gpl + ../dkpro-core-matetools-gpl + ../dkpro-core-maui-gpl + ../dkpro-core-stanfordnlp-gpl + ../dkpro-core-sfst-gpl + ../dkpro-core-io-tgrep-gpl + + + + + true + com.mycila + license-maven-plugin + 3.0 + +
${basedir}/.license-header.txt
+ true + true + + ${project.inceptionYear} + ${currentYear} + + + .checkstyle + release.properties + CHANGES + CHANGES.txt + NOTICE.txt + README.txt + LICENSE.txt + de.tudarmstadt.ukp.dkpro.core.examples-gpl/**/* + src/main/resources/**/* + src/test/resources/**/* + .license-header.txt + src/main/java/edu/stanford/nlp/parser/lexparser/**/* + + + SLASHSTAR_STYLE + +
+ + + check-headers + verify + + check + + + +
+
+
+ + + + deps-not-on-maven-central + + + icm + ICM repository + http://maven.icm.edu.pl/artifactory/repo + + true + warn + + + + + + ../dkpro-core-io-cermine-gpl + + + + + + org.dkpro.core + dkpro-core-io-cermine-gpl + 2.3.0-SNAPSHOT + + + + +
diff --git a/dkpro-core-gpl/scripts/build.xml b/dkpro-core-gpl/scripts/build.xml index 5eefd99dd0..04ba0c6c65 100644 --- a/dkpro-core-gpl/scripts/build.xml +++ b/dkpro-core-gpl/scripts/build.xml @@ -1,6 +1,6 @@ diff --git a/dkpro-core-hunpos-asl/pom.xml b/dkpro-core-hunpos-asl/pom.xml index 78aaaac1bd..b639f4e801 100644 --- a/dkpro-core-hunpos-asl/pom.xml +++ b/dkpro-core-hunpos-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.hunpos-asl + dkpro-core-hunpos-asl jar DKPro Core ASL - Hunpos + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -36,24 +37,28 @@ uimafit-core - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.hunpos-bin - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -66,8 +71,8 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test diff --git a/dkpro-core-hunpos-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/hunpos/HunPosTagger.java b/dkpro-core-hunpos-asl/src/main/java/org/dkpro/core/hunpos/HunPosTagger.java similarity index 78% rename from dkpro-core-hunpos-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/hunpos/HunPosTagger.java rename to dkpro-core-hunpos-asl/src/main/java/org/dkpro/core/hunpos/HunPosTagger.java index da8dc33038..8fbd8eefaa 100644 --- a/dkpro-core-hunpos-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/hunpos/HunPosTagger.java +++ b/dkpro-core-hunpos-asl/src/main/java/org/dkpro/core/hunpos/HunPosTagger.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.hunpos; +package org.dkpro.core.hunpos; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; @@ -40,17 +40,20 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.MappingProviderFactory; +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.api.resources.RuntimeProvider; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.resources.RuntimeProvider; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Part-of-Speech annotator using HunPos. Requires {@link Sentence}s to be annotated @@ -65,7 +68,9 @@ * (bibtex) * */ -@ResourceMetaData(name="HunPos POS-Tagger") +@Component(OperationType.PART_OF_SPEECH_TAGGER) +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@ResourceMetaData(name = "HunPos POS-Tagger") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", @@ -89,6 +94,20 @@ public class HunPosTagger @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + *

The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.

+ */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + /** * Load the model from this location instead of locating the model automatically. */ @@ -96,28 +115,25 @@ public class HunPosTagger @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) protected String modelLocation; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating the * mapping automatically. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid spaming - * the heap with thousands of strings representing only a few different tags. - * - * Default: {@code true} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - /** * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} */ public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") @@ -138,11 +154,12 @@ public void initialize(UimaContext aContext) { setContextObject(HunPosTagger.this); + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); setDefault(ARTIFACT_ID, "${groupId}.hunpos-model-tagger-${language}-${variant}"); setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/hunpos/lib/" + "tagger-${language}-${variant}.model"); setDefault(VARIANT, "default"); - setDefaultVariantsLocation("de/tudarmstadt/ukp/dkpro/core/hunpos/lib/tagger-default-variants.map"); + setDefaultVariantsLocation("${package}/lib/tagger-default-variants.map"); setOverride(LOCATION, modelLocation); setOverride(LANGUAGE, language); @@ -161,8 +178,8 @@ protected File produceResource(URL aUrl) runtimeProvider = new RuntimeProvider( "classpath:/de/tudarmstadt/ukp/dkpro/core/hunpos/bin/"); - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - language, modelProvider); + posMappingProvider = MappingProviderFactory.createPosMappingProvider(this, + posMappingLocation, language, modelProvider); } @Override @@ -233,7 +250,8 @@ public void process(JCas aJCas) for (Token t : tokens) { Type posTag = posMappingProvider.getTagType(tags[i]); POS posAnno = (POS) cas.createAnnotation(posTag, t.getBegin(), t.getEnd()); - posAnno.setPosValue(internTags ? tags[i].intern() : tags[i]); + String tag = tags[i]; + posAnno.setPosValue(tag != null ? tag.intern() : null); POSUtils.assignCoarseValue(posAnno); posAnno.addToIndexes(); t.setPos(posAnno); diff --git a/dkpro-core-hunpos-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/hunpos/lib/tagger-default-variants.map b/dkpro-core-hunpos-asl/src/main/resources/org/dkpro/core/hunpos/lib/tagger-default-variants.map similarity index 100% rename from dkpro-core-hunpos-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/hunpos/lib/tagger-default-variants.map rename to dkpro-core-hunpos-asl/src/main/resources/org/dkpro/core/hunpos/lib/tagger-default-variants.map diff --git a/dkpro-core-hunpos-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/hunpos/HunPosTaggerTest.java b/dkpro-core-hunpos-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/hunpos/HunPosTaggerTest.java deleted file mode 100644 index aa25517a2c..0000000000 --- a/dkpro-core-hunpos-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/hunpos/HunPosTaggerTest.java +++ /dev/null @@ -1,286 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.hunpos; - -import static org.apache.commons.lang3.StringUtils.repeat; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.junit.Assert.assertEquals; - -import java.util.ArrayList; -import java.util.List; -import java.util.Locale; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.fit.testing.util.HideOutput; -import org.apache.uima.jcas.JCas; -import org.junit.Assume; -import org.junit.Before; -import org.junit.Ignore; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TestName; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -public class HunPosTaggerTest -{ - @Before - public void prepare() - { - Assume.assumeFalse("HunPos currently hangs indefinitely on Windows: Issue #1099", - System.getProperty("os.name").toLowerCase(Locale.US).contains("win")); - } - -// @Test -// public void testCatalan() -// throws Exception -// { -// runTest("ca", null, "Aquesta és una prova .", -// new String[] { "Pd-nsn--n-a", "Vcr3s", "N-msan", "Z" }, -// new String[] { "POS", "POS", "POS", "POS" }); -// } - - @Test - public void testCroatian() - throws Exception - { - runTest("hr", null, "Ovo je test .", - new String[] { "Pd-nsn--n-a", "Vcr3s", "N-msan", "Z" }, - new String[] { "POS", "POS", "POS", "POS" }); - } - - @Test - public void testDanish() - throws Exception - { - runTest("da", null, "Dette er en test .", - new String[] { "PD", "VA", "PI", "NC", "XP" }, - new String[] { "POS_PRON", "POS_VERB", "POS_PRON", "POS_NOUN", "POS_PUNCT" }); - } - - @Test - public void testEnglish() - throws Exception - { - runTest("en", null, "This is a test .", - new String[] { "DT", "VBZ", "DT", "NN", "." }, - new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - - runTest("en", null, "A neural net .", - new String[] { "DT", "JJ", "NN", "." }, - new String[] { "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); - - runTest("en", null, "John is purchasing oranges .", - new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, - new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); - } - - @Test - public void testFarsi() - throws Exception - { - runTest("fa", null, "این یک تست است . \n", - new String[] { "DET", "PRO", "N_SING", "V_COP", "DELM" }, - new String[] { "POS_DET", "POS_PRON", "POS_NOUN", "POS_VERB", "POS_PUNCT" }); - } - - @Test - public void testGerman() - throws Exception - { - runTest("de", null, "Das ist ein Test .", - new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - } - - @Test - public void testHungarian() - throws Exception - { - runTest("hu", null, "Ez egy teszt .", - new String[] { "NOUN", "ART", "NOUN", "PUNCT" }, - new String[] { "POS", "POS", "POS", "POS" }); - } - - @Test - public void testPortuguese() - throws Exception - { - runTest("pt", null, "Este é um teste .", - new String[] {"pron-det", "v-fin", "art", "n", "punc" }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - - runTest("pt", "tbchp", "Este é um teste .", - new String[] { "D", "SR-P", "D-UM", "N", "." }, - new String[] { "POS", "POS", "POS", "POS", "POS" }); - - runTest("pt", "mm", "Este é um teste .", - new String[] { "PROSUB", "V", "ART", "N", "." }, - new String[] { "POS", "POS", "POS", "POS", "POS" }); - - runTest("pt", "bosque", "Este é um teste .", - new String[] { "pron-det", "v-fin", "art", "n", "punc" }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - } - - @Test - public void testRussian() - throws Exception - { - runTest("ru", null, "Это тест .", - new String[] { "A", "S", "PUNC" }, - new String[] { "POS", "POS", "POS" }); - } - - @Test - public void testSlovenian() - throws Exception - { - runTest("sl", null, "To je test .", - new String[] { "zaimek-kazalni", "glagol-pomožni", "samostalnik-občno_ime", "PUNC" }, - new String[] { "POS", "POS", "POS", "POS" }); - } - - @Test - public void testSwedish() - throws Exception - { - runTest("sv", null, "Detta är ett test .", - new String[] { "PN_NEU_SIN_DEF_SUB/OBJ", "VB_PRS_AKT", "DT_NEU_SIN_IND", "NN_NEU_SIN_IND_NOM", "DL_MAD" }, - new String[] { "POS_X", "POS_X", "POS_X", "POS_X", "POS_X" }); - - runTest("sv", "paroletags", "Detta är ett test .", - new String[] { "PF@NS0@S", "V@IPAS", "DI@NS@S", "NCNSN@IS", "FE" }, - new String[] { "POS", "POS", "POS", "POS", "POS" }); - - runTest("sv", "suctags", "Detta är ett test .", - new String[] { "PN_NEU_SIN_DEF_SUB/OBJ", "VB_PRS_AKT", "DT_NEU_SIN_IND", "NN_NEU_SIN_IND_NOM", "DL_MAD" }, - new String[] { "POS_X", "POS_X", "POS_X", "POS_X", "POS_X" }); - -// runTest("sv", "suc2x", "Detta är ett test .", -// new String[] { "PN_NEU_SIN_DEF_SUB@OBJ", "VB_PRS_AKT", "DT_NEU_SIN_IND", "NN_NEU_SIN_IND_NOM", "MAD" }, -// new String[] { "O", "O", "O", "O", "O" }); - } - - @Test -// @Ignore("Platform specific") - public void testOddCharacters() - throws Exception - { - runTest("en", null, "² § ¶ § °", - new String[] { "NNP", "NNP", "NNP", "NNP", "NNP" }, - new String[] { "POS_PROPN", "POS_PROPN", "POS_PROPN", "POS_PROPN", "POS_PROPN"}); - } - - /** - * Generate a very large document and test it. - * @throws Exception if an error occurs. - */ - @Test - @Ignore("Takes too long") - public void hugeDocumentTest() - throws Exception - { - // Start Java with -Xmx512m - boolean run = Runtime.getRuntime().maxMemory() > (500000000); - if (!run) { - System.out.println("Test requires more heap than available, skipping"); - } - Assume.assumeTrue(run); - - String text = "This is a test .\n"; - int reps = 4000000 / text.length(); - String testString = repeat(text, " ", reps); - - AnalysisEngineDescription engine = createEngineDescription(HunPosTagger.class); - JCas jcas = TestRunner.runTest(engine, "en", testString); - List actualTags = new ArrayList(select(jcas, POS.class)); - assertEquals(reps * 5, actualTags.size()); - - // test POS annotations - String[] expectedTags = new String[] { "DT", "VBZ", "DT", "NN", "." }; - String[] expectedTagClasses = new String[] { "POS_ART", "POS_V", "POS_ART", "POS_NN", "POS_PUNC" }; - - for (int i = 0; i < actualTags.size(); i++) { - POS posAnnotation = actualTags.get(i); - assertEquals("In position "+i, expectedTagClasses[i%5], posAnnotation.getType().getShortName()); - assertEquals("In position "+i, expectedTags[i%5], posAnnotation.getPosValue()); - } - - System.out.println("Successfully tagged document with " + testString.length() + - " characters"); - } - - /** - * Test using the same AnalysisEngine multiple times. - * @throws Exception if an error occurs. - */ - @Test - @Ignore("Takes too long") - public void multiDocumentTest() - throws Exception - { - String testDocument = "This is a test ."; - String[] tags = new String[] { "DT", "VBZ", "DT", "NN", "." }; - String[] tagClasses = new String[] { "POS_ART", "POS_V", "POS_ART", "POS_NN", "POS_PUNC" }; - - AnalysisEngine engine = createEngine(HunPosTagger.class); - - HideOutput hideOut = new HideOutput(); - try { - for (int n = 0; n < 100; n++) { - JCas aJCas = TestRunner.runTest(engine, "en", testDocument); - - AssertAnnotations.assertPOS(tagClasses, tags, select(aJCas, POS.class)); - } - } - finally { - engine.destroy(); - hideOut.restoreOutput(); - } - } - - private JCas runTest(String language, String variant, String testDocument, String[] tags, - String[] tagClasses) - throws Exception - { - AnalysisEngine engine = createEngine(HunPosTagger.class, - HunPosTagger.PARAM_VARIANT, variant, - HunPosTagger.PARAM_PRINT_TAGSET, true); - - JCas jcas = TestRunner.runTest(engine, language, testDocument); - - AssertAnnotations.assertPOS(tagClasses, tags, select(jcas, POS.class)); - - return jcas; - } - - @Rule - public TestName name = new TestName(); - - @Before - public void printSeparator() - { - System.out.println("\n=== " + name.getMethodName() + " ====================="); - } -} diff --git a/dkpro-core-hunpos-asl/src/test/java/org/dkpro/core/hunpos/HunPosTaggerTest.java b/dkpro-core-hunpos-asl/src/test/java/org/dkpro/core/hunpos/HunPosTaggerTest.java new file mode 100644 index 0000000000..57a4aa6709 --- /dev/null +++ b/dkpro-core-hunpos-asl/src/test/java/org/dkpro/core/hunpos/HunPosTaggerTest.java @@ -0,0 +1,294 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.hunpos; + +import static org.apache.commons.lang3.StringUtils.repeat; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.fit.testing.util.HideOutput; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.TestRunner; +import org.junit.Assume; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; + +public class HunPosTaggerTest +{ + @Before + public void prepare() + { + Assume.assumeFalse("HunPos currently hangs indefinitely on Windows: Issue #1099", + System.getProperty("os.name").toLowerCase(Locale.US).contains("win")); + Assume.assumeTrue("HunPos does not run on OS X Catalina or higher", + System.getProperty("os.name").toLowerCase(Locale.US).contains("mac") && + !System.getProperty("os.version").matches("10\\.([0-9]|1[0-4]).*")); + } + +// @Test +// public void testCatalan() +// throws Exception +// { +// runTest("ca", null, "Aquesta és una prova .", +// new String[] { "Pd-nsn--n-a", "Vcr3s", "N-msan", "Z" }, +// new String[] { "POS", "POS", "POS", "POS" }); +// } + + @Test + public void testCroatian() + throws Exception + { + runTest("hr", null, "Ovo je test .", + new String[] { "Pd-nsn--n-a", "Vcr3s", "N-msan", "Z" }, + new String[] { "POS", "POS", "POS", "POS" }); + } + + @Test + public void testDanish() + throws Exception + { + runTest("da", null, "Dette er en test .", + new String[] { "PD", "VA", "PI", "NC", "XP" }, + new String[] { "POS_PRON", "POS_VERB", "POS_PRON", "POS_NOUN", "POS_PUNCT" }); + } + + @Test + public void testEnglish() + throws Exception + { + runTest("en", null, "This is a test .", + new String[] { "DT", "VBZ", "DT", "NN", "." }, + new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + + runTest("en", null, "A neural net .", + new String[] { "DT", "JJ", "NN", "." }, + new String[] { "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); + + runTest("en", null, "John is purchasing oranges .", + new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, + new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); + } + + @Test + public void testFarsi() + throws Exception + { + runTest("fa", null, "این یک تست است . \n", + new String[] { "DET", "PRO", "N_SING", "V_COP", "DELM" }, + new String[] { "POS_DET", "POS_PRON", "POS_NOUN", "POS_VERB", "POS_PUNCT" }); + } + + @Test + public void testGerman() + throws Exception + { + runTest("de", null, "Das ist ein Test .", + new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + } + + @Test + public void testHungarian() + throws Exception + { + runTest("hu", null, "Ez egy teszt .", + new String[] { "NOUN", "ART", "NOUN", "PUNCT" }, + new String[] { "POS", "POS", "POS", "POS" }); + } + + @Test + public void testPortuguese() + throws Exception + { + runTest("pt", null, "Este é um teste .", + new String[] {"pron-det", "v-fin", "art", "n", "punc" }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + + runTest("pt", "tbchp", "Este é um teste .", + new String[] { "D", "SR-P", "D-UM", "N", "." }, + new String[] { "POS", "POS", "POS", "POS", "POS" }); + + runTest("pt", "mm", "Este é um teste .", + new String[] { "PROSUB", "V", "ART", "N", "." }, + new String[] { "POS", "POS", "POS", "POS", "POS" }); + + runTest("pt", "bosque", "Este é um teste .", + new String[] { "pron-det", "v-fin", "art", "n", "punc" }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + } + + @Test + public void testRussian() + throws Exception + { + runTest("ru", null, "Это тест .", + new String[] { "A", "S", "PUNC" }, + new String[] { "POS", "POS", "POS" }); + } + + @Test + public void testSlovenian() + throws Exception + { + runTest("sl", null, "To je test .", + new String[] { "zaimek-kazalni", "glagol-pomožni", "samostalnik-občno_ime", + "PUNC" }, + new String[] { "POS", "POS", "POS", "POS" }); + } + + @Test + public void testSwedish() + throws Exception + { + runTest("sv", null, "Detta är ett test .", + new String[] { "PN_NEU_SIN_DEF_SUB/OBJ", "VB_PRS_AKT", "DT_NEU_SIN_IND", + "NN_NEU_SIN_IND_NOM", "DL_MAD" }, + new String[] { "POS", "POS", "POS", "POS", "POS" }); + + runTest("sv", "paroletags", "Detta är ett test .", + new String[] { "PF@NS0@S", "V@IPAS", "DI@NS@S", "NCNSN@IS", "FE" }, + new String[] { "POS", "POS", "POS", "POS", "POS" }); + + runTest("sv", "suctags", "Detta är ett test .", + new String[] { "PN_NEU_SIN_DEF_SUB/OBJ", "VB_PRS_AKT", "DT_NEU_SIN_IND", + "NN_NEU_SIN_IND_NOM", "DL_MAD" }, + new String[] { "POS", "POS", "POS", "POS", "POS" }); + + // runTest("sv", "suc2x", "Detta är ett test .", + // new String[] { "PN_NEU_SIN_DEF_SUB@OBJ", "VB_PRS_AKT", "DT_NEU_SIN_IND", + // "NN_NEU_SIN_IND_NOM", "MAD" }, + // new String[] { "O", "O", "O", "O", "O" }); + } + + @Test +// @Ignore("Platform specific") + public void testOddCharacters() + throws Exception + { + runTest("en", null, "² § ¶ § °", + new String[] { "NNP", "NNP", "NNP", "NNP", "NNP" }, + new String[] { "POS_PROPN", "POS_PROPN", "POS_PROPN", "POS_PROPN", "POS_PROPN"}); + } + + /** + * Generate a very large document and test it. + * @throws Exception if an error occurs. + */ + @Test + @Ignore("Takes too long") + public void hugeDocumentTest() + throws Exception + { + // Start Java with -Xmx512m + boolean run = Runtime.getRuntime().maxMemory() > (500000000); + if (!run) { + System.out.println("Test requires more heap than available, skipping"); + } + Assume.assumeTrue(run); + + String text = "This is a test .\n"; + int reps = 4000000 / text.length(); + String testString = repeat(text, " ", reps); + + AnalysisEngineDescription engine = createEngineDescription(HunPosTagger.class); + JCas jcas = TestRunner.runTest(engine, "en", testString); + List actualTags = new ArrayList(select(jcas, POS.class)); + assertEquals(reps * 5, actualTags.size()); + + // test POS annotations + String[] expectedTags = { "DT", "VBZ", "DT", "NN", "." }; + String[] expectedTagClasses = { "POS_ART", "POS_V", "POS_ART", "POS_NN", "POS_PUNC" }; + + for (int i = 0; i < actualTags.size(); i++) { + POS posAnnotation = actualTags.get(i); + assertEquals("In position " + i, expectedTagClasses[i % 5], + posAnnotation.getType().getShortName()); + assertEquals("In position " + i, expectedTags[i % 5], posAnnotation.getPosValue()); + } + + System.out.println("Successfully tagged document with " + testString.length() + + " characters"); + } + + /** + * Test using the same AnalysisEngine multiple times. + * @throws Exception if an error occurs. + */ + @Test + @Ignore("Takes too long") + public void multiDocumentTest() + throws Exception + { + String testDocument = "This is a test ."; + String[] tags = { "DT", "VBZ", "DT", "NN", "." }; + String[] tagClasses = { "POS_ART", "POS_V", "POS_ART", "POS_NN", "POS_PUNC" }; + + AnalysisEngine engine = createEngine(HunPosTagger.class); + + HideOutput hideOut = new HideOutput(); + try { + for (int n = 0; n < 100; n++) { + JCas aJCas = TestRunner.runTest(engine, "en", testDocument); + + AssertAnnotations.assertPOS(tagClasses, tags, select(aJCas, POS.class)); + } + } + finally { + engine.destroy(); + hideOut.restoreOutput(); + } + } + + private JCas runTest(String language, String variant, String testDocument, String[] tags, + String[] tagClasses) + throws Exception + { + AnalysisEngine engine = createEngine(HunPosTagger.class, + HunPosTagger.PARAM_VARIANT, variant, + HunPosTagger.PARAM_PRINT_TAGSET, true); + + JCas jcas = TestRunner.runTest(engine, language, testDocument); + + AssertAnnotations.assertPOS(tagClasses, tags, select(jcas, POS.class)); + + return jcas; + } + + @Rule + public TestName name = new TestName(); + + @Before + public void printSeparator() + { + System.out.println("\n=== " + name.getMethodName() + " ====================="); + } +} diff --git a/dkpro-core-hunpos-asl/src/test/resources/log4j.properties b/dkpro-core-hunpos-asl/src/test/resources/log4j.properties deleted file mode 100644 index 9f0bdd6149..0000000000 --- a/dkpro-core-hunpos-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,12 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO - -log4j.logger.de.tudarmstadt.ukp.dkpro.core.io.bincas.BinaryCasReader = WARN -log4j.logger.de.tudarmstadt.ukp.dkpro.core.io.bincas.BinaryCasWriter = WARN -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase = WARN \ No newline at end of file diff --git a/dkpro-core-hunpos-asl/src/test/resources/log4j2.xml b/dkpro-core-hunpos-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-hunpos-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-icu-asl/pom.xml b/dkpro-core-icu-asl/pom.xml index e7f8250b83..9412acc80d 100644 --- a/dkpro-core-icu-asl/pom.xml +++ b/dkpro-core-icu-asl/pom.xml @@ -1,11 +1,11 @@ + + 4.0.0 + + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT + ../dkpro-core-asl + + dkpro-core-illinoisnlp-asl + jar + DKPro Core ASL - Illinois Cognitive Computation Group NLP (v ${illinois-cogcomp-nlp.version}) (academic use) + https://dkpro.github.io/dkpro-core/ + + 4.0.7 + 6g + + + + org.apache.uima + uimaj-core + + + org.apache.uima + uimafit-core + + + org.apache.commons + commons-lang3 + + + commons-io + commons-io + + + edu.illinois.cs.cogcomp + illinois-pos + ${illinois-cogcomp-nlp.version} + + + edu.illinois.cs.cogcomp + LBJava + 1.3.0 + + + weka-stable + nz.ac.waikato.cms.weka + + + + + edu.illinois.cs.cogcomp + illinois-core-utilities + ${illinois-cogcomp-nlp.version} + + + edu.illinois.cs.cogcomp + illinois-chunker + ${illinois-cogcomp-nlp.version} + + + edu.illinois.cs.cogcomp + illinois-ner + ${illinois-cogcomp-nlp.version} + + + edu.stanford.nlp + stanford-corenlp + + + + + edu.illinois.cs.cogcomp + illinois-tokenizer + ${illinois-cogcomp-nlp.version} + + + edu.illinois.cs.cogcomp + illinois-lemmatizer + ${illinois-cogcomp-nlp.version} + + + stanford-corenlp + edu.stanford.nlp + + + mysql + mysql-connector-java + + + + + org.dkpro.core + dkpro-core-api-metadata-asl + + + org.dkpro.core + dkpro-core-api-resources-asl + + + org.dkpro.core + dkpro-core-api-lexmorph-asl + + + org.dkpro.core + dkpro-core-api-ner-asl + + + org.dkpro.core + dkpro-core-api-syntax-asl + + + org.dkpro.core + dkpro-core-api-segmentation-asl + + + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api + + + org.dkpro.core + dkpro-core-testing-asl + test + + + junit + junit + test + + + \ No newline at end of file diff --git a/dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisChunker.java b/dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/IllinoisChunker.java similarity index 79% rename from dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisChunker.java rename to dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/IllinoisChunker.java index f0a36c4a0e..dbba5acb38 100644 --- a/dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisChunker.java +++ b/dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/IllinoisChunker.java @@ -15,10 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.lbj; +package org.dkpro.core.illinoisnlp; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; + import java.io.IOException; import java.net.URL; import java.util.List; @@ -29,30 +30,36 @@ import org.apache.uima.cas.CAS; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.LanguageCapability; import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.illinoisnlp.internal.ConvertToIllinois; +import org.dkpro.core.illinoisnlp.internal.ConvertToUima; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; -import de.tudarmstadt.ukp.dkpro.core.lbj.internal.ConvertToIllinois; -import de.tudarmstadt.ukp.dkpro.core.lbj.internal.ConvertToUima; import edu.illinois.cs.cogcomp.annotation.Annotator; import edu.illinois.cs.cogcomp.annotation.AnnotatorException; import edu.illinois.cs.cogcomp.chunker.main.ChunkerAnnotator; import edu.illinois.cs.cogcomp.chunker.main.lbjava.Chunker; import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Wrapper for the Illinois chunker from the Cognitive Computation Group (CCG). */ -@ResourceMetaData(name="Illinois CCG Chunker") +@Component(OperationType.CHUNKER) +@ResourceMetaData(name = "Illinois CCG Chunker") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", @@ -60,32 +67,24 @@ "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, outputs = { "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk" }) +@LanguageCapability("en") public class IllinoisChunker extends JCasAnnotator_ImplBase { - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code true} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - /** * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} */ public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") protected boolean printTagSet; // public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; // @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) // private String modelLocation; + /** + * Use this language instead of the document language. + */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) private String language; @@ -98,8 +97,9 @@ public class IllinoisChunker * Load the chunk tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ - public static final String PARAM_CHUNK_MAPPING_LOCATION = ComponentParameters.PARAM_CHUNK_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_CHUNK_MAPPING_LOCATION, mandatory = false, defaultValue="classpath:/de/tudarmstadt/ukp/dkpro/core/api/syntax/tagset/en-conll2000-chunk.map") + public static final String PARAM_CHUNK_MAPPING_LOCATION = + ComponentParameters.PARAM_CHUNK_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_CHUNK_MAPPING_LOCATION, mandatory = false, defaultValue = "classpath:/org/dkpro/core/api/syntax/tagset/en-conll2000-chunk.map") protected String chunkMappingLocation; private ModelProviderBase modelProvider; @@ -163,12 +163,11 @@ protected ChunkerAnnotator produceResource(URL aUrl) throws IOException } @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException + public void process(JCas aJCas) throws AnalysisEngineProcessException { - CAS cas = aJCas.getCas(); + CAS cas = aJCas.getCas(); - modelProvider.configure(cas); + modelProvider.configure(cas); mappingProvider.configure(cas); ConvertToIllinois converter = new ConvertToIllinois(); @@ -183,10 +182,10 @@ public void process(JCas aJCas) } for (Sentence s : select(aJCas, Sentence.class)) { - // Get tokens from CAS - List casTokens = selectCovered(aJCas, Token.class, s); - - ConvertToUima.convertChunks(aJCas, casTokens, document, mappingProvider, internTags); + // Get tokens from CAS + List casTokens = selectCovered(aJCas, Token.class, s); + + ConvertToUima.convertChunks(aJCas, casTokens, document, mappingProvider); } } } diff --git a/dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisLemmatizer.java b/dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/IllinoisLemmatizer.java similarity index 84% rename from dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisLemmatizer.java rename to dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/IllinoisLemmatizer.java index e239cacdfd..8172b79046 100644 --- a/dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisLemmatizer.java +++ b/dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/IllinoisLemmatizer.java @@ -15,10 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.lbj; +package org.dkpro.core.illinoisnlp; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; + import java.io.IOException; import java.net.URL; import java.util.List; @@ -31,20 +32,25 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.illinoisnlp.internal.ConvertToIllinois; +import org.dkpro.core.illinoisnlp.internal.ConvertToUima; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.lbj.internal.ConvertToIllinois; -import de.tudarmstadt.ukp.dkpro.core.lbj.internal.ConvertToUima; import edu.illinois.cs.cogcomp.annotation.Annotator; import edu.illinois.cs.cogcomp.annotation.AnnotatorException; import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Lemmatizer from the Cognitive Computation Group at University of Illinois at Urbana-Champaign. */ -@ResourceMetaData(name="Illinois CCG Lemmatizer") +@Component(OperationType.LEMMATIZER) +@ResourceMetaData(name = "Illinois CCG Lemmatizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", @@ -75,7 +81,8 @@ protected Annotator produceResource(URL aUrl) throws IOException throw new IllegalArgumentException("Only language [en] is supported"); } - Annotator annotator = new edu.illinois.cs.cogcomp.nlp.lemmatizer.IllinoisLemmatizer(); + Annotator annotator = + new edu.illinois.cs.cogcomp.nlp.lemmatizer.IllinoisLemmatizer(); return annotator; } diff --git a/dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisNamedEntityRecognizer.java b/dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/IllinoisNamedEntityRecognizer.java similarity index 87% rename from dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisNamedEntityRecognizer.java rename to dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/IllinoisNamedEntityRecognizer.java index 7a0a38a7da..7a11c839e2 100644 --- a/dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisNamedEntityRecognizer.java +++ b/dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/IllinoisNamedEntityRecognizer.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.lbj; +package org.dkpro.core.illinoisnlp; import java.io.IOException; import java.net.URL; @@ -30,24 +30,29 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.illinoisnlp.internal.ConvertToIllinois; +import org.dkpro.core.illinoisnlp.internal.ConvertToUima; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.lbj.internal.ConvertToIllinois; -import de.tudarmstadt.ukp.dkpro.core.lbj.internal.ConvertToUima; import edu.illinois.cs.cogcomp.annotation.Annotator; import edu.illinois.cs.cogcomp.annotation.AnnotatorException; import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager; import edu.illinois.cs.cogcomp.ner.NERAnnotator; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Wrapper for the Illinois named entity recognizer from the Cognitive Computation Group (CCG). */ -@ResourceMetaData(name="Illinois CCG Named Entity Recognizer") +@Component(OperationType.NAMED_ENTITITY_RECOGNIZER) +@ResourceMetaData(name = "Illinois CCG Named Entity Recognizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, @@ -56,16 +61,6 @@ public class IllinoisNamedEntityRecognizer extends JCasAnnotator_ImplBase { - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code true} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - /** * Log the tag set(s) when a model is loaded. */ @@ -98,7 +93,8 @@ public class IllinoisNamedEntityRecognizer // /** // * Location of the mapping file for named entity tags to UIMA types. // */ -// public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; +// public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = +// ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; // @ConfigurationParameter(name = PARAM_NAMED_ENTITY_MAPPING_LOCATION, mandatory = false) protected String mappingLocation; @@ -190,6 +186,6 @@ public void process(JCas aJCas) throw new IllegalStateException(e); } - ConvertToUima.convertNamedEntity(aJCas, document, mappingProvider, internTags); + ConvertToUima.convertNamedEntity(aJCas, document, mappingProvider); } } diff --git a/dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisPosTagger.java b/dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/IllinoisPosTagger.java similarity index 79% rename from dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisPosTagger.java rename to dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/IllinoisPosTagger.java index 67e3607a10..dc58020b4c 100644 --- a/dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisPosTagger.java +++ b/dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/IllinoisPosTagger.java @@ -1,197 +1,200 @@ -/* - * Copyright 2016 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.lbj; - -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; -import static org.apache.uima.util.Level.INFO; - -import java.io.IOException; -import java.net.URL; -import java.util.List; - -import org.apache.commons.lang3.reflect.FieldUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.lbj.internal.ConvertToIllinois; -import de.tudarmstadt.ukp.dkpro.core.lbj.internal.ConvertToUima; -import edu.illinois.cs.cogcomp.annotation.Annotator; -import edu.illinois.cs.cogcomp.annotation.AnnotatorException; -import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; -import edu.illinois.cs.cogcomp.lbjava.learn.Learner; -import edu.illinois.cs.cogcomp.pos.POSAnnotator; -import edu.illinois.cs.cogcomp.pos.lbjava.POSTagger; -import edu.illinois.cs.cogcomp.pos.lbjava.POSTaggerKnown; -import edu.illinois.cs.cogcomp.pos.lbjava.POSTaggerUnknown; - -/** - * Wrapper for the Illinois POS-tagger from the Cognitive Computation Group (CCG). - */ -@ResourceMetaData(name="Illinois CCG POS-Tagger") -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}) - -public class IllinoisPosTagger - extends JCasAnnotator_ImplBase -{ - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code true} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - - /** - * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") - protected boolean printTagSet; - -// public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; -// @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) -// private String modelLocation; - - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - private String language; - -// public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; -// @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) -// private String variant; - - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false, defaultValue="classpath:/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/en-lbj-pos.map") - private String posMappingLocation; - - private ModelProviderBase modelProvider; - - private MappingProvider mappingProvider; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - - modelProvider = new ModelProviderBase() { - { - setContextObject(IllinoisPosTagger.this); - setDefault(LOCATION, NOT_REQUIRED); - } - - @Override - protected POSAnnotator produceResource(URL aUrl) throws IOException - { - if (!"en".equals(getAggregatedProperties().getProperty(LANGUAGE))) { - throw new IllegalArgumentException("Only language [en] is supported"); - } - - POSAnnotator annotator = new POSAnnotator(false); - - SingletonTagset tags = new SingletonTagset(POS.class, "ptb"); - - try { - POSTagger trainedTagger = (POSTagger) FieldUtils - .readField(annotator, "tagger", true); - Learner known = (POSTaggerKnown) FieldUtils.readField(trainedTagger, "taggerKnown", - true); - for (int i = 0; i < known.getLabelLexicon().size(); i++) { - tags.add(known.getLabelLexicon().lookupKey(i).getStringValue()); - } - - Learner unknown = (POSTaggerUnknown) FieldUtils.readField(trainedTagger, - "taggerUnknown", true); - for (int i = 0; i < unknown.getLabelLexicon().size(); i++) { - tags.add(unknown.getLabelLexicon().lookupKey(i).getStringValue()); - } - } - catch (IllegalAccessException e) { - throw new IllegalStateException(e); - } - - addTagset(tags); - - if (printTagSet) { - getContext().getLogger().log(INFO, getTagset().toString()); - } - - return annotator; - } - }; - -// mappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, -// language, taggerProvider); - - mappingProvider = new MappingProvider(); - mappingProvider.setDefault(MappingProvider.LOCATION, posMappingLocation); - mappingProvider.setDefault(MappingProvider.BASE_TYPE, POS.class.getName()); - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - - modelProvider.configure(cas); - mappingProvider.configure(cas); - - ConvertToIllinois converter = new ConvertToIllinois(); - TextAnnotation document = converter.convert(aJCas); - - // Run tagger - try { - modelProvider.getResource().getView(document); - } - catch (AnnotatorException e) { - throw new IllegalStateException(e); - } - - for (Sentence s : select(aJCas, Sentence.class)) { - // Get tokens from CAS - List casTokens = selectCovered(aJCas, Token.class, s); - - ConvertToUima.convertPOSs(aJCas, casTokens, document, mappingProvider, internTags); - } - } +/* + * Copyright 2016 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.illinoisnlp; + +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.apache.uima.util.Level.INFO; + +import java.io.IOException; +import java.net.URL; +import java.util.List; + +import org.apache.commons.lang3.reflect.FieldUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.illinoisnlp.internal.ConvertToIllinois; +import org.dkpro.core.illinoisnlp.internal.ConvertToUima; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import edu.illinois.cs.cogcomp.annotation.Annotator; +import edu.illinois.cs.cogcomp.annotation.AnnotatorException; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; +import edu.illinois.cs.cogcomp.lbjava.learn.Learner; +import edu.illinois.cs.cogcomp.pos.POSAnnotator; +import edu.illinois.cs.cogcomp.pos.lbjava.POSTagger; +import edu.illinois.cs.cogcomp.pos.lbjava.POSTaggerKnown; +import edu.illinois.cs.cogcomp.pos.lbjava.POSTaggerUnknown; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Wrapper for the Illinois POS-tagger from the Cognitive Computation Group (CCG). + */ +@Component(OperationType.PART_OF_SPEECH_TAGGER) +@ResourceMetaData(name = "Illinois CCG POS-Tagger") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability("en") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}) +public class IllinoisPosTagger + extends JCasAnnotator_ImplBase +{ + /** + * Log the tag set(s) when a model is loaded. + */ + public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") + protected boolean printTagSet; + +// public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; +// @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) +// private String modelLocation; + + /** + * Use this language instead of the document language. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + private String language; + +// public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; +// @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) +// private String variant; + + /** + * Load the part-of-speech tag to UIMA type mapping from this location instead of locating + * the mapping automatically. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false, + defaultValue = "classpath:/org/dkpro/core/api/lexmorph/tagset/en-lbj-pos.map") + private String posMappingLocation; + + private ModelProviderBase modelProvider; + + private MappingProvider mappingProvider; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + modelProvider = new ModelProviderBase() { + { + setContextObject(IllinoisPosTagger.this); + setDefault(LOCATION, NOT_REQUIRED); + } + + @Override + protected POSAnnotator produceResource(URL aUrl) throws IOException + { + if (!"en".equals(getAggregatedProperties().getProperty(LANGUAGE))) { + throw new IllegalArgumentException("Only language [en] is supported"); + } + + POSAnnotator annotator = new POSAnnotator(false); + + SingletonTagset tags = new SingletonTagset(POS.class, "ptb"); + + try { + POSTagger trainedTagger = (POSTagger) FieldUtils + .readField(annotator, "tagger", true); + Learner known = (POSTaggerKnown) FieldUtils.readField(trainedTagger, "taggerKnown", + true); + for (int i = 0; i < known.getLabelLexicon().size(); i++) { + tags.add(known.getLabelLexicon().lookupKey(i).getStringValue()); + } + + Learner unknown = (POSTaggerUnknown) FieldUtils.readField(trainedTagger, + "taggerUnknown", true); + for (int i = 0; i < unknown.getLabelLexicon().size(); i++) { + tags.add(unknown.getLabelLexicon().lookupKey(i).getStringValue()); + } + } + catch (IllegalAccessException e) { + throw new IllegalStateException(e); + } + + addTagset(tags); + + if (printTagSet) { + getContext().getLogger().log(INFO, getTagset().toString()); + } + + return annotator; + } + }; + +// mappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, +// language, taggerProvider); + + mappingProvider = new MappingProvider(); + mappingProvider.setDefault(MappingProvider.LOCATION, posMappingLocation); + mappingProvider.setDefault(MappingProvider.BASE_TYPE, POS.class.getName()); + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + CAS cas = aJCas.getCas(); + + modelProvider.configure(cas); + mappingProvider.configure(cas); + + ConvertToIllinois converter = new ConvertToIllinois(); + TextAnnotation document = converter.convert(aJCas); + + // Run tagger + try { + modelProvider.getResource().getView(document); + } + catch (AnnotatorException e) { + throw new IllegalStateException(e); + } + + for (Sentence s : select(aJCas, Sentence.class)) { + // Get tokens from CAS + List casTokens = selectCovered(aJCas, Token.class, s); + + ConvertToUima.convertPOSs(aJCas, casTokens, document, mappingProvider); + } + } } diff --git a/dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisSegmenter.java b/dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/IllinoisSegmenter.java similarity index 89% rename from dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisSegmenter.java rename to dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/IllinoisSegmenter.java index 19731802ce..b8762153ab 100644 --- a/dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisSegmenter.java +++ b/dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/IllinoisSegmenter.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.lbj; +package org.dkpro.core.illinoisnlp; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; @@ -23,17 +23,19 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.segmentation.SegmenterBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; import edu.illinois.cs.cogcomp.core.datastructures.IntPair; import edu.illinois.cs.cogcomp.nlp.tokenizer.IllinoisTokenizer; import edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer; import edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer.Tokenization; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Illinois segmenter. */ -@ResourceMetaData(name="Illinois CCG Segmenter") +@ResourceMetaData(name = "Illinois CCG Segmenter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) @@ -64,7 +66,8 @@ protected void process(JCas aJCas, String text, int zoneBegin) int lastBegin = 0; for (int i : tokens.getSentenceEndTokenIndexes()) { - createSentence(aJCas, ts[lastBegin].getFirst() + zoneBegin, ts[i-1].getSecond() + zoneBegin); + createSentence(aJCas, ts[lastBegin].getFirst() + zoneBegin, + ts[i - 1].getSecond() + zoneBegin); lastBegin = i; } diff --git a/dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/IllinoisStatefulSegmenter.java b/dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/IllinoisStatefulSegmenter.java new file mode 100644 index 0000000000..5268f71ef8 --- /dev/null +++ b/dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/IllinoisStatefulSegmenter.java @@ -0,0 +1,113 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.illinoisnlp; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.segmentation.SegmenterBase; + +import edu.illinois.cs.cogcomp.core.datastructures.IntPair; +import edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer; +import edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer; +import edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer.Tokenization; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Illinois stateful segmenter. + */ +@ResourceMetaData(name = "Illinois CCG Stateful Segmenter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability(outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) +public class IllinoisStatefulSegmenter + extends SegmenterBase +{ + /** + * Split tokens on dashes. + */ + public static final String PARAM_SPLIT_ON_DASH = "splitOnDash"; + @ConfigurationParameter(name = PARAM_SPLIT_ON_DASH, mandatory = true, defaultValue = "true") + private boolean splitOnDash; + + /** + * Split if there are two newlines in a row (ignoring additional newlines). + */ + public static final String PARAM_SPLIT_ON_SECOND_NL = "splitOnSecondNL"; + @ConfigurationParameter(name = PARAM_SPLIT_ON_SECOND_NL, mandatory = true, defaultValue = "false") + private boolean splitOnSecondNL; + + private Tokenizer tokenizer; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + tokenizer = new StatefulTokenizer(splitOnDash, splitOnSecondNL); + } + + @Override + protected void process(JCas aJCas, String text, int zoneBegin) + throws AnalysisEngineProcessException + { + Tokenization tokens = tokenizer.tokenizeTextSpan(text); + + IntPair[] ts = tokens.getCharacterOffsets(); + for (IntPair t : ts) { + createToken(aJCas, t.getFirst() + zoneBegin, t.getSecond() + zoneBegin); + } + + int lastBegin = 0; + for (int i : tokens.getSentenceEndTokenIndexes()) { + createSentence(aJCas, ts[lastBegin].getFirst() + zoneBegin, + ts[i - 1].getSecond() + zoneBegin); + lastBegin = i; + } + + tokens.getSentenceEndTokenIndexes(); + + +// +// for (Paragraph paragraph : paragraphs) { +// if (writeParagraph) { +// Annotation p = new de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph( +// aJCas, paragraph.getStartIndex(), paragraph.getEndIndex()); +// p.addToIndexes(); +// } +// +// for (TextUnit tu : paragraph.getTextUnits()) { +// if (isWriteSentence()) { +// createSentence(aJCas, tu.getStartIndex(), tu.getEndIndex()); +// } +// +// for (Token t : tu.getTokens()) { +// if (isWriteToken()) { +// createToken(aJCas, t.getStartIndex(), t.getEndIndex()); +// } +// } +// } +// } + } +} diff --git a/dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/internal/ConvertToIllinois.java b/dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/internal/ConvertToIllinois.java similarity index 98% rename from dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/internal/ConvertToIllinois.java rename to dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/internal/ConvertToIllinois.java index db5794f180..567419a078 100644 --- a/dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/internal/ConvertToIllinois.java +++ b/dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/internal/ConvertToIllinois.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.lbj.internal; +package org.dkpro.core.illinoisnlp.internal; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; @@ -25,6 +25,7 @@ import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.fit.util.CasUtil; import org.apache.uima.jcas.JCas; + import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; diff --git a/dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/internal/ConvertToUima.java b/dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/internal/ConvertToUima.java similarity index 86% rename from dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/internal/ConvertToUima.java rename to dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/internal/ConvertToUima.java index 61932c44dc..378ef31efe 100644 --- a/dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/internal/ConvertToUima.java +++ b/dkpro-core-illinoisnlp-asl/src/main/java/org/dkpro/core/illinoisnlp/internal/ConvertToUima.java @@ -15,18 +15,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.lbj.internal; +package org.dkpro.core.illinoisnlp.internal; import java.util.List; import org.apache.uima.cas.CAS; import org.apache.uima.cas.Type; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; @@ -37,7 +37,7 @@ public class ConvertToUima { public static void convertPOSs(JCas aJCas, List casTokens, TextAnnotation document, - MappingProvider mappingProvider, boolean internStrings) + MappingProvider mappingProvider) { CAS cas = aJCas.getCas(); List pos = document.getView(ViewNames.POS).getConstituents(); @@ -50,7 +50,7 @@ public static void convertPOSs(JCas aJCas, List casTokens, TextAnnotation Type posTag = mappingProvider.getTagType(tag); POS posAnno = (POS) cas.createAnnotation(posTag, p.getStartCharOffset(), p.getEndCharOffset()); - posAnno.setPosValue(internStrings ? tag.intern() : tag); + posAnno.setPosValue(tag != null ? tag.intern() : null); POSUtils.assignCoarseValue(posAnno); posAnno.addToIndexes(); casTokens.get(i).setPos(posAnno); @@ -59,7 +59,7 @@ public static void convertPOSs(JCas aJCas, List casTokens, TextAnnotation } public static void convertChunks(JCas aJCas, List casTokens, TextAnnotation document, - MappingProvider mappingProvider, boolean internStrings) + MappingProvider mappingProvider) { CAS cas = aJCas.getCas(); List pos = document.getView(ViewNames.SHALLOW_PARSE).getConstituents(); @@ -71,13 +71,13 @@ public static void convertChunks(JCas aJCas, List casTokens, TextAnnotati Type chunkTag = mappingProvider.getTagType(tag); Chunk chunkAnno = (Chunk) cas.createAnnotation(chunkTag, p.getStartCharOffset(), p.getEndCharOffset()); - chunkAnno.setChunkValue(internStrings ? tag.intern() : tag); + chunkAnno.setChunkValue(tag != null ? tag.intern() : null); chunkAnno.addToIndexes(); } } public static void convertNamedEntity(JCas aJCas, TextAnnotation document, - MappingProvider mappingProvider, boolean internStrings) + MappingProvider mappingProvider) { CAS cas = aJCas.getCas(); List ne = document.getView(ViewNames.NER_CONLL).getConstituents(); @@ -89,7 +89,7 @@ public static void convertNamedEntity(JCas aJCas, TextAnnotation document, Type neTag = mappingProvider.getTagType(tag); NamedEntity neAnno = (NamedEntity) cas.createAnnotation(neTag, p.getStartCharOffset(), p.getEndCharOffset()); - neAnno.setValue(internStrings ? tag.intern() : tag); + neAnno.setValue(tag != null ? tag.intern() : null); neAnno.addToIndexes(); } } diff --git a/dkpro-core-lbj-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisChunkerTest.java b/dkpro-core-illinoisnlp-asl/src/test/java/org/dkpro/core/illinoisnlp/IllinoisChunkerTest.java similarity index 91% rename from dkpro-core-lbj-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisChunkerTest.java rename to dkpro-core-illinoisnlp-asl/src/test/java/org/dkpro/core/illinoisnlp/IllinoisChunkerTest.java index 84a232dfaf..23ef1dd3a7 100644 --- a/dkpro-core-lbj-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisChunkerTest.java +++ b/dkpro-core-illinoisnlp-asl/src/test/java/org/dkpro/core/illinoisnlp/IllinoisChunkerTest.java @@ -15,20 +15,23 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.lbj; +package org.dkpro.core.illinoisnlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.*; +import static org.dkpro.core.testing.AssertAnnotations.assertChunks; +import static org.dkpro.core.testing.AssertAnnotations.assertTagset; +import static org.dkpro.core.testing.AssertAnnotations.assertTagsetMapping; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; + import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class IllinoisChunkerTest { diff --git a/dkpro-core-lbj-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisLemmatizerTest.java b/dkpro-core-illinoisnlp-asl/src/test/java/org/dkpro/core/illinoisnlp/IllinoisLemmatizerTest.java similarity index 83% rename from dkpro-core-lbj-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisLemmatizerTest.java rename to dkpro-core-illinoisnlp-asl/src/test/java/org/dkpro/core/illinoisnlp/IllinoisLemmatizerTest.java index eae9d01af4..dabd02d23f 100644 --- a/dkpro-core-lbj-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisLemmatizerTest.java +++ b/dkpro-core-illinoisnlp-asl/src/test/java/org/dkpro/core/illinoisnlp/IllinoisLemmatizerTest.java @@ -15,20 +15,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.lbj; +package org.dkpro.core.illinoisnlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class IllinoisLemmatizerTest { @@ -46,9 +46,9 @@ public void testEnglish() AssertAnnotations.assertLemma(lemmas, select(jcas, Lemma.class)); } - private JCas runTest(String aLanguage, String aText) - throws Exception - { + private JCas runTest(String aLanguage, String aText) + throws Exception + { AnalysisEngineDescription engine; engine = createEngineDescription( @@ -56,8 +56,8 @@ private JCas runTest(String aLanguage, String aText) createEngineDescription(IllinoisLemmatizer.class)); return TestRunner.runTest(engine, aLanguage, aText); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); } diff --git a/dkpro-core-lbj-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisNamedEntityRecognizerTest.java b/dkpro-core-illinoisnlp-asl/src/test/java/org/dkpro/core/illinoisnlp/IllinoisNamedEntityRecognizerTest.java similarity index 92% rename from dkpro-core-lbj-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisNamedEntityRecognizerTest.java rename to dkpro-core-illinoisnlp-asl/src/test/java/org/dkpro/core/illinoisnlp/IllinoisNamedEntityRecognizerTest.java index 62b38d178f..46ef80730c 100644 --- a/dkpro-core-lbj-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisNamedEntityRecognizerTest.java +++ b/dkpro-core-illinoisnlp-asl/src/test/java/org/dkpro/core/illinoisnlp/IllinoisNamedEntityRecognizerTest.java @@ -15,19 +15,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.lbj; +package org.dkpro.core.illinoisnlp; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.*; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; + import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class IllinoisNamedEntityRecognizerTest { diff --git a/dkpro-core-lbj-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisPosTaggerTest.java b/dkpro-core-illinoisnlp-asl/src/test/java/org/dkpro/core/illinoisnlp/IllinoisPosTaggerTest.java similarity index 86% rename from dkpro-core-lbj-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisPosTaggerTest.java rename to dkpro-core-illinoisnlp-asl/src/test/java/org/dkpro/core/illinoisnlp/IllinoisPosTaggerTest.java index 0c35f42f7d..ff3efdafb6 100644 --- a/dkpro-core-lbj-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisPosTaggerTest.java +++ b/dkpro-core-illinoisnlp-asl/src/test/java/org/dkpro/core/illinoisnlp/IllinoisPosTaggerTest.java @@ -1,90 +1,91 @@ -/* - * Copyright 2016 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.lbj; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.util.JCasUtil.select; - -import java.io.File; - -import org.apache.commons.io.FileUtils; -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.jcas.JCas; -import org.junit.Rule; -import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; -import edu.illinois.cs.cogcomp.pos.POSTagPlain; - -public class IllinoisPosTaggerTest -{ - @Test - public void testEnglishNative() - throws Exception - { - File tempFile = File.createTempFile("dkpro", ".txt"); - FileUtils.write(tempFile, "This is a test ."); - POSTagPlain.main(new String[] { tempFile.getAbsolutePath() }); - } - - @Test - public void testEnglish() - throws Exception - { - runTest("en", null, "This is a test . \n", - new String[] { "DT", "VBZ", "DT", "NN", "." }, - new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - - runTest("en", null, "A neural net . \n", - new String[] { "DT", "NN", "NN", "." }, - new String[] { "POS_DET", "POS_NOUN", "POS_NOUN", "POS_PUNCT" }); - - JCas jcas = runTest("en", null, "John is purchasing oranges . \n", - new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, - new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); - - String[] posTags = { "#", "$", "''", ",", "-LRB-", "-RRB-", ".", ":", "CC", "CD", "DT", - "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "PDT", - "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", - "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "``" }; - - String[] unmappedPos = {}; - - AssertAnnotations.assertTagset(POS.class, "ptb", posTags, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ptb", unmappedPos, jcas); - } - - private JCas runTest(String language, String variant, String testDocument, String[] tags, - String[] tagClasses) - throws Exception - { - AnalysisEngine engine = createEngine(IllinoisPosTagger.class); - - JCas jcas = TestRunner.runTest(engine, language, testDocument); - - AssertAnnotations.assertPOS(tagClasses, tags, select(jcas, POS.class)); - - return jcas; - } - - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} +/* + * Copyright 2016 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.illinoisnlp; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.util.JCasUtil.select; + +import java.io.File; + +import org.apache.commons.io.FileUtils; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import edu.illinois.cs.cogcomp.pos.POSTagPlain; + +public class IllinoisPosTaggerTest +{ + @Test + public void testEnglishNative() + throws Exception + { + File tempFile = File.createTempFile("dkpro", ".txt"); + FileUtils.write(tempFile, "This is a test .", "UTF-8"); + POSTagPlain.main(new String[] { tempFile.getAbsolutePath() }); + } + + @Test + public void testEnglish() + throws Exception + { + runTest("en", null, "This is a test . \n", + new String[] { "DT", "VBZ", "DT", "NN", "." }, + new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + + runTest("en", null, "A neural net . \n", + new String[] { "DT", "NN", "NN", "." }, + new String[] { "POS_DET", "POS_NOUN", "POS_NOUN", "POS_PUNCT" }); + + JCas jcas = runTest("en", null, "John is purchasing oranges . \n", + new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, + new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); + + String[] posTags = { "#", "$", "''", ",", "-LRB-", "-RRB-", ".", ":", "CC", "CD", "DT", + "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "PDT", + "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", + "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "``" }; + + String[] unmappedPos = {}; + + AssertAnnotations.assertTagset(POS.class, "ptb", posTags, jcas); + AssertAnnotations.assertTagsetMapping(POS.class, "ptb", unmappedPos, jcas); + } + + private JCas runTest(String language, String variant, String testDocument, String[] tags, + String[] tagClasses) + throws Exception + { + AnalysisEngine engine = createEngine(IllinoisPosTagger.class); + + JCas jcas = TestRunner.runTest(engine, language, testDocument); + + AssertAnnotations.assertPOS(tagClasses, tags, select(jcas, POS.class)); + + return jcas; + } + + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-lbj-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisSegmenterTest.java b/dkpro-core-illinoisnlp-asl/src/test/java/org/dkpro/core/illinoisnlp/IllinoisSegmenterTest.java similarity index 90% rename from dkpro-core-lbj-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisSegmenterTest.java rename to dkpro-core-illinoisnlp-asl/src/test/java/org/dkpro/core/illinoisnlp/IllinoisSegmenterTest.java index 7e1aba65e3..8e025f3ecb 100644 --- a/dkpro-core-lbj-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisSegmenterTest.java +++ b/dkpro-core-illinoisnlp-asl/src/test/java/org/dkpro/core/illinoisnlp/IllinoisSegmenterTest.java @@ -15,15 +15,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.lbj; +package org.dkpro.core.illinoisnlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.dkpro.core.testing.harness.SegmenterHarness; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.harness.SegmenterHarness; - public class IllinoisSegmenterTest { @Test diff --git a/dkpro-core-lbj-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisStatefulSegmenterTest.java b/dkpro-core-illinoisnlp-asl/src/test/java/org/dkpro/core/illinoisnlp/IllinoisStatefulSegmenterTest.java similarity index 84% rename from dkpro-core-lbj-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisStatefulSegmenterTest.java rename to dkpro-core-illinoisnlp-asl/src/test/java/org/dkpro/core/illinoisnlp/IllinoisStatefulSegmenterTest.java index dca5138c9b..071b850a55 100644 --- a/dkpro-core-lbj-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisStatefulSegmenterTest.java +++ b/dkpro-core-illinoisnlp-asl/src/test/java/org/dkpro/core/illinoisnlp/IllinoisStatefulSegmenterTest.java @@ -15,15 +15,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.lbj; +package org.dkpro.core.illinoisnlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.dkpro.core.testing.harness.SegmenterHarness; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.harness.SegmenterHarness; - public class IllinoisStatefulSegmenterTest { @Test @@ -32,6 +31,6 @@ public void runHarness() { AnalysisEngineDescription aed = createEngineDescription(IllinoisStatefulSegmenter.class); - SegmenterHarness.run(aed, "de.4", "en.9", "ar.1", "zh.1", "zh.2"); + SegmenterHarness.run(aed, "de.4", "en.1", "en.9", "ar.1", "zh.1", "zh.2"); } } diff --git a/dkpro-core-illinoisnlp-asl/src/test/resources/log4j2.xml b/dkpro-core-illinoisnlp-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-illinoisnlp-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-aclanthology-asl/pom.xml b/dkpro-core-io-aclanthology-asl/pom.xml index 80684c7d8b..050990a92d 100644 --- a/dkpro-core-io-aclanthology-asl/pom.xml +++ b/dkpro-core-io-aclanthology-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.aclanthology-asl + dkpro-core-io-aclanthology-asl jar DKPro Core ASL - IO - ACL Anthology + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -44,12 +45,16 @@ icu4j - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -57,8 +62,8 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl test diff --git a/dkpro-core-io-aclanthology-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/aclanthology/package-info.java b/dkpro-core-io-aclanthology-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/aclanthology/package-info.java deleted file mode 100644 index 8e14f612ee..0000000000 --- a/dkpro-core-io-aclanthology-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/aclanthology/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Reader for the ACL Anthology Corpus. - * - * @since 1.3.0 - */ -package de.tudarmstadt.ukp.dkpro.core.io.aclanthology; \ No newline at end of file diff --git a/dkpro-core-io-aclanthology-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/aclanthology/AclAnthologyReader.java b/dkpro-core-io-aclanthology-asl/src/main/java/org/dkpro/core/io/aclanthology/AclAnthologyReader.java similarity index 84% rename from dkpro-core-io-aclanthology-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/aclanthology/AclAnthologyReader.java rename to dkpro-core-io-aclanthology-asl/src/main/java/org/dkpro/core/io/aclanthology/AclAnthologyReader.java index 8a629cc8b9..541a6e3231 100644 --- a/dkpro-core-io-aclanthology-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/aclanthology/AclAnthologyReader.java +++ b/dkpro-core-io-aclanthology-asl/src/main/java/org/dkpro/core/io/aclanthology/AclAnthologyReader.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.aclanthology; +package org.dkpro.core.io.aclanthology; import static org.apache.commons.io.IOUtils.closeQuietly; @@ -30,12 +30,13 @@ import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; +import org.dkpro.core.api.io.ResourceCollectionReaderBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; import com.ibm.icu.text.CharsetDetector; -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; +import eu.openminted.share.annotations.api.DocumentationResource; /** *

Reads the ACL anthology corpus and outputs CASes with plain text documents.

@@ -43,10 +44,11 @@ *

The reader tries to strip out hyphenation and replace problematic characters to produce a * cleaned text. Otherwise, it is a plain text reader.

*/ -@ResourceMetaData(name="ACL Anthology Corpus Reader") +@ResourceMetaData(name = "ACL Anthology Corpus Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability(MimeTypes.TEXT_PLAIN) @TypeCapability( - outputs={ + outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) public class AclAnthologyReader extends ResourceCollectionReaderBase @@ -110,23 +112,21 @@ public void getNext(CAS aCAS) } } - private String replaceHyphens(String text) { - String lines[] = text.split("\\r?\\n"); + private String replaceHyphens(String text) + { + String[] lines = text.split("\\r?\\n"); StringBuilder sb = new StringBuilder(); for (int i = 0; i < lines.length - 1; i++) { // hyphen heuristic - if (lines[i].endsWith("-") && - lines[i+1].length() > 0 && - Character.isLowerCase(lines[i+1].charAt(0)) && - !(lines[i+1].split(" ")[0].contains("-")) - ) - { + if (lines[i].endsWith("-") && lines[i + 1].length() > 0 + && Character.isLowerCase(lines[i + 1].charAt(0)) + && !(lines[i + 1].split(" ")[0].contains("-"))) { // combine wordA[-\n]wordB into one word String[] lineA = lines[i].split(" "); - String[] lineB = lines[i+1].split(" "); - String wordA = lineA[lineA.length-1]; - wordA = wordA.substring(0, wordA.length()-1); // remove hyphen + String[] lineB = lines[i + 1].split(" "); + String wordA = lineA[lineA.length - 1]; + wordA = wordA.substring(0, wordA.length() - 1); // remove hyphen String wordB = lineB[0]; // take current line without hyphen, but with complete word @@ -142,7 +142,7 @@ private String replaceHyphens(String text) { sbTmp.append(" " + lineB[j]); } } - lines[i+1] = sbTmp.toString(); + lines[i + 1] = sbTmp.toString(); } else { sb.append(lines[i] + "\n"); diff --git a/dkpro-core-io-aclanthology-asl/src/main/java/org/dkpro/core/io/aclanthology/package-info.java b/dkpro-core-io-aclanthology-asl/src/main/java/org/dkpro/core/io/aclanthology/package-info.java new file mode 100644 index 0000000000..70ebe2d02a --- /dev/null +++ b/dkpro-core-io-aclanthology-asl/src/main/java/org/dkpro/core/io/aclanthology/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Reader for the ACL Anthology Corpus. + * + * @since 1.3.0 + */ +package org.dkpro.core.io.aclanthology; diff --git a/dkpro-core-io-aclanthology-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/aclanthology/AclArcReaderTest.java b/dkpro-core-io-aclanthology-asl/src/test/java/org/dkpro/core/io/aclanthology/AclArcReaderTest.java similarity index 91% rename from dkpro-core-io-aclanthology-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/aclanthology/AclArcReaderTest.java rename to dkpro-core-io-aclanthology-asl/src/test/java/org/dkpro/core/io/aclanthology/AclArcReaderTest.java index 511b00dbf2..fdb898f38e 100644 --- a/dkpro-core-io-aclanthology-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/aclanthology/AclArcReaderTest.java +++ b/dkpro-core-io-aclanthology-asl/src/test/java/org/dkpro/core/io/aclanthology/AclArcReaderTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.aclanthology; +package org.dkpro.core.io.aclanthology; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; import static org.junit.Assert.assertEquals; @@ -23,9 +23,10 @@ import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.pipeline.JCasIterable; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.ResourceCollectionReaderBase; +import org.dkpro.core.io.aclanthology.AclAnthologyReader; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; public class AclArcReaderTest @@ -39,7 +40,7 @@ public void aclArcReaderTest() ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "src/test/resources/acl/", ResourceCollectionReaderBase.PARAM_PATTERNS, "[+]**/*.txt" ); - int i=0; + int i = 0; for (JCas jcas : new JCasIterable(reader)) { DocumentMetaData md = DocumentMetaData.get(jcas); System.out.println(md.getDocumentUri()); @@ -54,4 +55,4 @@ public void aclArcReaderTest() } assertEquals(10, i); } -} \ No newline at end of file +} diff --git a/dkpro-core-io-ancora-asl/pom.xml b/dkpro-core-io-ancora-asl/pom.xml index 33b8987b43..a03b478133 100644 --- a/dkpro-core-io-ancora-asl/pom.xml +++ b/dkpro-core-io-ancora-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.ancora-asl + dkpro-core-io-ancora-asl jar DKPro Core ASL - IO - AnCora + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -40,24 +41,32 @@ commons-io - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.slf4j + slf4j-api - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-segmentation-asl + + + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -65,13 +74,13 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.conll-asl + org.dkpro.core + dkpro-core-io-conll-asl test diff --git a/dkpro-core-io-ancora-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/ancora/AncoraReader.java b/dkpro-core-io-ancora-asl/src/main/java/org/dkpro/core/io/ancora/AncoraReader.java similarity index 86% rename from dkpro-core-io-ancora-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/ancora/AncoraReader.java rename to dkpro-core-io-ancora-asl/src/main/java/org/dkpro/core/io/ancora/AncoraReader.java index 9b57a1eddc..160f5f35a7 100644 --- a/dkpro-core-io-ancora-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/ancora/AncoraReader.java +++ b/dkpro-core-io-ancora-asl/src/main/java/org/dkpro/core/io/ancora/AncoraReader.java @@ -15,16 +15,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.ancora; +package org.dkpro.core.io.ancora; -import static de.tudarmstadt.ukp.dkpro.core.io.ancora.internal.AncoraConstants.ATTR_LEMMA; -import static de.tudarmstadt.ukp.dkpro.core.io.ancora.internal.AncoraConstants.ATTR_POS; -import static de.tudarmstadt.ukp.dkpro.core.io.ancora.internal.AncoraConstants.ATTR_WORD; -import static de.tudarmstadt.ukp.dkpro.core.io.ancora.internal.AncoraConstants.TAG_SENTENCE; import static java.util.Arrays.asList; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.dkpro.core.io.ancora.internal.AncoraConstants.ATTR_LEMMA; +import static org.dkpro.core.io.ancora.internal.AncoraConstants.ATTR_POS; +import static org.dkpro.core.io.ancora.internal.AncoraConstants.ATTR_WORD; +import static org.dkpro.core.io.ancora.internal.AncoraConstants.TAG_SENTENCE; import java.io.IOException; import java.io.InputStream; @@ -44,30 +44,32 @@ import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.internal.ExtendedLogger; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.MappingProviderFactory; +import org.slf4j.Logger; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Read AnCora XML format. */ -@ResourceMetaData(name="AnCora XML Reader") +@ResourceMetaData(name = "AnCora XML Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.APPLICATION_XML, MimeTypes.APPLICATION_X_ANCORA_XML}) @TypeCapability( outputs = { @@ -107,10 +109,19 @@ public class AncoraReader @ConfigurationParameter(name = PARAM_READ_SENTENCE, mandatory = true, defaultValue = "true") private boolean readSentence; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Location of the mapping file for part-of-speech tags to UIMA types. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String mappingPosLocation; @@ -123,12 +134,21 @@ public class AncoraReader @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) protected String posTagset; + /** + * Whether to split words containing underscores into multiple tokens. + */ public static final String PARAM_SPLIT_MULTI_WORD_TOKENS = "splitMultiWordTokens"; - @ConfigurationParameter(name = PARAM_SPLIT_MULTI_WORD_TOKENS, mandatory = true, defaultValue="true") + @ConfigurationParameter(name = PARAM_SPLIT_MULTI_WORD_TOKENS, mandatory = true, + defaultValue = "true") protected boolean splitMultiWordTokens; + /** + * Whether to ignore sentence in which any POS tags are missing. Normally, it is assumed that + * if any POS tags are present, then every token as a POS tag. + */ public static final String PARAM_DROP_SENTENCES_WITH_MISSING_POS = "dropSentencesMissingPosTags"; - @ConfigurationParameter(name = PARAM_DROP_SENTENCES_WITH_MISSING_POS, mandatory = true, defaultValue="false") + @ConfigurationParameter(name = PARAM_DROP_SENTENCES_WITH_MISSING_POS, mandatory = true, + defaultValue = "false") protected boolean dropSentencesMissingPosTags; private MappingProvider posMappingProvider; @@ -139,8 +159,8 @@ public void initialize(UimaContext aContext) { super.initialize(aContext); - posMappingProvider = MappingProviderFactory.createPosMappingProvider(mappingPosLocation, - posTagset, getLanguage()); + posMappingProvider = MappingProviderFactory.createPosMappingProvider(this, + mappingPosLocation, posTagset, getLanguage()); } @Override @@ -247,7 +267,7 @@ public class AncoraHandler private final StringBuilder buffer = new StringBuilder(); private JCas jcas; - private ExtendedLogger logger; + private Logger logger; public void setJCas(final JCas aJCas) { @@ -259,12 +279,12 @@ protected JCas getJCas() return jcas; } - public void setLogger(ExtendedLogger aLogger) + public void setLogger(Logger aLogger) { logger = aLogger; } - public ExtendedLogger getLogger() + public Logger getLogger() { return logger; } @@ -302,7 +322,7 @@ private void addToken(String aWord, String aLemma, String aPos) if (aPos != null && readPOS) { Type posTagType = posMappingProvider.getTagType(aPos); POS pos = (POS) getJCas().getCas().createAnnotation(posTagType, start, end); - pos.setPosValue(aPos.intern()); + pos.setPosValue(aPos != null ? aPos.intern() : null); POSUtils.assignCoarseValue(pos); pos.addToIndexes(); if (token != null) { @@ -335,7 +355,7 @@ public void startElement(String aUri, String aLocalName, String aName, sentenceStart = getBuffer().length(); } else if (wd != null && sentenceStart == -1) { - getLogger().info("Ignoring token outside sentence boundaries: ["+wd+"]"); + getLogger().info("Ignoring token outside sentence boundaries: [" + wd + "]"); } else if (wd != null && sentenceStart != -1) { String posTag = aAttributes.getValue(ATTR_POS); diff --git a/dkpro-core-io-ancora-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/ancora/internal/AncoraConstants.java b/dkpro-core-io-ancora-asl/src/main/java/org/dkpro/core/io/ancora/internal/AncoraConstants.java similarity index 92% rename from dkpro-core-io-ancora-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/ancora/internal/AncoraConstants.java rename to dkpro-core-io-ancora-asl/src/main/java/org/dkpro/core/io/ancora/internal/AncoraConstants.java index 1aec07ca55..091dcf44ad 100644 --- a/dkpro-core-io-ancora-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/ancora/internal/AncoraConstants.java +++ b/dkpro-core-io-ancora-asl/src/main/java/org/dkpro/core/io/ancora/internal/AncoraConstants.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.ancora.internal; +package org.dkpro.core.io.ancora.internal; public final class AncoraConstants { @@ -31,4 +31,5 @@ public final class AncoraConstants public static final String ATTR_ELLIPTIC = "elliptic"; public static final String ATTR_PUNCT = "punct"; public static final String ATTR_GENDER = "gen"; - public static final String ATTR_NUMBER = "num";} + public static final String ATTR_NUMBER = "num"; +} diff --git a/dkpro-core-io-ancora-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/ancora/AncoraReaderTest.java b/dkpro-core-io-ancora-asl/src/test/java/org/dkpro/core/io/ancora/AncoraReaderTest.java similarity index 88% rename from dkpro-core-io-ancora-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/ancora/AncoraReaderTest.java rename to dkpro-core-io-ancora-asl/src/test/java/org/dkpro/core/io/ancora/AncoraReaderTest.java index c14970c576..c08b2e35bb 100644 --- a/dkpro-core-io-ancora-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/ancora/AncoraReaderTest.java +++ b/dkpro-core-io-ancora-asl/src/test/java/org/dkpro/core/io/ancora/AncoraReaderTest.java @@ -15,16 +15,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.ancora; +package org.dkpro.core.io.ancora; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testOneWay; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; +import org.dkpro.core.io.ancora.AncoraReader; +import org.dkpro.core.io.conll.Conll2006Writer; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2006Writer; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class AncoraReaderTest { diff --git a/dkpro-core-io-ancora-asl/src/test/resources/log4j.properties b/dkpro-core-io-ancora-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-io-ancora-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-ancora-asl/src/test/resources/log4j2.xml b/dkpro-core-io-ancora-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-ancora-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-annis-asl/pom.xml b/dkpro-core-io-annis-asl/pom.xml index 92b52c2cce..910f462f43 100644 --- a/dkpro-core-io-annis-asl/pom.xml +++ b/dkpro-core-io-annis-asl/pom.xml @@ -15,18 +15,20 @@ See the License for the specific language governing permissions and limitations under the License. --> - 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.annis-asl + dkpro-core-io-annis-asl jar DKPro Core ASL - IO - ANNIS2 + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -45,24 +47,28 @@ commons-lang3 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.syntax-asl + org.dkpro.core + dkpro-core-api-syntax-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -70,8 +76,8 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.negra-asl + org.dkpro.core + dkpro-core-io-negra-asl test diff --git a/dkpro-core-io-annis-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/annis/package-info.java b/dkpro-core-io-annis-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/annis/package-info.java deleted file mode 100644 index 9d0b5ce0db..0000000000 --- a/dkpro-core-io-annis-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/annis/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Support for ANNIS 2 format. - */ -package de.tudarmstadt.ukp.dkpro.core.io.annis; diff --git a/dkpro-core-io-annis-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/annis/RelAnnisWriter.java b/dkpro-core-io-annis-asl/src/main/java/org/dkpro/core/io/annis/RelAnnisWriter.java similarity index 96% rename from dkpro-core-io-annis-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/annis/RelAnnisWriter.java rename to dkpro-core-io-annis-asl/src/main/java/org/dkpro/core/io/annis/RelAnnisWriter.java index 55477333dd..96317004ba 100644 --- a/dkpro-core-io-annis-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/annis/RelAnnisWriter.java +++ b/dkpro-core-io-annis-asl/src/main/java/org/dkpro/core/io/annis/RelAnnisWriter.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.annis; +package org.dkpro.core.io.annis; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; @@ -44,20 +44,25 @@ import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * This Consumer outputs the content of all CASes into the relAnnis file format. The produced files * can be fed into Annis2 (http://www.sfb632.uni-potsdam.de/d1/annis/) to visualize the data. e.g. * constituent and dependency structure. */ -@ResourceMetaData(name="RelANNIS Writer") +@Component(OperationType.WRITER) +@ResourceMetaData(name = "RelANNIS Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", @@ -78,8 +83,6 @@ public class RelAnnisWriter /** * Write part-of-speech information. - * - * Default: {@code true} */ public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "true") @@ -87,8 +90,6 @@ public class RelAnnisWriter /** * Write lemma information. - * - * Default: {@code true} */ public static final String PARAM_WRITE_LEMMA = ComponentParameters.PARAM_WRITE_LEMMA; @ConfigurationParameter(name = PARAM_WRITE_LEMMA, mandatory = true, defaultValue = "true") @@ -96,17 +97,14 @@ public class RelAnnisWriter /** * Write constituent structure information. - * - * Default: {@code true} */ - public static final String PARAM_WRITE_CONSTITUENT = ComponentParameters.PARAM_WRITE_CONSTITUENT; + public static final String PARAM_WRITE_CONSTITUENT = + ComponentParameters.PARAM_WRITE_CONSTITUENT; @ConfigurationParameter(name = PARAM_WRITE_CONSTITUENT, mandatory = true, defaultValue = "true") private boolean writeConstituents; /** * Write dependency relation information. - * - * Default: {@code true} */ public static final String PARAM_WRITE_DEPENDENCY = ComponentParameters.PARAM_WRITE_DEPENDENCY; @ConfigurationParameter(name = PARAM_WRITE_DEPENDENCY, mandatory = true, defaultValue = "true") diff --git a/dkpro-core-io-annis-asl/src/main/java/org/dkpro/core/io/annis/package-info.java b/dkpro-core-io-annis-asl/src/main/java/org/dkpro/core/io/annis/package-info.java new file mode 100644 index 0000000000..ae3986bbad --- /dev/null +++ b/dkpro-core-io-annis-asl/src/main/java/org/dkpro/core/io/annis/package-info.java @@ -0,0 +1,22 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Support for ANNIS 2 format. + */ +package org.dkpro.core.io.annis; diff --git a/dkpro-core-io-annis-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/annis/RelAnnisWriterTest.java b/dkpro-core-io-annis-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/annis/RelAnnisWriterTest.java deleted file mode 100644 index aab8e0da76..0000000000 --- a/dkpro-core-io-annis-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/annis/RelAnnisWriterTest.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.annis; - -import static org.apache.commons.io.FileUtils.*; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.junit.Assert.*; - -import java.io.File; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -import de.tudarmstadt.ukp.dkpro.core.io.negra.NegraExportReader; - -public class RelAnnisWriterTest -{ - @Rule - public TemporaryFolder workspace = new TemporaryFolder(); - - @Test - public void tuebaTest() - throws Exception - { - // create NegraExportReader output - CollectionReaderDescription reader = createReaderDescription(NegraExportReader.class, - NegraExportReader.PARAM_SOURCE_LOCATION, "src/test/resources/tueba/input/tueba-sample.export", - NegraExportReader.PARAM_LANGUAGE, "de", -// NegraExportReader.PARAM_READ_PENN_TREE, false, - NegraExportReader.PARAM_SOURCE_ENCODING, "UTF-8"); - - AnalysisEngineDescription writer = createEngineDescription(RelAnnisWriter.class, - RelAnnisWriter.PARAM_PATH, workspace.getRoot().getPath()); - - SimplePipeline.runPipeline(reader, writer); - - // Check if the output matches the reference output - for (File f : workspace.getRoot().listFiles()) { - System.out.print("Checking ["+f.getName()+"]... "); - if( - readFileToString(new File("src/test/resources/tueba/reference", f.getName()), "UTF-8").equals( - readFileToString(f, "UTF-8"))) { - System.out.println("ok."); - } - else { - System.out.println("FAIL."); - } - } - - // Check if the output matches the reference output - for (File f : workspace.getRoot().listFiles()) { - assertTrue(contentEqualsIgnoreEOL( - new File("src/test/resources/tueba/reference", f.getName()), f, "UTF-8")); - } - } -} diff --git a/dkpro-core-io-annis-asl/src/test/java/org/dkpro/core/io/annis/RelAnnisWriterTest.java b/dkpro-core-io-annis-asl/src/test/java/org/dkpro/core/io/annis/RelAnnisWriterTest.java new file mode 100644 index 0000000000..c60ff12660 --- /dev/null +++ b/dkpro-core-io-annis-asl/src/test/java/org/dkpro/core/io/annis/RelAnnisWriterTest.java @@ -0,0 +1,76 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.annis; + +import static org.apache.commons.io.FileUtils.contentEqualsIgnoreEOL; +import static org.apache.commons.io.FileUtils.readFileToString; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.junit.Assert.assertTrue; + +import java.io.File; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.io.annis.RelAnnisWriter; +import org.dkpro.core.io.negra.NegraExportReader; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class RelAnnisWriterTest +{ + @Rule + public TemporaryFolder workspace = new TemporaryFolder(); + + @Test + public void tuebaTest() + throws Exception + { + // create NegraExportReader output + CollectionReaderDescription reader = createReaderDescription(NegraExportReader.class, + NegraExportReader.PARAM_SOURCE_LOCATION, "src/test/resources/tueba/input/tueba-sample.export", + NegraExportReader.PARAM_LANGUAGE, "de", +// NegraExportReader.PARAM_READ_PENN_TREE, false, + NegraExportReader.PARAM_SOURCE_ENCODING, "UTF-8"); + + AnalysisEngineDescription writer = createEngineDescription(RelAnnisWriter.class, + RelAnnisWriter.PARAM_PATH, workspace.getRoot().getPath()); + + SimplePipeline.runPipeline(reader, writer); + + // Check if the output matches the reference output + for (File f : workspace.getRoot().listFiles()) { + System.out.print("Checking [" + f.getName() + "]... "); + if (readFileToString(new File("src/test/resources/tueba/reference", f.getName()), + "UTF-8").equals(readFileToString(f, "UTF-8"))) { + System.out.println("ok."); + } + else { + System.out.println("FAIL."); + } + } + + // Check if the output matches the reference output + for (File f : workspace.getRoot().listFiles()) { + assertTrue(contentEqualsIgnoreEOL( + new File("src/test/resources/tueba/reference", f.getName()), f, "UTF-8")); + } + } +} diff --git a/dkpro-core-io-bincas-asl/pom.xml b/dkpro-core-io-bincas-asl/pom.xml index 2ca6b0a4b3..37e9801288 100644 --- a/dkpro-core-io-bincas-asl/pom.xml +++ b/dkpro-core-io-bincas-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.bincas-asl + dkpro-core-io-bincas-asl jar DKPro Core ASL - IO - UIMA Binary CAS + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -44,20 +45,24 @@ spring-core - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -65,13 +70,13 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.text-asl + org.dkpro.core + dkpro-core-io-text-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.performance-asl + org.dkpro.core + dkpro-core-performance-asl test @@ -80,8 +85,8 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test diff --git a/dkpro-core-io-bincas-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/BinaryCasReader.java b/dkpro-core-io-bincas-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/BinaryCasReader.java deleted file mode 100644 index 3fecb52ccd..0000000000 --- a/dkpro-core-io-bincas-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/BinaryCasReader.java +++ /dev/null @@ -1,347 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.bincas; - -import static org.apache.commons.io.IOUtils.closeQuietly; -import java.io.BufferedInputStream; -import java.io.DataInputStream; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.ObjectInputStream; -import java.net.MalformedURLException; -import java.util.Arrays; -import java.util.Collection; - -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.CASException; -import org.apache.uima.cas.SerialFormat; -import org.apache.uima.cas.TypeSystem; -import org.apache.uima.cas.impl.CASImpl; -import org.apache.uima.cas.impl.CASMgrSerializer; -import org.apache.uima.cas.impl.Serialization; -import org.apache.uima.cas.impl.TypeSystemImpl; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.metadata.FsIndexDescription; -import org.apache.uima.resource.metadata.TypePriorities; -import org.apache.uima.resource.metadata.TypeSystemDescription; -import org.apache.uima.util.CasCreationUtils; -import org.apache.uima.util.CasIOUtils; -import org.apache.uima.util.CasLoadMode; -import org.apache.uima.util.TypeSystemUtil; - -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; - -/** - * UIMA Binary CAS formats reader. - */ -@ResourceMetaData(name="UIMA Binary CAS Reader") -@MimeTypeCapability({ MimeTypes.APPLICATION_X_UIMA_BINARY }) -public class BinaryCasReader - extends ResourceCollectionReaderBase -{ - private static final byte[] DKPRO_HEADER = new byte[] { 'D', 'K', 'P', 'r', 'o', '1' }; - - /** - * The location from which to obtain the type system when the CAS is stored in form 0. - */ - public static final String PARAM_TYPE_SYSTEM_LOCATION = "typeSystemLocation"; - @ConfigurationParameter(name=PARAM_TYPE_SYSTEM_LOCATION, mandatory=false) - private String typeSystemLocation; - - /** - * Determines whether the type system from a currently read file should be merged - * with the current type system - */ - public static final String PARAM_MERGE_TYPE_SYSTEM = "mergeTypeSystem"; - @ConfigurationParameter(name = PARAM_MERGE_TYPE_SYSTEM, mandatory = true, defaultValue = "false") - private boolean mergeTypeSystem; - - /** - * Add DKPro Core metadata if it is not already present in the document. - */ - public static final String PARAM_ADD_DOCUMENT_METADATA = "addDocumentMetadata"; - @ConfigurationParameter(name=PARAM_ADD_DOCUMENT_METADATA, mandatory=true, defaultValue="true") - private boolean addDocumentMetadata; - - /** - * Generate new DKPro Core document metadata (i.e. title, ID, URI) for the document instead - * of retaining what is already present in the XMI file. - */ - public static final String PARAM_OVERRIDE_DOCUMENT_METADATA = "overrideDocumentMetadata"; - @ConfigurationParameter(name=PARAM_OVERRIDE_DOCUMENT_METADATA, mandatory=true, defaultValue="false") - private boolean overrideDocumentMetadata; - - private CASMgrSerializer casMgrSerializer; - - private TypeSystemImpl typeSystem; - - @Override - public void getNext(CAS aCAS) - throws IOException, CollectionException - { - Resource res = nextFile(); - TypeSystemImpl xts = null; - byte[] header = new byte[DKPRO_HEADER.length]; - - if (this.mergeTypeSystem) { - // type system from input file - TypeSystemDescription tsd; - - try (InputStream is = CompressionUtils.getInputStream(res.getLocation(), - res.getInputStream())) { - BufferedInputStream bis = new BufferedInputStream(is); - - getLogger().debug("Reading CAS from [" + res.getLocation() + "]"); - - // Prepare for format detection - bis.mark(32); - DataInputStream dis = new DataInputStream(bis); - dis.read(header); - - // If it is DKPro Core format, read the type system - if (Arrays.equals(header, DKPRO_HEADER)) { - xts = readDKProHeader(bis, header, xts); - } else { - // No embedded DKPro TS, reset - bis.reset(); - // Try reading an externalized type system instead - if (typeSystemLocation != null) { - xts = readTypeSystem(); - initCasFromEmbeddedTS(header, aCAS); - } - } - - if (xts != null) { - // use external type system if specified - tsd = TypeSystemUtil.typeSystem2TypeSystemDescription(xts); - } else { - // else load the CAS from the input file and use its type system - CasIOUtils.load(bis, null, aCAS, CasLoadMode.REINIT); - tsd = TypeSystemUtil.typeSystem2TypeSystemDescription(aCAS.getTypeSystem()); - } - } - - try { - // Merge the current type system with the one specified by the file being read - TypeSystemDescription mergedTypeSystem = CasCreationUtils.mergeTypeSystems(Arrays - .asList(TypeSystemUtil.typeSystem2TypeSystemDescription(typeSystem), tsd)); - - // Create a new CAS based on the merged type system - JCas mergedTypeSystemCas = CasCreationUtils.createCas(mergedTypeSystem, - (TypePriorities) null, (FsIndexDescription[]) null).getJCas(); - - // Create a holder for the CAS metadata - CASMgrSerializer casMgrSerializer = Serialization - .serializeCASMgr((mergedTypeSystemCas).getCasImpl()); - - // Reinitialize CAS with merged type system - ((CASImpl) aCAS).setupCasFromCasMgrSerializer(casMgrSerializer); - - } catch (CASException | ResourceInitializationException e) { - throw new CollectionException(e); - } - } - - // Read file again, this time into a CAS which has been prepared with the merged TS - try (InputStream is = CompressionUtils.getInputStream(res.getLocation(), - res.getInputStream())) { - BufferedInputStream bis = new BufferedInputStream(is); - bis.mark(32); - DataInputStream dis = new DataInputStream(bis); - dis.read(header); - - // If it is DKPro Core format, read the type system - if (Arrays.equals(header, DKPRO_HEADER)) { - xts = readDKProHeader(bis, header, xts); - } else { - // No embedded DKPro TS, reset - bis.reset(); - // Try reading an externalized type system instead - if (typeSystemLocation != null) { - xts = readTypeSystem(); - initCasFromEmbeddedTS(header, aCAS); - } - - } - - SerialFormat format; - if (xts != null) { - format = CasIOUtils.load(bis, aCAS, xts); - } else { - format = CasIOUtils.load(bis, aCAS); - } - getLogger().debug("Found format " + format); - } catch (IOException e) { - throw new CollectionException(e); - } - - // Initialize the JCas sub-system which is the most often used API in DKPro Core components - try { - aCAS.getJCas(); - } - catch (CASException e) { - throw new CollectionException(e); - } - - // Handle DKPro Core DocumentMetaData - AnnotationFS docAnno = aCAS.getDocumentAnnotation(); - if (docAnno.getType().getName().equals(DocumentMetaData.class.getName())) { - if (overrideDocumentMetadata) { - // Unless the language is explicity set on the reader, try to retain the language - // already present in the XMI file. - String language = getLanguage(); - if (language == null) { - language = aCAS.getDocumentLanguage(); - } - aCAS.removeFsFromIndexes(docAnno); - - initCas(aCAS, res); - - aCAS.setDocumentLanguage(language); - } - } - else if (addDocumentMetadata) { - initCas(aCAS, res); - } - } - - // Check whether this is original UIMA CAS format or DKPro Core Legacy format - private TypeSystemImpl readDKProHeader(BufferedInputStream bis, byte[] header, - TypeSystemImpl ts) throws CollectionException { - - getLogger().debug("Found DKPro-Core-style embedded type system"); - ObjectInputStream ois; - try { - ois = new ObjectInputStream(bis); - CASMgrSerializer casMgr = (CASMgrSerializer) ois.readObject(); - if (ts == null) { - ts = casMgr.getTypeSystem(); - ts.commit(); - } - } catch (IOException | ClassNotFoundException e) { - throw new CollectionException(e); - } - return ts; - } - - @Override - public void typeSystemInit(TypeSystem aTypeSystem) throws ResourceInitializationException { - if (typeSystemLocation == null) { - typeSystem = (TypeSystemImpl) aTypeSystem; - } - } - - /** - * It is possible that the type system overlaps with the scan pattern for files, e.g. because - * the type system ends in {@code .ser} and the resources also end in {@code .ser}. If this is - * the case, we filter the type system file from the resource files during scanning. - */ - @Override - protected Collection scan(String aBase, Collection aIncludes, - Collection aExcludes) - throws IOException - { - Collection resources = super.scan(aBase, aIncludes, aExcludes); - if (typeSystemLocation != null) { - org.springframework.core.io.Resource r = getTypeSystemResource(); - resources.remove(new Resource(null, null, r.getURI(), null, null, r)); - } - return resources; - } - - protected org.springframework.core.io.Resource getTypeSystemResource() throws MalformedURLException - { - org.springframework.core.io.Resource r; - // Is absolute? - if (typeSystemLocation.indexOf(':') != -1 || typeSystemLocation.startsWith("/") - || typeSystemLocation.startsWith(File.separator)) { - // If the type system location is absolute, resolve it absolute - r = getResolver().getResource(locationToUrl(typeSystemLocation)); - } - else { - // If the type system is not absolute, resolve it relative to the base location - r = getResolver().getResource(getBase() + typeSystemLocation); - } - return r; - } - - private TypeSystemImpl readTypeSystem() throws IOException { - if (typeSystemLocation == null) { - return null; - } - - if (typeSystem == null) { - CASMgrSerializer casMgr = readCasManager(); - typeSystem = casMgr.getTypeSystem(); - typeSystem.commit(); - } - - return typeSystem; - } - - private void initCasFromEmbeddedTS (byte[] header, CAS aCAS) throws IOException { - // If we encounter a Java-serialized file with an external - // TSI, then we reinitalize the CAS with the external TSI - // prior to loading the data - if (header[0] == (byte) 0xAC && header[1] == (byte) 0xED) { - CASMgrSerializer casMgr = readCasManager(); - ((CASImpl) aCAS).setupCasFromCasMgrSerializer(casMgr); - } - } - - private CASMgrSerializer readCasManager() throws IOException - { - if (typeSystemLocation == null) { - return null; - } - - // If we already read the type system, return it - do not read it again. - if (casMgrSerializer != null) { - return casMgrSerializer; - } - - org.springframework.core.io.Resource r = getTypeSystemResource(); - getLogger().debug("Reading type system from [" + r.getURI() + "]"); - - ObjectInputStream is = null; - try { - is = new ObjectInputStream(CompressionUtils.getInputStream(typeSystemLocation, - r.getInputStream())); - casMgrSerializer = (CASMgrSerializer) is.readObject(); - } - catch (ClassNotFoundException e) { - throw new IOException(e); - } - finally { - closeQuietly(is); - } - - - return casMgrSerializer; - } -} diff --git a/dkpro-core-io-bincas-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/SerializedCasReader.java b/dkpro-core-io-bincas-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/SerializedCasReader.java deleted file mode 100644 index 3d3f2c4838..0000000000 --- a/dkpro-core-io-bincas-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/SerializedCasReader.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.bincas; - -import static org.apache.commons.io.IOUtils.closeQuietly; -import static org.apache.uima.cas.impl.Serialization.deserializeCASComplete; - -import java.io.File; -import java.io.IOException; -import java.io.ObjectInputStream; - -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.impl.CASCompleteSerializer; -import org.apache.uima.cas.impl.CASImpl; -import org.apache.uima.cas.impl.CASMgrSerializer; -import org.apache.uima.cas.impl.CASSerializer; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; - -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; - -/** - * @deprecated use {@code BinaryCasReader} instead. - */ -@ResourceMetaData(name="UIMA Serialized CAS Reader") -@Deprecated -public class SerializedCasReader - extends ResourceCollectionReaderBase -{ - /** - * The file from which to obtain the type system if it is not embedded in the serialized CAS. - */ - public static final String PARAM_TYPE_SYSTEM_LOCATION = "typeSystemLocation"; - @ConfigurationParameter(name=PARAM_TYPE_SYSTEM_LOCATION, mandatory=false) - private String typeSystemLocation; - - private CASMgrSerializer casMgrSerializer; - - @Override - public void getNext(CAS aCAS) - throws IOException, CollectionException - { - Resource res = nextFile(); - ObjectInputStream is = null; - try { - is = new ObjectInputStream(CompressionUtils.getInputStream(res.getLocation(), - res.getInputStream())); - - Object object = is.readObject(); - if (object instanceof CASCompleteSerializer) { - // Annotations and CAS metadata saved together - getLogger().debug("Reading CAS and type system from [" + res.getLocation() + "]"); - CASCompleteSerializer serializer = (CASCompleteSerializer) object; - deserializeCASComplete(serializer, (CASImpl) aCAS); - } - else if (object instanceof CASSerializer) { - // Annotations and CAS metadata saved separately - CASCompleteSerializer serializer = new CASCompleteSerializer(); - serializer.setCasMgrSerializer(readCasManager()); - serializer.setCasSerializer((CASSerializer) object); - getLogger().debug("Reading CAS from [" + res.getLocation() + "]"); - deserializeCASComplete(serializer, (CASImpl) aCAS); - } - else { - throw new IOException("Unknown serialized object found with type [" - + object.getClass().getName() + "]"); - } - } - catch (ClassNotFoundException e) { - throw new IOException(e); - } - finally { - closeQuietly(is); - } - } - - private CASMgrSerializer readCasManager() throws IOException - { - // If we already read the type system, return it - do not read it again. - if (casMgrSerializer != null) { - return casMgrSerializer; - } - - org.springframework.core.io.Resource r; - // Is absolute? - if (typeSystemLocation.indexOf(':') != -1 || typeSystemLocation.startsWith("/") - || typeSystemLocation.startsWith(File.separator)) { - // If the type system location is absolute, resolve it absolute - r = getResolver().getResource(locationToUrl(typeSystemLocation)); - } - else { - // If the type system is not absolute, resolve it relative to the base location - r = getResolver().getResource(getBase() + typeSystemLocation); - } - getLogger().debug("Reading type system from [" + r.getURI() + "]"); - - ObjectInputStream is = null; - try { - is = new ObjectInputStream(CompressionUtils.getInputStream(typeSystemLocation, - r.getInputStream())); - casMgrSerializer = (CASMgrSerializer) is.readObject(); - } - catch (ClassNotFoundException e) { - throw new IOException(e); - } - finally { - closeQuietly(is); - } - - return casMgrSerializer; - } -} diff --git a/dkpro-core-io-bincas-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/SerializedCasWriter.java b/dkpro-core-io-bincas-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/SerializedCasWriter.java deleted file mode 100644 index 7bd834b102..0000000000 --- a/dkpro-core-io-bincas-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/SerializedCasWriter.java +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.bincas; - -import static org.apache.commons.io.IOUtils.closeQuietly; -import static org.apache.uima.cas.impl.Serialization.serializeCASComplete; -import static org.apache.uima.cas.impl.Serialization.serializeCASMgr; - -import java.io.File; -import java.io.IOException; -import java.io.ObjectOutputStream; -import java.io.OutputStream; - -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.impl.CASCompleteSerializer; -import org.apache.uima.cas.impl.CASMgrSerializer; -import org.apache.uima.cas.impl.CASSerializer; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; - -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; - -/** - * @deprecated use {@code BinaryCasWriter} with format S instead. - */ -@ResourceMetaData(name="UIMA Serialized CAS Writer") -@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" }) -@Deprecated -public class SerializedCasWriter - extends JCasFileWriter_ImplBase -{ - /** - * Location to write the type system to. The type system is saved using Java serialization, it - * is not saved as a XML type system description. We recommend to use the name - * {@code typesystem.ser}. - *
- * The {@link #PARAM_COMPRESSION} parameter has no effect on the - * type system. Instead, if the type system file should be compressed or not is detected from - * the file name extension (e.g. ".gz"). - *
- * If this parameter is set, the type system and index repository are no longer serialized into - * the same file as the test of the CAS. The {@link SerializedCasReader} can currently not - * read such files. Use this only if you really know what you are doing. - */ - public static final String PARAM_TYPE_SYSTEM_LOCATION = "typeSystemLocation"; - @ConfigurationParameter(name=PARAM_TYPE_SYSTEM_LOCATION, mandatory=false) - private String typeSystemLocation; - - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; - @ConfigurationParameter(name=PARAM_FILENAME_EXTENSION, mandatory=true, defaultValue=".ser") - private String filenameExtension; - - private boolean typeSystemWritten; - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - // To support writing to ZIPs, the type system must be written before the CAS document - // output stream is obtained. - try { - if (typeSystemLocation != null && !typeSystemWritten) { - writeTypeSystem(aJCas); - typeSystemWritten = true; - } - } - catch (Exception e) { - throw new AnalysisEngineProcessException(e); - } - - ObjectOutputStream docOS = null; - try { - NamedOutputStream os = getOutputStream(aJCas, filenameExtension); - docOS = new ObjectOutputStream(os); - - if (typeSystemLocation == null) { - getLogger().debug("Writing CAS and type system to [" + os + "]"); - CASCompleteSerializer serializer = serializeCASComplete(aJCas.getCasImpl()); - docOS.writeObject(serializer); - } - else { - getLogger().debug("Writing CAS to [" + os + "]"); - CASSerializer serializer = new CASSerializer(); - serializer.addCAS(aJCas.getCasImpl()); - docOS.writeObject(serializer); - } - } - catch (Exception e) { - throw new AnalysisEngineProcessException(e); - } - finally { - closeQuietly(docOS); - } - } - - private void writeTypeSystem(JCas aJCas) - throws IOException - { - // If the type system location is an absolute file system location, write it there, - // otherwise use the default storage which places the file relative to the target location - if (!typeSystemLocation.startsWith(JAR_PREFIX) && new File(typeSystemLocation).isAbsolute()) { - OutputStream typeOS = null; - try { - typeOS = CompressionUtils.getOutputStream(new File(typeSystemLocation)); - getLogger().debug("Writing type system to [" + typeSystemLocation + "]"); - writeTypeSystem(aJCas, typeOS); - } - finally { - closeQuietly(typeOS); - } - } - else { - NamedOutputStream typeOS = null; - try { - typeOS = getOutputStream(typeSystemLocation, ""); - getLogger().debug("Writing type system to [" + typeOS + "]"); - writeTypeSystem(aJCas, typeOS); - } - finally { - closeQuietly(typeOS); - } - } - } - - private void writeTypeSystem(JCas aJCas, OutputStream aOS) - throws IOException - { - ObjectOutputStream typeOS = new ObjectOutputStream(aOS); - CASMgrSerializer casMgrSerializer = serializeCASMgr(aJCas.getCasImpl()); - typeOS.writeObject(casMgrSerializer); - typeOS.flush(); - } -} diff --git a/dkpro-core-io-bincas-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/package-info.java b/dkpro-core-io-bincas-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/package-info.java deleted file mode 100644 index fa40ea96f9..0000000000 --- a/dkpro-core-io-bincas-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/package-info.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Support for fast binary serialization of UIMA CAS. Be advised that binary serialization may have - * drawbacks compared to XMI serialization. It was originally mainly meant for transferring CAS - * objects over the network or exchange them between the Java and C++ implementation. This module - * uses internal UIMA API. - */ -package de.tudarmstadt.ukp.dkpro.core.io.bincas; diff --git a/dkpro-core-io-bincas-asl/src/main/java/org/dkpro/core/io/bincas/BinaryCasReader.java b/dkpro-core-io-bincas-asl/src/main/java/org/dkpro/core/io/bincas/BinaryCasReader.java new file mode 100644 index 0000000000..1b81464a48 --- /dev/null +++ b/dkpro-core-io-bincas-asl/src/main/java/org/dkpro/core/io/bincas/BinaryCasReader.java @@ -0,0 +1,367 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.bincas; + +import static org.apache.commons.io.IOUtils.closeQuietly; + +import java.io.BufferedInputStream; +import java.io.DataInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.ObjectInputStream; +import java.net.MalformedURLException; +import java.util.Arrays; +import java.util.Collection; + +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.CASException; +import org.apache.uima.cas.SerialFormat; +import org.apache.uima.cas.TypeSystem; +import org.apache.uima.cas.impl.CASImpl; +import org.apache.uima.cas.impl.CASMgrSerializer; +import org.apache.uima.cas.impl.Serialization; +import org.apache.uima.cas.impl.TypeSystemImpl; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.metadata.FsIndexDescription; +import org.apache.uima.resource.metadata.TypePriorities; +import org.apache.uima.resource.metadata.TypeSystemDescription; +import org.apache.uima.util.CasCreationUtils; +import org.apache.uima.util.CasIOUtils; +import org.apache.uima.util.CasLoadMode; +import org.apache.uima.util.TypeSystemUtil; +import org.dkpro.core.api.io.ResourceCollectionReaderBase; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.Parameters; + +/** + * UIMA Binary CAS formats reader. + */ +@ResourceMetaData(name = "UIMA Binary CAS Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@Parameters( + exclude = { + ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, + ResourceCollectionReaderBase.PARAM_INCLUDE_HIDDEN, + ResourceCollectionReaderBase.PARAM_USE_DEFAULT_EXCLUDES, + ResourceCollectionReaderBase.PARAM_LOG_FREQ, + BinaryCasReader.PARAM_TYPE_SYSTEM_LOCATION }) +@MimeTypeCapability({ MimeTypes.APPLICATION_X_UIMA_BINARY }) +public class BinaryCasReader + extends ResourceCollectionReaderBase +{ + private static final byte[] DKPRO_HEADER = new byte[] { 'D', 'K', 'P', 'r', 'o', '1' }; + + /** + * The location from which to obtain the type system when the CAS is stored in form 0. + */ + public static final String PARAM_TYPE_SYSTEM_LOCATION = "typeSystemLocation"; + @ConfigurationParameter(name = PARAM_TYPE_SYSTEM_LOCATION, mandatory = false) + private String typeSystemLocation; + + /** + * Determines whether the type system from a currently read file should be merged with the + * current type system + */ + public static final String PARAM_MERGE_TYPE_SYSTEM = "mergeTypeSystem"; + @ConfigurationParameter(name = PARAM_MERGE_TYPE_SYSTEM, mandatory = true, defaultValue = "false") + private boolean mergeTypeSystem; + + /** + * Add DKPro Core metadata if it is not already present in the document. + */ + public static final String PARAM_ADD_DOCUMENT_METADATA = "addDocumentMetadata"; + @ConfigurationParameter(name = PARAM_ADD_DOCUMENT_METADATA, mandatory = true, defaultValue = "true") + private boolean addDocumentMetadata; + + /** + * Generate new DKPro Core document metadata (i.e. title, ID, URI) for the document instead of + * retaining what is already present in the XMI file. + */ + public static final String PARAM_OVERRIDE_DOCUMENT_METADATA = "overrideDocumentMetadata"; + @ConfigurationParameter(name = PARAM_OVERRIDE_DOCUMENT_METADATA, mandatory = true, defaultValue = "false") + private boolean overrideDocumentMetadata; + + private CASMgrSerializer casMgrSerializer; + + private TypeSystemImpl typeSystem; + + @Override + public void getNext(CAS aCAS) + throws IOException, CollectionException + { + Resource res = nextFile(); + TypeSystemImpl xts = null; + byte[] header = new byte[DKPRO_HEADER.length]; + + if (this.mergeTypeSystem) { + // type system from input file + TypeSystemDescription tsd; + + try (InputStream is = CompressionUtils.getInputStream(res.getLocation(), + res.getInputStream())) { + BufferedInputStream bis = new BufferedInputStream(is); + + getLogger().debug("Reading CAS from [" + res.getLocation() + "]"); + + // Prepare for format detection + bis.mark(32); + DataInputStream dis = new DataInputStream(bis); + dis.read(header); + + // If it is DKPro Core format, read the type system + if (Arrays.equals(header, DKPRO_HEADER)) { + xts = readDKProHeader(bis, header, xts); + } + else { + // No embedded DKPro TS, reset + bis.reset(); + // Try reading an externalized type system instead + if (typeSystemLocation != null) { + xts = readTypeSystem(); + initCasFromEmbeddedTS(header, aCAS); + } + } + + if (xts != null) { + // use external type system if specified + tsd = TypeSystemUtil.typeSystem2TypeSystemDescription(xts); + } + else { + // else load the CAS from the input file and use its type system + CasIOUtils.load(bis, null, aCAS, CasLoadMode.REINIT); + tsd = TypeSystemUtil.typeSystem2TypeSystemDescription(aCAS.getTypeSystem()); + } + } + + try { + // Merge the current type system with the one specified by the file being read + TypeSystemDescription mergedTypeSystem = CasCreationUtils.mergeTypeSystems(Arrays + .asList(TypeSystemUtil.typeSystem2TypeSystemDescription(typeSystem), tsd)); + + // Create a new CAS based on the merged type system + JCas mergedTypeSystemCas = CasCreationUtils.createCas(mergedTypeSystem, + (TypePriorities) null, (FsIndexDescription[]) null).getJCas(); + + // Create a holder for the CAS metadata + CASMgrSerializer casMgrSerializer = Serialization + .serializeCASMgr((mergedTypeSystemCas).getCasImpl()); + + // Reinitialize CAS with merged type system + ((CASImpl) aCAS).getBinaryCasSerDes() + .setupCasFromCasMgrSerializer(casMgrSerializer); + } catch (CASException | ResourceInitializationException e) { + throw new CollectionException(e); + } + } + + // Read file again, this time into a CAS which has been prepared with the merged TS + try (InputStream is = CompressionUtils.getInputStream(res.getLocation(), + res.getInputStream())) { + BufferedInputStream bis = new BufferedInputStream(is); + bis.mark(32); + DataInputStream dis = new DataInputStream(bis); + dis.read(header); + + // If it is DKPro Core format, read the type system + if (Arrays.equals(header, DKPRO_HEADER)) { + xts = readDKProHeader(bis, header, xts); + } + else { + // No embedded DKPro TS, reset + bis.reset(); + // Try reading an externalized type system instead + if (typeSystemLocation != null) { + xts = readTypeSystem(); + initCasFromEmbeddedTS(header, aCAS); + } + + } + + SerialFormat format; + if (xts != null) { + format = CasIOUtils.load(bis, aCAS, xts); + } + else { + format = CasIOUtils.load(bis, aCAS); + } + getLogger().debug("Found format " + format); + } catch (IOException e) { + throw new CollectionException(e); + } + + // Initialize the JCas sub-system which is the most often used API in DKPro Core components + try { + aCAS.getJCas(); + } + catch (CASException e) { + throw new CollectionException(e); + } + + // Handle DKPro Core DocumentMetaData + AnnotationFS docAnno = aCAS.getDocumentAnnotation(); + if (docAnno.getType().getName().equals(DocumentMetaData.class.getName())) { + if (overrideDocumentMetadata) { + // Unless the language is explicity set on the reader, try to retain the language + // already present in the XMI file. + String language = getLanguage(); + if (language == null) { + language = aCAS.getDocumentLanguage(); + } + aCAS.removeFsFromIndexes(docAnno); + + initCas(aCAS, res); + + aCAS.setDocumentLanguage(language); + } + } + else if (addDocumentMetadata) { + initCas(aCAS, res); + } + } + + // Check whether this is original UIMA CAS format or DKPro Core Legacy format + private TypeSystemImpl readDKProHeader(BufferedInputStream bis, byte[] header, + TypeSystemImpl ts) + throws CollectionException + { + getLogger().debug("Found DKPro-Core-style embedded type system"); + ObjectInputStream ois; + try { + ois = new ObjectInputStream(bis); + CASMgrSerializer casMgr = (CASMgrSerializer) ois.readObject(); + if (ts == null) { + ts = casMgr.getTypeSystem(); + ts = ts.commit(); + } + } + catch (IOException | ClassNotFoundException e) { + throw new CollectionException(e); + } + return ts; + } + + @Override + public void typeSystemInit(TypeSystem aTypeSystem) throws ResourceInitializationException + { + if (typeSystemLocation == null) { + typeSystem = (TypeSystemImpl) aTypeSystem; + } + } + + /** + * It is possible that the type system overlaps with the scan pattern for files, e.g. because + * the type system ends in {@code .ser} and the resources also end in {@code .ser}. If this is + * the case, we filter the type system file from the resource files during scanning. + */ + @Override + protected Collection scan(String aBase, Collection aIncludes, + Collection aExcludes) + throws IOException + { + Collection resources = super.scan(aBase, aIncludes, aExcludes); + if (typeSystemLocation != null) { + org.springframework.core.io.Resource r = getTypeSystemResource(); + resources.remove(new Resource(null, null, r.getURI(), null, null, r)); + } + return resources; + } + + protected org.springframework.core.io.Resource getTypeSystemResource() + throws MalformedURLException + { + org.springframework.core.io.Resource r; + // Is absolute? + if (typeSystemLocation.indexOf(':') != -1 || typeSystemLocation.startsWith("/") + || typeSystemLocation.startsWith(File.separator)) { + // If the type system location is absolute, resolve it absolute + r = getResolver().getResource(locationToUrl(typeSystemLocation)); + } + else { + // If the type system is not absolute, resolve it relative to the base location + r = getResolver().getResource(getBase() + typeSystemLocation); + } + return r; + } + + private TypeSystemImpl readTypeSystem() throws IOException { + if (typeSystemLocation == null) { + return null; + } + + if (typeSystem == null) { + CASMgrSerializer casMgr = readCasManager(); + typeSystem = casMgr.getTypeSystem(); + typeSystem = typeSystem.commit(); + } + + return typeSystem; + } + + private void initCasFromEmbeddedTS(byte[] header, CAS aCAS) throws IOException + { + // If we encounter a Java-serialized file with an external + // TSI, then we reinitalize the CAS with the external TSI + // prior to loading the data + if (header[0] == (byte) 0xAC && header[1] == (byte) 0xED) { + CASMgrSerializer casMgr = readCasManager(); + ((CASImpl) aCAS).getBinaryCasSerDes().setupCasFromCasMgrSerializer(casMgr); + } + } + + private CASMgrSerializer readCasManager() throws IOException + { + if (typeSystemLocation == null) { + return null; + } + + // If we already read the type system, return it - do not read it again. + if (casMgrSerializer != null) { + return casMgrSerializer; + } + + org.springframework.core.io.Resource r = getTypeSystemResource(); + getLogger().debug("Reading type system from [" + r.getURI() + "]"); + + ObjectInputStream is = null; + try { + is = new ObjectInputStream(CompressionUtils.getInputStream(typeSystemLocation, + r.getInputStream())); + casMgrSerializer = (CASMgrSerializer) is.readObject(); + } + catch (ClassNotFoundException e) { + throw new IOException(e); + } + finally { + closeQuietly(is); + } + + + return casMgrSerializer; + } +} diff --git a/dkpro-core-io-bincas-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/BinaryCasWriter.java b/dkpro-core-io-bincas-asl/src/main/java/org/dkpro/core/io/bincas/BinaryCasWriter.java similarity index 79% rename from dkpro-core-io-bincas-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/BinaryCasWriter.java rename to dkpro-core-io-bincas-asl/src/main/java/org/dkpro/core/io/bincas/BinaryCasWriter.java index 3d673b4389..8bd2581d42 100644 --- a/dkpro-core-io-bincas-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/BinaryCasWriter.java +++ b/dkpro-core-io-bincas-asl/src/main/java/org/dkpro/core/io/bincas/BinaryCasWriter.java @@ -15,9 +15,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.bincas; +package org.dkpro.core.io.bincas; import static org.apache.uima.cas.SerialFormat.BINARY; +import static org.apache.uima.cas.SerialFormat.BINARY_TSI; import static org.apache.uima.cas.SerialFormat.COMPRESSED; import static org.apache.uima.cas.SerialFormat.COMPRESSED_FILTERED; import static org.apache.uima.cas.SerialFormat.COMPRESSED_FILTERED_TS; @@ -48,17 +49,22 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.CasIOUtils; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; +import eu.openminted.share.annotations.api.DocumentationResource; /** - *

Write CAS in one of the UIMA binary formats.

+ *

+ * Write CAS in one of the UIMA binary formats. + *

* - *

All the supported formats except 6+ can also be loaded and saved via the UIMA - * {@link CasIOUtils}.

+ *

+ * All the supported formats except 6+ can also be loaded and saved via the UIMA + * {@link CasIOUtils}. + *

* * * @@ -80,9 +86,9 @@ * * * - * + * * * * @@ -96,16 +102,15 @@ * * * - * + * * * * * * - * * @@ -113,25 +118,23 @@ * * * - * + * * * * * * - * + * * * * * * - * * @@ -139,8 +142,7 @@ * * * - * * @@ -151,7 +153,8 @@ * @see Compressed * Binary CASes */ -@ResourceMetaData(name="UIMA Binary CAS Writer") +@ResourceMetaData(name = "UIMA Binary CAS Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({ MimeTypes.APPLICATION_X_UIMA_BINARY }) @TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" }) public class BinaryCasWriter @@ -181,6 +184,9 @@ public class BinaryCasWriter @ConfigurationParameter(name = PARAM_TYPE_SYSTEM_LOCATION, mandatory = false) private String typeSystemLocation; + /** + * Binary format to produce. + */ public static final String PARAM_FORMAT = "format"; @ConfigurationParameter(name = PARAM_FORMAT, mandatory = true, defaultValue = "COMPRESSED_FILTERED_TSI") private String format; @@ -192,8 +198,9 @@ public class BinaryCasWriter * When using the old short names (e.g. 6), the default extension .bin is * used. */ - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; - @ConfigurationParameter(name=PARAM_FILENAME_EXTENSION, mandatory=true, defaultValue=AUTO) + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; + @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = AUTO) private String filenameExtension; private boolean typeSystemWritten; @@ -206,7 +213,7 @@ public void initialize(UimaContext aContext) if (AUTO.equals(filenameExtension)) { try { - filenameExtension = SerialFormat.valueOf(format).getDefaultFileExtension(); + filenameExtension = "." + SerialFormat.valueOf(format).getDefaultFileExtension(); } catch (IllegalArgumentException e) { filenameExtension = ".bin"; @@ -221,13 +228,13 @@ public void process(JCas aJCas) try (NamedOutputStream docOS = getOutputStream(aJCas, filenameExtension)) { if ("S".equals(format) || SERIALIZED.toString().equals(format)) { // Java-serialized CAS without type system - getLogger().debug("Writing CAS to [" + docOS + "]"); -// CASSerializer serializer = new CASSerializer(); + getLogger().debug("Writing CAS to [" + docOS + "]"); + // CASSerializer serializer = new CASSerializer(); // serializer.addCAS(aJCas.getCasImpl()); // ObjectOutputStream objOS = new ObjectOutputStream(docOS); // objOS.writeObject(serializer); // objOS.flush(); - CasIOUtils.save(aJCas.getCas(), docOS, SERIALIZED); + CasIOUtils.save(aJCas.getCas(), docOS, SERIALIZED); } else if ("S+".equals(format) || SERIALIZED_TSI.toString().equals(format)) { // Java-serialized CAS with type system @@ -243,9 +250,9 @@ else if ("0".equals(format) || BINARY.toString().equals(format)) { // serializeCAS(aJCas.getCas(), docOS); CasIOUtils.save(aJCas.getCas(), docOS, BINARY); } - else if (BINARY.toString().equals(format)) { - // Java-serialized CAS without type system - CasIOUtils.save(aJCas.getCas(), docOS, SerialFormat.BINARY_TSI); + else if (BINARY_TSI.toString().equals(format)) { + // Java-serialized CAS with type system + CasIOUtils.save(aJCas.getCas(), docOS, BINARY_TSI); } else if ("4".equals(format) || COMPRESSED.toString().equals(format)) { // Binary compressed CAS without type system (form 4) @@ -273,7 +280,9 @@ else if (format.equals("6+")) { } else { throw new IllegalArgumentException("Unknown format [" + format - + "]. Must be S, S+, 0, 4, 6, or 6+"); + + "]. Must be S, S+, 0, 4, 6, 6+, SERIALIZED, SERIALIZED_TSI, BINARY, " + + "BINARY_TSI, COMPRESSED_TSI, COMPRESSED_FILTERED, " + + "COMPRESSED_FILTERED_TS or COMPRESSED_FILTERED_TSI"); } } catch (Exception e) { @@ -297,8 +306,10 @@ private void writeTypeSystem(JCas aJCas) { // If the type system location is an absolute file system location, write it there, // otherwise use the default storage which places the file relative to the target location - if (!typeSystemLocation.startsWith(JAR_PREFIX) && new File(typeSystemLocation).isAbsolute()) { - try (OutputStream typeOS = CompressionUtils.getOutputStream(new File(typeSystemLocation))) { + if (!typeSystemLocation.startsWith(JAR_PREFIX) + && new File(typeSystemLocation).isAbsolute()) { + try (OutputStream typeOS = CompressionUtils + .getOutputStream(new File(typeSystemLocation))) { getLogger().debug("Writing type system to [" + typeSystemLocation + "]"); writeTypeSystem(aJCas, typeOS); } diff --git a/dkpro-core-io-bincas-asl/src/main/java/org/dkpro/core/io/bincas/SerializedCasReader.java b/dkpro-core-io-bincas-asl/src/main/java/org/dkpro/core/io/bincas/SerializedCasReader.java new file mode 100644 index 0000000000..c56eedfbce --- /dev/null +++ b/dkpro-core-io-bincas-asl/src/main/java/org/dkpro/core/io/bincas/SerializedCasReader.java @@ -0,0 +1,131 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.bincas; + +import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.apache.uima.cas.impl.Serialization.deserializeCASComplete; + +import java.io.File; +import java.io.IOException; +import java.io.ObjectInputStream; + +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.impl.CASCompleteSerializer; +import org.apache.uima.cas.impl.CASImpl; +import org.apache.uima.cas.impl.CASMgrSerializer; +import org.apache.uima.cas.impl.CASSerializer; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.dkpro.core.api.io.ResourceCollectionReaderBase; +import org.dkpro.core.api.resources.CompressionUtils; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * @deprecated use {@code BinaryCasReader} instead. + */ +@ResourceMetaData(name = "UIMA Serialized CAS Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@Deprecated +public class SerializedCasReader + extends ResourceCollectionReaderBase +{ + /** + * The file from which to obtain the type system if it is not embedded in the serialized CAS. + */ + public static final String PARAM_TYPE_SYSTEM_LOCATION = "typeSystemLocation"; + @ConfigurationParameter(name = PARAM_TYPE_SYSTEM_LOCATION, mandatory = false) + private String typeSystemLocation; + + private CASMgrSerializer casMgrSerializer; + + @Override + public void getNext(CAS aCAS) + throws IOException, CollectionException + { + Resource res = nextFile(); + ObjectInputStream is = null; + try { + is = new ObjectInputStream(CompressionUtils.getInputStream(res.getLocation(), + res.getInputStream())); + + Object object = is.readObject(); + if (object instanceof CASCompleteSerializer) { + // Annotations and CAS metadata saved together + getLogger().debug("Reading CAS and type system from [" + res.getLocation() + "]"); + CASCompleteSerializer serializer = (CASCompleteSerializer) object; + deserializeCASComplete(serializer, (CASImpl) aCAS); + } + else if (object instanceof CASSerializer) { + // Annotations and CAS metadata saved separately + CASCompleteSerializer serializer = new CASCompleteSerializer(); + serializer.setCasMgrSerializer(readCasManager()); + serializer.setCasSerializer((CASSerializer) object); + getLogger().debug("Reading CAS from [" + res.getLocation() + "]"); + deserializeCASComplete(serializer, (CASImpl) aCAS); + } + else { + throw new IOException("Unknown serialized object found with type [" + + object.getClass().getName() + "]"); + } + } + catch (ClassNotFoundException e) { + throw new IOException(e); + } + finally { + closeQuietly(is); + } + } + + private CASMgrSerializer readCasManager() throws IOException + { + // If we already read the type system, return it - do not read it again. + if (casMgrSerializer != null) { + return casMgrSerializer; + } + + org.springframework.core.io.Resource r; + // Is absolute? + if (typeSystemLocation.indexOf(':') != -1 || typeSystemLocation.startsWith("/") + || typeSystemLocation.startsWith(File.separator)) { + // If the type system location is absolute, resolve it absolute + r = getResolver().getResource(locationToUrl(typeSystemLocation)); + } + else { + // If the type system is not absolute, resolve it relative to the base location + r = getResolver().getResource(getBase() + typeSystemLocation); + } + getLogger().debug("Reading type system from [" + r.getURI() + "]"); + + ObjectInputStream is = null; + try { + is = new ObjectInputStream(CompressionUtils.getInputStream(typeSystemLocation, + r.getInputStream())); + casMgrSerializer = (CASMgrSerializer) is.readObject(); + } + catch (ClassNotFoundException e) { + throw new IOException(e); + } + finally { + closeQuietly(is); + } + + return casMgrSerializer; + } +} diff --git a/dkpro-core-io-bincas-asl/src/main/java/org/dkpro/core/io/bincas/SerializedCasWriter.java b/dkpro-core-io-bincas-asl/src/main/java/org/dkpro/core/io/bincas/SerializedCasWriter.java new file mode 100644 index 0000000000..ee0592be1b --- /dev/null +++ b/dkpro-core-io-bincas-asl/src/main/java/org/dkpro/core/io/bincas/SerializedCasWriter.java @@ -0,0 +1,159 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.bincas; + +import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.apache.uima.cas.impl.Serialization.serializeCASComplete; +import static org.apache.uima.cas.impl.Serialization.serializeCASMgr; + +import java.io.File; +import java.io.IOException; +import java.io.ObjectOutputStream; +import java.io.OutputStream; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.impl.CASCompleteSerializer; +import org.apache.uima.cas.impl.CASMgrSerializer; +import org.apache.uima.cas.impl.CASSerializer; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CompressionUtils; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * @deprecated use {@code BinaryCasWriter} with format S instead. + */ +@ResourceMetaData(name = "UIMA Serialized CAS Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" }) +@Deprecated +public class SerializedCasWriter + extends JCasFileWriter_ImplBase +{ + /** + * Location to write the type system to. The type system is saved using Java serialization, it + * is not saved as a XML type system description. We recommend to use the name + * {@code typesystem.ser}. + *
+ * The {@link #PARAM_COMPRESSION} parameter has no effect on the + * type system. Instead, if the type system file should be compressed or not is detected from + * the file name extension (e.g. ".gz"). + *
+ * If this parameter is set, the type system and index repository are no longer serialized into + * the same file as the test of the CAS. The {@link SerializedCasReader} can currently not + * read such files. Use this only if you really know what you are doing. + */ + public static final String PARAM_TYPE_SYSTEM_LOCATION = "typeSystemLocation"; + @ConfigurationParameter(name = PARAM_TYPE_SYSTEM_LOCATION, mandatory = false) + private String typeSystemLocation; + + /** + * Use this filename extension. + */ + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; + @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".ser") + private String filenameExtension; + + private boolean typeSystemWritten; + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + // To support writing to ZIPs, the type system must be written before the CAS document + // output stream is obtained. + try { + if (typeSystemLocation != null && !typeSystemWritten) { + writeTypeSystem(aJCas); + typeSystemWritten = true; + } + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + + ObjectOutputStream docOS = null; + try { + NamedOutputStream os = getOutputStream(aJCas, filenameExtension); + docOS = new ObjectOutputStream(os); + + if (typeSystemLocation == null) { + getLogger().debug("Writing CAS and type system to [" + os + "]"); + CASCompleteSerializer serializer = serializeCASComplete(aJCas.getCasImpl()); + docOS.writeObject(serializer); + } + else { + getLogger().debug("Writing CAS to [" + os + "]"); + CASSerializer serializer = new CASSerializer(); + serializer.addCAS(aJCas.getCasImpl()); + docOS.writeObject(serializer); + } + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + finally { + closeQuietly(docOS); + } + } + + private void writeTypeSystem(JCas aJCas) + throws IOException + { + // If the type system location is an absolute file system location, write it there, + // otherwise use the default storage which places the file relative to the target location + if (!typeSystemLocation.startsWith(JAR_PREFIX) + && new File(typeSystemLocation).isAbsolute()) { + OutputStream typeOS = null; + try { + typeOS = CompressionUtils.getOutputStream(new File(typeSystemLocation)); + getLogger().debug("Writing type system to [" + typeSystemLocation + "]"); + writeTypeSystem(aJCas, typeOS); + } + finally { + closeQuietly(typeOS); + } + } + else { + NamedOutputStream typeOS = null; + try { + typeOS = getOutputStream(typeSystemLocation, ""); + getLogger().debug("Writing type system to [" + typeOS + "]"); + writeTypeSystem(aJCas, typeOS); + } + finally { + closeQuietly(typeOS); + } + } + } + + private void writeTypeSystem(JCas aJCas, OutputStream aOS) + throws IOException + { + ObjectOutputStream typeOS = new ObjectOutputStream(aOS); + CASMgrSerializer casMgrSerializer = serializeCASMgr(aJCas.getCasImpl()); + typeOS.writeObject(casMgrSerializer); + typeOS.flush(); + } +} diff --git a/dkpro-core-io-bincas-asl/src/main/java/org/dkpro/core/io/bincas/package-info.java b/dkpro-core-io-bincas-asl/src/main/java/org/dkpro/core/io/bincas/package-info.java new file mode 100644 index 0000000000..9a4e63a9ad --- /dev/null +++ b/dkpro-core-io-bincas-asl/src/main/java/org/dkpro/core/io/bincas/package-info.java @@ -0,0 +1,25 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Support for fast binary serialization of UIMA CAS. Be advised that binary serialization may have + * drawbacks compared to XMI serialization. It was originally mainly meant for transferring CAS + * objects over the network or exchange them between the Java and C++ implementation. This module + * uses internal UIMA API. + */ +package org.dkpro.core.io.bincas; diff --git a/dkpro-core-io-bincas-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/SerializedCasWriterReaderTest.java b/dkpro-core-io-bincas-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/SerializedCasWriterReaderTest.java deleted file mode 100644 index 278960f9b4..0000000000 --- a/dkpro-core-io-bincas-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/SerializedCasWriterReaderTest.java +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.bincas; - -import static org.apache.commons.io.FileUtils.readFileToString; -import static org.apache.uima.fit.factory.TypeSystemDescriptionFactory.createTypeSystemDescription; -import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.io.File; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.cas.CAS; -import org.apache.uima.collection.CollectionReader; -import org.apache.uima.fit.factory.AnalysisEngineFactory; -import org.apache.uima.fit.factory.CollectionReaderFactory; -import org.apache.uima.util.CasCreationUtils; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - -public class SerializedCasWriterReaderTest -{ - @Rule - public TemporaryFolder testFolder = new TemporaryFolder(); - - @Test - public void testCasWithTypeSystemEmbedded() throws Exception - { - write(true); - read(); - } - - @Test - public void testCasWithTypeSystemSeparate() throws Exception - { - write(false); - read(); - } - - public void write(boolean aIncludeTypeSystem) throws Exception - { - CollectionReader reader = CollectionReaderFactory.createReader( - TextReader.class, - TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/texts", - TextReader.PARAM_PATTERNS, "*.txt", - TextReader.PARAM_LANGUAGE, "latin"); - - AnalysisEngine writer = AnalysisEngineFactory.createEngine( - SerializedCasWriter.class, - SerializedCasWriter.PARAM_TARGET_LOCATION, testFolder.getRoot(), - SerializedCasWriter.PARAM_TYPE_SYSTEM_LOCATION, - aIncludeTypeSystem ? null : testFolder.newFile("typesystem.ser")); - - runPipeline(reader, writer); - - assertTrue(new File(testFolder.getRoot(), "example1.txt.ser").exists()); - } - - public void read() throws Exception - { - CollectionReader reader = CollectionReaderFactory.createReader( - SerializedCasReader.class, - SerializedCasReader.PARAM_SOURCE_LOCATION, testFolder.getRoot(), - SerializedCasReader.PARAM_PATTERNS, "*.ser", - SerializedCasReader.PARAM_TYPE_SYSTEM_LOCATION, - new File(testFolder.getRoot(), "typesystem.ser")); - - CAS cas = CasCreationUtils.createCas(createTypeSystemDescription(), null, null); - reader.getNext(cas); - - String refText = readFileToString(new File("src/test/resources/texts/example1.txt")); - assertEquals(refText, cas.getDocumentText()); - assertEquals("latin", cas.getDocumentLanguage()); - } - -// @Test -// public void lenientTest() throws Exception -// { -// TypeSystemDescription tsdMeta = TypeSystemDescriptionFactory -// .createTypeSystemDescription("desc.type.metadata"); -// -// -// // Create a CAS initialized with that type system and set the text -// CAS casOut = createCas(tsdMeta, null, null); -// casOut.setDocumentText("This is a test."); -// DocumentMetaData meta = DocumentMetaData.create(casOut); -// meta.setDocumentId("document"); -// -// // Write out -// AnalysisEngine writer = AnalysisEngineFactory.createEngine( -// SerializedCasWriter.class, tsdMeta, -// SerializedCasWriter.PARAM_TARGET_LOCATION, testFolder.getRoot().getPath()); -// writer.process(casOut); -// -// // Create a new type system from scratch -// TypeSystemDescription tsd = new TypeSystemDescription_impl(); -// TypeDescription tokenTypeDesc = tsd.addType("Token", "", CAS.TYPE_NAME_ANNOTATION); -// tokenTypeDesc.addFeature("length", "", CAS.TYPE_NAME_INTEGER); -// tsd = CasCreationUtils.mergeTypeSystems(asList(tsd, tsdMeta)); -// -// // Now read in to CAS with different type system -// CollectionReader reader = CollectionReaderFactory.createReader( -// SerializedCasReader.class, -// SerializedCasReader.PARAM_SOURCE_LOCATION, testFolder.getRoot().getPath(), -// SerializedCasReader.PARAM_PATTERNS, new String [] { -// SerializedCasReader.INCLUDE_PREFIX+"*.ser" -// }); -// -// CAS casIn = CasCreationUtils.createCas(tsd, null, null); -// reader.getNext(casIn); -// -// upgrade(casIn, tsd); -// -// // Try to create an annotation with the extra type -// AnnotationFS fs = casOut.createAnnotation(casIn.getTypeSystem().getType("Token"), 0, 1); -// casOut.addFsToIndexes(fs); -// } -// -// private void upgrade(CAS aCas, TypeSystemDescription aTsd) throws Exception -// { -// // Prepare template for new CAS -// CAS newCas = CasCreationUtils.createCas(aTsd, null, null); -// CASCompleteSerializer serializer = Serialization.serializeCASComplete((CASImpl) newCas); -// -// // Save old type system -// TypeSystem oldTypeSystem = aCas.getTypeSystem(); -// -// // Save old CAS contents -// ByteArrayOutputStream os2 = new ByteArrayOutputStream(); -// Serialization.serializeWithCompression(aCas, os2, oldTypeSystem); -// -// // Prepare CAS with new type system -// Serialization.deserializeCASComplete(serializer, (CASImpl) aCas); -// -// // Restore CAS data to new type system -// Serialization.deserializeCAS(aCas, new ByteArrayInputStream(os2.toByteArray()), oldTypeSystem, null); -// } -// -// private void upgrade(CAS aCas) throws Exception -// { -// // Prepare template for new CAS -// CAS newCas = JCasFactory.createJCas().getCas(); -// CASCompleteSerializer serializer = Serialization.serializeCASComplete((CASImpl) newCas); -// -// // Save old type system -// TypeSystem oldTypeSystem = aCas.getTypeSystem(); -// -// // Save old CAS contents -// ByteArrayOutputStream os2 = new ByteArrayOutputStream(); -// Serialization.serializeWithCompression(aCas, os2, oldTypeSystem); -// -// // Prepare CAS with new type system -// Serialization.deserializeCASComplete(serializer, (CASImpl) aCas); -// -// // Restore CAS data to new type system -// Serialization.deserializeCAS(aCas, new ByteArrayInputStream(os2.toByteArray()), oldTypeSystem, null); -// } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-io-bincas-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/BinaryCasWriterReaderTest.java b/dkpro-core-io-bincas-asl/src/test/java/org/dkpro/core/io/bincas/BinaryCasWriterReaderTest.java similarity index 85% rename from dkpro-core-io-bincas-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/BinaryCasWriterReaderTest.java rename to dkpro-core-io-bincas-asl/src/test/java/org/dkpro/core/io/bincas/BinaryCasWriterReaderTest.java index 1aa018af13..905ae28d2e 100644 --- a/dkpro-core-io-bincas-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/bincas/BinaryCasWriterReaderTest.java +++ b/dkpro-core-io-bincas-asl/src/test/java/org/dkpro/core/io/bincas/BinaryCasWriterReaderTest.java @@ -15,15 +15,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.bincas; +package org.dkpro.core.io.bincas; -import static de.tudarmstadt.ukp.dkpro.core.performance.PerformanceTestUtil.initRandomCas; -import static de.tudarmstadt.ukp.dkpro.core.performance.PerformanceTestUtil.measureReadPerformance; -import static de.tudarmstadt.ukp.dkpro.core.performance.PerformanceTestUtil.measureWritePerformance; -import static de.tudarmstadt.ukp.dkpro.core.performance.PerformanceTestUtil.repeat; import static org.apache.commons.io.FileUtils.readFileToString; import static org.apache.commons.io.FilenameUtils.separatorsToUnix; +import static org.apache.uima.cas.SerialFormat.BINARY_TSI; import static org.apache.uima.cas.SerialFormat.COMPRESSED_FILTERED; +import static org.apache.uima.cas.SerialFormat.COMPRESSED_FILTERED_TSI; +import static org.apache.uima.cas.SerialFormat.COMPRESSED_TSI; +import static org.apache.uima.cas.SerialFormat.SERIALIZED_TSI; import static org.apache.uima.cas.impl.Serialization.deserializeCASComplete; import static org.apache.uima.cas.impl.Serialization.serializeCASComplete; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; @@ -33,6 +33,10 @@ import static org.apache.uima.fit.factory.TypeSystemDescriptionFactory.createTypeSystemDescription; import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; import static org.apache.uima.fit.util.FSUtil.getFeature; +import static org.dkpro.core.performance.PerformanceTestUtil.initRandomCas; +import static org.dkpro.core.performance.PerformanceTestUtil.measureReadPerformance; +import static org.dkpro.core.performance.PerformanceTestUtil.measureWritePerformance; +import static org.dkpro.core.performance.PerformanceTestUtil.repeat; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -69,16 +73,16 @@ import org.apache.uima.resource.metadata.TypeSystemDescription; import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.CasIOUtils; +import org.dkpro.core.api.io.ResourceCollectionReaderBase; +import org.dkpro.core.api.resources.CompressionMethod; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Before; import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionMethod; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class BinaryCasWriterReaderTest { @@ -99,7 +103,8 @@ public void testSReinitialize() throws Exception { write(testFolder.getPath(), SerialFormat.SERIALIZED.toString(), true); - read(testFolder.getPath(), NONE, true, false); // Type system is reinitialized from the persisted type system + // Type system is reinitialized from the persisted type system + read(testFolder.getPath(), NONE, true, false); read(testFolder.getPath(), NONE, true, true); } @@ -108,7 +113,8 @@ public void testSReinitializeInZIP() throws Exception { write("jar:" + testFolder.toURI().toURL() + "/archive.zip", "S", true); - read("jar:" + testFolder.toURI().toURL() + "/archive.zip", NONE, true, false); // Type system is reinitialized from the persisted type system + // Type system is reinitialized from the persisted type system + read("jar:" + testFolder.toURI().toURL() + "/archive.zip", NONE, true, false); read("jar:" + testFolder.toURI().toURL() + "/archive.zip", NONE, true, true); } @@ -126,7 +132,8 @@ public void testSplusReinitialize() throws Exception { write(testFolder.getPath(), "S+", false); - read(testFolder.getPath(), NONE, false, false); // Type system is reinitialized from the persisted CAS + // Type system is reinitialized from the persisted CAS + read(testFolder.getPath(), NONE, false, false); read(testFolder.getPath(), NONE, false, true); } @@ -164,12 +171,12 @@ public void test6Lenient() @Test public void test6LenientPlainUima() throws Exception { -// TypeSystemDescription tsd = new TypeSystemDescription_impl(); -// TypeDescription td = tsd.addType("DocumentMetaData", "", CAS.TYPE_NAME_DOCUMENT_ANNOTATION); -// td.addFeature("feat", "", CAS.TYPE_NAME_STRING); +// TypeSystemDescription tsd = new TypeSystemDescription_impl(); +// TypeDescription td = tsd.addType("DocumentMetaData", "", CAS.TYPE_NAME_DOCUMENT_ANNOTATION); +// td.addFeature("feat", "", CAS.TYPE_NAME_STRING); // -// CAS source = CasCreationUtils.createCas(tsd, null, null, null); -// CAS target = CasCreationUtils.createCas(tsd, null, null, null); +// CAS source = CasCreationUtils.createCas(tsd, null, null, null); +// CAS target = CasCreationUtils.createCas(tsd, null, null, null); // source.getJCas(); // target.getJCas(); // @@ -217,8 +224,8 @@ public void test_COMPRESSED_FILTERED_TSI_lenient() throws Exception { write(testFolder.getPath(), SerialFormat.COMPRESSED_FILTERED_TSI.toString(), false); - read(testFolder.getPath(), METADATA, false, false); - read(testFolder.getPath(), METADATA, false, true); + read(testFolder.getPath(), METADATA, false, false); + read(testFolder.getPath(), METADATA, false, true); } @Test @@ -244,7 +251,8 @@ public void testSerializedEmbeddedTypeSystem() throws Exception { writeSerialized(testFolder.getPath(), false); - read(testFolder.getPath(), NONE, false, false); // Type system is reinitialized from the persisted CAS + // Type system is reinitialized from the persisted CAS + read(testFolder.getPath(), NONE, false, false); read(testFolder.getPath(), NONE, false, true); } @@ -253,7 +261,8 @@ public void testSerializedSeparateTypeSystem() throws Exception { writeSerialized(testFolder.getPath(), true); - read(testFolder.getPath(), NONE, true, false); // Type system is reinitialized from the persisted CAS + // Type system is reinitialized from the persisted CAS + read(testFolder.getPath(), NONE, true, false); read(testFolder.getPath(), NONE, true, true); } @@ -290,7 +299,8 @@ public void readWriteZipMinimal() assertEquals(out.getDocumentLanguage(), in.getDocumentLanguage()); assertEquals(out.getDocumentText(), in.getDocumentText()); - assertEquals(DocumentMetaData.get(out).getDocumentId(), DocumentMetaData.get(in).getDocumentId()); + assertEquals(DocumentMetaData.get(out).getDocumentId(), + DocumentMetaData.get(in).getDocumentId()); } @Test @@ -496,13 +506,13 @@ public void read(String aLocation, int aMode, boolean aLoadExternal, boolean aMe System.out.println("--- READING ---"); CollectionReader reader; if (false) { - reader = CollectionReaderFactory.createReader( - BinaryCasReader.class, - BinaryCasReader.PARAM_SOURCE_LOCATION, aLocation, - BinaryCasReader.PARAM_PATTERNS, "*.bin", - // Allow loading only if TSD is not specified - BinaryCasReader.PARAM_TYPE_SYSTEM_LOCATION, - aLoadExternal ? new File(aLocation, "typesystem.bin") : null); + reader = CollectionReaderFactory.createReader( + BinaryCasReader.class, + BinaryCasReader.PARAM_SOURCE_LOCATION, aLocation, + BinaryCasReader.PARAM_PATTERNS, "*.bin", + // Allow loading only if TSD is not specified + BinaryCasReader.PARAM_TYPE_SYSTEM_LOCATION, + aLoadExternal ? new File(aLocation, "typesystem.bin") : null); } else { reader = CollectionReaderFactory.createReader( @@ -596,7 +606,8 @@ private static SummaryStatistics measureCasCreation(int aRepeat) for (int n = 0; n < aRepeat; n++) { long begin = System.currentTimeMillis(); // JCas jcas = JCasFactory.createJCas(); - JCas jcas = CasCreationUtils.createCas((TypeSystemDescription) null, null, null).getJCas(); + JCas jcas = CasCreationUtils.createCas((TypeSystemDescription) null, null, null) + .getJCas(); stats.addValue(System.currentTimeMillis() - begin); } @@ -619,13 +630,14 @@ private static void readSerializedCas(JCas aJCas, File aFile) CASCompleteSerializer serializer = (CASCompleteSerializer) is.readObject(); deserializeCASComplete(serializer, aJCas.getCasImpl()); -// // Initialize the JCas sub-system which is the most often used API in DKPro Core components -// try { -// aJCas.getCas().getJCas(); -// } -// catch (CASException e) { -// throw new IOException(e); -// } +// // Initialize the JCas sub-system which is the most often used API in DKPro Core +// // components +// try { +// aJCas.getCas().getJCas(); +// } +// catch (CASException e) { +// throw new IOException(e); +// } } catch (ClassNotFoundException e) { throw new IOException(e); @@ -657,9 +669,9 @@ public void performanceTest() System.out.printf("Data serialized to %s %n", testFolder); // Set up configurations - Map configs = new LinkedHashMap(); + Map configs = new LinkedHashMap<>(); configs.put( - "Format S - no compression", + "Format S - no external compression", createEngineDescription( BinaryCasWriter.class, BinaryCasWriter.PARAM_OVERWRITE, true, @@ -667,7 +679,7 @@ public void performanceTest() BinaryCasWriter.PARAM_COMPRESSION, CompressionMethod.NONE, BinaryCasWriter.PARAM_TARGET_LOCATION, testFolder)); configs.put( - "Format S+ - no compression", + "Format S+ - no external compression", createEngineDescription( BinaryCasWriter.class, BinaryCasWriter.PARAM_OVERWRITE, true, @@ -675,7 +687,7 @@ public void performanceTest() BinaryCasWriter.PARAM_COMPRESSION, CompressionMethod.NONE, BinaryCasWriter.PARAM_TARGET_LOCATION, testFolder)); configs.put( - "Format 0 - no compression", + "Format 0 - no external compression", createEngineDescription( BinaryCasWriter.class, BinaryCasWriter.PARAM_OVERWRITE, true, @@ -683,7 +695,7 @@ public void performanceTest() BinaryCasWriter.PARAM_COMPRESSION, CompressionMethod.NONE, BinaryCasWriter.PARAM_TARGET_LOCATION, testFolder)); configs.put( - "Format 4 - no compression", + "Format 4 - no external compression", createEngineDescription( BinaryCasWriter.class, BinaryCasWriter.PARAM_OVERWRITE, true, @@ -691,7 +703,7 @@ public void performanceTest() BinaryCasWriter.PARAM_COMPRESSION, CompressionMethod.NONE, BinaryCasWriter.PARAM_TARGET_LOCATION, testFolder)); configs.put( - "Format 6 - no compression", + "Format 6 - no external compression", createEngineDescription( BinaryCasWriter.class, BinaryCasWriter.PARAM_OVERWRITE, true, @@ -699,13 +711,49 @@ public void performanceTest() BinaryCasWriter.PARAM_COMPRESSION, CompressionMethod.NONE, BinaryCasWriter.PARAM_TARGET_LOCATION, testFolder)); configs.put( - "Format 6+ - no compression", + "Format 6+ - no external compression", createEngineDescription( BinaryCasWriter.class, BinaryCasWriter.PARAM_OVERWRITE, true, BinaryCasWriter.PARAM_FORMAT, "6+", BinaryCasWriter.PARAM_COMPRESSION, CompressionMethod.NONE, BinaryCasWriter.PARAM_TARGET_LOCATION, testFolder)); + configs.put( + "Format BINARY_TSI - no external compression", + createEngineDescription( + BinaryCasWriter.class, + BinaryCasWriter.PARAM_OVERWRITE, true, + BinaryCasWriter.PARAM_FORMAT, BINARY_TSI, + BinaryCasWriter.PARAM_FILENAME_EXTENSION, ".bin", + BinaryCasWriter.PARAM_COMPRESSION, CompressionMethod.NONE, + BinaryCasWriter.PARAM_TARGET_LOCATION, testFolder)); + configs.put( + "Format SERIALIZED_TSI - no external compression", + createEngineDescription( + BinaryCasWriter.class, + BinaryCasWriter.PARAM_OVERWRITE, true, + BinaryCasWriter.PARAM_FORMAT, SERIALIZED_TSI, + BinaryCasWriter.PARAM_FILENAME_EXTENSION, ".bin", + BinaryCasWriter.PARAM_COMPRESSION, CompressionMethod.NONE, + BinaryCasWriter.PARAM_TARGET_LOCATION, testFolder)); + configs.put( + "Format COMPRESSED_TSI - no external compression", + createEngineDescription( + BinaryCasWriter.class, + BinaryCasWriter.PARAM_OVERWRITE, true, + BinaryCasWriter.PARAM_FORMAT, COMPRESSED_TSI, + BinaryCasWriter.PARAM_FILENAME_EXTENSION, ".bin", + BinaryCasWriter.PARAM_COMPRESSION, CompressionMethod.NONE, + BinaryCasWriter.PARAM_TARGET_LOCATION, testFolder)); + configs.put( + "Format COMPRESSED_FILTERED_TSI - no external compression", + createEngineDescription( + BinaryCasWriter.class, + BinaryCasWriter.PARAM_OVERWRITE, true, + BinaryCasWriter.PARAM_FORMAT, COMPRESSED_FILTERED_TSI, + BinaryCasWriter.PARAM_FILENAME_EXTENSION, ".bin", + BinaryCasWriter.PARAM_COMPRESSION, CompressionMethod.NONE, + BinaryCasWriter.PARAM_TARGET_LOCATION, testFolder)); // configs.put( // "Format 6+ - GZip compression", // createEngineDescription( @@ -734,13 +782,15 @@ public void performanceTest() System.out.printf("%s%n", cfg.getKey()); System.out.printf(" Measuring WRITE%n"); - for (File f : FileUtils.listFiles(testFolder, new PrefixFileFilter("dummy.bin"), null)) { + for (File f : FileUtils.listFiles(testFolder, new PrefixFileFilter("dummy.bin"), + null)) { f.delete(); } SummaryStatistics writeStats = measureWritePerformance(cfg.getValue(), testdata); - Collection files = FileUtils.listFiles(testFolder, new PrefixFileFilter("dummy.bin"), null); + Collection files = FileUtils.listFiles(testFolder, + new PrefixFileFilter("dummy.bin"), null); assertEquals(1, files.size()); File f = files.iterator().next(); diff --git a/dkpro-core-io-bincas-asl/src/test/java/org/dkpro/core/io/bincas/SerializedCasWriterReaderTest.java b/dkpro-core-io-bincas-asl/src/test/java/org/dkpro/core/io/bincas/SerializedCasWriterReaderTest.java new file mode 100644 index 0000000000..822189ee5d --- /dev/null +++ b/dkpro-core-io-bincas-asl/src/test/java/org/dkpro/core/io/bincas/SerializedCasWriterReaderTest.java @@ -0,0 +1,184 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.bincas; + +import static org.apache.commons.io.FileUtils.readFileToString; +import static org.apache.uima.fit.factory.TypeSystemDescriptionFactory.createTypeSystemDescription; +import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.cas.CAS; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.CollectionReaderFactory; +import org.apache.uima.util.CasCreationUtils; +import org.dkpro.core.io.bincas.SerializedCasReader; +import org.dkpro.core.io.bincas.SerializedCasWriter; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class SerializedCasWriterReaderTest +{ + @Rule + public TemporaryFolder testFolder = new TemporaryFolder(); + + @Test + public void testCasWithTypeSystemEmbedded() throws Exception + { + write(true); + read(); + } + + @Test + public void testCasWithTypeSystemSeparate() throws Exception + { + write(false); + read(); + } + + public void write(boolean aIncludeTypeSystem) throws Exception + { + CollectionReader reader = CollectionReaderFactory.createReader( + TextReader.class, + TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/texts", + TextReader.PARAM_PATTERNS, "*.txt", + TextReader.PARAM_LANGUAGE, "latin"); + + AnalysisEngine writer = AnalysisEngineFactory.createEngine( + SerializedCasWriter.class, + SerializedCasWriter.PARAM_TARGET_LOCATION, testFolder.getRoot(), + SerializedCasWriter.PARAM_TYPE_SYSTEM_LOCATION, + aIncludeTypeSystem ? null : testFolder.newFile("typesystem.ser")); + + runPipeline(reader, writer); + + assertTrue(new File(testFolder.getRoot(), "example1.txt.ser").exists()); + } + + public void read() throws Exception + { + CollectionReader reader = CollectionReaderFactory.createReader( + SerializedCasReader.class, + SerializedCasReader.PARAM_SOURCE_LOCATION, testFolder.getRoot(), + SerializedCasReader.PARAM_PATTERNS, "*.ser", + SerializedCasReader.PARAM_TYPE_SYSTEM_LOCATION, + new File(testFolder.getRoot(), "typesystem.ser")); + + CAS cas = CasCreationUtils.createCas(createTypeSystemDescription(), null, null); + reader.getNext(cas); + + String refText = readFileToString(new File("src/test/resources/texts/example1.txt")); + assertEquals(refText, cas.getDocumentText()); + assertEquals("latin", cas.getDocumentLanguage()); + } + +// @Test +// public void lenientTest() throws Exception +// { +// TypeSystemDescription tsdMeta = TypeSystemDescriptionFactory +// .createTypeSystemDescription("desc.type.metadata"); +// +// +// // Create a CAS initialized with that type system and set the text +// CAS casOut = createCas(tsdMeta, null, null); +// casOut.setDocumentText("This is a test."); +// DocumentMetaData meta = DocumentMetaData.create(casOut); +// meta.setDocumentId("document"); +// +// // Write out +// AnalysisEngine writer = AnalysisEngineFactory.createEngine( +// SerializedCasWriter.class, tsdMeta, +// SerializedCasWriter.PARAM_TARGET_LOCATION, testFolder.getRoot().getPath()); +// writer.process(casOut); +// +// // Create a new type system from scratch +// TypeSystemDescription tsd = new TypeSystemDescription_impl(); +// TypeDescription tokenTypeDesc = tsd.addType("Token", "", CAS.TYPE_NAME_ANNOTATION); +// tokenTypeDesc.addFeature("length", "", CAS.TYPE_NAME_INTEGER); +// tsd = CasCreationUtils.mergeTypeSystems(asList(tsd, tsdMeta)); +// +// // Now read in to CAS with different type system +// CollectionReader reader = CollectionReaderFactory.createReader( +// SerializedCasReader.class, +// SerializedCasReader.PARAM_SOURCE_LOCATION, testFolder.getRoot().getPath(), +// SerializedCasReader.PARAM_PATTERNS, new String [] { +// SerializedCasReader.INCLUDE_PREFIX+"*.ser" +// }); +// +// CAS casIn = CasCreationUtils.createCas(tsd, null, null); +// reader.getNext(casIn); +// +// upgrade(casIn, tsd); +// +// // Try to create an annotation with the extra type +// AnnotationFS fs = casOut.createAnnotation(casIn.getTypeSystem().getType("Token"), 0, 1); +// casOut.addFsToIndexes(fs); +// } +// +// private void upgrade(CAS aCas, TypeSystemDescription aTsd) throws Exception +// { +// // Prepare template for new CAS +// CAS newCas = CasCreationUtils.createCas(aTsd, null, null); +// CASCompleteSerializer serializer = Serialization.serializeCASComplete((CASImpl) newCas); +// +// // Save old type system +// TypeSystem oldTypeSystem = aCas.getTypeSystem(); +// +// // Save old CAS contents +// ByteArrayOutputStream os2 = new ByteArrayOutputStream(); +// Serialization.serializeWithCompression(aCas, os2, oldTypeSystem); +// +// // Prepare CAS with new type system +// Serialization.deserializeCASComplete(serializer, (CASImpl) aCas); +// +// // Restore CAS data to new type system +// Serialization.deserializeCAS(aCas, new ByteArrayInputStream(os2.toByteArray()), +// oldTypeSystem, null); +// } +// +// private void upgrade(CAS aCas) throws Exception +// { +// // Prepare template for new CAS +// CAS newCas = JCasFactory.createJCas().getCas(); +// CASCompleteSerializer serializer = Serialization.serializeCASComplete((CASImpl) newCas); +// +// // Save old type system +// TypeSystem oldTypeSystem = aCas.getTypeSystem(); +// +// // Save old CAS contents +// ByteArrayOutputStream os2 = new ByteArrayOutputStream(); +// Serialization.serializeWithCompression(aCas, os2, oldTypeSystem); +// +// // Prepare CAS with new type system +// Serialization.deserializeCASComplete(serializer, (CASImpl) aCas); +// +// // Restore CAS data to new type system +// Serialization.deserializeCAS(aCas, new ByteArrayInputStream(os2.toByteArray()), +// oldTypeSystem, null); +// } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-bincas-asl/src/test/resources/log4j.properties b/dkpro-core-io-bincas-asl/src/test/resources/log4j.properties deleted file mode 100644 index 9f0bdd6149..0000000000 --- a/dkpro-core-io-bincas-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,12 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO - -log4j.logger.de.tudarmstadt.ukp.dkpro.core.io.bincas.BinaryCasReader = WARN -log4j.logger.de.tudarmstadt.ukp.dkpro.core.io.bincas.BinaryCasWriter = WARN -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase = WARN \ No newline at end of file diff --git a/dkpro-core-io-bincas-asl/src/test/resources/log4j2.xml b/dkpro-core-io-bincas-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-bincas-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-bliki-asl/pom.xml b/dkpro-core-io-bliki-asl/pom.xml index 1e7234ea3e..c21290158f 100644 --- a/dkpro-core-io-bliki-asl/pom.xml +++ b/dkpro-core-io-bliki-asl/pom.xml @@ -18,14 +18,18 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.bliki-asl + dkpro-core-io-bliki-asl jar - DKPro Core ASL - IO - Wikipedia via Bliki Engine + DKPro Core ASL - IO - Wikipedia via Bliki Engine (v ${bliki.version}) + https://dkpro.github.io/dkpro-core/ + + 3.1.0 + org.apache.uima @@ -43,7 +47,23 @@ info.bliki.wiki bliki-core - 3.0.19 + ${bliki.version} + + + javax.xml.bind + jaxb-api + + + com.sun.xml.bind + jaxb-core + + + com.sun.xml.bind + jaxb-impl + + + javax.activation + javax.activation-api + javax.xml.bind:jaxb-api + com.sun.xml.bind:jaxb-core + com.sun.xml.bind:jaxb-impl + javax.activation:javax.activation-api + + + + + \ No newline at end of file diff --git a/dkpro-core-io-bliki-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bliki/package-info.java b/dkpro-core-io-bliki-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bliki/package-info.java deleted file mode 100644 index d5205309ce..0000000000 --- a/dkpro-core-io-bliki-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bliki/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Reading Wikipedia pages (or other pages retrievable via the mediawiki-API) directly from the Web. - * Should only used for low volume queries. - * For big queries use io.jwpl and a offline database. - */ -package de.tudarmstadt.ukp.dkpro.core.io.bliki; diff --git a/dkpro-core-io-bliki-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bliki/BlikiWikipediaReader.java b/dkpro-core-io-bliki-asl/src/main/java/org/dkpro/core/io/bliki/BlikiWikipediaReader.java similarity index 93% rename from dkpro-core-io-bliki-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bliki/BlikiWikipediaReader.java rename to dkpro-core-io-bliki-asl/src/main/java/org/dkpro/core/io/bliki/BlikiWikipediaReader.java index 95d4386f97..abe1df77f6 100644 --- a/dkpro-core-io-bliki-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bliki/BlikiWikipediaReader.java +++ b/dkpro-core-io-bliki-asl/src/main/java/org/dkpro/core/io/bliki/BlikiWikipediaReader.java @@ -1,226 +1,226 @@ -/* - * Copyright 2013 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.bliki; - -import info.bliki.api.Page; -import info.bliki.api.User; - -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.List; - -import javax.xml.bind.JAXBException; - -import org.apache.uima.UimaContext; -import org.apache.uima.cas.CASRuntimeException; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.Progress; -import org.apache.uima.util.ProgressImpl; -import org.sweble.wikitext.engine.CompiledPage; -import org.sweble.wikitext.engine.Compiler; -import org.sweble.wikitext.engine.PageId; -import org.sweble.wikitext.engine.PageTitle; -import org.sweble.wikitext.engine.utils.SimpleWikiConfiguration; - -import de.fau.cs.osr.ptk.common.AstVisitor; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.wikipedia.api.WikiConstants; -import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException; -import de.tudarmstadt.ukp.wikipedia.api.sweble.PlainTextConverter; - -/** - * Bliki-based Wikipedia reader. - */ -@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" }) -public class BlikiWikipediaReader - extends JCasCollectionReader_ImplBase -{ - /** - * Wikiapi URL E.g. for the English Wikipedia it should be: http://en.wikipedia.org/w/api.php - */ - public static final String PARAM_SOURCE_LOCATION = ComponentParameters.PARAM_SOURCE_LOCATION; - @ConfigurationParameter(name = PARAM_SOURCE_LOCATION, mandatory = true) - private String wikiapiUrl; - - /** - * The language of the wiki installation. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true) - private String language; - - /** - * Whether the reader outputs plain text or wiki markup. - */ - public static final String PARAM_OUTPUT_PLAIN_TEXT = "outputPlainText"; - @ConfigurationParameter(name = PARAM_OUTPUT_PLAIN_TEXT, mandatory = true, defaultValue = "true") - private boolean outputPlainText; - - /** - * Which page titles should be retrieved. - */ - public static final String PARAM_PAGE_TITLES = "pageTitles"; - @ConfigurationParameter(name = PARAM_PAGE_TITLES, mandatory = true) - private String[] pageTitles; - - private List listOfPages; - private int pageOffset = 0; - - private SimpleWikiConfiguration config; - private Compiler compiler; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - - User user = new User("", "", wikiapiUrl); - user.login(); - - try { - config = new SimpleWikiConfiguration(WikiConstants.SWEBLE_CONFIG); - } - catch (FileNotFoundException e) { - throw new ResourceInitializationException(e); - } - catch (JAXBException e) { - throw new ResourceInitializationException(e); - } - compiler = new Compiler(config); - - listOfPages = user.queryContent(pageTitles); - } - - @Override - public boolean hasNext() - throws IOException, CollectionException - { - return pageOffset < listOfPages.size(); - } - - @Override - public void getNext(JCas jcas) - throws IOException, CollectionException - { - Page page = listOfPages.get(pageOffset); - - DocumentMetaData dmd = new DocumentMetaData(jcas); - dmd.setDocumentTitle(page.getTitle()); - dmd.setDocumentUri(wikiapiUrl + "?title=" + page.getTitle()); - dmd.setDocumentId(page.getPageid()); - dmd.setDocumentBaseUri(wikiapiUrl); - dmd.setCollectionId(page.getPageid()); - dmd.addToIndexes(); - - jcas.setDocumentLanguage(language); - - if (outputPlainText) { - try { - jcas.setDocumentText(getPlainText(page)); - } - catch (CASRuntimeException e) { - throw new CollectionException(e); - } - catch (WikiApiException e) { - throw new CollectionException(e); - } - } - else { - jcas.setDocumentText(page.getCurrentContent()); - } - - pageOffset++; - } - - @Override - public Progress[] getProgress() - { - return new Progress[] { new ProgressImpl(Long.valueOf(pageOffset).intValue(), Long.valueOf( - listOfPages.size()).intValue(), Progress.ENTITIES) }; - } - - /** - *

- * Returns the Wikipedia article as plain text using the SwebleParser with a - * SimpleWikiConfiguration and the PlainTextConverter.
- * If you have different needs regarding the plain text, you can use getParsedPage(Visitor v) - * and provide your own Sweble-Visitor. Examples are in the - * de.tudarmstadt.ukp.wikipedia.api.sweble package or on http://www.sweble.org - *

- * - *

- * Alternatively, use Page.getText() to return the Wikipedia article with all Wiki markup. You - * can then use the old JWPL MediaWiki parser for creating a plain text version. The JWPL parser - * is now located in a separate project de.tudarmstad.ukp.wikipedia.parser. Please - * refer to the JWPL Google Code project page for further reference. - *

- * - * @return The plain text of a Wikipedia article - */ - private String getPlainText(Page page) - throws WikiApiException - { - return (String) parsePage(page, new PlainTextConverter()); - } - - /** - * Parses the page with the Sweble parser using a SimpleWikiConfiguration and the provided - * visitor. For further information about the visitor concept, look at the examples in the - * de.tudarmstadt.ukp.wikipedia.api.sweble package, or on - * http://www.sweble.org or on the JWPL Google Code project page. - * - * @return the parsed page. The actual return type depends on the provided visitor. You have to - * cast the return type according to the return type of the go() method of your visitor. - */ - private Object parsePage(Page page, AstVisitor v) - throws WikiApiException - { - // Use the provided visitor to parse the page - return v.go(getCompiledPage(page).getPage()); - } - - /** - * a Returns CompiledPage produced by the SWEBLE parser using the SimpleWikiConfiguration. - * - * @return the parsed page - */ - private CompiledPage getCompiledPage(Page page) - throws WikiApiException - { - CompiledPage cp; - try { - - PageTitle pageTitle = PageTitle.make(config, page.getTitle()); - PageId pageId = new PageId(pageTitle, -1); - - // Compile the retrieved page - cp = compiler.postprocess(pageId, page.getCurrentContent(), null); - } - catch (Exception e) { - throw new WikiApiException(e); - } - return cp; - } -} \ No newline at end of file +/* + * Copyright 2013 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.bliki; + +import java.io.IOException; +import java.util.List; + +import org.apache.uima.UimaContext; +import org.apache.uima.cas.CASRuntimeException; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Progress; +import org.apache.uima.util.ProgressImpl; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.sweble.wikitext.engine.CompiledPage; +import org.sweble.wikitext.engine.Compiler; +import org.sweble.wikitext.engine.PageId; +import org.sweble.wikitext.engine.PageTitle; +import org.sweble.wikitext.engine.utils.SimpleWikiConfiguration; + +import de.fau.cs.osr.ptk.common.AstVisitor; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.wikipedia.api.WikiConstants; +import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException; +import de.tudarmstadt.ukp.wikipedia.api.sweble.PlainTextConverter; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; +import info.bliki.api.Page; +import info.bliki.api.User; + +/** + * Bliki-based Wikipedia reader. + */ +@Component(value = OperationType.READER) +@ResourceMetaData(name = "Bliki-based Wikipedia reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" }) +public class BlikiWikipediaReader + extends JCasCollectionReader_ImplBase +{ + /** + * Wikiapi URL E.g. for the English Wikipedia it should be: http://en.wikipedia.org/w/api.php + */ + public static final String PARAM_SOURCE_LOCATION = ComponentParameters.PARAM_SOURCE_LOCATION; + @ConfigurationParameter(name = PARAM_SOURCE_LOCATION, mandatory = true) + private String wikiapiUrl; + + /** + * The language of the wiki installation. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true) + private String language; + + /** + * Whether the reader outputs plain text or wiki markup. + */ + public static final String PARAM_OUTPUT_PLAIN_TEXT = "outputPlainText"; + @ConfigurationParameter(name = PARAM_OUTPUT_PLAIN_TEXT, mandatory = true, defaultValue = "true") + private boolean outputPlainText; + + /** + * Which page titles should be retrieved. + */ + public static final String PARAM_PAGE_TITLES = "pageTitles"; + @ConfigurationParameter(name = PARAM_PAGE_TITLES, mandatory = true) + private String[] pageTitles; + + private List listOfPages; + private int pageOffset = 0; + + private SimpleWikiConfiguration config; + private Compiler compiler; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + User user = new User("", "", wikiapiUrl); + user.login(); + + try { + config = new SimpleWikiConfiguration(WikiConstants.SWEBLE_CONFIG); + } + catch (Exception e) { + throw new ResourceInitializationException(e); + } + compiler = new Compiler(config); + + listOfPages = user.queryContent(pageTitles); + } + + @Override + public boolean hasNext() + throws IOException, CollectionException + { + return pageOffset < listOfPages.size(); + } + + @Override + public void getNext(JCas jcas) + throws IOException, CollectionException + { + Page page = listOfPages.get(pageOffset); + + DocumentMetaData dmd = new DocumentMetaData(jcas); + dmd.setDocumentTitle(page.getTitle()); + dmd.setDocumentUri(wikiapiUrl + "?title=" + page.getTitle()); + dmd.setDocumentId(page.getPageid()); + dmd.setDocumentBaseUri(wikiapiUrl); + dmd.setCollectionId(page.getPageid()); + dmd.addToIndexes(); + + jcas.setDocumentLanguage(language); + + if (outputPlainText) { + try { + jcas.setDocumentText(getPlainText(page)); + } + catch (CASRuntimeException e) { + throw new CollectionException(e); + } + catch (WikiApiException e) { + throw new CollectionException(e); + } + } + else { + jcas.setDocumentText(page.getCurrentContent()); + } + + pageOffset++; + } + + @Override + public Progress[] getProgress() + { + return new Progress[] { new ProgressImpl(Long.valueOf(pageOffset).intValue(), Long.valueOf( + listOfPages.size()).intValue(), Progress.ENTITIES) }; + } + + /** + *

+ * Returns the Wikipedia article as plain text using the SwebleParser with a + * SimpleWikiConfiguration and the PlainTextConverter.
+ * If you have different needs regarding the plain text, you can use getParsedPage(Visitor v) + * and provide your own Sweble-Visitor. Examples are in the + * de.tudarmstadt.ukp.wikipedia.api.sweble package or on http://www.sweble.org + *

+ * + *

+ * Alternatively, use Page.getText() to return the Wikipedia article with all Wiki markup. You + * can then use the old JWPL MediaWiki parser for creating a plain text version. The JWPL parser + * is now located in a separate project de.tudarmstad.ukp.wikipedia.parser. Please + * refer to the JWPL Google Code project page for further reference. + *

+ * + * @return The plain text of a Wikipedia article + */ + private String getPlainText(Page page) + throws WikiApiException + { + return (String) parsePage(page, new PlainTextConverter()); + } + + /** + * Parses the page with the Sweble parser using a SimpleWikiConfiguration and the provided + * visitor. For further information about the visitor concept, look at the examples in the + * de.tudarmstadt.ukp.wikipedia.api.sweble package, or on + * http://www.sweble.org or on the JWPL Google Code project page. + * + * @return the parsed page. The actual return type depends on the provided visitor. You have to + * cast the return type according to the return type of the go() method of your visitor. + */ + private Object parsePage(Page page, AstVisitor v) + throws WikiApiException + { + // Use the provided visitor to parse the page + return v.go(getCompiledPage(page).getPage()); + } + + /** + * a Returns CompiledPage produced by the SWEBLE parser using the SimpleWikiConfiguration. + * + * @return the parsed page + */ + private CompiledPage getCompiledPage(Page page) + throws WikiApiException + { + CompiledPage cp; + try { + + PageTitle pageTitle = PageTitle.make(config, page.getTitle()); + PageId pageId = new PageId(pageTitle, -1); + + // Compile the retrieved page + cp = compiler.postprocess(pageId, page.getCurrentContent(), null); + } + catch (Exception e) { + throw new WikiApiException(e); + } + return cp; + } +} diff --git a/dkpro-core-io-bliki-asl/src/main/java/org/dkpro/core/io/bliki/package-info.java b/dkpro-core-io-bliki-asl/src/main/java/org/dkpro/core/io/bliki/package-info.java new file mode 100644 index 0000000000..e419cb8d6b --- /dev/null +++ b/dkpro-core-io-bliki-asl/src/main/java/org/dkpro/core/io/bliki/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Reading Wikipedia pages (or other pages retrievable via the mediawiki-API) directly from the Web. + * Should only used for low volume queries. + * For big queries use io.jwpl and a offline database. + */ +package org.dkpro.core.io.bliki; diff --git a/dkpro-core-io-bliki-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/bliki/BlikiWikipediaReaderTest.java b/dkpro-core-io-bliki-asl/src/test/java/org/dkpro/core/io/bliki/BlikiWikipediaReaderTest.java similarity index 95% rename from dkpro-core-io-bliki-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/bliki/BlikiWikipediaReaderTest.java rename to dkpro-core-io-bliki-asl/src/test/java/org/dkpro/core/io/bliki/BlikiWikipediaReaderTest.java index 4dde885b2e..82c64a4917 100644 --- a/dkpro-core-io-bliki-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/bliki/BlikiWikipediaReaderTest.java +++ b/dkpro-core-io-bliki-asl/src/test/java/org/dkpro/core/io/bliki/BlikiWikipediaReaderTest.java @@ -1,99 +1,100 @@ -/* - * Copyright 2013 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.bliki; - -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; - -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.JCasIterable; -import org.apache.uima.jcas.JCas; -import org.junit.Ignore; -import org.junit.Test; - -public class BlikiWikipediaReaderTest -{ - @Ignore("May fail due to Wikipedia API problems.") - @Test - public void wikipediaReaderTestPlainText() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - BlikiWikipediaReader.class, - BlikiWikipediaReader.PARAM_SOURCE_LOCATION, "http://en.wikipedia.org/w/api.php", - BlikiWikipediaReader.PARAM_LANGUAGE, "en", - BlikiWikipediaReader.PARAM_PAGE_TITLES, new String[]{"New York City", "Darmstadt"} - ); - - int i = 0; - for (JCas jcas : new JCasIterable(reader)) { - assertNotNull(jcas); - assertTrue(jcas.getDocumentText().length() > 0); - i++; - } - - assertEquals(2, i); - } - - @Ignore("May fail due to Wikipedia API problems.") - @Test - public void wikipediaReaderTestMarkup() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - BlikiWikipediaReader.class, - BlikiWikipediaReader.PARAM_OUTPUT_PLAIN_TEXT, false, - BlikiWikipediaReader.PARAM_SOURCE_LOCATION, "http://en.wikipedia.org/w/api.php", - BlikiWikipediaReader.PARAM_LANGUAGE, "en", - BlikiWikipediaReader.PARAM_PAGE_TITLES, new String[]{"New York City", "Darmstadt"} - ); - - int i = 0; - for (JCas jcas : new JCasIterable(reader)) { - assertNotNull(jcas); - assertTrue(jcas.getDocumentText().length() > 0); - i++; - } - - assertEquals(2, i); - } - - @Ignore("May fail due to Wikipedia API problems.") - @Test - public void wikipediaReaderUnknownPage() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - BlikiWikipediaReader.class, - BlikiWikipediaReader.PARAM_SOURCE_LOCATION, "http://en.wikipedia.org/w/api.php", - BlikiWikipediaReader.PARAM_LANGUAGE, "en", - BlikiWikipediaReader.PARAM_PAGE_TITLES, new String[]{"humbelgrpf"} - ); - - int i = 0; - for (JCas jcas : new JCasIterable(reader)) { - assertNotNull(jcas); - assertTrue(jcas.getDocumentText().length() == 0); - i++; - } - - assertEquals(1, i); - } -} +/* + * Copyright 2013 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.bliki; + +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.JCasIterable; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.bliki.BlikiWikipediaReader; +import org.junit.Ignore; +import org.junit.Test; + +public class BlikiWikipediaReaderTest +{ + @Ignore("May fail due to Wikipedia API problems.") + @Test + public void wikipediaReaderTestPlainText() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + BlikiWikipediaReader.class, + BlikiWikipediaReader.PARAM_SOURCE_LOCATION, "http://en.wikipedia.org/w/api.php", + BlikiWikipediaReader.PARAM_LANGUAGE, "en", + BlikiWikipediaReader.PARAM_PAGE_TITLES, new String[]{"New York City", "Darmstadt"} + ); + + int i = 0; + for (JCas jcas : new JCasIterable(reader)) { + assertNotNull(jcas); + assertTrue(jcas.getDocumentText().length() > 0); + i++; + } + + assertEquals(2, i); + } + + @Ignore("May fail due to Wikipedia API problems.") + @Test + public void wikipediaReaderTestMarkup() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + BlikiWikipediaReader.class, + BlikiWikipediaReader.PARAM_OUTPUT_PLAIN_TEXT, false, + BlikiWikipediaReader.PARAM_SOURCE_LOCATION, "http://en.wikipedia.org/w/api.php", + BlikiWikipediaReader.PARAM_LANGUAGE, "en", + BlikiWikipediaReader.PARAM_PAGE_TITLES, new String[]{"New York City", "Darmstadt"} + ); + + int i = 0; + for (JCas jcas : new JCasIterable(reader)) { + assertNotNull(jcas); + assertTrue(jcas.getDocumentText().length() > 0); + i++; + } + + assertEquals(2, i); + } + + @Ignore("May fail due to Wikipedia API problems.") + @Test + public void wikipediaReaderUnknownPage() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + BlikiWikipediaReader.class, + BlikiWikipediaReader.PARAM_SOURCE_LOCATION, "http://en.wikipedia.org/w/api.php", + BlikiWikipediaReader.PARAM_LANGUAGE, "en", + BlikiWikipediaReader.PARAM_PAGE_TITLES, new String[]{"humbelgrpf"} + ); + + int i = 0; + for (JCas jcas : new JCasIterable(reader)) { + assertNotNull(jcas); + assertTrue(jcas.getDocumentText().length() == 0); + i++; + } + + assertEquals(1, i); + } +} diff --git a/dkpro-core-io-bnc-asl/pom.xml b/dkpro-core-io-bnc-asl/pom.xml index dccc2c19c2..11f0027f28 100644 --- a/dkpro-core-io-bnc-asl/pom.xml +++ b/dkpro-core-io-bnc-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.bnc-asl + dkpro-core-io-bnc-asl jar DKPro Core ASL - IO - British National Corpus + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -40,32 +41,36 @@ commons-lang3 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.xml-asl + org.dkpro.core + dkpro-core-io-xml-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -73,8 +78,8 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test diff --git a/dkpro-core-io-bnc-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bnc/BncReader.java b/dkpro-core-io-bnc-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bnc/BncReader.java deleted file mode 100644 index d41a7bed82..0000000000 --- a/dkpro-core-io-bnc-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bnc/BncReader.java +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.bnc; - -import static org.apache.commons.lang3.StringUtils.isNotBlank; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Type; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.tcas.Annotation; -import org.apache.uima.resource.ResourceInitializationException; -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.io.xml.XmlTextReader; - -/** - * Reader for the British National Corpus (XML version). - */ -@ResourceMetaData(name="British National Corpus (BNC) XML Reader") -@MimeTypeCapability(MimeTypes.APPLICATION_XML) -@TypeCapability( - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }) -public class BncReader - extends XmlTextReader -{ - /** - * (character) contains a significant punctuation mark as identified by the CLAWS tagger. - */ - private static final String TAG_CHARACTER = "c"; - - /** - * (word) represents a grammatical (not necessarily orthographic) word. - */ - private static final String TAG_WORD = "w"; - - /** - * (s-unit) contains a sentence-like division of a text. - */ - private static final String TAG_SUNIT = "s"; - - /** - * contains a single spoken text, i.e. a transcription or collection of transcriptions from a - * single source. - */ - private static final String TAG_STEXT = "stext"; - - /** - * contains a single written text. - */ - private static final String TAG_WTEXT = "wtext"; - - /** - * contains the full title of a work of any kind. - */ - private static final String TAG_TITLE = "title"; - - /** - * the root tag - */ - private static final String TAG_BNC_DOC = "bncDoc"; - - private static final String ATTR_C5 = "c5"; - - private static final String ATTR_HEADWORD = "hw"; - - /** - * Location of the mapping file for part-of-speech tags to UIMA types. - */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - protected String posMappingLocation; - - /** - * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the - * tag set defined as part of the model meta data. - */ - public static final String PARAM_POS_TAGSET = ComponentParameters.PARAM_POS_TAG_SET; - @ConfigurationParameter(name = PARAM_POS_TAGSET, mandatory = false) - protected String posTagset; - - private MappingProvider posMappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - posTagset, getLanguage()); - posMappingProvider.setDefault("pos.tagset", "c5"); - } - - @Override - protected void initCas(CAS aCas, Resource aResource, String aQualifier) throws RuntimeException - { - super.initCas(aCas, aResource, aQualifier); - try { - posMappingProvider.configure(aCas); - } - catch (AnalysisEngineProcessException e) { - throw new RuntimeException(e); - } - } - - @Override - protected Handler newSaxHandler() - { - return new BncHandler(); - } - - public class BncHandler - extends TextExtractor - { - private String documentId = null; - private boolean captureText = false; - private int sentenceStart = -1; - private int tokenStart = -1; - private String c5Tag = null; - private String headword = null; - private boolean complete = false; - - @Override - public void startElement(String aUri, String aLocalName, String aName, - Attributes aAttributes) - throws SAXException - { - if (TAG_BNC_DOC.equals(aName)) { - documentId = aAttributes.getValue("xml:id"); - } - else if (TAG_TITLE.equals(aName)) { - captureText = true; - } - else if (TAG_STEXT.equals(aName) || TAG_WTEXT.equals(aName)) { - captureText = true; - } - else if (TAG_SUNIT.equals(aName)) { - sentenceStart = getBuffer().length(); - } - else if (TAG_WORD.equals(aName) || TAG_CHARACTER.equals(aName)) { - tokenStart = getBuffer().length(); - c5Tag = aAttributes.getValue(ATTR_C5); - headword = aAttributes.getValue(ATTR_HEADWORD); - } - } - - @Override - public void endElement(String aUri, String aLocalName, String aName) - throws SAXException - { - if (TAG_TITLE.equals(aName)) { - DocumentMetaData.get(getJCas()).setDocumentTitle(getBuffer().toString().trim()); - DocumentMetaData.get(getJCas()).setDocumentId(documentId); - getBuffer().setLength(0); - captureText = false; - } - else if (TAG_STEXT.equals(aName) || TAG_WTEXT.equals(aName)) { - captureText = false; - complete = true; - } - else if (TAG_SUNIT.equals(aName)) { - new Sentence(getJCas(), sentenceStart, getBuffer().length()).addToIndexes(); - sentenceStart = -1; - } - else if (TAG_WORD.equals(aName) || TAG_CHARACTER.equals(aName)) { - if (isNotBlank(getBuffer().substring(tokenStart, getBuffer().length()))) { - Token token = new Token(getJCas(), tokenStart, getBuffer().length()); - trim(token); - - if (c5Tag != null) { - Type posTag = posMappingProvider.getTagType(c5Tag); - POS pos = (POS) getJCas().getCas().createAnnotation(posTag, - token.getBegin(), token.getEnd()); - pos.setPosValue(c5Tag.intern()); - POSUtils.assignCoarseValue(pos); - pos.addToIndexes(); - token.setPos(pos); - } - - if (headword != null) { - Lemma lemma = new Lemma(getJCas(), token.getBegin(), token.getEnd()); - lemma.setValue(headword); - lemma.addToIndexes(); - token.setLemma(lemma); - } - - token.addToIndexes(); - } - - tokenStart = -1; - } - } - - @Override - public void characters(char[] aCh, int aStart, int aLength) - throws SAXException - { - if (complete) { - throw new SAXException("Extra content after stext is not permitted."); - } - - if (captureText) { - super.characters(aCh, aStart, aLength); - } - } - - @Override - public void ignorableWhitespace(char[] aCh, int aStart, int aLength) - throws SAXException - { - if (complete) { - throw new SAXException("Extra content after stext is not permitted."); - } - - if (captureText) { - super.ignorableWhitespace(aCh, aStart, aLength); - } - } - - private void trim(Annotation aAnnotation) - { - StringBuilder buffer = getBuffer(); - int s = aAnnotation.getBegin(); - int e = aAnnotation.getEnd(); - while (Character.isWhitespace(buffer.charAt(s))) { - s++; - } - while ((e > s+1) && Character.isWhitespace(buffer.charAt(e-1))) { - e--; - } - aAnnotation.setBegin(s); - aAnnotation.setEnd(e); - } - } -} diff --git a/dkpro-core-io-bnc-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bnc/package-info.java b/dkpro-core-io-bnc-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bnc/package-info.java deleted file mode 100644 index 26fd22b111..0000000000 --- a/dkpro-core-io-bnc-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/bnc/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Support for British National Corpus XML Edition. - */ -package de.tudarmstadt.ukp.dkpro.core.io.bnc; diff --git a/dkpro-core-io-bnc-asl/src/main/java/org/dkpro/core/io/bnc/BncReader.java b/dkpro-core-io-bnc-asl/src/main/java/org/dkpro/core/io/bnc/BncReader.java new file mode 100644 index 0000000000..7930905e2f --- /dev/null +++ b/dkpro-core-io-bnc-asl/src/main/java/org/dkpro/core/io/bnc/BncReader.java @@ -0,0 +1,281 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.bnc; + +import static org.apache.commons.lang3.StringUtils.isNotBlank; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Type; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.tcas.Annotation; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.MappingProviderFactory; +import org.dkpro.core.io.xml.XmlTextReader; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Reader for the British National Corpus (XML version). + */ +@ResourceMetaData(name = "British National Corpus (BNC) XML Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability(MimeTypes.APPLICATION_X_BNC) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }) +public class BncReader + extends XmlTextReader +{ + /** + * (character) contains a significant punctuation mark as identified by the CLAWS tagger. + */ + private static final String TAG_CHARACTER = "c"; + + /** + * (word) represents a grammatical (not necessarily orthographic) word. + */ + private static final String TAG_WORD = "w"; + + /** + * (s-unit) contains a sentence-like division of a text. + */ + private static final String TAG_SUNIT = "s"; + + /** + * contains a single spoken text, i.e. a transcription or collection of transcriptions from a + * single source. + */ + private static final String TAG_STEXT = "stext"; + + /** + * contains a single written text. + */ + private static final String TAG_WTEXT = "wtext"; + + /** + * contains the full title of a work of any kind. + */ + private static final String TAG_TITLE = "title"; + + /** + * the root tag + */ + private static final String TAG_BNC_DOC = "bncDoc"; + + private static final String ATTR_C5 = "c5"; + + private static final String ATTR_HEADWORD = "hw"; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** + * Location of the mapping file for part-of-speech tags to UIMA types. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + protected String posMappingLocation; + + /** + * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the + * tag set defined as part of the model meta data. + */ + public static final String PARAM_POS_TAGSET = ComponentParameters.PARAM_POS_TAG_SET; + @ConfigurationParameter(name = PARAM_POS_TAGSET, mandatory = false) + protected String posTagset; + + private MappingProvider posMappingProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + posMappingProvider = MappingProviderFactory.createPosMappingProvider(this, + posMappingLocation, posTagset, getLanguage()); + posMappingProvider.setDefault("pos.tagset", "c5"); + } + + @Override + protected void initCas(CAS aCas, Resource aResource, String aQualifier) throws RuntimeException + { + super.initCas(aCas, aResource, aQualifier); + try { + posMappingProvider.configure(aCas); + } + catch (AnalysisEngineProcessException e) { + throw new RuntimeException(e); + } + } + + @Override + protected Handler newSaxHandler() + { + return new BncHandler(); + } + + public class BncHandler + extends TextExtractor + { + private String documentId = null; + private boolean captureText = false; + private int sentenceStart = -1; + private int tokenStart = -1; + private String c5Tag = null; + private String headword = null; + private boolean complete = false; + + @Override + public void startElement(String aUri, String aLocalName, String aName, + Attributes aAttributes) + throws SAXException + { + if (TAG_BNC_DOC.equals(aName)) { + documentId = aAttributes.getValue("xml:id"); + } + else if (TAG_TITLE.equals(aName)) { + captureText = true; + } + else if (TAG_STEXT.equals(aName) || TAG_WTEXT.equals(aName)) { + captureText = true; + } + else if (TAG_SUNIT.equals(aName)) { + sentenceStart = getBuffer().length(); + } + else if (TAG_WORD.equals(aName) || TAG_CHARACTER.equals(aName)) { + tokenStart = getBuffer().length(); + c5Tag = aAttributes.getValue(ATTR_C5); + headword = aAttributes.getValue(ATTR_HEADWORD); + } + } + + @Override + public void endElement(String aUri, String aLocalName, String aName) + throws SAXException + { + if (TAG_TITLE.equals(aName)) { + DocumentMetaData.get(getJCas()).setDocumentTitle(getBuffer().toString().trim()); + DocumentMetaData.get(getJCas()).setDocumentId(documentId); + getBuffer().setLength(0); + captureText = false; + } + else if (TAG_STEXT.equals(aName) || TAG_WTEXT.equals(aName)) { + captureText = false; + complete = true; + } + else if (TAG_SUNIT.equals(aName)) { + new Sentence(getJCas(), sentenceStart, getBuffer().length()).addToIndexes(); + sentenceStart = -1; + } + else if (TAG_WORD.equals(aName) || TAG_CHARACTER.equals(aName)) { + if (isNotBlank(getBuffer().substring(tokenStart, getBuffer().length()))) { + Token token = new Token(getJCas(), tokenStart, getBuffer().length()); + trim(token); + + if (c5Tag != null) { + Type posTag = posMappingProvider.getTagType(c5Tag); + POS pos = (POS) getJCas().getCas().createAnnotation(posTag, + token.getBegin(), token.getEnd()); + pos.setPosValue(c5Tag != null ? c5Tag.intern() : null); + POSUtils.assignCoarseValue(pos); + pos.addToIndexes(); + token.setPos(pos); + } + + if (headword != null) { + Lemma lemma = new Lemma(getJCas(), token.getBegin(), token.getEnd()); + lemma.setValue(headword); + lemma.addToIndexes(); + token.setLemma(lemma); + } + + token.addToIndexes(); + } + + tokenStart = -1; + } + } + + @Override + public void characters(char[] aCh, int aStart, int aLength) + throws SAXException + { + if (complete) { + throw new SAXException("Extra content after stext is not permitted."); + } + + if (captureText) { + super.characters(aCh, aStart, aLength); + } + } + + @Override + public void ignorableWhitespace(char[] aCh, int aStart, int aLength) + throws SAXException + { + if (complete) { + throw new SAXException("Extra content after stext is not permitted."); + } + + if (captureText) { + super.ignorableWhitespace(aCh, aStart, aLength); + } + } + + private void trim(Annotation aAnnotation) + { + StringBuilder buffer = getBuffer(); + int s = aAnnotation.getBegin(); + int e = aAnnotation.getEnd(); + while (Character.isWhitespace(buffer.charAt(s))) { + s++; + } + while ((e > s + 1) && Character.isWhitespace(buffer.charAt(e - 1))) { + e--; + } + aAnnotation.setBegin(s); + aAnnotation.setEnd(e); + } + } +} diff --git a/dkpro-core-io-bnc-asl/src/main/java/org/dkpro/core/io/bnc/package-info.java b/dkpro-core-io-bnc-asl/src/main/java/org/dkpro/core/io/bnc/package-info.java new file mode 100644 index 0000000000..587e6ec125 --- /dev/null +++ b/dkpro-core-io-bnc-asl/src/main/java/org/dkpro/core/io/bnc/package-info.java @@ -0,0 +1,22 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Support for British National Corpus XML Edition. + */ +package org.dkpro.core.io.bnc; diff --git a/dkpro-core-io-bnc-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/bnc/BncReaderTest.java b/dkpro-core-io-bnc-asl/src/test/java/org/dkpro/core/io/bnc/BncReaderTest.java similarity index 81% rename from dkpro-core-io-bnc-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/bnc/BncReaderTest.java rename to dkpro-core-io-bnc-asl/src/test/java/org/dkpro/core/io/bnc/BncReaderTest.java index 008daed69e..fdd43c24c0 100644 --- a/dkpro-core-io-bnc-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/bnc/BncReaderTest.java +++ b/dkpro-core-io-bnc-asl/src/test/java/org/dkpro/core/io/bnc/BncReaderTest.java @@ -15,28 +15,28 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.bnc; +package org.dkpro.core.io.bnc; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.*; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; +import org.dkpro.core.io.bnc.BncReader; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - public class BncReaderTest { - @Test - public void test() throws Exception - { + @Test + public void test() throws Exception + { testOneWay( createReaderDescription(BncReader.class, BncReader.PARAM_LANGUAGE, "en"), "FX8.xml.dump", "FX8.xml"); - } - + } + @Rule public DkproTestContext testContext = new DkproTestContext(); } diff --git a/dkpro-core-io-bnc-asl/src/test/resources/FX8.xml.dump b/dkpro-core-io-bnc-asl/src/test/resources/FX8.xml.dump index 957e769113..593ee81bce 100644 --- a/dkpro-core-io-bnc-asl/src/test/resources/FX8.xml.dump +++ b/dkpro-core-io-bnc-asl/src/test/resources/FX8.xml.dump @@ -7,7 +7,7 @@ DocumentMetaData begin: 0 end: 635 language: "en" - documentTitle: "General practitioner's surgery: medical consultation. Sample containing about 125 words speech recorded in public context" + documentTitle: "General practitioner's surgery: medical consultation. Sample containing about 12..." documentId: "FX8" isLastSegment: false @@ -62,6 +62,7 @@ Token end: 3 PosValue: "ITJ" coarseValue: "INTJ" + order: 0 [there] POS_ADV sofa: _InitialView @@ -91,6 +92,7 @@ Token end: 9 PosValue: "AV0" coarseValue: "ADV" + order: 0 [we] POS_PRON sofa: _InitialView @@ -120,6 +122,7 @@ Token end: 12 PosValue: "PNP" coarseValue: "PRON" + order: 0 [are] POS_VERB sofa: _InitialView @@ -149,6 +152,7 @@ Token end: 16 PosValue: "VBB" coarseValue: "VERB" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -167,6 +171,7 @@ Token end: 17 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -185,6 +190,7 @@ Token end: 18 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [Right abdominal wound, she's a wee bit confused.] Sentence sofa: _InitialView @@ -219,6 +225,7 @@ Token end: 24 PosValue: "AV0" coarseValue: "ADV" + order: 0 [abdominal] POS_ADJ sofa: _InitialView @@ -248,6 +255,7 @@ Token end: 35 PosValue: "AJ0" coarseValue: "ADJ" + order: 0 [wound] POS_NOUN sofa: _InitialView @@ -277,6 +285,7 @@ Token end: 41 PosValue: "NN1" coarseValue: "NOUN" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -295,6 +304,7 @@ Token end: 42 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [she] POS_PRON sofa: _InitialView @@ -324,6 +334,7 @@ Token end: 46 PosValue: "PNP" coarseValue: "PRON" + order: 0 ['s] POS_VERB sofa: _InitialView @@ -353,6 +364,7 @@ Token end: 48 PosValue: "VBZ" coarseValue: "VERB" + order: 0 [a] POS_DET sofa: _InitialView @@ -382,6 +394,7 @@ Token end: 50 PosValue: "AT0" coarseValue: "DET" + order: 0 [wee] POS_ADJ sofa: _InitialView @@ -411,6 +424,7 @@ Token end: 54 PosValue: "AJ0-NN1" coarseValue: "ADJ" + order: 0 [bit] POS_NOUN sofa: _InitialView @@ -440,6 +454,7 @@ Token end: 58 PosValue: "NN1" coarseValue: "NOUN" + order: 0 [confused] POS_VERB sofa: _InitialView @@ -469,6 +484,7 @@ Token end: 68 PosValue: "VVN-AJ0" coarseValue: "VERB" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -487,6 +503,7 @@ Token end: 69 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [She didn't bother to tell me that she'd only got to call you, right?] Sentence sofa: _InitialView @@ -521,6 +538,7 @@ Token end: 73 PosValue: "PNP" coarseValue: "PRON" + order: 0 [did] POS_VERB sofa: _InitialView @@ -550,6 +568,7 @@ Token end: 77 PosValue: "VDD" coarseValue: "VERB" + order: 0 [n't] POS_PART sofa: _InitialView @@ -579,6 +598,7 @@ Token end: 80 PosValue: "XX0" coarseValue: "PART" + order: 0 [bother] POS_VERB sofa: _InitialView @@ -608,13 +628,13 @@ Token end: 87 PosValue: "VVI" coarseValue: "VERB" + order: 0 [to] -POS_X +POS sofa: _InitialView begin: 88 end: 90 PosValue: "TO0" - coarseValue: "X" [to] Lemma sofa: _InitialView @@ -631,12 +651,12 @@ Token begin: 88 end: 90 value: "to" - pos: POS_X + pos: POS sofa: _InitialView begin: 88 end: 90 PosValue: "TO0" - coarseValue: "X" + order: 0 [tell] POS_VERB sofa: _InitialView @@ -666,6 +686,7 @@ Token end: 95 PosValue: "VVI" coarseValue: "VERB" + order: 0 [me] POS_PRON sofa: _InitialView @@ -695,6 +716,7 @@ Token end: 98 PosValue: "PNP" coarseValue: "PRON" + order: 0 [that] POS_SCONJ sofa: _InitialView @@ -724,6 +746,7 @@ Token end: 103 PosValue: "CJT" coarseValue: "SCONJ" + order: 0 [she] POS_PRON sofa: _InitialView @@ -753,6 +776,7 @@ Token end: 107 PosValue: "PNP" coarseValue: "PRON" + order: 0 ['d] POS_VERB sofa: _InitialView @@ -782,6 +806,7 @@ Token end: 109 PosValue: "VHD" coarseValue: "VERB" + order: 0 [only] POS_ADV sofa: _InitialView @@ -811,6 +836,7 @@ Token end: 114 PosValue: "AV0" coarseValue: "ADV" + order: 0 [got] POS_VERB sofa: _InitialView @@ -840,13 +866,13 @@ Token end: 118 PosValue: "VVN" coarseValue: "VERB" + order: 0 [to] -POS_X +POS sofa: _InitialView begin: 120 end: 122 PosValue: "TO0" - coarseValue: "X" [to] Lemma sofa: _InitialView @@ -863,12 +889,12 @@ Token begin: 120 end: 122 value: "to" - pos: POS_X + pos: POS sofa: _InitialView begin: 120 end: 122 PosValue: "TO0" - coarseValue: "X" + order: 0 [call] POS_VERB sofa: _InitialView @@ -898,6 +924,7 @@ Token end: 127 PosValue: "VVI" coarseValue: "VERB" + order: 0 [you] POS_PRON sofa: _InitialView @@ -927,6 +954,7 @@ Token end: 131 PosValue: "PNP" coarseValue: "PRON" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -945,6 +973,7 @@ Token end: 132 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [right] POS_ADV sofa: _InitialView @@ -974,6 +1003,7 @@ Token end: 138 PosValue: "AV0" coarseValue: "ADV" + order: 0 [?] POS_PUNCT sofa: _InitialView @@ -992,6 +1022,7 @@ Token end: 139 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [Erm she wasn't in her nightdress but she only dressed herself, she said ] Sentence sofa: _InitialView @@ -1026,6 +1057,7 @@ Token end: 143 PosValue: "UNC" coarseValue: "X" + order: 0 [she] POS_PRON sofa: _InitialView @@ -1055,6 +1087,7 @@ Token end: 147 PosValue: "PNP" coarseValue: "PRON" + order: 0 [was] POS_VERB sofa: _InitialView @@ -1084,6 +1117,7 @@ Token end: 151 PosValue: "VBD" coarseValue: "VERB" + order: 0 [n't] POS_PART sofa: _InitialView @@ -1113,6 +1147,7 @@ Token end: 154 PosValue: "XX0" coarseValue: "PART" + order: 0 [in] POS_ADP sofa: _InitialView @@ -1142,6 +1177,7 @@ Token end: 157 PosValue: "PRP" coarseValue: "ADP" + order: 0 [her] POS_DET sofa: _InitialView @@ -1171,6 +1207,7 @@ Token end: 161 PosValue: "DPS" coarseValue: "DET" + order: 0 [nightdress] POS_NOUN sofa: _InitialView @@ -1200,6 +1237,7 @@ Token end: 172 PosValue: "NN1" coarseValue: "NOUN" + order: 0 [but] POS_CONJ sofa: _InitialView @@ -1229,6 +1267,7 @@ Token end: 176 PosValue: "CJC" coarseValue: "CONJ" + order: 0 [she] POS_PRON sofa: _InitialView @@ -1258,6 +1297,7 @@ Token end: 180 PosValue: "PNP" coarseValue: "PRON" + order: 0 [only] POS_ADV sofa: _InitialView @@ -1287,6 +1327,7 @@ Token end: 185 PosValue: "AV0" coarseValue: "ADV" + order: 0 [dressed] POS_VERB sofa: _InitialView @@ -1316,13 +1357,13 @@ Token end: 193 PosValue: "VVD" coarseValue: "VERB" + order: 0 [herself] -POS_X +POS sofa: _InitialView begin: 194 end: 201 PosValue: "PNX" - coarseValue: "X" [herself] Lemma sofa: _InitialView @@ -1339,12 +1380,12 @@ Token begin: 194 end: 201 value: "herself" - pos: POS_X + pos: POS sofa: _InitialView begin: 194 end: 201 PosValue: "PNX" - coarseValue: "X" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -1363,6 +1404,7 @@ Token end: 202 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [she] POS_PRON sofa: _InitialView @@ -1392,6 +1434,7 @@ Token end: 206 PosValue: "PNP" coarseValue: "PRON" + order: 0 [said] POS_VERB sofa: _InitialView @@ -1421,6 +1464,7 @@ Token end: 211 PosValue: "VVD" coarseValue: "VERB" + order: 0 [And you ] Sentence sofa: _InitialView @@ -1455,6 +1499,7 @@ Token end: 217 PosValue: "CJC" coarseValue: "CONJ" + order: 0 [you] POS_PRON sofa: _InitialView @@ -1484,6 +1529,7 @@ Token end: 221 PosValue: "PNP" coarseValue: "PRON" + order: 0 [She said she went to buy something herself, she phoned the clinic and the clinic .] Sentence sofa: _InitialView @@ -1518,6 +1564,7 @@ Token end: 227 PosValue: "PNP" coarseValue: "PRON" + order: 0 [said] POS_VERB sofa: _InitialView @@ -1547,6 +1594,7 @@ Token end: 232 PosValue: "VVD" coarseValue: "VERB" + order: 0 [she] POS_PRON sofa: _InitialView @@ -1576,6 +1624,7 @@ Token end: 236 PosValue: "PNP" coarseValue: "PRON" + order: 0 [went] POS_VERB sofa: _InitialView @@ -1605,13 +1654,13 @@ Token end: 241 PosValue: "VVD" coarseValue: "VERB" + order: 0 [to] -POS_X +POS sofa: _InitialView begin: 242 end: 244 PosValue: "TO0" - coarseValue: "X" [to] Lemma sofa: _InitialView @@ -1628,12 +1677,12 @@ Token begin: 242 end: 244 value: "to" - pos: POS_X + pos: POS sofa: _InitialView begin: 242 end: 244 PosValue: "TO0" - coarseValue: "X" + order: 0 [buy] POS_VERB sofa: _InitialView @@ -1663,6 +1712,7 @@ Token end: 248 PosValue: "VVI" coarseValue: "VERB" + order: 0 [something] POS_PRON sofa: _InitialView @@ -1692,13 +1742,13 @@ Token end: 258 PosValue: "PNI" coarseValue: "PRON" + order: 0 [herself] -POS_X +POS sofa: _InitialView begin: 260 end: 267 PosValue: "PNX" - coarseValue: "X" [herself] Lemma sofa: _InitialView @@ -1715,12 +1765,12 @@ Token begin: 260 end: 267 value: "herself" - pos: POS_X + pos: POS sofa: _InitialView begin: 260 end: 267 PosValue: "PNX" - coarseValue: "X" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -1739,6 +1789,7 @@ Token end: 268 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [she] POS_PRON sofa: _InitialView @@ -1768,6 +1819,7 @@ Token end: 272 PosValue: "PNP" coarseValue: "PRON" + order: 0 [phoned] POS_VERB sofa: _InitialView @@ -1797,6 +1849,7 @@ Token end: 279 PosValue: "VVD" coarseValue: "VERB" + order: 0 [the] POS_DET sofa: _InitialView @@ -1826,6 +1879,7 @@ Token end: 283 PosValue: "AT0" coarseValue: "DET" + order: 0 [clinic] POS_NOUN sofa: _InitialView @@ -1855,6 +1909,7 @@ Token end: 290 PosValue: "NN1" coarseValue: "NOUN" + order: 0 [and] POS_CONJ sofa: _InitialView @@ -1884,6 +1939,7 @@ Token end: 294 PosValue: "CJC" coarseValue: "CONJ" + order: 0 [the] POS_DET sofa: _InitialView @@ -1913,6 +1969,7 @@ Token end: 298 PosValue: "AT0" coarseValue: "DET" + order: 0 [clinic] POS_NOUN sofa: _InitialView @@ -1942,6 +1999,7 @@ Token end: 305 PosValue: "NN1" coarseValue: "NOUN" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -1960,6 +2018,7 @@ Token end: 307 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [She's here and says she should be fortnightly .] Sentence sofa: _InitialView @@ -1994,6 +2053,7 @@ Token end: 311 PosValue: "PNP" coarseValue: "PRON" + order: 0 ['s] POS_VERB sofa: _InitialView @@ -2023,6 +2083,7 @@ Token end: 313 PosValue: "VBZ" coarseValue: "VERB" + order: 0 [here] POS_ADV sofa: _InitialView @@ -2052,6 +2113,7 @@ Token end: 319 PosValue: "AV0" coarseValue: "ADV" + order: 0 [and] POS_CONJ sofa: _InitialView @@ -2081,6 +2143,7 @@ Token end: 323 PosValue: "CJC" coarseValue: "CONJ" + order: 0 [says] POS_VERB sofa: _InitialView @@ -2110,6 +2173,7 @@ Token end: 329 PosValue: "VVZ" coarseValue: "VERB" + order: 0 [she] POS_PRON sofa: _InitialView @@ -2139,6 +2203,7 @@ Token end: 333 PosValue: "PNP" coarseValue: "PRON" + order: 0 [should] POS_AUX sofa: _InitialView @@ -2168,6 +2233,7 @@ Token end: 340 PosValue: "VM0" coarseValue: "AUX" + order: 0 [be] POS_VERB sofa: _InitialView @@ -2197,6 +2263,7 @@ Token end: 343 PosValue: "VBI" coarseValue: "VERB" + order: 0 [fortnightly] POS_ADV sofa: _InitialView @@ -2226,6 +2293,7 @@ Token end: 356 PosValue: "AV0" coarseValue: "ADV" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -2244,6 +2312,7 @@ Token end: 358 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [So I don't know whether you want to go and see her rather than, I could get a doctor to go and see her and phone,.] Sentence sofa: _InitialView @@ -2278,6 +2347,7 @@ Token end: 361 PosValue: "AV0" coarseValue: "ADV" + order: 0 [I] POS_PRON sofa: _InitialView @@ -2307,6 +2377,7 @@ Token end: 363 PosValue: "PNP" coarseValue: "PRON" + order: 0 [do] POS_VERB sofa: _InitialView @@ -2336,6 +2407,7 @@ Token end: 366 PosValue: "VDB" coarseValue: "VERB" + order: 0 [n't] POS_PART sofa: _InitialView @@ -2365,6 +2437,7 @@ Token end: 369 PosValue: "XX0" coarseValue: "PART" + order: 0 [know] POS_VERB sofa: _InitialView @@ -2394,6 +2467,7 @@ Token end: 374 PosValue: "VVI" coarseValue: "VERB" + order: 0 [whether] POS_SCONJ sofa: _InitialView @@ -2423,6 +2497,7 @@ Token end: 382 PosValue: "CJS" coarseValue: "SCONJ" + order: 0 [you] POS_PRON sofa: _InitialView @@ -2452,6 +2527,7 @@ Token end: 386 PosValue: "PNP" coarseValue: "PRON" + order: 0 [want] POS_VERB sofa: _InitialView @@ -2481,13 +2557,13 @@ Token end: 391 PosValue: "VVB" coarseValue: "VERB" + order: 0 [to] -POS_X +POS sofa: _InitialView begin: 392 end: 394 PosValue: "TO0" - coarseValue: "X" [to] Lemma sofa: _InitialView @@ -2504,12 +2580,12 @@ Token begin: 392 end: 394 value: "to" - pos: POS_X + pos: POS sofa: _InitialView begin: 392 end: 394 PosValue: "TO0" - coarseValue: "X" + order: 0 [go] POS_VERB sofa: _InitialView @@ -2539,6 +2615,7 @@ Token end: 397 PosValue: "VVI" coarseValue: "VERB" + order: 0 [and] POS_CONJ sofa: _InitialView @@ -2568,6 +2645,7 @@ Token end: 401 PosValue: "CJC" coarseValue: "CONJ" + order: 0 [see] POS_VERB sofa: _InitialView @@ -2597,6 +2675,7 @@ Token end: 405 PosValue: "VVI" coarseValue: "VERB" + order: 0 [her] POS_PRON sofa: _InitialView @@ -2626,6 +2705,7 @@ Token end: 409 PosValue: "PNP" coarseValue: "PRON" + order: 0 [rather] POS_ADV sofa: _InitialView @@ -2655,6 +2735,7 @@ Token end: 416 PosValue: "AV0" coarseValue: "ADV" + order: 0 [than] POS_SCONJ sofa: _InitialView @@ -2684,6 +2765,7 @@ Token end: 421 PosValue: "CJS" coarseValue: "SCONJ" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -2702,6 +2784,7 @@ Token end: 422 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [I] POS_PRON sofa: _InitialView @@ -2731,6 +2814,7 @@ Token end: 424 PosValue: "PNP" coarseValue: "PRON" + order: 0 [could] POS_AUX sofa: _InitialView @@ -2760,6 +2844,7 @@ Token end: 430 PosValue: "VM0" coarseValue: "AUX" + order: 0 [get] POS_VERB sofa: _InitialView @@ -2789,6 +2874,7 @@ Token end: 434 PosValue: "VVI" coarseValue: "VERB" + order: 0 [a] POS_DET sofa: _InitialView @@ -2818,6 +2904,7 @@ Token end: 436 PosValue: "AT0" coarseValue: "DET" + order: 0 [doctor] POS_NOUN sofa: _InitialView @@ -2847,13 +2934,13 @@ Token end: 443 PosValue: "NN1" coarseValue: "NOUN" + order: 0 [to] -POS_X +POS sofa: _InitialView begin: 444 end: 446 PosValue: "TO0" - coarseValue: "X" [to] Lemma sofa: _InitialView @@ -2870,12 +2957,12 @@ Token begin: 444 end: 446 value: "to" - pos: POS_X + pos: POS sofa: _InitialView begin: 444 end: 446 PosValue: "TO0" - coarseValue: "X" + order: 0 [go] POS_VERB sofa: _InitialView @@ -2905,6 +2992,7 @@ Token end: 449 PosValue: "VVI" coarseValue: "VERB" + order: 0 [and] POS_CONJ sofa: _InitialView @@ -2934,6 +3022,7 @@ Token end: 453 PosValue: "CJC" coarseValue: "CONJ" + order: 0 [see] POS_VERB sofa: _InitialView @@ -2963,6 +3052,7 @@ Token end: 457 PosValue: "VVI" coarseValue: "VERB" + order: 0 [her] POS_PRON sofa: _InitialView @@ -2992,6 +3082,7 @@ Token end: 461 PosValue: "PNP" coarseValue: "PRON" + order: 0 [and] POS_CONJ sofa: _InitialView @@ -3021,6 +3112,7 @@ Token end: 465 PosValue: "CJC" coarseValue: "CONJ" + order: 0 [phone] POS_NOUN sofa: _InitialView @@ -3050,6 +3142,7 @@ Token end: 471 PosValue: "NN1-VVB" coarseValue: "NOUN" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -3068,6 +3161,7 @@ Token end: 472 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -3086,6 +3180,7 @@ Token end: 473 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [it's just that I'm never gonna get to up to.] Sentence sofa: _InitialView @@ -3120,6 +3215,7 @@ Token end: 476 PosValue: "PNP" coarseValue: "PRON" + order: 0 ['s] POS_VERB sofa: _InitialView @@ -3149,6 +3245,7 @@ Token end: 478 PosValue: "VBZ" coarseValue: "VERB" + order: 0 [just] POS_ADV sofa: _InitialView @@ -3178,6 +3275,7 @@ Token end: 483 PosValue: "AV0" coarseValue: "ADV" + order: 0 [that] POS_SCONJ sofa: _InitialView @@ -3207,6 +3305,7 @@ Token end: 488 PosValue: "CJT" coarseValue: "SCONJ" + order: 0 [I] POS_PRON sofa: _InitialView @@ -3236,6 +3335,7 @@ Token end: 490 PosValue: "PNP" coarseValue: "PRON" + order: 0 ['m] POS_VERB sofa: _InitialView @@ -3265,6 +3365,7 @@ Token end: 492 PosValue: "VBB" coarseValue: "VERB" + order: 0 [never] POS_ADV sofa: _InitialView @@ -3294,6 +3395,7 @@ Token end: 498 PosValue: "AV0" coarseValue: "ADV" + order: 0 [gon] POS_VERB sofa: _InitialView @@ -3323,13 +3425,13 @@ Token end: 502 PosValue: "VVG" coarseValue: "VERB" + order: 0 [na] -POS_X +POS sofa: _InitialView begin: 502 end: 504 PosValue: "TO0" - coarseValue: "X" [na] Lemma sofa: _InitialView @@ -3346,12 +3448,12 @@ Token begin: 502 end: 504 value: "na" - pos: POS_X + pos: POS sofa: _InitialView begin: 502 end: 504 PosValue: "TO0" - coarseValue: "X" + order: 0 [get] POS_VERB sofa: _InitialView @@ -3381,6 +3483,7 @@ Token end: 508 PosValue: "VVI" coarseValue: "VERB" + order: 0 [to] POS_ADP sofa: _InitialView @@ -3410,13 +3513,13 @@ Token end: 511 PosValue: "PRP" coarseValue: "ADP" + order: 0 [up] -POS_X +POS sofa: _InitialView begin: 512 end: 514 PosValue: "AVP" - coarseValue: "X" [up] Lemma sofa: _InitialView @@ -3433,12 +3536,12 @@ Token begin: 512 end: 514 value: "up" - pos: POS_X + pos: POS sofa: _InitialView begin: 512 end: 514 PosValue: "AVP" - coarseValue: "X" + order: 0 [to] POS_ADP sofa: _InitialView @@ -3468,6 +3571,7 @@ Token end: 517 PosValue: "PRP" coarseValue: "ADP" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -3486,6 +3590,7 @@ Token end: 518 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [?] POS_PUNCT sofa: _InitialView @@ -3509,6 +3614,7 @@ Token end: 520 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [Yeah.] Sentence sofa: _InitialView @@ -3543,6 +3649,7 @@ Token end: 525 PosValue: "ITJ" coarseValue: "INTJ" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -3561,6 +3668,7 @@ Token end: 526 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [Okay.] Sentence sofa: _InitialView @@ -3595,6 +3703,7 @@ Token end: 531 PosValue: "AV0" coarseValue: "ADV" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -3613,6 +3722,7 @@ Token end: 532 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [Yeah.] Sentence sofa: _InitialView @@ -3647,6 +3757,7 @@ Token end: 537 PosValue: "ITJ" coarseValue: "INTJ" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -3665,6 +3776,7 @@ Token end: 538 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [erm, first twelve weeks pregnant so should I mark at the bottom when she types .] Sentence sofa: _InitialView @@ -3699,6 +3811,7 @@ Token end: 542 PosValue: "UNC" coarseValue: "X" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -3717,6 +3830,7 @@ Token end: 543 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [first] POS_NUM sofa: _InitialView @@ -3746,6 +3860,7 @@ Token end: 549 PosValue: "ORD" coarseValue: "NUM" + order: 0 [twelve] POS_NUM sofa: _InitialView @@ -3775,6 +3890,7 @@ Token end: 557 PosValue: "CRD" coarseValue: "NUM" + order: 0 [weeks] POS_NOUN sofa: _InitialView @@ -3804,6 +3920,7 @@ Token end: 563 PosValue: "NN2" coarseValue: "NOUN" + order: 0 [pregnant] POS_ADJ sofa: _InitialView @@ -3833,6 +3950,7 @@ Token end: 572 PosValue: "AJ0" coarseValue: "ADJ" + order: 0 [so] POS_ADV sofa: _InitialView @@ -3862,6 +3980,7 @@ Token end: 575 PosValue: "AV0" coarseValue: "ADV" + order: 0 [should] POS_AUX sofa: _InitialView @@ -3891,6 +4010,7 @@ Token end: 582 PosValue: "VM0" coarseValue: "AUX" + order: 0 [I] POS_PRON sofa: _InitialView @@ -3920,6 +4040,7 @@ Token end: 584 PosValue: "PNP" coarseValue: "PRON" + order: 0 [mark] POS_VERB sofa: _InitialView @@ -3949,6 +4070,7 @@ Token end: 589 PosValue: "VVI" coarseValue: "VERB" + order: 0 [at] POS_ADP sofa: _InitialView @@ -3978,6 +4100,7 @@ Token end: 592 PosValue: "PRP" coarseValue: "ADP" + order: 0 [the] POS_DET sofa: _InitialView @@ -4007,6 +4130,7 @@ Token end: 596 PosValue: "AT0" coarseValue: "DET" + order: 0 [bottom] POS_NOUN sofa: _InitialView @@ -4036,6 +4160,7 @@ Token end: 603 PosValue: "NN1-AJ0" coarseValue: "NOUN" + order: 0 [when] POS_SCONJ sofa: _InitialView @@ -4065,6 +4190,7 @@ Token end: 608 PosValue: "CJS" coarseValue: "SCONJ" + order: 0 [she] POS_PRON sofa: _InitialView @@ -4094,6 +4220,7 @@ Token end: 612 PosValue: "PNP" coarseValue: "PRON" + order: 0 [types] POS_VERB sofa: _InitialView @@ -4123,6 +4250,7 @@ Token end: 618 PosValue: "VVZ" coarseValue: "VERB" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -4141,6 +4269,7 @@ Token end: 620 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 [Erm this one.] Sentence sofa: _InitialView @@ -4175,6 +4304,7 @@ Token end: 624 PosValue: "UNC" coarseValue: "X" + order: 0 [this] POS_DET sofa: _InitialView @@ -4204,6 +4334,7 @@ Token end: 630 PosValue: "DT0" coarseValue: "DET" + order: 0 [one] POS_PRON sofa: _InitialView @@ -4233,6 +4364,7 @@ Token end: 634 PosValue: "PNI" coarseValue: "PRON" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -4251,6 +4383,7 @@ Token end: 635 PosValue: "PUN" coarseValue: "PUNCT" + order: 0 -------- View _InitialView end ---------------------------------- ======== CAS 0 end ================================== \ No newline at end of file diff --git a/dkpro-core-io-bnc-asl/src/test/resources/log4j.properties b/dkpro-core-io-bnc-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-io-bnc-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-bnc-asl/src/test/resources/log4j2.xml b/dkpro-core-io-bnc-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-bnc-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-brat-asl/pom.xml b/dkpro-core-io-brat-asl/pom.xml index a60aa017d2..3ba10766c9 100644 --- a/dkpro-core-io-brat-asl/pom.xml +++ b/dkpro-core-io-brat-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.brat-asl + dkpro-core-io-brat-asl jar DKPro Core ASL - IO - brat file format + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -43,6 +44,18 @@ org.apache.commons commons-lang3 + + org.apache.commons + commons-collections4 + + + commons-logging + commons-logging-api + + + org.dkpro.core + dkpro-core-api-segmentation-asl + org.springframework spring-core @@ -50,15 +63,26 @@ com.fasterxml.jackson.core jackson-core - 2.4.2 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + com.fasterxml.jackson.core + jackson-annotations + + + com.fasterxml.jackson.core + jackson-databind - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-io-asl + + + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -66,13 +90,23 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.assertj + assertj-core + test + + + org.dkpro.core + dkpro-core-testing-asl + test + + + org.dkpro.core + dkpro-core-io-conll-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.conll-asl + org.dkpro.core + dkpro-core-api-ner-asl test diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/BratReader.java b/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/BratReader.java deleted file mode 100644 index 39c2a549e4..0000000000 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/BratReader.java +++ /dev/null @@ -1,410 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.brat; - -import java.io.BufferedInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; -import java.net.URL; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; - -import org.apache.commons.io.FilenameUtils; -import org.apache.commons.io.IOUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.FeatureStructure; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.TypeSystem; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.util.FSUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratAnnotation; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratAnnotationDocument; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratAttribute; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratEventAnnotation; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratEventArgument; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratRelationAnnotation; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratTextAnnotation; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.RelationParam; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.TextAnnotationParam; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.TypeMapping; - -/** - * Reader for the brat format. - * - * @see brat standoff format - * @see brat configuration format - */ -@ResourceMetaData(name="Brat Reader") -public class BratReader - extends JCasResourceCollectionReader_ImplBase -{ - /** - * Name of configuration parameter that contains the character encoding used by the input files. - */ - public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) - private String sourceEncoding; - - /** - * Types that are relations. It is mandatory to provide the type name followed by two feature - * names that represent Arg1 and Arg2 separated by colons, e.g. - * de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency:Governor:Dependent{A}. - * Additionally, a subcategorization feature may be specified. - */ - public static final String PARAM_RELATION_TYPES = "relationTypes"; - @ConfigurationParameter(name = PARAM_RELATION_TYPES, mandatory = true, defaultValue = { - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency:Governor:Dependent{A}" - }) - private Set relationTypes; - private Map parsedRelationTypes; - - /** - * Types that are text annotations. It is mandatory to provide the type name which can - * optionally be followed by a subcategorization feature. Using this parameter is - * only necessary to specify a subcategorization feature. Otherwise, text annotation types are - * automatically detected. - */ - public static final String PARAM_TEXT_ANNOTATION_TYPES = "textAnnotationTypes"; - @ConfigurationParameter(name = PARAM_TEXT_ANNOTATION_TYPES, mandatory = true, defaultValue = {}) - private Set textAnnotationTypes; - private Map parsedTextAnnotationTypes; - - public static final String PARAM_TYPE_MAPPINGS = "typeMappings"; - @ConfigurationParameter(name = PARAM_TYPE_MAPPINGS, mandatory = false, defaultValue = { -// "Token -> de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", -// "Organization -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization", -// "Location -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location" - }) - private String[] typeMappings; - private TypeMapping typeMapping; - - private Map spanIdMap; - - private Set warnings; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - parsedRelationTypes = new HashMap<>(); - for (String rel : relationTypes) { - RelationParam p = RelationParam.parse(rel); - parsedRelationTypes.put(p.getType(), p); - } - - parsedTextAnnotationTypes = new HashMap<>(); - for (String rel : textAnnotationTypes) { - TextAnnotationParam p = TextAnnotationParam.parse(rel); - parsedTextAnnotationTypes.put(p.getType(), p); - } - - typeMapping = new TypeMapping(typeMappings); - - warnings = new LinkedHashSet(); - } - - @Override - public void close() - throws IOException - { - super.close(); - - for (String warning : warnings) { - getLogger().warn(warning); - } - } - - @Override - public void getNext(JCas aJCas) - throws IOException, CollectionException - { - spanIdMap = new HashMap<>(); - - Resource res = nextFile(); - initCas(aJCas, res); - - readText(aJCas, res); - readAnnotations(aJCas, res); - } - - private void readAnnotations(JCas aJCas, Resource aRes) - throws IOException - { - BratAnnotationDocument doc; - try (Reader r = new InputStreamReader(aRes.getInputStream(), sourceEncoding)) { - doc = BratAnnotationDocument.read(r); - } - - CAS cas = aJCas.getCas(); - TypeSystem ts = aJCas.getTypeSystem(); - - List relations = new ArrayList<>(); - List events = new ArrayList<>(); - for (BratAnnotation anno : doc.getAnnotations()) { - Type type = typeMapping.getUimaType(ts, anno); - if (anno instanceof BratTextAnnotation) { - create(cas, type, (BratTextAnnotation) anno); - } - else if (anno instanceof BratRelationAnnotation) { - relations.add((BratRelationAnnotation) anno); - } - else if (anno instanceof BratEventAnnotation) { - create(cas, type, (BratEventAnnotation) anno); - events.add((BratEventAnnotation) anno); - } - else { - throw new IllegalStateException("Annotation type [" + anno.getClass() - + "] is currently not supported."); - } - } - - // Go through the relations now - for (BratRelationAnnotation rel : relations) { - Type type = typeMapping.getUimaType(ts, rel); - create(cas, type, rel); - } - - // Go through the events again and handle the slots - for (BratEventAnnotation e : events) { - Type type = typeMapping.getUimaType(ts, e); - fillSlots(cas, type, doc, e); - } - } - - private void readText(JCas aJCas, Resource res) - throws IOException - { - String annUrl = res.getResource().getURL().toString(); - String textUrl = FilenameUtils.removeExtension(annUrl) + ".txt"; - - try (InputStream is = new BufferedInputStream(new URL(textUrl).openStream())) { - aJCas.setDocumentText(IOUtils.toString(is, sourceEncoding)); - } - } - - private void create(CAS aCAS, Type aType, BratTextAnnotation aAnno) - { - TextAnnotationParam param = parsedTextAnnotationTypes.get(aType.getName()); - - AnnotationFS anno = aCAS.createAnnotation(aType, aAnno.getBegin(), aAnno.getEnd()); - - if (param != null && param.getSubcat() != null) { - anno.setStringValue(getFeature(anno, param.getSubcat()), aAnno.getType()); - } - - fillAttributes(anno, aAnno.getAttributes()); - aCAS.addFsToIndexes(anno); - spanIdMap.put(aAnno.getId(), anno); - } - - private void create(CAS aCAS, Type aType, BratEventAnnotation aAnno) - { - AnnotationFS anno = aCAS.createAnnotation(aType, - aAnno.getTriggerAnnotation().getBegin(), aAnno.getTriggerAnnotation().getEnd()); - fillAttributes(anno, aAnno.getAttributes()); - - // Slots cannot be handled yet because they might point to events that have not been - // created yet. - - aCAS.addFsToIndexes(anno); - spanIdMap.put(aAnno.getId(), anno); - } - - private void create(CAS aCAS, Type aType, BratRelationAnnotation aAnno) - { - RelationParam param = parsedRelationTypes.get(aType.getName()); - - AnnotationFS arg1 = spanIdMap.get(aAnno.getArg1Target()); - AnnotationFS arg2 = spanIdMap.get(aAnno.getArg2Target()); - - FeatureStructure anno = aCAS.createFS(aType); - - anno.setFeatureValue(getFeature(anno, param.getArg1()), arg1); - anno.setFeatureValue(getFeature(anno, param.getArg2()), arg2); - - AnnotationFS anchor = null; - if (param.getFlags1().contains(RelationParam.FLAG_ANCHOR) && - param.getFlags2().contains(RelationParam.FLAG_ANCHOR)) { - throw new IllegalStateException("Only one argument can be the anchor."); - } - else if (param.getFlags1().contains(RelationParam.FLAG_ANCHOR)) { - anchor = arg1; - } - else if (param.getFlags2().contains(RelationParam.FLAG_ANCHOR)) { - anchor = arg2; - } - - if (param.getSubcat() != null) { - anno.setStringValue(getFeature(anno, param.getSubcat()), aAnno.getType()); - } - - if (anchor != null) { - anno.setIntValue(anno.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_BEGIN), - anchor.getBegin()); - anno.setIntValue(anno.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_END), - anchor.getEnd()); - } - else { - TypeSystem ts = aCAS.getTypeSystem(); - if (ts.subsumes(ts.getType(CAS.TYPE_NAME_ANNOTATION), anno.getType())) { - warnings.add("Relation type [" + aType.getName() - + "] has offsets but no anchor is specified."); - } - } - - fillAttributes(anno, aAnno.getAttributes()); - - aCAS.addFsToIndexes(anno); - } - - private void fillAttributes(FeatureStructure aAnno, Collection aAttributes) - { - for (BratAttribute attr : aAttributes) { - // Try treating the attribute name as an unqualified name, then as a qualified name. - Feature feat = aAnno.getType().getFeatureByBaseName(attr.getName()); - if (feat == null) { - String featName = attr.getName().replace('_', ':'); - featName = featName.substring(featName.indexOf(TypeSystem.FEATURE_SEPARATOR) + 1); - feat = aAnno.getType().getFeatureByBaseName(featName); - } - - // FIXME HACK! We may not find a "role" feature from slot links in the target type - // because it should be in the link type. This here is a bad hack, but it should work - // as long as the target type doesn't define a "role" feature itself. - if ((("role".equals(attr.getName())) || attr.getName().endsWith("_role")) - && feat == null) { - return; - } - - if (feat == null) { - throw new IllegalStateException("Type [" + aAnno.getType().getName() - + "] has no feature named [" + attr.getName() + "]"); - } - - if (attr.getValues().length == 0) { - // Nothing to do - } - else if (attr.getValues().length == 1) { - aAnno.setFeatureValueFromString(feat, attr.getValues()[0]); - } - else { - throw new IllegalStateException("Multi-valued attributes currently not supported"); - } - } - } - - private void fillSlots(CAS aCas, Type aType, BratAnnotationDocument aDoc, BratEventAnnotation aE) - { - AnnotationFS event = spanIdMap.get(aE.getId()); - Map> groupedArgs = aE.getGroupedArguments(); - - for (Entry> slot : groupedArgs.entrySet()) { - // Resolve the target IDs to feature structures - List targets = new ArrayList<>(); - - // Lets see if there is a multi-valued feature by the name of the slot - if (FSUtil.hasFeature(event, slot.getKey()) - && FSUtil.isMultiValuedFeature(event, slot.getKey())) { - for (BratEventArgument arg : slot.getValue()) { - FeatureStructure target = spanIdMap.get(arg.getTarget()); - if (target == null) { - throw new IllegalStateException("Unable to resolve id [" + arg.getTarget() - + "]"); - } - - // Handle WebAnno-style slot links - // FIXME It would be better if the link type could be configured, e.g. what - // is the name of the link feature and what is the name of the role feature... - // but right now we just keep it hard-coded to the values that are used - // in the DKPro Core SemArgLink and that are also hard-coded in WebAnno - Type componentType = event.getType().getFeatureByBaseName(slot.getKey()) - .getRange().getComponentType(); - if (CAS.TYPE_NAME_TOP - .equals(aCas.getTypeSystem().getParent(componentType).getName())) { - BratAnnotation targetAnno = aDoc.getAnnotation(arg.getTarget()); - BratAttribute roleAttr = targetAnno.getAttribute("role"); - if (roleAttr == null) { - roleAttr = targetAnno.getAttribute( - target.getType().getName().replace('.', '-') + "_role"); - } - FeatureStructure link = aCas.createFS(componentType); - FSUtil.setFeature(link, "role", roleAttr.getValues()); - FSUtil.setFeature(link, "target", target); - target = link; - } - - targets.add(target); - } - FSUtil.setFeature(event, slot.getKey(), targets); - } - // Lets see if there is a single-valued feature by the name of the slot - else if (FSUtil.hasFeature(event, slot.getKey())) { - for (BratEventArgument arg : slot.getValue()) { - AnnotationFS target = spanIdMap.get(arg.getTarget()); - if (target == null) { - throw new IllegalStateException("Unable to resolve id [" + arg.getTarget() - + "]"); - } - - String fname = arg.getSlot() + (arg.getIndex() > 0 ? arg.getIndex() : ""); - if (FSUtil.hasFeature(event, fname)) { - FSUtil.setFeature(event, fname, target); - } - else { - throw new IllegalStateException("Type [" + event.getType().getName() - + "] has no feature naemd [" + fname + "]"); - } - } - } - else { - throw new IllegalStateException("Type [" + event.getType().getName() - + "] has no feature naemd [" + slot.getKey() + "]"); - } - } - } - - private Feature getFeature(FeatureStructure aFS, String aName) - { - Feature f = aFS.getType().getFeatureByBaseName(aName); - if (f == null) { - throw new IllegalArgumentException("Type [" + aFS.getType().getName() - + "] has no feature called [" + aName + "]"); - } - return f; - } -} diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/BratWriter.java b/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/BratWriter.java deleted file mode 100644 index 0234846f46..0000000000 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/BratWriter.java +++ /dev/null @@ -1,730 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.brat; - -import static org.apache.uima.fit.util.JCasUtil.selectAll; - -import java.io.IOException; -import java.io.OutputStream; -import java.io.OutputStreamWriter; -import java.io.StringWriter; -import java.io.Writer; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; -import java.util.stream.Collectors; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.FeatureStructure; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.TypeSystem; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.util.FSUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonGenerator; - -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratAnnotation; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratAnnotationDocument; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratAttributeDecl; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratConfiguration; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratConstants; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratEventAnnotation; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratEventAnnotationDecl; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratEventArgument; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratEventArgumentDecl; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratRelationAnnotation; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratTextAnnotation; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratTextAnnotationDrawingDecl; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.RelationParam; -import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.TypeMapping; - -/** - * Writer for the brat annotation format. - * - *

Known issues:

- * - * - * @see brat standoff format - * @see brat configuration format - */ -@ResourceMetaData(name="Brat Writer") -public class BratWriter extends JCasFileWriter_ImplBase -{ - /** - * Specify the suffix of text output files. Default value .txt. If the suffix is not - * needed, provide an empty string as value. - */ - public static final String PARAM_TEXT_FILENAME_EXTENSION = "textFilenameExtension"; - @ConfigurationParameter(name = PARAM_TEXT_FILENAME_EXTENSION, mandatory = true, defaultValue = ".txt") - private String textFilenameExtension; - - /** - * Specify the suffix of output files. Default value .ann. If the suffix is not - * needed, provide an empty string as value. - */ - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; - @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".ann") - private String filenameSuffix; - - /** - * Types that will not be written to the exported file. - */ - public static final String PARAM_EXCLUDE_TYPES = "excludeTypes"; - @ConfigurationParameter(name = PARAM_EXCLUDE_TYPES, mandatory = true, defaultValue = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) - private Set excludeTypes; - - /** - * Types that are text annotations (aka entities or spans). - */ - public static final String PARAM_TEXT_ANNOTATION_TYPES = "spanTypes"; - @ConfigurationParameter(name = PARAM_TEXT_ANNOTATION_TYPES, mandatory = true, defaultValue = { -// "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", -// "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", -// "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", -// "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", -// "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem", -// "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk", -// "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", -// "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg", -// "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred" - }) - private Set spanTypes; - - /** - * Types that are relations. It is mandatory to provide the type name followed by two feature - * names that represent Arg1 and Arg2 separated by colons, e.g. - * de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency:Governor:Dependent. - */ - public static final String PARAM_RELATION_TYPES = "relationTypes"; - @ConfigurationParameter(name = PARAM_RELATION_TYPES, mandatory = true, defaultValue = { - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency:Governor:Dependent" - }) - private Set relationTypes; - private Map parsedRelationTypes; - -// /** -// * Types that are events. Optionally, multiple slot features can be specified. -// * my.type.Event:location:participant. -// */ -// public static final String PARAM_EVENT_TYPES = "eventTypes"; -// @ConfigurationParameter(name = PARAM_EVENT_TYPES, mandatory = true, defaultValue = { }) -// private Set eventTypes; -// private Map parsedEventTypes; - - /** - * Enable type mappings. - */ - public static final String PARAM_ENABLE_TYPE_MAPPINGS = "enableTypeMappings"; - @ConfigurationParameter(name = PARAM_ENABLE_TYPE_MAPPINGS, mandatory = true, defaultValue = "false") - private boolean enableTypeMappings; - - /** - * FIXME - */ - public static final String PARAM_TYPE_MAPPINGS = "typeMappings"; - @ConfigurationParameter(name = PARAM_TYPE_MAPPINGS, mandatory = false, defaultValue = { - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.(\\w+) -> $1", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.(\\w+) -> $1", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.(\\w+) -> $1", - "de.tudarmstadt.ukp.dkpro.core.api.ner.type.(\\w+) -> $1" - }) - private String[] typeMappings; - private TypeMapping typeMapping; - - /** - * The brat web application can currently not handle attributes on relations, thus they are - * disabled by default. Here they can be enabled again. - */ - public static final String PARAM_WRITE_RELATION_ATTRIBUTES = "writeRelationAttributes"; - @ConfigurationParameter(name = PARAM_WRITE_RELATION_ATTRIBUTES, mandatory = true, defaultValue = "false") - private boolean writeRelationAttributes; - - /** - * Enable writing of features with null values. - */ - public static final String PARAM_WRITE_NULL_ATTRIBUTES = "writeNullAttributes"; - @ConfigurationParameter(name = PARAM_WRITE_NULL_ATTRIBUTES, mandatory = true, defaultValue = "false") - private boolean writeNullAttributes; - - /** - * Colors to be used for the visual configuration that is generated for brat. - */ - public static final String PARAM_PALETTE = "palette"; - @ConfigurationParameter(name = PARAM_PALETTE, mandatory = false, defaultValue = { "#8dd3c7", - "#ffffb3", "#bebada", "#fb8072", "#80b1d3", "#fdb462", "#b3de69", "#fccde5", "#d9d9d9", - "#bc80bd", "#ccebc5", "#ffed6f" }) - private String[] palette; - - /** - * Whether to render attributes by their short name or by their qualified name. - */ - public static final String PARAM_SHORT_ATTRIBUTE_NAMES = "shortAttributeNames"; - @ConfigurationParameter(name = PARAM_SHORT_ATTRIBUTE_NAMES, mandatory = true, defaultValue = "false") - private boolean shortAttributeNames; - - private int nextEventAnnotationId; - private int nextTextAnnotationId; - private int nextRelationAnnotationId; - private int nextAttributeId; - private int nextPaletteIndex; - private Map spanIdMap; - - private BratConfiguration conf; - - private Set warnings; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - conf = new BratConfiguration(); - - warnings = new LinkedHashSet(); - - parsedRelationTypes = new HashMap<>(); - for (String rel : relationTypes) { - RelationParam p = RelationParam.parse(rel); - parsedRelationTypes.put(p.getType(), p); - } - -// parsedEventTypes = new HashMap<>(); -// for (String rel : eventTypes) { -// EventParam p = EventParam.parse(rel); -// parsedEventTypes.put(p.getType(), p); -// } - - if (enableTypeMappings) { - typeMapping = new TypeMapping(typeMappings); - } - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - nextEventAnnotationId = 1; - nextTextAnnotationId = 1; - nextRelationAnnotationId = 1; - nextAttributeId = 1; - nextPaletteIndex = 0; - spanIdMap = new HashMap<>(); - - try { - if (".ann".equals(filenameSuffix)) { - writeText(aJCas); - } - writeAnnotations(aJCas); - } - catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - } - - @Override - public void collectionProcessComplete() - throws AnalysisEngineProcessException - { - if (!".ann".equals(filenameSuffix)) { - return; - } - - try { - writeAnnotationConfiguration(); - writeVisualConfiguration(); - } - catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - - for (String warning : warnings) { - getLogger().warn(warning); - } - } - - private void writeAnnotationConfiguration() - throws IOException - { - try (Writer out = new OutputStreamWriter(getOutputStream("annotation", ".conf"), "UTF-8")) { - conf.writeAnnotationConfiguration(out); - } - } - - private void writeVisualConfiguration() - throws IOException - { - try (Writer out = new OutputStreamWriter(getOutputStream("visual", ".conf"), "UTF-8")) { - conf.writeVisualConfiguration(out); - } - } - - private void writeAnnotations(JCas aJCas) - throws IOException - { - BratAnnotationDocument doc = new BratAnnotationDocument(); - - List relationFS = new ArrayList<>(); - - Map eventFS = new LinkedHashMap<>(); - - // Go through all the annotations but only handle the ones that have no references to - // other annotations. - for (FeatureStructure fs : selectAll(aJCas)) { - // Skip document annotation - if (fs == aJCas.getDocumentAnnotationFs()) { - continue; - } - - // Skip excluded types - if (excludeTypes.contains(fs.getType().getName())) { - getLogger().debug("Excluding [" + fs.getType().getName() + "]"); - continue; - } - - if (spanTypes.contains(fs.getType().getName())) { - writeTextAnnotation(doc, (AnnotationFS) fs); - } - else if (parsedRelationTypes.containsKey(fs.getType().getName())) { - relationFS.add(fs); - } - else if (hasNonPrimitiveFeatures(fs) && (fs instanceof AnnotationFS)) { -// else if (parsedEventTypes.containsKey(fs.getType().getName())) { - BratEventAnnotation event = writeEventAnnotation(doc, (AnnotationFS) fs); - eventFS.put(event, fs); - } - else if (fs instanceof AnnotationFS) { - warnings.add("Assuming annotation type ["+fs.getType().getName()+"] is span"); - writeTextAnnotation(doc, (AnnotationFS) fs); - } - else { - warnings.add("Skipping annotation with type ["+fs.getType().getName()+"]"); - } - } - - // Handle relations now since now we can resolve their targets to IDs. - for (FeatureStructure fs : relationFS) { - writeRelationAnnotation(doc, fs); - } - - // Handle event slots now since now we can resolve their targets to IDs. - for (Entry e : eventFS.entrySet()) { - writeSlots(doc, e.getKey(), e.getValue()); - } - - switch (filenameSuffix) { - case ".ann": - try (Writer out = new OutputStreamWriter(getOutputStream(aJCas, filenameSuffix), "UTF-8")) { - doc.write(out); - break; - } - case ".html": - case ".json": - String template ; - if (filenameSuffix.equals(".html")) { - template = IOUtils.toString(getClass().getResource("html/template.html")); - } - else { - template = "{ \"collData\" : ##COLL-DATA## , \"docData\" : ##DOC-DATA## }"; - } - - JsonFactory jfactory = new JsonFactory(); - try (Writer out = new OutputStreamWriter(getOutputStream(aJCas, filenameSuffix), "UTF-8")) { - String docData; - try (StringWriter buf = new StringWriter()) { - try (JsonGenerator jg = jfactory.createGenerator(buf)) { - jg.useDefaultPrettyPrinter(); - doc.write(jg, aJCas.getDocumentText()); - } - docData = buf.toString(); - } - - String collData; - try (StringWriter buf = new StringWriter()) { - try (JsonGenerator jg = jfactory.createGenerator(buf)) { - jg.useDefaultPrettyPrinter(); - conf.write(jg); - } - collData = buf.toString(); - } - - template = StringUtils.replaceEach(template, - new String[] {"##COLL-DATA##", "##DOC-DATA##"}, - new String[] {collData, docData}); - - out.write(template); - } - conf = new BratConfiguration(); - break; - default: - throw new IllegalArgumentException("Unknown file format: [" + filenameSuffix + "]"); - } - } - - /** - * Checks if the feature structure has non-default non-primitive properties. - */ - private boolean hasNonPrimitiveFeatures(FeatureStructure aFS) - { - for (Feature f : aFS.getType().getFeatures()) { - if (CAS.FEATURE_BASE_NAME_SOFA.equals(f.getShortName())) { - continue; - } - - if (!f.getRange().isPrimitive()) { - return true; - } - } - - return false; - } - - private String getBratType(Type aType) - { - if (enableTypeMappings) { - return typeMapping.getBratType(aType); - } - else { - return aType.getName().replace('.', '-'); - } - } - - private BratEventAnnotation writeEventAnnotation(BratAnnotationDocument aDoc, AnnotationFS aFS) - { - // Write trigger annotation - BratTextAnnotation trigger = new BratTextAnnotation(nextTextAnnotationId, - getBratType(aFS.getType()), aFS.getBegin(), aFS.getEnd(), aFS.getCoveredText()); - nextTextAnnotationId++; - - // Write event annotation - BratEventAnnotation event = new BratEventAnnotation(nextEventAnnotationId, - getBratType(aFS.getType()), trigger.getId()); - spanIdMap.put(aFS, event.getId()); - nextEventAnnotationId++; - - // We do not add the trigger annotations to the document - they are owned by the event - //aDoc.addAnnotation(trigger); - event.setTriggerAnnotation(trigger); - - // Write attributes - writeAttributes(event, aFS); - - // Slots are written later after we know all the span/event IDs - - conf.addLabelDecl(event.getType(), aFS.getType().getShortName(), aFS.getType() - .getShortName().substring(0, 1)); - - if (!conf.hasDrawingDecl(event.getType())) { - conf.addDrawingDecl(new BratTextAnnotationDrawingDecl(event.getType(), "black", - palette[nextPaletteIndex % palette.length])); - nextPaletteIndex++; - } - - aDoc.addAnnotation(event); - return event; - } - - private void writeSlots(BratAnnotationDocument aDoc, BratEventAnnotation aEvent, - FeatureStructure aFS) - { - String superType = getBratType(aFS.getCAS().getTypeSystem().getParent(aFS.getType())); - String type = getBratType(aFS.getType()); - - assert type.equals(aEvent.getType()); - - BratEventAnnotationDecl decl = conf.getEventDecl(type); - if (decl == null) { - decl = new BratEventAnnotationDecl(superType, type); - conf.addEventDecl(decl); - } - - Map> slots = new LinkedHashMap<>(); - for (Feature feat : aFS.getType().getFeatures()) { - if (!isSlotFeature(aFS, feat)) { - continue; - } - String slot = feat.getShortName(); - - List args = slots.get(slot); - if (args == null) { - args = new ArrayList<>(); - slots.put(slot, args); - } - - if ( - FSUtil.isMultiValuedFeature(aFS, feat) && - CAS.TYPE_NAME_TOP.equals(aFS.getCAS().getTypeSystem().getParent(feat.getRange().getComponentType()).getName()) && - (feat.getRange().getComponentType().getFeatureByBaseName("target") != null) && - (feat.getRange().getComponentType().getFeatureByBaseName("role") != null) - ) { - // Handle WebAnno-style slot links - // FIXME It would be better if the link type could be configured, e.g. what - // is the name of the link feature and what is the name of the role feature... - // but right now we just keep it hard-coded to the values that are used - // in the DKPro Core SemArgLink and that are also hard-coded in WebAnno - BratEventArgumentDecl slotDecl = new BratEventArgumentDecl(slot, - BratConstants.CARD_ZERO_OR_MORE); - decl.addSlot(slotDecl); - - FeatureStructure[] links = FSUtil.getFeature(aFS, feat, FeatureStructure[].class); - if (links != null) { - for (FeatureStructure link : links) { - FeatureStructure target = FSUtil.getFeature(link, "target", - FeatureStructure.class); - Feature roleFeat = link.getType().getFeatureByBaseName("role"); - BratEventArgument arg = new BratEventArgument(slot, args.size(), - spanIdMap.get(target)); - args.add(arg); - - // Attach the role attribute to the target span - BratAnnotation targetAnno = aDoc.getAnnotation(spanIdMap.get(target)); - writePrimitiveAttribute(targetAnno, link, roleFeat); - } - } - } - else if (FSUtil.isMultiValuedFeature(aFS, feat)) { - // Handle normal multi-valued features - BratEventArgumentDecl slotDecl = new BratEventArgumentDecl(slot, - BratConstants.CARD_ZERO_OR_MORE); - decl.addSlot(slotDecl); - - FeatureStructure[] targets = FSUtil.getFeature(aFS, feat, FeatureStructure[].class); - if (targets != null) { - for (FeatureStructure target : targets) { - BratEventArgument arg = new BratEventArgument(slot, args.size(), - spanIdMap.get(target)); - args.add(arg); - } - } - } - else { - // Handle normal single-valued features - BratEventArgumentDecl slotDecl = new BratEventArgumentDecl(slot, - BratConstants.CARD_OPTIONAL); - decl.addSlot(slotDecl); - - FeatureStructure target = FSUtil.getFeature(aFS, feat, FeatureStructure.class); - if (target != null) { - BratEventArgument arg = new BratEventArgument(slot, args.size(), - spanIdMap.get(target)); - args.add(arg); - } - } - } - - aEvent.setArguments(slots.values().stream().flatMap(args -> args.stream()) - .collect(Collectors.toList())); - } - - private boolean isSlotFeature(FeatureStructure aFS, Feature aFeature) - { - return !isInternalFeature(aFeature) - && (FSUtil.isMultiValuedFeature(aFS, aFeature) || !aFeature.getRange() - .isPrimitive()); - } - - private void writeRelationAnnotation(BratAnnotationDocument aDoc, FeatureStructure aFS) - { - RelationParam rel = parsedRelationTypes.get(aFS.getType().getName()); - - FeatureStructure arg1 = aFS.getFeatureValue(aFS.getType().getFeatureByBaseName( - rel.getArg1())); - FeatureStructure arg2 = aFS.getFeatureValue(aFS.getType().getFeatureByBaseName( - rel.getArg2())); - - if (arg1 == null || arg2 == null) { - throw new IllegalArgumentException("Dangling relation"); - } - - String arg1Id = spanIdMap.get(arg1); - String arg2Id = spanIdMap.get(arg2); - - if (arg1Id == null || arg2Id == null) { - throw new IllegalArgumentException("Unknown targets!"); - } - - String superType = getBratType(aFS.getCAS().getTypeSystem().getParent(aFS.getType())); - String type = getBratType(aFS.getType()); - - BratRelationAnnotation anno = new BratRelationAnnotation(nextRelationAnnotationId, - type, rel.getArg1(), arg1Id, rel.getArg2(), arg2Id); - nextRelationAnnotationId++; - - conf.addRelationDecl(superType, type, rel.getArg1(), rel.getArg2()); - - conf.addLabelDecl(anno.getType(), aFS.getType().getShortName(), aFS.getType() - .getShortName().substring(0, 1)); - - aDoc.addAnnotation(anno); - - // brat doesn't support attributes on relations - // https://github.com/nlplab/brat/issues/791 - if (writeRelationAttributes) { - writeAttributes(anno, aFS); - } - } - - private void writeTextAnnotation(BratAnnotationDocument aDoc, AnnotationFS aFS) - { - String superType = getBratType(aFS.getCAS().getTypeSystem().getParent(aFS.getType())); - String type = getBratType(aFS.getType()); - - BratTextAnnotation anno = new BratTextAnnotation(nextTextAnnotationId, type, - aFS.getBegin(), aFS.getEnd(), aFS.getCoveredText()); - nextTextAnnotationId++; - - conf.addEntityDecl(superType, type); - - conf.addLabelDecl(anno.getType(), aFS.getType().getShortName(), aFS.getType() - .getShortName().substring(0, 1)); - - if (!conf.hasDrawingDecl(anno.getType())) { - conf.addDrawingDecl(new BratTextAnnotationDrawingDecl(anno.getType(), "black", - palette[nextPaletteIndex % palette.length])); - nextPaletteIndex++; - } - - aDoc.addAnnotation(anno); - - writeAttributes(anno, aFS); - - spanIdMap.put(aFS, anno.getId()); - } - - private boolean isInternalFeature(Feature aFeature) - { - // https://issues.apache.org/jira/browse/UIMA-4565 - return "uima.cas.AnnotationBase:sofa".equals(aFeature.getName()); - // return CAS.FEATURE_FULL_NAME_SOFA.equals(aFeature.getName()); - } - - private void writeAttributes(BratAnnotation aAnno, FeatureStructure aFS) - { - for (Feature feat : aFS.getType().getFeatures()) { - // Skip Sofa feature - if (isInternalFeature(feat)) { - continue; - } - - // No need to write begin / end, they are already on the text annotation - if (CAS.FEATURE_FULL_NAME_BEGIN.equals(feat.getName()) || - CAS.FEATURE_FULL_NAME_END.equals(feat.getName())) { - continue; - } - - // No need to write link endpoints again, they are already on the relation annotation - RelationParam relParam = parsedRelationTypes.get(aFS.getType().getName()); - if (relParam != null) { - if (relParam.getArg1().equals(feat.getShortName()) - || relParam.getArg2().equals(feat.getShortName())) { - continue; - } - } - - if (feat.getRange().isPrimitive()) { - writePrimitiveAttribute(aAnno, aFS, feat); - } - // The following warning is not relevant for event annotations because these render such - // features as slots. - else if (!(aAnno instanceof BratEventAnnotation)) { - warnings.add( - "Unable to render feature [" + feat.getName() + "] with range [" - + feat.getRange().getName() + "] as attribute"); - } - } - } - - private void writePrimitiveAttribute(BratAnnotation aAnno, FeatureStructure aFS, Feature feat) - { - String featureValue = aFS.getFeatureValueAsString(feat); - - // Do not write attributes with null values unless this is explicitly enabled - if (featureValue == null && !writeNullAttributes) { - return; - } - - String attributeName = shortAttributeNames ? feat.getShortName() - : aAnno.getType() + '_' + feat.getShortName(); - - aAnno.addAttribute(nextAttributeId, attributeName, featureValue); - nextAttributeId++; - - // Do not write certain values to the visual/annotation configuration because - // they are not compatible with the brat annotation file format. The values are - // still maintained in the ann file. - if (isValidFeatureValue(featureValue)) { - // Features are inherited to subtypes in UIMA. By storing the attribute under - // the name of the type that declares the feature (domain) instead of the name - // of the actual instance we are processing, we make sure not to maintain - // multiple value sets for the same feature. - BratAttributeDecl attrDecl = conf.addAttributeDecl( - aAnno.getType(), - getAllSubtypes(aFS.getCAS().getTypeSystem(), feat.getDomain()), - attributeName, featureValue); - conf.addDrawingDecl(attrDecl); - } - } - - // This generates lots of types as well that we may not otherwise have in declared in the - // brat configuration files, but brat doesn't seem to mind. - private Set getAllSubtypes(TypeSystem aTS, Type aType) - { - Set types = new LinkedHashSet<>(); - aTS.getProperlySubsumedTypes(aType).stream().forEach(t -> types.add(getBratType(t))); - return types; - } - - /** - * Some feature values do not need to be registered or cannot be registered because brat does - * not support them. - */ - private boolean isValidFeatureValue(String aFeatureValue) - { - // https://github.com/nlplab/brat/issues/1149 - return !(aFeatureValue == null || aFeatureValue.length() == 0 || aFeatureValue.equals(",")); - } - - private void writeText(JCas aJCas) - throws IOException - { - try (OutputStream docOS = getOutputStream(aJCas, textFilenameExtension)) { - IOUtils.write(aJCas.getDocumentText(), docOS); - } - } -} diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratTextAnnotation.java b/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratTextAnnotation.java deleted file mode 100644 index 2f36401e64..0000000000 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratTextAnnotation.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; - -import java.io.IOException; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import com.fasterxml.jackson.core.JsonGenerator; - -public class BratTextAnnotation - extends BratAnnotation -{ - private static final Pattern PATTERN = Pattern.compile( - "(?T[0-9]+)\\t" + - "(?[a-zA-Z_][a-zA-Z0-9_\\-]+) " + - "(?[0-9]+) " + - "(?[0-9]+)\\t" + - "(?.*)"); - - private static final String ID = "ID"; - private static final String TYPE = "TYPE"; - private static final String BEGIN = "BEGIN"; - private static final String END = "END"; - private static final String TEXT = "TEXT"; - - private final int begin; - private final int end; - private final String text; - - public BratTextAnnotation(int aId, String aType, int aBegin, int aEnd, String aText) - { - this("T" + aId, aType, aBegin, aEnd, aText); - } - - public BratTextAnnotation(String aId, String aType, int aBegin, int aEnd, String aText) - { - super(aId, aType); - begin = aBegin; - end = aEnd; - text = aText; - } - - public int getBegin() - { - return begin; - } - - public int getEnd() - { - return end; - } - - public String getText() - { - return text; - } - - @Override - public void write(JsonGenerator aJG) - throws IOException - { - // Format: [${ID}, ${TYPE}, [[${START}, ${END}]]] - // note that range of the offsets are [${START},${END}) - // ['T1', 'Person', [[0, 11]]] - - aJG.writeStartArray(); - aJG.writeString(getId()); - aJG.writeString(getType()); - aJG.writeStartArray(); - aJG.writeStartArray(); - aJG.writeNumber(begin); - aJG.writeNumber(end); - aJG.writeEndArray(); - aJG.writeEndArray(); - aJG.writeEndArray(); - } - - @Override - public String toString() - { - return getId() + '\t' + getType() + ' ' + begin + ' ' + end + '\t' + text; - } - - public static BratTextAnnotation parse(String aLine) - { - Matcher m = PATTERN.matcher(aLine); - - if (!m.matches()) { - throw new IllegalArgumentException("Illegal text annotation format ["+aLine+"]"); - } - - return new BratTextAnnotation(m.group(ID), m.group(TYPE), Integer.valueOf(m.group(BEGIN)), - Integer.valueOf(m.group(END)), m.group(TEXT)); - } -} diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/MappingParam.java b/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/MappingParam.java deleted file mode 100644 index 5cdb2fa8fb..0000000000 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/MappingParam.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; - -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.commons.lang3.StringUtils; - -public class MappingParam -{ - public static final String SEP = "->"; - - private final Pattern pattern; - private final String replacement; - - private Matcher matcher; - - public MappingParam(String aPattern, String aReplacement) - { - super(); - pattern = Pattern.compile("^" + aPattern.trim() + "$"); - replacement = aReplacement.trim(); - } - - public boolean matches(String aType) - { - matcher = pattern.matcher(aType); - return matcher.matches(); - } - - public String apply() - { - return matcher.replaceFirst(replacement); - } - - public static MappingParam parse(String aMapping) - { - int sep = StringUtils.lastIndexOf(aMapping, SEP); - return new MappingParam(aMapping.substring(0, sep), aMapping.substring(sep + SEP.length())); - } -} \ No newline at end of file diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/RelationParam.java b/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/RelationParam.java deleted file mode 100644 index a4d6f5c6b3..0000000000 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/RelationParam.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; - -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -public class RelationParam -{ - public static final String FLAG_ANCHOR = "A"; - - private static final Pattern PATTERN = Pattern.compile( - "(?[a-zA-Z_][a-zA-Z0-9_\\-.]+):" + - "(?[a-zA-Z][a-zA-Z0-9]+)(?\\{A?\\})?:" + - "(?[a-zA-Z][a-zA-Z0-9]+)(?\\{A?\\})?" + - "(?:[:](?[a-zA-Z][a-zA-Z0-9]+))?"); - - private static final String TYPE = "TYPE"; - private static final String ARG1 = "ARG1"; - private static final String FLAGS1 = "FLAGS1"; - private static final String ARG2 = "ARG2"; - private static final String FLAGS2 = "FLAGS2"; - private static final String SUBCAT = "SUBCAT"; - - private final String type; - private final String arg1; - private final String flags1; - private final String arg2; - private final String flags2; - private final String subcat; - - public RelationParam(String aType, String aArg1, String aFlags1, String aArg2, String aFlags2, - String aSubCat) - { - super(); - type = aType; - arg1 = aArg1; - flags1 = aFlags1; - arg2 = aArg2; - flags2 = aFlags2; - subcat = aSubCat; - } - - public String getType() - { - return type; - } - - public String getArg1() - { - return arg1; - } - - public String getFlags1() - { - return flags1 != null ? flags1 : ""; - } - - public String getArg2() - { - return arg2; - } - - public String getFlags2() - { - return flags2 != null ? flags2 : ""; - } - - public String getSubcat() - { - return subcat; - } - - public static RelationParam parse(String aValue) - { - Matcher m = PATTERN.matcher(aValue); - - if (!m.matches()) { - throw new IllegalArgumentException("Illegal relation parameter format [" + aValue + "]"); - } - - return new RelationParam(m.group(TYPE), m.group(ARG1), m.group(FLAGS1), m.group(ARG2), - m.group(FLAGS2), m.group(SUBCAT)); - } -} diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/TextAnnotationParam.java b/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/TextAnnotationParam.java deleted file mode 100644 index d9f39af30b..0000000000 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/TextAnnotationParam.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; - -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -public class TextAnnotationParam -{ - public static final String FLAG_ANCHOR = "A"; - - private static final Pattern PATTERN = Pattern.compile( - "(?[a-zA-Z_][a-zA-Z0-9_\\-.]+)" + - "(?:[:](?[a-zA-Z][a-zA-Z0-9]+))?"); - - private static final String TYPE = "TYPE"; - private static final String SUBCAT = "SUBCAT"; - - private final String type; - private final String subcat; - - public TextAnnotationParam(String aType, String aSubCat) - { - super(); - type = aType; - subcat = aSubCat; - } - - public String getType() - { - return type; - } - - public String getSubcat() - { - return subcat; - } - - public static TextAnnotationParam parse(String aValue) - { - Matcher m = PATTERN.matcher(aValue); - - if (!m.matches()) { - throw new IllegalArgumentException( - "Illegal text annotation parameter format [" + aValue + "]"); - } - - return new TextAnnotationParam(m.group(TYPE), m.group(SUBCAT)); - } -} diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/TypeMapping.java b/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/TypeMapping.java deleted file mode 100644 index a4a391c9fd..0000000000 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/TypeMapping.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.uima.cas.Type; -import org.apache.uima.cas.TypeSystem; - -public class TypeMapping -{ - private final List parsedMappings; - private final Map brat2UimaMappingCache; - private final Map uima2BratMappingCache; - - public TypeMapping(String... aMappings) - { - parsedMappings = new ArrayList<>(); - for (String m : aMappings) { - parsedMappings.add(MappingParam.parse(m)); - } - - brat2UimaMappingCache = new HashMap<>(); - uima2BratMappingCache = new HashMap<>(); - } - - private String apply(String aType) - { - String type = aType; - for (MappingParam m : parsedMappings) { - if (m.matches(aType)) { - type = m.apply(); - break; - } - } - return type; - } - - public Type getUimaType(TypeSystem aTs, BratAnnotation aAnno) - { - Type t = brat2UimaMappingCache.get(aAnno.getType()); - - if (t == null) { - // brat doesn't like dots in name names, so we had replaced them with dashes. Now revert. - String type = apply(aAnno.getType().replace("-", ".")); - t = aTs.getType(type); - - // if the lookup didn't work with replacing the dashes, try without, e.g. because the - // brat name *really* contains dashes and we only resolve them through mapping - if (t == null) { - type = apply(aAnno.getType()); - t = aTs.getType(type); - } - - brat2UimaMappingCache.put(aAnno.getType(), t); - } - - if (t == null) { - throw new IllegalStateException("Unable to find appropriate UIMA type for brat type [" - + aAnno.getType() + "]"); - } - - return t; - } - - public String getBratType(Type aType) - { - String bratType = uima2BratMappingCache.get(aType.getName()); - - if (bratType == null) { - String uimaType = aType.getName(); - - for (MappingParam m : parsedMappings) { - if (m.matches(aType.getName())) { - uimaType = m.apply(); - break; - } - } - - // brat doesn't like dots in name names, so we had replaced them with dashes. - bratType = uimaType.replace(".", "-"); - uima2BratMappingCache.put(uimaType, bratType); - } - - return bratType; - } -} diff --git a/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/BratReader.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/BratReader.java new file mode 100644 index 0000000000..076a2f2d3f --- /dev/null +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/BratReader.java @@ -0,0 +1,570 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.brat; + +import static java.util.stream.Collectors.toList; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.net.URL; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.commons.io.FilenameUtils; +import org.apache.commons.io.IOUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.FeatureStructure; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.TypeSystem; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.util.FSUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.io.brat.internal.mapping.CommentMapping; +import org.dkpro.core.io.brat.internal.mapping.Mapping; +import org.dkpro.core.io.brat.internal.mapping.RelationMapping; +import org.dkpro.core.io.brat.internal.mapping.SpanMapping; +import org.dkpro.core.io.brat.internal.mapping.TypeMapping; +import org.dkpro.core.io.brat.internal.mapping.TypeMappings; +import org.dkpro.core.io.brat.internal.model.BratAnnotation; +import org.dkpro.core.io.brat.internal.model.BratAnnotationDocument; +import org.dkpro.core.io.brat.internal.model.BratAttribute; +import org.dkpro.core.io.brat.internal.model.BratEventAnnotation; +import org.dkpro.core.io.brat.internal.model.BratEventArgument; +import org.dkpro.core.io.brat.internal.model.BratNoteAnnotation; +import org.dkpro.core.io.brat.internal.model.BratRelationAnnotation; +import org.dkpro.core.io.brat.internal.model.BratTextAnnotation; +import org.dkpro.core.io.brat.internal.model.Offsets; + +import com.fasterxml.jackson.annotation.JsonSetter; +import com.fasterxml.jackson.annotation.Nulls; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Reader for the brat format. + * + * @see brat standoff format + * @see brat configuration format + */ +@ResourceMetaData(name = "Brat Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({ MimeTypes.APPLICATION_X_BRAT }) +public class BratReader + extends JCasResourceCollectionReader_ImplBase +{ + /** + * Name of configuration parameter that contains the character encoding used by the input files. + */ + public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, defaultValue = ComponentParameters.DEFAULT_ENCODING) + private String sourceEncoding; + + /** + * Types that are relations. It is mandatory to provide the type name followed by two feature + * names that represent Arg1 and Arg2 separated by colons, e.g. + * + *
+     * 
+     * de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency:Governor:Dependent{A}
+     * 
+     * 
+ * + * Additionally, a subcategorization feature may be specified. + */ + @Deprecated + public static final String PARAM_RELATION_TYPES = "relationTypes"; + @Deprecated + @ConfigurationParameter(name = PARAM_RELATION_TYPES, mandatory = false, defaultValue = { + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency:Governor:Dependent{A}" }) + private Set relationTypes; + + /** + * Using this parameter is only necessary to specify a subcategorization feature for text and + * event annotation types. It is mandatory to provide the type name which can optionally be + * followed by a subcategorization feature. + */ + @Deprecated + public static final String PARAM_TEXT_ANNOTATION_TYPES = "textAnnotationTypes"; + @Deprecated + @ConfigurationParameter(name = PARAM_TEXT_ANNOTATION_TYPES, mandatory = false, defaultValue = {}) + private Set textAnnotationTypes; + + /** + * Mapping of brat text annotations (entities or events) to UIMA types, e.g. : + * + *
+     * 
+     * Country -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location
+     * 
+     * 
+ */ + @Deprecated + public static final String PARAM_TEXT_ANNOTATION_TYPE_MAPPINGS = "textAnnotationTypeMappings"; + @Deprecated + @ConfigurationParameter(name = PARAM_TEXT_ANNOTATION_TYPE_MAPPINGS, mandatory = false) + private String[] textAnnotationTypeMappings; + + /** + * Mapping of brat relation annotations to UIMA types, e.g. : + * + *
+     * 
+     * SUBJ -> de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+     * 
+     * 
+ */ + @Deprecated + public static final String PARAM_RELATION_TYPE_MAPPINGS = "relationTypeMappings"; + @Deprecated + @ConfigurationParameter(name = PARAM_RELATION_TYPE_MAPPINGS, mandatory = false) + private String[] relationTypeMappings; + + /** + * Mapping of brat notes to particular features. + */ + @Deprecated + public static final String PARAM_NOTE_MAPPINGS = "noteMappings"; + @Deprecated + @ConfigurationParameter(name = PARAM_NOTE_MAPPINGS, mandatory = false, defaultValue = {}) + private Set noteMappings; + + /** + * Configuration + */ + public static final String PARAM_MAPPING = "mapping"; + @ConfigurationParameter(name = PARAM_MAPPING, mandatory = false) + private String mappingJson; + + private Mapping mapping; + + private Map idMap; + + private Set warnings; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException + { + super.initialize(aContext); + + if (mappingJson != null) { + ObjectMapper mapper = new ObjectMapper(); + mapper.setDefaultSetterInfo(JsonSetter.Value.forContentNulls(Nulls.AS_EMPTY)); + mapper.configure(JsonParser.Feature.ALLOW_SINGLE_QUOTES, true); + try { + mapping = mapper.readValue(mappingJson, Mapping.class); + } + catch (IOException e) { + throw new ResourceInitializationException(e); + } + } + else { + Map parsedRelationTypes = new HashMap<>(); + for (String rel : relationTypes) { + RelationMapping p = RelationMapping.parse(rel); + parsedRelationTypes.put(p.getType(), p); + } + + Map parsedTextAnnotationTypes = new HashMap<>(); + for (String rel : textAnnotationTypes) { + SpanMapping p = SpanMapping.parse(rel); + parsedTextAnnotationTypes.put(p.getType(), p); + } + + TypeMappings textAnnotationTypeMapping = new TypeMappings(textAnnotationTypeMappings); + TypeMappings relationTypeMapping = new TypeMappings(relationTypeMappings); + + mapping = new Mapping(textAnnotationTypeMapping, relationTypeMapping, + textAnnotationTypes.stream().map(SpanMapping::parse).collect(toList()), + relationTypes.stream().map(RelationMapping::parse).collect(Collectors.toList()), + noteMappings.stream().map(CommentMapping::parse).collect(toList())); + } + + warnings = new LinkedHashSet(); + } + + @Override + public void close() throws IOException + { + super.close(); + + for (String warning : warnings) { + getLogger().warn(warning); + } + } + + @Override + public void getNext(JCas aJCas) throws IOException, CollectionException + { + idMap = new HashMap<>(); + + Resource res = nextFile(); + initCas(aJCas, res); + + readText(aJCas, res); + readAnnotations(aJCas, res); + } + + private void readAnnotations(JCas aJCas, Resource aRes) throws IOException + { + BratAnnotationDocument doc; + try (Reader r = new InputStreamReader(aRes.getInputStream(), sourceEncoding)) { + doc = BratAnnotationDocument.read(r); + } + + CAS cas = aJCas.getCas(); + TypeSystem ts = aJCas.getTypeSystem(); + + List relations = new ArrayList<>(); + List events = new ArrayList<>(); + List notes = new ArrayList<>(); + for (BratAnnotation anno : doc.getAnnotations()) { + if (anno instanceof BratTextAnnotation) { + Type type = mapping.getTextTypeMapppings().getUimaType(ts, anno); + create(cas, type, (BratTextAnnotation) anno); + } + else if (anno instanceof BratRelationAnnotation) { + relations.add((BratRelationAnnotation) anno); + } + else if (anno instanceof BratNoteAnnotation) { + notes.add((BratNoteAnnotation) anno); + } + else if (anno instanceof BratEventAnnotation) { + Type type = mapping.getTextTypeMapppings().getUimaType(ts, anno); + create(cas, type, (BratEventAnnotation) anno); + events.add((BratEventAnnotation) anno); + } + else { + throw new IllegalStateException( + "Annotation type [" + anno.getClass() + "] is currently not supported."); + } + } + + // Go through the relations now + for (BratRelationAnnotation rel : relations) { + Type type = mapping.getRelationTypeMapppings().getUimaType(ts, rel); + create(cas, type, rel); + } + + // Go through the events again and handle the slots + for (BratEventAnnotation e : events) { + Type type = mapping.getTextTypeMapppings().getUimaType(ts, e); + fillSlots(cas, type, doc, e); + } + + // Finally go through the notes and map them to features (if configured to do so) + for (BratNoteAnnotation n : notes) { + FeatureStructure anno = idMap.get(n.getTarget()); + + Type type = anno.getType(); + Collection mappings = mapping.getCommentMapping(type.getName()); + + if (mappings.isEmpty()) { + warnings.add("No comment mappings defined for note type [" + n.getType() + + "] on annotation type [" + type.getName() + "]"); + continue; + } + + List attrs = new ArrayList<>(); + for (CommentMapping m : mappings) { + if (m.matches(n.getNote())) { + attrs.add(new BratAttribute(-1, m.getFeature(), n.getTarget(), m.apply())); + } + } + fillAttributes(anno, attrs); + } + } + + private void readText(JCas aJCas, Resource res) throws IOException + { + String annUrl = res.getResource().getURL().toString(); + String textUrl = FilenameUtils.removeExtension(annUrl) + ".txt"; + + try (InputStream is = new BufferedInputStream(new URL(textUrl).openStream())) { + aJCas.setDocumentText(IOUtils.toString(is, sourceEncoding)); + } + } + + private void create(CAS aCAS, Type aType, BratTextAnnotation aAnno) + { + SpanMapping param = mapping.getSpanMapping(aType.getName()); + TypeMapping tmap = mapping.getTextTypeMapppings().getMappingByBratType(aAnno.getType()); + + for (Offsets offset : aAnno.getOffsets()) { + AnnotationFS anno = aCAS.createAnnotation(aType, offset.getBegin(), offset.getEnd()); + + if (tmap != null) { + fillDefaultAttributes(anno, tmap.getDefaultFeatureValues()); + } + + if (param != null) { + fillDefaultAttributes(anno, param.getDefaultFeatureValues()); + } + + fillAttributes(anno, aAnno.getAttributes()); + + if (param != null && param.getSubcat() != null) { + anno.setStringValue(getFeature(anno, param.getSubcat()), aAnno.getType()); + } + + aCAS.addFsToIndexes(anno); + idMap.put(aAnno.getId(), anno); + } + } + + private void create(CAS aCAS, Type aType, BratEventAnnotation aAnno) + { + SpanMapping param = mapping.getSpanMapping(aType.getName()); + TypeMapping tmap = mapping.getTextTypeMapppings().getMappingByBratType(aAnno.getType()); + + for (Offsets offset : aAnno.getTriggerAnnotation().getOffsets()) { + AnnotationFS anno = aCAS.createAnnotation(aType, offset.getBegin(), offset.getEnd()); + + if (tmap != null) { + fillDefaultAttributes(anno, tmap.getDefaultFeatureValues()); + } + + if (param != null) { + fillDefaultAttributes(anno, param.getDefaultFeatureValues()); + } + + fillAttributes(anno, aAnno.getAttributes()); + + if (param != null && param.getSubcat() != null) { + anno.setStringValue(getFeature(anno, param.getSubcat()), aAnno.getType()); + } + + // Slots cannot be handled yet because they might point to events that have not been + // created yet. + + aCAS.addFsToIndexes(anno); + idMap.put(aAnno.getId(), anno); + } + } + + private void create(CAS aCAS, Type aType, BratRelationAnnotation aAnno) + { + RelationMapping param = mapping.getRelationMapping(aType.getName()); + TypeMapping tmap = mapping.getRelationTypeMapppings().getMappingByBratType(aAnno.getType()); + + AnnotationFS arg1 = idMap.get(aAnno.getArg1Target()); + AnnotationFS arg2 = idMap.get(aAnno.getArg2Target()); + + AnnotationFS anno = aCAS.createFS(aType); + + anno.setFeatureValue(getFeature(anno, param.getArg1()), arg1); + anno.setFeatureValue(getFeature(anno, param.getArg2()), arg2); + + AnnotationFS anchor = null; + if (param.getFlags1().contains(RelationMapping.FLAG_ANCHOR) + && param.getFlags2().contains(RelationMapping.FLAG_ANCHOR)) { + throw new IllegalStateException("Only one argument can be the anchor."); + } + else if (param.getFlags1().contains(RelationMapping.FLAG_ANCHOR)) { + anchor = arg1; + } + else if (param.getFlags2().contains(RelationMapping.FLAG_ANCHOR)) { + anchor = arg2; + } + + if (tmap != null) { + fillDefaultAttributes(anno, tmap.getDefaultFeatureValues()); + } + + if (param != null) { + fillDefaultAttributes(anno, param.getDefaultFeatureValues()); + } + + fillAttributes(anno, aAnno.getAttributes()); + + if (param.getSubcat() != null) { + anno.setStringValue(getFeature(anno, param.getSubcat()), aAnno.getType()); + } + + if (anchor != null) { + anno.setIntValue(anno.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_BEGIN), + anchor.getBegin()); + anno.setIntValue(anno.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_END), + anchor.getEnd()); + } + else { + TypeSystem ts = aCAS.getTypeSystem(); + if (ts.subsumes(ts.getType(CAS.TYPE_NAME_ANNOTATION), anno.getType())) { + warnings.add("Relation type [" + aType.getName() + + "] has offsets but no anchor is specified."); + } + } + + aCAS.addFsToIndexes(anno); + idMap.put(aAnno.getId(), anno); + } + + private void fillDefaultAttributes(FeatureStructure aAnno, Map aValues) + { + for (Entry e : aValues.entrySet()) { + Feature feat = aAnno.getType().getFeatureByBaseName(e.getKey()); + + if (feat == null) { + throw new IllegalStateException("Type [" + aAnno.getType().getName() + + "] has no feature named [" + e.getKey() + "]"); + } + + aAnno.setFeatureValueFromString(feat, e.getValue()); + } + } + + private void fillAttributes(FeatureStructure aAnno, Collection aAttributes) + { + for (BratAttribute attr : aAttributes) { + // Try treating the attribute name as an unqualified name, then as a qualified name. + Feature feat = aAnno.getType().getFeatureByBaseName(attr.getName()); + if (feat == null) { + String featName = attr.getName().replace('_', ':'); + featName = featName.substring(featName.indexOf(TypeSystem.FEATURE_SEPARATOR) + 1); + feat = aAnno.getType().getFeatureByBaseName(featName); + } + + // FIXME HACK! We may not find a "role" feature from slot links in the target type + // because it should be in the link type. This here is a bad hack, but it should work + // as long as the target type doesn't define a "role" feature itself. + if ((("role".equals(attr.getName())) || attr.getName().endsWith("_role")) + && feat == null) { + return; + } + + if (feat == null) { + throw new IllegalStateException("Type [" + aAnno.getType().getName() + + "] has no feature named [" + attr.getName() + "]"); + } + + if (attr.getValues().length == 0) { + // Nothing to do + } + else if (attr.getValues().length == 1) { + aAnno.setFeatureValueFromString(feat, attr.getValues()[0]); + } + else { + throw new IllegalStateException("Multi-valued attributes currently not supported"); + } + } + } + + private void fillSlots(CAS aCas, Type aType, BratAnnotationDocument aDoc, + BratEventAnnotation aE) + { + AnnotationFS event = idMap.get(aE.getId()); + Map> groupedArgs = aE.getGroupedArguments(); + + for (Entry> slot : groupedArgs.entrySet()) { + // Resolve the target IDs to feature structures + List targets = new ArrayList<>(); + + // Lets see if there is a multi-valued feature by the name of the slot + if (FSUtil.hasFeature(event, slot.getKey()) + && FSUtil.isMultiValuedFeature(event, slot.getKey())) { + for (BratEventArgument arg : slot.getValue()) { + FeatureStructure target = idMap.get(arg.getTarget()); + if (target == null) { + throw new IllegalStateException( + "Unable to resolve id [" + arg.getTarget() + "]"); + } + + // Handle WebAnno-style slot links + // FIXME It would be better if the link type could be configured, e.g. what + // is the name of the link feature and what is the name of the role feature... + // but right now we just keep it hard-coded to the values that are used + // in the DKPro Core SemArgLink and that are also hard-coded in WebAnno + Type componentType = event.getType().getFeatureByBaseName(slot.getKey()) + .getRange().getComponentType(); + if (CAS.TYPE_NAME_TOP + .equals(aCas.getTypeSystem().getParent(componentType).getName())) { + BratAnnotation targetAnno = aDoc.getAnnotation(arg.getTarget()); + BratAttribute roleAttr = targetAnno.getAttribute("role"); + if (roleAttr == null) { + roleAttr = targetAnno.getAttribute( + target.getType().getName().replace('.', '-') + "_role"); + } + FeatureStructure link = aCas.createFS(componentType); + if (roleAttr != null) { + FSUtil.setFeature(link, "role", roleAttr.getValues()); + } + FSUtil.setFeature(link, "target", target); + target = link; + } + + targets.add(target); + } + FSUtil.setFeature(event, slot.getKey(), targets); + } + // Lets see if there is a single-valued feature by the name of the slot + else if (FSUtil.hasFeature(event, slot.getKey())) { + for (BratEventArgument arg : slot.getValue()) { + AnnotationFS target = idMap.get(arg.getTarget()); + if (target == null) { + throw new IllegalStateException( + "Unable to resolve id [" + arg.getTarget() + "]"); + } + + String fname = arg.getSlot() + (arg.getIndex() > 0 ? arg.getIndex() : ""); + if (FSUtil.hasFeature(event, fname)) { + FSUtil.setFeature(event, fname, target); + } + else { + throw new IllegalStateException("Type [" + event.getType().getName() + + "] has no feature named [" + fname + "]"); + } + } + } + else { + throw new IllegalStateException("Type [" + event.getType().getName() + + "] has no feature named [" + slot.getKey() + "]"); + } + } + } + + private Feature getFeature(FeatureStructure aFS, String aName) + { + Feature f = aFS.getType().getFeatureByBaseName(aName); + if (f == null) { + throw new IllegalArgumentException( + "Type [" + aFS.getType().getName() + "] has no feature named [" + aName + "]"); + } + return f; + } +} diff --git a/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/BratWriter.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/BratWriter.java new file mode 100644 index 0000000000..84c65319fc --- /dev/null +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/BratWriter.java @@ -0,0 +1,322 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.brat; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.StringWriter; +import java.io.Writer; +import java.util.Collection; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.io.brat.internal.mapping.RelationMapping; +import org.dkpro.core.io.brat.internal.mapping.TypeMappings; +import org.dkpro.core.io.brat.internal.model.BratAnnotationDocument; +import org.dkpro.core.io.brat.internal.model.BratConfiguration; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonGenerator; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Writer for the brat annotation format. + * + *

+ * Known issues: + *

+ * + * + * @see brat standoff format + * @see brat configuration format + */ +@ResourceMetaData(name = "Brat Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({ MimeTypes.APPLICATION_X_BRAT }) +public class BratWriter + extends JCasFileWriter_ImplBase +{ + /** + * Specify the suffix of text output files. Default value .txt. If the suffix is + * not needed, provide an empty string as value. + */ + public static final String PARAM_TEXT_FILENAME_EXTENSION = "textFilenameExtension"; + @ConfigurationParameter(name = PARAM_TEXT_FILENAME_EXTENSION, mandatory = true, defaultValue = ".txt") + private String textFilenameExtension; + + /** + * Specify the suffix of output files. Default value .ann. If the suffix is not + * needed, provide an empty string as value. + */ + public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; + @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".ann") + private String filenameSuffix; + + /** + * Types that will not be written to the exported file. + */ + public static final String PARAM_EXCLUDE_TYPES = "excludeTypes"; + @ConfigurationParameter(name = PARAM_EXCLUDE_TYPES, mandatory = true, defaultValue = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) + private Set excludeTypes; + + /** + * Types that are text annotations (aka entities or spans). + */ + public static final String PARAM_TEXT_ANNOTATION_TYPES = "spanTypes"; + @ConfigurationParameter(name = PARAM_TEXT_ANNOTATION_TYPES, mandatory = true, defaultValue = { + // "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + // "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + // "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", + // "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", + // "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem", + // "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk", + // "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", + // "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg", + // "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred" + }) + private Set spanTypes; + + /** + * Types that are relations. It is mandatory to provide the type name followed by two feature + * names that represent Arg1 and Arg2 separated by colons, e.g. + * de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency:Governor:Dependent + * . + */ + public static final String PARAM_RELATION_TYPES = "relationTypes"; + @ConfigurationParameter(name = PARAM_RELATION_TYPES, mandatory = true, defaultValue = { + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency:Governor:Dependent" }) + private Set relationTypes; + + // /** + // * Types that are events. Optionally, multiple slot features can be specified. + // * my.type.Event:location:participant. + // */ + // public static final String PARAM_EVENT_TYPES = "eventTypes"; + // @ConfigurationParameter(name = PARAM_EVENT_TYPES, mandatory = true, defaultValue = { }) + // private Set eventTypes; + // private Map parsedEventTypes; + + /** + * Enable type mappings. + */ + public static final String PARAM_ENABLE_TYPE_MAPPINGS = "enableTypeMappings"; + @ConfigurationParameter(name = PARAM_ENABLE_TYPE_MAPPINGS, mandatory = true, defaultValue = "false") + private boolean enableTypeMappings; + + /** + * FIXME + */ + public static final String PARAM_TYPE_MAPPINGS = "typeMappings"; + @ConfigurationParameter(name = PARAM_TYPE_MAPPINGS, mandatory = false, defaultValue = { + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.(\\w+) -> $1", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.(\\w+) -> $1", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.(\\w+) -> $1", + "de.tudarmstadt.ukp.dkpro.core.api.ner.type.(\\w+) -> $1" }) + private String[] typeMappings; + + /** + * The brat web application can currently not handle attributes on relations, thus they are + * disabled by default. Here they can be enabled again. + */ + public static final String PARAM_WRITE_RELATION_ATTRIBUTES = "writeRelationAttributes"; + @ConfigurationParameter(name = PARAM_WRITE_RELATION_ATTRIBUTES, mandatory = true, defaultValue = "false") + private boolean writeRelationAttributes; + + /** + * Enable writing of features with null values. + */ + public static final String PARAM_WRITE_NULL_ATTRIBUTES = "writeNullAttributes"; + @ConfigurationParameter(name = PARAM_WRITE_NULL_ATTRIBUTES, mandatory = true, defaultValue = "false") + private boolean writeNullAttributes; + + /** + * Colors to be used for the visual configuration that is generated for brat. + */ + public static final String PARAM_PALETTE = "palette"; + @ConfigurationParameter(name = PARAM_PALETTE, mandatory = false, defaultValue = { "#8dd3c7", + "#ffffb3", "#bebada", "#fb8072", "#80b1d3", "#fdb462", "#b3de69", "#fccde5", "#d9d9d9", + "#bc80bd", "#ccebc5", "#ffed6f" }) + private String[] palette; + + /** + * Whether to render attributes by their short name or by their qualified name. + */ + public static final String PARAM_SHORT_ATTRIBUTE_NAMES = "shortAttributeNames"; + @ConfigurationParameter(name = PARAM_SHORT_ATTRIBUTE_NAMES, mandatory = true, defaultValue = "false") + private boolean shortAttributeNames; + + private BratConfiguration conf; + private DKPro2Brat converter; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException + { + super.initialize(aContext); + + // parsedEventTypes = new HashMap<>(); + // for (String rel : eventTypes) { + // EventParam p = EventParam.parse(rel); + // parsedEventTypes.put(p.getType(), p); + // } + + conf = new BratConfiguration(); + converter = new DKPro2Brat(conf); + converter.setWriteNullAttributes(writeNullAttributes); + converter.setWriteRelationAttributes(writeRelationAttributes); + converter.setShortAttributeNames(shortAttributeNames); + converter.setPalette(palette); + converter.setExcludeTypes(excludeTypes); + converter.setSpanTypes(spanTypes); + converter.setRelationTypes( + relationTypes.stream().map(RelationMapping::parse).collect(Collectors.toList())); + if (enableTypeMappings) { + converter.setTypeMapping(new TypeMappings(typeMappings)); + } + } + + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException + { + try { + if (".ann".equals(filenameSuffix)) { + writeText(aJCas); + } + writeAnnotations(aJCas); + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + + @Override + public void collectionProcessComplete() throws AnalysisEngineProcessException + { + if (!".ann".equals(filenameSuffix)) { + return; + } + + try { + writeAnnotationConfiguration(); + writeVisualConfiguration(); + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + + private void writeAnnotationConfiguration() throws IOException + { + try (Writer out = new OutputStreamWriter(getOutputStream("annotation", ".conf"), "UTF-8")) { + conf.writeAnnotationConfiguration(out); + } + } + + private void writeVisualConfiguration() throws IOException + { + try (Writer out = new OutputStreamWriter(getOutputStream("visual", ".conf"), "UTF-8")) { + conf.writeVisualConfiguration(out); + } + } + + private void writeAnnotations(JCas aJCas) throws IOException + { + BratAnnotationDocument doc = new BratAnnotationDocument(); + + Collection warnings = converter.convert(aJCas, doc); + + for (String warning : warnings) { + getLogger().warn(warning); + } + + switch (filenameSuffix) { + case ".ann": + try (Writer out = new OutputStreamWriter(getOutputStream(aJCas, filenameSuffix), + "UTF-8")) { + doc.write(out); + break; + } + case ".html": + case ".json": + String template; + if (filenameSuffix.equals(".html")) { + template = IOUtils.toString(getClass().getResource("html/template.html")); + } + else { + template = "{ \"collData\" : ##COLL-DATA## , \"docData\" : ##DOC-DATA## }"; + } + + JsonFactory jfactory = new JsonFactory(); + try (Writer out = new OutputStreamWriter(getOutputStream(aJCas, filenameSuffix), + "UTF-8")) { + String docData; + try (StringWriter buf = new StringWriter()) { + try (JsonGenerator jg = jfactory.createGenerator(buf)) { + jg.useDefaultPrettyPrinter(); + doc.write(jg, aJCas.getDocumentText()); + } + docData = buf.toString(); + } + + String collData; + try (StringWriter buf = new StringWriter()) { + try (JsonGenerator jg = jfactory.createGenerator(buf)) { + jg.useDefaultPrettyPrinter(); + conf.write(jg); + } + collData = buf.toString(); + } + + template = StringUtils.replaceEach(template, + new String[] { "##COLL-DATA##", "##DOC-DATA##" }, + new String[] { collData, docData }); + + out.write(template); + } + conf = new BratConfiguration(); + break; + default: + throw new IllegalArgumentException("Unknown file format: [" + filenameSuffix + "]"); + } + } + + private void writeText(JCas aJCas) throws IOException + { + try (OutputStream docOS = getOutputStream(aJCas, textFilenameExtension)) { + IOUtils.write(aJCas.getDocumentText(), docOS); + } + } +} diff --git a/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/DKPro2Brat.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/DKPro2Brat.java new file mode 100644 index 0000000000..18392e2253 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/DKPro2Brat.java @@ -0,0 +1,629 @@ +/* + * Copyright 2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.brat; + +import static org.apache.uima.cas.CAS.TYPE_NAME_BYTE; +import static org.apache.uima.cas.CAS.TYPE_NAME_DOUBLE; +import static org.apache.uima.cas.CAS.TYPE_NAME_FLOAT; +import static org.apache.uima.cas.CAS.TYPE_NAME_INTEGER; +import static org.apache.uima.cas.CAS.TYPE_NAME_LONG; +import static org.apache.uima.cas.CAS.TYPE_NAME_SHORT; +import static org.apache.uima.fit.util.JCasUtil.selectAll; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.FeatureStructure; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.TypeSystem; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.fit.util.FSUtil; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.brat.internal.mapping.RelationMapping; +import org.dkpro.core.io.brat.internal.mapping.TypeMappings; +import org.dkpro.core.io.brat.internal.model.BratAnnotation; +import org.dkpro.core.io.brat.internal.model.BratAnnotationDocument; +import org.dkpro.core.io.brat.internal.model.BratAttributeDecl; +import org.dkpro.core.io.brat.internal.model.BratConfiguration; +import org.dkpro.core.io.brat.internal.model.BratConstants; +import org.dkpro.core.io.brat.internal.model.BratEventAnnotation; +import org.dkpro.core.io.brat.internal.model.BratEventAnnotationDecl; +import org.dkpro.core.io.brat.internal.model.BratEventArgument; +import org.dkpro.core.io.brat.internal.model.BratEventArgumentDecl; +import org.dkpro.core.io.brat.internal.model.BratRelationAnnotation; +import org.dkpro.core.io.brat.internal.model.BratTextAnnotation; +import org.dkpro.core.io.brat.internal.model.BratTextAnnotationDrawingDecl; +import org.dkpro.core.io.brat.internal.model.Offsets; + +public class DKPro2Brat +{ + private final Log log = LogFactory.getLog(getClass()); + + private final static Pattern NEWLINE_EXTRACT_PATTERN = Pattern.compile("(.+?)(?:\\R|$)+"); + + private final BratConfiguration conf; + + private int nextEventAnnotationId; + private int nextTextAnnotationId; + private int nextRelationAnnotationId; + private int nextAttributeId; + private int nextPaletteIndex; + private Map spanIdMap; + + private Set warnings; + + private String[] palette = new String[] { "#8dd3c7", "#ffffb3", "#bebada", "#fb8072", "#80b1d3", + "#fdb462", "#b3de69", "#fccde5", "#d9d9d9", "#bc80bd", "#ccebc5", "#ffed6f" }; + private Set excludeTypes = Collections + .singleton("de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence"); + private Set spanTypes = new HashSet<>(); + private Map parsedRelationTypes = new HashMap<>(); + private TypeMappings typeMapping; + + private boolean writeRelationAttributes; + private boolean writeNullAttributes; + private boolean shortAttributeNames; + + public DKPro2Brat(BratConfiguration aConf) + { + super(); + conf = aConf; + } + + public boolean isWriteRelationAttributes() + { + return writeRelationAttributes; + } + + public void setWriteRelationAttributes(boolean aWriteRelationAttributes) + { + writeRelationAttributes = aWriteRelationAttributes; + } + + public boolean isWriteNullAttributes() + { + return writeNullAttributes; + } + + public void setWriteNullAttributes(boolean aWriteNullAttributes) + { + writeNullAttributes = aWriteNullAttributes; + } + + public boolean isShortAttributeNames() + { + return shortAttributeNames; + } + + public void setShortAttributeNames(boolean aShortAttributeNames) + { + shortAttributeNames = aShortAttributeNames; + } + + + + public String[] getPalette() + { + return palette; + } + + public void setPalette(String[] aPalette) + { + palette = aPalette; + } + + public Set getExcludeTypes() + { + return excludeTypes; + } + + public void setExcludeTypes(Set aExcludeTypes) + { + excludeTypes = aExcludeTypes; + } + + public Map getRelationTypes() + { + return parsedRelationTypes; + } + + public void setRelationTypes(Collection aRelationTypes) + { + aRelationTypes.stream().forEachOrdered(p -> parsedRelationTypes.put(p.getType(), p)); + } + + public Set getSpanTypes() + { + return spanTypes; + } + + public void setSpanTypes(Set aSpanTypes) + { + spanTypes = aSpanTypes; + } + + public TypeMappings getTypeMapping() + { + return typeMapping; + } + + public void setTypeMapping(TypeMappings aTypeMapping) + { + typeMapping = aTypeMapping; + } + + private void init() + { + nextEventAnnotationId = 1; + nextTextAnnotationId = 1; + nextRelationAnnotationId = 1; + nextAttributeId = 1; + nextPaletteIndex = 0; + spanIdMap = new HashMap<>(); + warnings = new LinkedHashSet<>(); + } + + public Set convert(JCas aJCas, BratAnnotationDocument doc) + { + init(); + + List relationFS = new ArrayList<>(); + + Map eventFS = new LinkedHashMap<>(); + + // Go through all the annotations but only handle the ones that have no references to + // other annotations. + for (FeatureStructure fs : selectAll(aJCas)) { + // Skip document annotation + if (fs == aJCas.getDocumentAnnotationFs()) { + continue; + } + + // Skip excluded types + if (excludeTypes.contains(fs.getType().getName())) { + log.debug("Excluding [" + fs.getType().getName() + "]"); + continue; + } + + if (spanTypes.contains(fs.getType().getName())) { + writeTextAnnotation(doc, (AnnotationFS) fs); + } + else if (parsedRelationTypes.containsKey(fs.getType().getName())) { + relationFS.add(fs); + } + else if (hasNonPrimitiveFeatures(fs) && (fs instanceof AnnotationFS)) { +// else if (parsedEventTypes.containsKey(fs.getType().getName())) { + BratEventAnnotation event = writeEventAnnotation(doc, (AnnotationFS) fs); + eventFS.put(event, fs); + } + else if (fs instanceof AnnotationFS) { + warnings.add("Assuming annotation type [" + fs.getType().getName() + "] is span"); + writeTextAnnotation(doc, (AnnotationFS) fs); + } + else { + warnings.add("Skipping annotation with type [" + fs.getType().getName() + "]"); + } + } + + // Handle relations now since now we can resolve their targets to IDs. + for (FeatureStructure fs : relationFS) { + writeRelationAnnotation(doc, fs); + } + + // Handle event slots now since now we can resolve their targets to IDs. + for (Entry e : eventFS.entrySet()) { + writeSlots(doc, e.getKey(), e.getValue()); + } + + return warnings; + } + + /** + * Checks if the feature structure has non-default non-primitive properties. + */ + private boolean hasNonPrimitiveFeatures(FeatureStructure aFS) + { + for (Feature f : aFS.getType().getFeatures()) { + if (CAS.FEATURE_BASE_NAME_SOFA.equals(f.getShortName())) { + continue; + } + + if (!f.getRange().isPrimitive()) { + return true; + } + } + + return false; + } + + private BratEventAnnotation writeEventAnnotation(BratAnnotationDocument aDoc, AnnotationFS aFS) + { + + // Write trigger annotation + BratTextAnnotation trigger = splitNewline(aFS); + + nextTextAnnotationId++; + + // Write event annotation + BratEventAnnotation event = new BratEventAnnotation(nextEventAnnotationId, + getBratType(aFS.getType()), trigger.getId()); + spanIdMap.put(aFS, event.getId()); + nextEventAnnotationId++; + + // We do not add the trigger annotations to the document - they are owned by the event + //aDoc.addAnnotation(trigger); + event.setTriggerAnnotation(trigger); + + // Write attributes + writeAttributes(event, aFS); + + // Slots are written later after we know all the span/event IDs + + conf.addLabelDecl(event.getType(), aFS.getType().getShortName(), aFS.getType() + .getShortName().substring(0, 1)); + + if (!conf.hasDrawingDecl(event.getType())) { + conf.addDrawingDecl(new BratTextAnnotationDrawingDecl(event.getType(), "black", + palette[nextPaletteIndex % palette.length])); + nextPaletteIndex++; + } + + aDoc.addAnnotation(event); + return event; + } + + private void writeTextAnnotation(BratAnnotationDocument aDoc, AnnotationFS aFS) + { + String superType = getBratType(aFS.getCAS().getTypeSystem().getParent(aFS.getType())); + String type = getBratType(aFS.getType()); + BratTextAnnotation anno = splitNewline(aFS); + + nextTextAnnotationId++; + + conf.addEntityDecl(superType, type); + + conf.addLabelDecl(anno.getType(), aFS.getType().getShortName(), aFS.getType() + .getShortName().substring(0, 1)); + + if (!conf.hasDrawingDecl(anno.getType())) { + conf.addDrawingDecl(new BratTextAnnotationDrawingDecl(anno.getType(), "black", + palette[nextPaletteIndex % palette.length])); + nextPaletteIndex++; + } + + aDoc.addAnnotation(anno); + + writeAttributes(anno, aFS); + + spanIdMap.put(aFS, anno.getId()); + } + + private void writeRelationAnnotation(BratAnnotationDocument aDoc, FeatureStructure aFS) + { + RelationMapping rel = parsedRelationTypes.get(aFS.getType().getName()); + + FeatureStructure arg1 = aFS.getFeatureValue(aFS.getType().getFeatureByBaseName( + rel.getArg1())); + FeatureStructure arg2 = aFS.getFeatureValue(aFS.getType().getFeatureByBaseName( + rel.getArg2())); + + if (arg1 == null || arg2 == null) { + throw new IllegalArgumentException("Dangling relation"); + } + + String arg1Id = spanIdMap.get(arg1); + String arg2Id = spanIdMap.get(arg2); + + if (arg1Id == null || arg2Id == null) { + throw new IllegalArgumentException("Unknown targets!"); + } + + String superType = getBratType(aFS.getCAS().getTypeSystem().getParent(aFS.getType())); + String type = getBratType(aFS.getType()); + + BratRelationAnnotation anno = new BratRelationAnnotation(nextRelationAnnotationId, + type, rel.getArg1(), arg1Id, rel.getArg2(), arg2Id); + nextRelationAnnotationId++; + + conf.addRelationDecl(superType, type, rel.getArg1(), rel.getArg2()); + + conf.addLabelDecl(anno.getType(), aFS.getType().getShortName(), aFS.getType() + .getShortName().substring(0, 1)); + + aDoc.addAnnotation(anno); + + // brat doesn't support attributes on relations + // https://github.com/nlplab/brat/issues/791 + if (writeRelationAttributes) { + writeAttributes(anno, aFS); + } + } + + private void writeAttributes(BratAnnotation aAnno, FeatureStructure aFS) + { + for (Feature feat : aFS.getType().getFeatures()) { + // Skip Sofa feature + if (isInternalFeature(feat)) { + continue; + } + + // No need to write begin / end, they are already on the text annotation + if (CAS.FEATURE_FULL_NAME_BEGIN.equals(feat.getName()) || + CAS.FEATURE_FULL_NAME_END.equals(feat.getName())) { + continue; + } + + // No need to write link endpoints again, they are already on the relation annotation + RelationMapping relParam = parsedRelationTypes.get(aFS.getType().getName()); + if (relParam != null) { + if (relParam.getArg1().equals(feat.getShortName()) + || relParam.getArg2().equals(feat.getShortName())) { + continue; + } + } + + if (feat.getRange().isPrimitive()) { + writePrimitiveAttribute(aAnno, aFS, feat); + } + // The following warning is not relevant for event annotations because these render such + // features as slots. + else if (!(aAnno instanceof BratEventAnnotation)) { + warnings.add( + "Unable to render feature [" + feat.getName() + "] with range [" + + feat.getRange().getName() + "] as attribute"); + } + } + } + + private void writeSlots(BratAnnotationDocument aDoc, BratEventAnnotation aEvent, + FeatureStructure aFS) + { + String superType = getBratType(aFS.getCAS().getTypeSystem().getParent(aFS.getType())); + String type = getBratType(aFS.getType()); + + assert type.equals(aEvent.getType()); + + BratEventAnnotationDecl decl = conf.getEventDecl(type); + if (decl == null) { + decl = new BratEventAnnotationDecl(superType, type); + conf.addEventDecl(decl); + } + + Map> slots = new LinkedHashMap<>(); + for (Feature feat : aFS.getType().getFeatures()) { + if (!isSlotFeature(aFS, feat)) { + continue; + } + String slot = feat.getShortName(); + + List args = slots.get(slot); + if (args == null) { + args = new ArrayList<>(); + slots.put(slot, args); + } + + if ( + FSUtil.isMultiValuedFeature(aFS, feat) + // this can only be true for array types + && feat.getRange().getComponentType() != null + // Avoid calling getParent on TOP + && !CAS.TYPE_NAME_TOP.equals(feat.getRange().getComponentType().getName()) + && CAS.TYPE_NAME_TOP.equals(aFS.getCAS().getTypeSystem() + .getParent(feat.getRange().getComponentType()).getName()) + && (feat.getRange().getComponentType().getFeatureByBaseName("target") != null) + && (feat.getRange().getComponentType().getFeatureByBaseName("role") != null) + ) { + // Handle WebAnno-style slot links + // FIXME It would be better if the link type could be configured, e.g. what + // is the name of the link feature and what is the name of the role feature... + // but right now we just keep it hard-coded to the values that are used + // in the DKPro Core SemArgLink and that are also hard-coded in WebAnno + BratEventArgumentDecl slotDecl = new BratEventArgumentDecl(slot, + BratConstants.CARD_ZERO_OR_MORE); + decl.addSlot(slotDecl); + + FeatureStructure[] links = FSUtil.getFeature(aFS, feat, FeatureStructure[].class); + if (links != null) { + for (FeatureStructure link : links) { + FeatureStructure target = FSUtil.getFeature(link, "target", + FeatureStructure.class); + Feature roleFeat = link.getType().getFeatureByBaseName("role"); + BratEventArgument arg = new BratEventArgument(slot, args.size(), + spanIdMap.get(target)); + args.add(arg); + + // Attach the role attribute to the target span + BratAnnotation targetAnno = aDoc.getAnnotation(spanIdMap.get(target)); + writePrimitiveAttribute(targetAnno, link, roleFeat); + } + } + } + else if (FSUtil.isMultiValuedFeature(aFS, feat)) { + // Handle normal multi-valued features + BratEventArgumentDecl slotDecl = new BratEventArgumentDecl(slot, + BratConstants.CARD_ZERO_OR_MORE); + decl.addSlot(slotDecl); + + FeatureStructure[] targets = FSUtil.getFeature(aFS, feat, FeatureStructure[].class); + if (targets != null) { + for (FeatureStructure target : targets) { + BratEventArgument arg = new BratEventArgument(slot, args.size(), + spanIdMap.get(target)); + args.add(arg); + } + } + } + else { + // Handle normal single-valued features + BratEventArgumentDecl slotDecl = new BratEventArgumentDecl(slot, + BratConstants.CARD_OPTIONAL); + decl.addSlot(slotDecl); + + FeatureStructure target = FSUtil.getFeature(aFS, feat, FeatureStructure.class); + if (target != null) { + BratEventArgument arg = new BratEventArgument(slot, args.size(), + spanIdMap.get(target)); + args.add(arg); + } + } + } + + aEvent.setArguments(slots.values().stream().flatMap(args -> args.stream()) + .collect(Collectors.toList())); + } + + private boolean isSlotFeature(FeatureStructure aFS, Feature aFeature) + { + return !isInternalFeature(aFeature) + && (FSUtil.isMultiValuedFeature(aFS, aFeature) || !aFeature.getRange() + .isPrimitive()); + } + + + + + private boolean isInternalFeature(Feature aFeature) + { + // https://issues.apache.org/jira/browse/UIMA-4565 + return "uima.cas.AnnotationBase:sofa".equals(aFeature.getName()); + // return CAS.FEATURE_FULL_NAME_SOFA.equals(aFeature.getName()); + } + + private void writePrimitiveAttribute(BratAnnotation aAnno, FeatureStructure aFS, Feature feat) + { + String featureValue = aFS.getFeatureValueAsString(feat); + String rangeType = feat.getRange().getName(); + + // Do not write attributes with null values unless this is explicitly enabled + if ( + !writeNullAttributes + && + ( + // null value + featureValue == null + || + ( + // zero value for integer values + "0".equals(featureValue) + && + ( + TYPE_NAME_BYTE.equals(rangeType) || + TYPE_NAME_SHORT.equals(rangeType) || + TYPE_NAME_INTEGER.equals(rangeType) || + TYPE_NAME_LONG.equals(rangeType) + ) + ) + // zero value for float values + || + ( + TYPE_NAME_DOUBLE.equals(rangeType) && + aFS.getDoubleValue(feat) == 0.0d + ) + || + ( + TYPE_NAME_FLOAT.equals(rangeType) && + aFS.getFloatValue(feat) == 0.0f + ) + ) + ) { + return; + } + + String attributeName = shortAttributeNames ? feat.getShortName() + : aAnno.getType() + '_' + feat.getShortName(); + + aAnno.addAttribute(nextAttributeId, attributeName, featureValue); + nextAttributeId++; + + // Do not write certain values to the visual/annotation configuration because + // they are not compatible with the brat annotation file format. The values are + // still maintained in the ann file. + if (isValidFeatureValue(featureValue)) { + // Features are inherited to subtypes in UIMA. By storing the attribute under + // the name of the type that declares the feature (domain) instead of the name + // of the actual instance we are processing, we make sure not to maintain + // multiple value sets for the same feature. + BratAttributeDecl attrDecl = conf.addAttributeDecl( + aAnno.getType(), + getAllSubtypes(aFS.getCAS().getTypeSystem(), feat.getDomain()), + attributeName, featureValue); + conf.addDrawingDecl(attrDecl); + } + } + + // This generates lots of types as well that we may not otherwise have in declared in the + // brat configuration files, but brat doesn't seem to mind. + private Set getAllSubtypes(TypeSystem aTS, Type aType) + { + Set types = new LinkedHashSet<>(); + aTS.getProperlySubsumedTypes(aType).stream().forEach(t -> types.add(getBratType(t))); + return types; + } + + /** + * Some feature values do not need to be registered or cannot be registered because brat does + * not support them. + */ + private boolean isValidFeatureValue(String aFeatureValue) + { + // https://github.com/nlplab/brat/issues/1149 + return !(aFeatureValue == null || aFeatureValue.length() == 0 || aFeatureValue.equals(",")); + } + + private BratTextAnnotation splitNewline(AnnotationFS aFS) + { + + // extract all but newlines as groups + Matcher m = NEWLINE_EXTRACT_PATTERN.matcher(aFS.getCoveredText()); + List offsets = new ArrayList<>(); + while (m.find()) { + Offsets offset = new Offsets(m.start(1) + aFS.getBegin(), m.end(1) + aFS.getBegin() ); + offsets.add(offset); + } + // replaces any group of newline by one space + String[] texts = new String[] { aFS.getCoveredText().replaceAll("\\R+", " ") }; + return new BratTextAnnotation(nextTextAnnotationId, getBratType(aFS.getType()), offsets, + texts); + } + + private String getBratType(Type aType) + { + if (typeMapping != null) { + return typeMapping.getBratType(aType); + } + else { + return aType.getName().replace('.', '-'); + } + } +} diff --git a/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/CommentMapping.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/CommentMapping.java new file mode 100644 index 0000000000..00705d68d8 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/CommentMapping.java @@ -0,0 +1,104 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.brat.internal.mapping; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +public class CommentMapping +{ + private static final Pattern PATTERN = Pattern.compile( + "(?[a-zA-Z_][a-zA-Z0-9_\\-.]+)" + + "[:](?[a-zA-Z][a-zA-Z0-9]+)"); + + private static final String TYPE = "TYPE"; + private static final String FEAT = "FEAT"; + + private final String type; + private final String feature; + private final Pattern pattern; + private final String replacement; + + private Matcher matcher; + private String value; + + /** + * Jackson requires this constructor - even if it is private - do not use! + */ + @SuppressWarnings("unused") + private CommentMapping() + { + this(null, null, null, null); + } + + @JsonCreator + public CommentMapping( + @JsonProperty("type") String aType, + @JsonProperty("feature") String aFeature, + @JsonProperty("match") String aMatch, + @JsonProperty("replace") String aReplace) + { + type = aType; + feature = aFeature; + pattern = Pattern.compile(aMatch != null ? aMatch : ".*"); + replacement = aReplace; + } + + public CommentMapping( + @JsonProperty("type") String aType, + @JsonProperty("feature")String aFeature) + { + this(aType, aFeature, null, null); + } + + public String getType() + { + return type; + } + + public String getFeature() + { + return feature; + } + + public boolean matches(String aValue) + { + value = aValue; + matcher = pattern.matcher(aValue); + return matcher.matches(); + } + + public String apply() + { + return replacement != null ? matcher.replaceFirst(replacement) : value; + } + + public static CommentMapping parse(String aValue) + { + Matcher m = PATTERN.matcher(aValue); + + if (!m.matches()) { + throw new IllegalArgumentException("Illegal note mapping parameter format [" + aValue + "]"); + } + + return new CommentMapping(m.group(TYPE), m.group(FEAT)); + } +} diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/EventParam.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/EventParam.java similarity index 96% rename from dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/EventParam.java rename to dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/EventParam.java index d0f65cacb1..8f982fca1a 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/EventParam.java +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/EventParam.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.mapping; import java.util.regex.Matcher; import java.util.regex.Pattern; diff --git a/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/Mapping.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/Mapping.java new file mode 100644 index 0000000000..4428a40a88 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/Mapping.java @@ -0,0 +1,88 @@ +/* + * Copyright 2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.brat.internal.mapping; + +import static java.util.Collections.emptyMap; +import static java.util.function.Function.identity; +import static java.util.stream.Collectors.toMap; + +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import org.apache.commons.collections4.MultiValuedMap; +import org.apache.commons.collections4.multimap.ArrayListValuedHashMap; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +public class Mapping +{ + private final TypeMappings textTypeMapppings; + private final TypeMappings relationTypeMapppings; + private final Map textAnnotations; + private final Map relations; + private final MultiValuedMap comments; + + @JsonCreator + public Mapping( + @JsonProperty(value = "textTypeMapppings") TypeMappings aTextTypeMapppings, + @JsonProperty(value = "relationTypeMapppings") TypeMappings aRelationTypeMapppings, + @JsonProperty(value = "spans") List aTextAnnotations, + @JsonProperty(value = "relations") List aRelations, + @JsonProperty(value = "comments") List aComments) + { + textTypeMapppings = aTextTypeMapppings; + relationTypeMapppings = aRelationTypeMapppings; + + textAnnotations = aTextAnnotations != null ? aTextAnnotations.stream() + .collect(toMap(SpanMapping::getType, identity())) : emptyMap(); + relations = aRelations != null ? aRelations.stream() + .collect(toMap(RelationMapping::getType, identity())) : emptyMap(); + + comments = new ArrayListValuedHashMap<>(); + if (aComments != null) { + aComments.forEach(mapping -> comments.put(mapping.getType(), mapping)); + } + } + + public TypeMappings getTextTypeMapppings() + { + return textTypeMapppings; + } + + public TypeMappings getRelationTypeMapppings() + { + return relationTypeMapppings; + } + + public SpanMapping getSpanMapping(String aType) + { + return textAnnotations.get(aType); + } + + public RelationMapping getRelationMapping(String aType) + { + return relations.get(aType); + } + + public Collection getCommentMapping(String aType) + { + return comments.get(aType); + } +} diff --git a/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/RelationMapping.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/RelationMapping.java new file mode 100644 index 0000000000..0f713e5375 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/RelationMapping.java @@ -0,0 +1,139 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.brat.internal.mapping; + +import java.util.Collections; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +public class RelationMapping +{ + public static final String FLAG_ANCHOR = "A"; + + private static final Pattern PATTERN = Pattern.compile( + "(?[a-zA-Z_][a-zA-Z0-9_\\-.]+):" + + "(?[a-zA-Z][a-zA-Z0-9]+)(?\\{A?\\})?:" + + "(?[a-zA-Z][a-zA-Z0-9]+)(?\\{A?\\})?" + + "(?:[:](?[a-zA-Z][a-zA-Z0-9]+))?"); + + private static final String TYPE = "TYPE"; + private static final String ARG1 = "ARG1"; + private static final String FLAGS1 = "FLAGS1"; + private static final String ARG2 = "ARG2"; + private static final String FLAGS2 = "FLAGS2"; + private static final String SUBCAT = "SUBCAT"; + + private final String type; + private final String arg1; + private final String flags1; + private final String arg2; + private final String flags2; + private final String subcat; + private final Map defaultFeatureValues; + + /** + * Jackson requires this constructor - even if it is private - do not use! + */ + @SuppressWarnings("unused") + private RelationMapping() + { + this(null, null, null, null, null, null); + } + + @JsonCreator + public RelationMapping( + @JsonProperty(value = "type", required = true) String aType, + @JsonProperty(value = "arg1", required = true) String aArg1, + @JsonProperty(value = "flags1") String aFlags1, + @JsonProperty(value = "arg2", required = true) String aArg2, + @JsonProperty(value = "flags2") String aFlags2, + @JsonProperty(value = "subCatFeature") String aSubCat, + @JsonProperty(value = "defaultFeatureValues") Map aDefaults) + { + type = aType; + arg1 = aArg1; + flags1 = aFlags1; + arg2 = aArg2; + flags2 = aFlags2; + subcat = aSubCat; + defaultFeatureValues = aDefaults != null ? aDefaults : Collections.emptyMap(); + } + + public RelationMapping( + @JsonProperty(value = "type", required = true) String aType, + @JsonProperty(value = "arg1", required = true) String aArg1, + @JsonProperty(value = "flags1") String aFlags1, + @JsonProperty(value = "arg2", required = true) String aArg2, + @JsonProperty(value = "flags2") String aFlags2, + @JsonProperty(value = "subCatFeature") String aSubCat) + { + this(aType, aArg1, aFlags1, aArg2, aFlags2, aSubCat, Collections.emptyMap()); + } + + + public String getType() + { + return type; + } + + public String getArg1() + { + return arg1; + } + + public String getFlags1() + { + return flags1 != null ? flags1 : ""; + } + + public String getArg2() + { + return arg2; + } + + public String getFlags2() + { + return flags2 != null ? flags2 : ""; + } + + public String getSubcat() + { + return subcat; + } + + public Map getDefaultFeatureValues() + { + return defaultFeatureValues; + } + + public static RelationMapping parse(String aValue) + { + Matcher m = PATTERN.matcher(aValue); + + if (!m.matches()) { + throw new IllegalArgumentException("Illegal relation parameter format [" + aValue + "]"); + } + + return new RelationMapping(m.group(TYPE), m.group(ARG1), m.group(FLAGS1), m.group(ARG2), + m.group(FLAGS2), m.group(SUBCAT)); + } +} diff --git a/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/SpanMapping.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/SpanMapping.java new file mode 100644 index 0000000000..5d208ae1b5 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/SpanMapping.java @@ -0,0 +1,96 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.brat.internal.mapping; + +import java.util.Collections; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +public class SpanMapping +{ + public static final String FLAG_ANCHOR = "A"; + + private static final Pattern PATTERN = Pattern.compile( + "(?[a-zA-Z_][a-zA-Z0-9_\\-.]+)" + + "(?:[:](?[a-zA-Z][a-zA-Z0-9]+))?"); + + private static final String TYPE = "TYPE"; + private static final String SUBCAT = "SUBCAT"; + + private final String type; + private final String subcat; + private final Map defaultFeatureValues; + + /** + * Jackson requires this constructor - even if it is private - do not use! + */ + @SuppressWarnings("unused") + private SpanMapping() + { + this(null, null); + } + + @JsonCreator + public SpanMapping( + @JsonProperty(value = "type", required = true) String aType, + @JsonProperty(value = "subCatFeature") String aSubCat, + @JsonProperty(value = "defaultFeatureValues") Map aDefaults) + { + type = aType; + subcat = aSubCat; + defaultFeatureValues = aDefaults != null ? aDefaults : Collections.emptyMap(); + } + + public SpanMapping( + @JsonProperty(value = "type", required = true) String aType, + @JsonProperty(value = "subCatFeature") String aSubCat) + { + this(aType, aSubCat, Collections.emptyMap()); + } + + public String getType() + { + return type; + } + + public String getSubcat() + { + return subcat; + } + + public Map getDefaultFeatureValues() + { + return defaultFeatureValues; + } + + public static SpanMapping parse(String aValue) + { + Matcher m = PATTERN.matcher(aValue); + + if (!m.matches()) { + throw new IllegalArgumentException( + "Illegal text annotation parameter format [" + aValue + "]"); + } + + return new SpanMapping(m.group(TYPE), m.group(SUBCAT)); + } +} diff --git a/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/TypeMapping.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/TypeMapping.java new file mode 100644 index 0000000000..13883e6596 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/TypeMapping.java @@ -0,0 +1,86 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.brat.internal.mapping; + +import java.util.Collections; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +public class TypeMapping +{ + private static final Pattern PATTERN = Pattern.compile( + "(?.+?)" + + "\\s*->\\s*" + + "(?.+?)"); + + private static final String BRAT = "BRAT"; + private static final String UIMA = "UIMA"; + + private final Pattern bratTypePattern; + private final String uimaType; + private final Map defaultFeatureValues; + + private Matcher matcher; + + @JsonCreator + public TypeMapping( + @JsonProperty(value = "from", required = true) String aPattern, + @JsonProperty(value = "to", required = true) String aReplacement, + @JsonProperty(value = "defaultFeatureValues") Map aDefaults) + { + bratTypePattern = Pattern.compile("^" + aPattern.trim() + "$"); + uimaType = aReplacement.trim(); + defaultFeatureValues = aDefaults != null ? aDefaults : Collections.emptyMap(); + } + + public TypeMapping(String aPattern, String aReplacement) + { + this(aPattern, aReplacement, Collections.emptyMap()); + } + + public boolean matches(String aType) + { + matcher = bratTypePattern.matcher(aType); + return matcher.matches(); + } + + public String apply() + { + return matcher.replaceFirst(uimaType); + } + + public Map getDefaultFeatureValues() + { + return defaultFeatureValues; + } + + public static TypeMapping parse(String aValue) + { + Matcher m = PATTERN.matcher(aValue); + + if (!m.matches()) { + throw new IllegalArgumentException("Illegal mapping parameter format [" + aValue + "]"); + } + + return new TypeMapping(m.group(BRAT), m.group(UIMA)); + } +} diff --git a/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/TypeMappings.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/TypeMappings.java new file mode 100644 index 0000000000..d19ab4bff6 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/mapping/TypeMappings.java @@ -0,0 +1,128 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.brat.internal.mapping; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.uima.cas.Type; +import org.apache.uima.cas.TypeSystem; +import org.dkpro.core.io.brat.internal.model.BratAnnotation; + +import com.fasterxml.jackson.annotation.JsonCreator; + +public class TypeMappings +{ + private final List parsedMappings; + private final Map brat2UimaMappingCache; + private final Map uima2BratMappingCache; + + @JsonCreator + public TypeMappings(List aMappings) + { + parsedMappings = aMappings; + brat2UimaMappingCache = new HashMap<>(); + uima2BratMappingCache = new HashMap<>(); + } + + public TypeMappings(String... aMappings) + { + parsedMappings = new ArrayList<>(); + + if (aMappings != null) { + for (String m : aMappings) { + parsedMappings.add(TypeMapping.parse(m)); + } + } + + brat2UimaMappingCache = new HashMap<>(); + uima2BratMappingCache = new HashMap<>(); + } + + private String apply(String aType) + { + String type = aType; + for (TypeMapping m : parsedMappings) { + if (m.matches(aType)) { + type = m.apply(); + break; + } + } + return type; + } + + public TypeMapping getMappingByBratType(String aBratType) + { + return parsedMappings.stream() + .filter(mapping -> mapping.matches(aBratType)) + .findFirst() + .orElse(null); + } + + public Type getUimaType(TypeSystem aTs, BratAnnotation aAnno) + { + Type t = brat2UimaMappingCache.get(aAnno.getType()); + + if (t == null) { + // brat doesn't like dots in name names, so we had replaced them with dashes. + // Now revert. + String type = apply(aAnno.getType().replace("-", ".")); + t = aTs.getType(type); + + // if the lookup didn't work with replacing the dashes, try without, e.g. because the + // brat name *really* contains dashes and we only resolve them through mapping + if (t == null) { + type = apply(aAnno.getType()); + t = aTs.getType(type); + } + + brat2UimaMappingCache.put(aAnno.getType(), t); + } + + if (t == null) { + throw new IllegalStateException("Unable to find appropriate UIMA type for brat type [" + + aAnno.getType() + "]"); + } + + return t; + } + + public String getBratType(Type aType) + { + String bratType = uima2BratMappingCache.get(aType.getName()); + + if (bratType == null) { + String uimaType = aType.getName(); + + for (TypeMapping m : parsedMappings) { + if (m.matches(aType.getName())) { + uimaType = m.apply(); + break; + } + } + + // brat doesn't like dots in name names, so we had replaced them with dashes. + bratType = uimaType.replace(".", "-"); + uima2BratMappingCache.put(uimaType, bratType); + } + + return bratType; + } +} diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAnnotation.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratAnnotation.java similarity index 97% rename from dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAnnotation.java rename to dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratAnnotation.java index a2269ce006..11fd56edc8 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAnnotation.java +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratAnnotation.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; import java.io.IOException; import java.util.Collection; diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAnnotationDecl.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratAnnotationDecl.java similarity index 95% rename from dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAnnotationDecl.java rename to dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratAnnotationDecl.java index cd0ac8e0e8..054854b716 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAnnotationDecl.java +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratAnnotationDecl.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; import java.util.LinkedHashSet; import java.util.Set; diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAnnotationDocument.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratAnnotationDocument.java similarity index 97% rename from dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAnnotationDocument.java rename to dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratAnnotationDocument.java index f28b1143be..1ec320a6f9 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAnnotationDocument.java +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratAnnotationDocument.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; import java.io.IOException; import java.io.Reader; @@ -47,6 +47,9 @@ public static BratAnnotationDocument read(Reader aReader) while (lines.hasNext()) { String line = lines.next(); switch (line.charAt(0)) { + case '#': + doc.addAnnotation(BratNoteAnnotation.parse(line)); + break; case 'T': doc.addAnnotation(BratTextAnnotation.parse(line)); break; diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAttribute.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratAttribute.java similarity index 97% rename from dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAttribute.java rename to dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratAttribute.java index 5eb3179b9e..b42f235d82 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAttribute.java +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratAttribute.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; import java.io.IOException; import java.util.regex.Matcher; @@ -117,7 +117,7 @@ public static BratAttribute parse(String aLine) Matcher m = PATTERN.matcher(aLine); if (!m.matches()) { - throw new IllegalArgumentException("Illegal attribute format ["+aLine+"]"); + throw new IllegalArgumentException("Illegal attribute format [" + aLine + "]"); } String values = m.group(VALUES); diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAttributeDecl.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratAttributeDecl.java similarity index 97% rename from dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAttributeDecl.java rename to dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratAttributeDecl.java index bb0e91732b..c1a29dda95 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAttributeDecl.java +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratAttributeDecl.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; import java.util.Arrays; import java.util.Collection; diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAttributeDrawingDecl.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratAttributeDrawingDecl.java similarity index 96% rename from dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAttributeDrawingDecl.java rename to dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratAttributeDrawingDecl.java index 8e65b5f0db..593f3ea282 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAttributeDrawingDecl.java +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratAttributeDrawingDecl.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; import java.io.IOException; diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratConfiguration.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratConfiguration.java similarity index 98% rename from dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratConfiguration.java rename to dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratConfiguration.java index c5a8bd3b2d..520244c1eb 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratConfiguration.java +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratConfiguration.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; import java.io.IOException; import java.io.Writer; @@ -85,7 +85,8 @@ public void addRelationDecl(String aSuperType, String aType, String aArg1Label, } private void write(Writer aWriter, int aDepth, BratAnnotationDecl aDecl, - Map aAll, Collection aRendered) + Map aAll, + Collection aRendered) throws IOException { // Avoid rendering the same declaration multiple times diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratConstants.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratConstants.java similarity index 93% rename from dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratConstants.java rename to dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratConstants.java index cf60ba4c2b..32699f716d 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratConstants.java +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratConstants.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; public class BratConstants { diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratDrawingDecl.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratDrawingDecl.java similarity index 95% rename from dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratDrawingDecl.java rename to dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratDrawingDecl.java index db3eec97f2..762a9ace18 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratDrawingDecl.java +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratDrawingDecl.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; import java.io.IOException; diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratEventAnnotation.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratEventAnnotation.java similarity index 96% rename from dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratEventAnnotation.java rename to dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratEventAnnotation.java index 9b67826a6b..9c47949a75 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratEventAnnotation.java +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratEventAnnotation.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; import java.io.IOException; import java.util.ArrayList; @@ -53,9 +53,9 @@ public class BratEventAnnotation private static final Pattern PATTERN = Pattern.compile( "(?E[0-9]+)[\\t]" + - "(?[a-zA-Z_][a-zA-Z0-9_-]+):" + + "(?[a-zA-Z0-9_][a-zA-Z0-9_-]+):" + "(?[ET][0-9]+)" + - "(?( [a-zA-Z_][a-zA-Z0-9_-]+:[ET][0-9]+)*)"); + "(?( [a-zA-Z_][a-zA-Z0-9_-]+:[ET][0-9]+)*)[ ]*"); private static final String ID = "ID"; private static final String TYPE = "TYPE"; diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratEventAnnotationDecl.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratEventAnnotationDecl.java similarity index 96% rename from dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratEventAnnotationDecl.java rename to dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratEventAnnotationDecl.java index 3878e4a5a3..17451ae72a 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratEventAnnotationDecl.java +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratEventAnnotationDecl.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; import java.util.Arrays; import java.util.LinkedHashSet; diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratEventArgument.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratEventArgument.java similarity index 96% rename from dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratEventArgument.java rename to dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratEventArgument.java index 57932530f2..48900776d3 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratEventArgument.java +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratEventArgument.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; import java.io.IOException; import java.util.regex.Matcher; @@ -88,7 +88,7 @@ public static BratEventArgument parse(String aLine) Matcher m = PATTERN.matcher(aLine); if (!m.matches()) { - throw new IllegalArgumentException("Illegal event argument format ["+aLine+"]"); + throw new IllegalArgumentException("Illegal event argument format [" + aLine + "]"); } int index = 0; diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratEventArgumentDecl.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratEventArgumentDecl.java similarity index 97% rename from dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratEventArgumentDecl.java rename to dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratEventArgumentDecl.java index feac463d01..c77b333bbc 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratEventArgumentDecl.java +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratEventArgumentDecl.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; public class BratEventArgumentDecl { diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratLabelDecl.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratLabelDecl.java similarity index 96% rename from dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratLabelDecl.java rename to dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratLabelDecl.java index f4bdcfa1fb..01d12eefde 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratLabelDecl.java +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratLabelDecl.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; import java.io.IOException; diff --git a/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratNoteAnnotation.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratNoteAnnotation.java new file mode 100644 index 0000000000..8d3a093d14 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratNoteAnnotation.java @@ -0,0 +1,95 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.brat.internal.model; + +import java.io.IOException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import com.fasterxml.jackson.core.JsonGenerator; + +public class BratNoteAnnotation + extends BratAnnotation +{ + private static final Pattern PATTERN = Pattern.compile( + "(?#[0-9]+)\\t" + + "(?[a-zA-Z_][a-zA-Z0-9_\\-]+) " + + "(?[ETR][0-9]+)\\t" + + "(?.*)"); + + private static final String ID = "ID"; + private static final String TYPE = "TYPE"; + private static final String TARGET = "TARGET"; + private static final String NOTE = "NOTE"; + + private final String target; + private final String note; + + public BratNoteAnnotation(int aId, String aType, String aTarget, String aNote) + { + this("#" + aId, aType, aTarget, aNote); + } + + public BratNoteAnnotation(String aId, String aType, String aTarget, String aNote) + { + super(aId, aType); + target = aTarget; + note = aNote; + } + + public String getTarget() + { + return target; + } + + public String getNote() + { + return note; + } + + @Override + public void write(JsonGenerator aJG) + throws IOException + { + // Format: [${TARGET}, ${TYPE}, ${NOTE}] + // ['T1', 'AnnotatorNotes', 'Hurrah!'] + + aJG.writeStartArray(); + aJG.writeString(getType()); + aJG.writeString(target); + aJG.writeString(note); + aJG.writeEndArray(); + } + + @Override + public String toString() + { + return getId() + '\t' + getType() + ' ' + target + '\t' + note; + } + + public static BratNoteAnnotation parse(String aLine) + { + Matcher m = PATTERN.matcher(aLine); + + if (!m.matches()) { + throw new IllegalArgumentException("Illegal text annotation format [" + aLine + "]"); + } + + return new BratNoteAnnotation(m.group(ID), m.group(TYPE), m.group(TARGET), m.group(NOTE)); + } +} diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratRelationAnnotation.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratRelationAnnotation.java similarity index 92% rename from dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratRelationAnnotation.java rename to dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratRelationAnnotation.java index 683ba955bf..1f602e67f4 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratRelationAnnotation.java +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratRelationAnnotation.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; import java.io.IOException; import java.util.regex.Matcher; @@ -28,7 +28,7 @@ public class BratRelationAnnotation { private static final Pattern PATTERN = Pattern.compile( "(?R[0-9]+)[\\t]" + - "(?[a-zA-Z_][a-zA-Z0-9_-]+) " + + "(?[a-zA-Z0-9_][a-zA-Z0-9_-]+) " + "(?[a-zA-Z][a-zA-Z0-9]+):" + "(?[ET][0-9]+) " + "(?[a-zA-Z][a-zA-Z0-9]+):" + @@ -49,7 +49,7 @@ public class BratRelationAnnotation public BratRelationAnnotation(int aId, String aType, String aArg1Label, String aArg1Target, String aArg2Label, String aArg2Target) { - this("R"+aId, aType, aArg1Label, aArg1Target, aArg2Label, aArg2Target); + this("R" + aId, aType, aArg1Label, aArg1Target, aArg2Label, aArg2Target); } public BratRelationAnnotation(String aId, String aType, String aArg1Label, String aArg1Target, @@ -117,7 +117,8 @@ public static BratRelationAnnotation parse(String aLine) Matcher m = PATTERN.matcher(aLine); if (!m.matches()) { - throw new IllegalArgumentException("Illegal relation annotation format ["+aLine+"]"); + throw new IllegalArgumentException( + "Illegal relation annotation format [" + aLine + "]"); } return new BratRelationAnnotation(m.group(ID), m.group(TYPE), m.group(ARG1_LABEL), diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratRelationAnnotationDecl.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratRelationAnnotationDecl.java similarity index 96% rename from dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratRelationAnnotationDecl.java rename to dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratRelationAnnotationDecl.java index 2edd8fc1c6..99de01dc8c 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratRelationAnnotationDecl.java +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratRelationAnnotationDecl.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; public class BratRelationAnnotationDecl extends BratAnnotationDecl diff --git a/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratTextAnnotation.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratTextAnnotation.java new file mode 100644 index 0000000000..3825bb24db --- /dev/null +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratTextAnnotation.java @@ -0,0 +1,155 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.brat.internal.model; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import com.fasterxml.jackson.core.JsonGenerator; + +public class BratTextAnnotation + extends BratAnnotation +{ + private static final Pattern PATTERN = Pattern.compile( + "(?T[0-9]+)\\t" + + "(?[a-zA-Z0-9_][a-zA-Z0-9_\\-]+) " + + "(?[0-9]+ [0-9]+(;[0-9]+ [0-9]+)*)\\t" + + "(?.*)"); + + private static final String ID = "ID"; + private static final String TYPE = "TYPE"; + private static final String OFFSETS = "OFFSETS"; + private static final String TEXT = "TEXT"; + + private final String[] texts; + + private final List offsets; + + public BratTextAnnotation(int aId, String aType, List aOffsets, String[] aTexts) + { + this("T" + aId, aType, aOffsets, aTexts); + } + + public BratTextAnnotation(String aId, String aType, List aOffsets, String[] aTexts) + { + super(aId, aType); + offsets = aOffsets; + texts = aTexts; + } + + private static String[] splitText(String aText, List aOffsets) + { + String[] result = new String[aOffsets.size()]; + String pieceOfText = aText; + for (int i = 0; i < aOffsets.size(); i++) { + int size = aOffsets.get(i).getEnd() - aOffsets.get(i).getBegin(); + result[i] = aText.substring(0, size); + pieceOfText = pieceOfText.substring(size); + } + return result; + } + + public List getOffsets() + { + return offsets; + } + + public String[] getText() + { + return texts; + } + + @Override + public void write(JsonGenerator aJG) throws IOException + { + // Format: [${ID}, ${TYPE}, [[${START}, ${END}]]] + // note that range of the offsets are [${START},${END}) + // ['T1', 'Person', [[0, 11]]] + + aJG.writeStartArray(); + aJG.writeString(getId()); + aJG.writeString(getType()); + aJG.writeStartArray(); + for (int i = 0; i < offsets.size(); i++) { + // handle discontinuous annotations + aJG.writeStartArray(); + aJG.writeNumber(offsets.get(i).getBegin()); + aJG.writeNumber(offsets.get(i).getEnd()); + aJG.writeEndArray(); + } + aJG.writeEndArray(); + aJG.writeEndArray(); + } + + @Override + public String toString() + { + return getId() + '\t' + getType() + ' ' + generateOffset(offsets) + '\t' + + String.join(" ", texts); + } + + private String generateOffset(List aOffsets) + { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < offsets.size(); i++) { + sb.append(String.format("%s %s", offsets.get(i).getBegin(), offsets.get(i).getEnd())); + if (i < offsets.size() - 1) { + sb.append(";"); + } + } + return sb.toString(); + } + + private static List generateOffsetsString(String aOffsetsStr) + { + String[] offsetsArray = aOffsetsStr.split(";"); + List offsetsList = new ArrayList<>(); + for (int i = 0; i < offsetsArray.length; i++) { + String[] beginEnd = offsetsArray[i].split(" "); + int effectiveBegin = Integer.parseInt(beginEnd[0]); + int effectiveEnd = Integer.parseInt(beginEnd[1]); + if (i > 0 && effectiveBegin <= (1 + offsetsList.get(offsetsList.size() - 1).getEnd())) { + // in case of adjacent or overlapping discontinuous annotations, merge the spans + // 1 2;3 4 -> 1 4 + offsetsList.get(offsetsList.size() - 1).setEnd(effectiveEnd); + } + else { + // in case of non-adjacent discontinuous annotation, create two offsets + // 1 2;4 5 -> 1 2 and 4 5 + offsetsList.add(new Offsets(effectiveBegin, effectiveEnd)); + } + } + return offsetsList; + } + + + public static BratTextAnnotation parse(String aLine) + { + Matcher m = PATTERN.matcher(aLine); + + if (!m.matches()) { + throw new IllegalArgumentException("Illegal text annotation format [" + aLine + "]"); + } + List offsetsLocal = generateOffsetsString(m.group(OFFSETS)); + String[] textsLocal = splitText(m.group(TEXT), offsetsLocal); + return new BratTextAnnotation(m.group(ID), m.group(TYPE), offsetsLocal, textsLocal); + } +} diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratTextAnnotationDecl.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratTextAnnotationDecl.java similarity index 93% rename from dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratTextAnnotationDecl.java rename to dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratTextAnnotationDecl.java index b08df6b323..2ef26c834a 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratTextAnnotationDecl.java +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratTextAnnotationDecl.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; public class BratTextAnnotationDecl extends BratAnnotationDecl diff --git a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratTextAnnotationDrawingDecl.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratTextAnnotationDrawingDecl.java similarity index 96% rename from dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratTextAnnotationDrawingDecl.java rename to dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratTextAnnotationDrawingDecl.java index dc65e4b3c4..0562b9e192 100644 --- a/dkpro-core-io-brat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratTextAnnotationDrawingDecl.java +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/BratTextAnnotationDrawingDecl.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; import java.io.IOException; diff --git a/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/Offsets.java b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/Offsets.java new file mode 100644 index 0000000000..098b166020 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/main/java/org/dkpro/core/io/brat/internal/model/Offsets.java @@ -0,0 +1,50 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.brat.internal.model; + +public class Offsets +{ + private int begin; + private int end; + + public Offsets(int aBegin, int aEnd) + { + begin = aBegin; + end = aEnd; + } + + public int getBegin() + { + return begin; + } + + public int getEnd() + { + return end; + } + + public void setBegin(int aBegin) + { + begin = aBegin; + } + + public void setEnd(int aEnd) + { + end = aEnd; + } +} diff --git a/dkpro-core-io-brat-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/brat/html/template.html b/dkpro-core-io-brat-asl/src/main/resources/org/dkpro/core/io/brat/html/template.html similarity index 100% rename from dkpro-core-io-brat-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/brat/html/template.html rename to dkpro-core-io-brat-asl/src/main/resources/org/dkpro/core/io/brat/html/template.html diff --git a/dkpro-core-io-brat-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/brat/BratReaderWriterTest.java b/dkpro-core-io-brat-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/brat/BratReaderWriterTest.java deleted file mode 100644 index ac631e4d1c..0000000000 --- a/dkpro-core-io-brat-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/brat/BratReaderWriterTest.java +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.brat; - -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testOneWay; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testRoundTrip; -import static java.util.Arrays.asList; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; - -import org.junit.Ignore; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2009Reader; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2012Reader; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - -//NOTE: This file contains Asciidoc markers for partial inclusion of this file in the documentation -//Do not remove these tags! -public class BratReaderWriterTest -{ - @Test - public void testConll2009() - throws Exception - { -// tag::testOneWay[] - testOneWay( - createReaderDescription(Conll2009Reader.class), // the reader - createEngineDescription(BratWriter.class, // the writer - BratWriter.PARAM_WRITE_RELATION_ATTRIBUTES, true), - "conll/2009/en-ref.ann", // the reference file for the output - "conll/2009/en-orig.conll"); // the input file for the test -// end::testOneWay[] - } - - @Test - public void testConll2009_2() - throws Exception - { - testRoundTrip( - createReaderDescription(BratReader.class), - createEngineDescription(BratWriter.class, - BratWriter.PARAM_WRITE_RELATION_ATTRIBUTES, true), - "conll/2009/en-ref.ann"); - } - - @Test - public void testConll2012Html() - throws Exception - { - testOneWay( - createReaderDescription(Conll2012Reader.class, - Conll2012Reader.PARAM_USE_HEADER_METADATA, false), - createEngineDescription(BratWriter.class, - BratWriter.PARAM_FILENAME_EXTENSION, ".html"), - "conll/2012/en-ref.html", - "conll/2012/en-orig.conll"); - } - - @Test - public void testConll2012Json() - throws Exception - { - testOneWay( - createReaderDescription(Conll2012Reader.class, - Conll2012Reader.PARAM_USE_HEADER_METADATA, false), - createEngineDescription(BratWriter.class, - BratWriter.PARAM_FILENAME_EXTENSION, ".json"), - "conll/2012/en-ref.json", - "conll/2012/en-orig.conll"); - } - - @Test - public void testConll2012() - throws Exception - { - testOneWay( - createReaderDescription(Conll2012Reader.class, - Conll2012Reader.PARAM_USE_HEADER_METADATA, false), - createEngineDescription(BratWriter.class), - "conll/2012/en-ref.ann", - "conll/2012/en-orig.conll"); - } - - @Ignore("Test largely ok but due to same spans for constituents not stable, thus ignoring") - @Test - public void testConll2012_2() - throws Exception - { - testRoundTrip( - createReaderDescription(BratReader.class), - createEngineDescription(BratWriter.class), - "conll/2012/en-ref.ann"); - } - - @Test - public void testConll2012_3() - throws Exception - { - testOneWay( - createReaderDescription(Conll2012Reader.class, - Conll2012Reader.PARAM_READ_LEMMA, false, - Conll2012Reader.PARAM_READ_NAMED_ENTITY, false, - Conll2012Reader.PARAM_READ_SEMANTIC_PREDICATE, false, - Conll2012Reader.PARAM_READ_COREFERENCE, false, - Conll2012Reader.PARAM_USE_HEADER_METADATA, false), - createEngineDescription(BratWriter.class), - "conll/2012/en-ref-min.ann", - "conll/2012/en-orig.conll"); - } - - @Test - public void testWithShortNames() - throws Exception - { - testRoundTrip( - createReaderDescription(BratReader.class, - BratReader.PARAM_TYPE_MAPPINGS, asList( - "Token -> de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "Organization -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization", - "Location -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location")), - createEngineDescription(BratWriter.class, - BratWriter.PARAM_ENABLE_TYPE_MAPPINGS, true), - "brat/document0a.ann"); - } - - @Test - public void testWithLongNames() - throws Exception - { - testRoundTrip( - createReaderDescription(BratReader.class), - createEngineDescription(BratWriter.class, - BratWriter.PARAM_ENABLE_TYPE_MAPPINGS, false), - "brat/document0b.ann"); - } - - @Test - public void test1() - throws Exception - { - testOneWay( - createReaderDescription(BratReader.class, - BratReader.PARAM_TYPE_MAPPINGS, asList( - "Origin -> de.tudarmstadt.ukp.dkpro.core.io.brat.type.AnnotationRelation", - "Country -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location", - "Organization -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization", - "MERGE-ORG -> de.tudarmstadt.ukp.dkpro.core.io.brat.type.MergeOrg"), - BratReader.PARAM_RELATION_TYPES, asList( - "de.tudarmstadt.ukp.dkpro.core.io.brat.type.AnnotationRelation:source:target{A}:value")), - createEngineDescription(BratWriter.class, - BratWriter.PARAM_RELATION_TYPES, asList( - "de.tudarmstadt.ukp.dkpro.core.io.brat.type.AnnotationRelation:source:target{A}:value")), - "brat/document1-ref.ann", - "brat/document1.ann"); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-io-brat-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratTextAnnotationTest.java b/dkpro-core-io-brat-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratTextAnnotationTest.java deleted file mode 100644 index 49b9580b8f..0000000000 --- a/dkpro-core-io-brat-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratTextAnnotationTest.java +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; - -import static org.junit.Assert.assertEquals; - -import org.junit.Test; - -public class BratTextAnnotationTest -{ - @Test - public void parseTest() - { - final String in = "T1\tOrganization 0 43\tInternational Business Machines Corporation"; - BratTextAnnotation v = BratTextAnnotation.parse(in); - assertEquals(in, v.toString()); - } - - @Test - public void parseTestZeroLength() - { - final String in = "T1\tOrganization 0 0\t"; - BratTextAnnotation v = BratTextAnnotation.parse(in); - assertEquals(in, v.toString()); - } -} diff --git a/dkpro-core-io-brat-asl/src/test/java/org/dkpro/core/io/brat/BratReaderWriterTest.java b/dkpro-core-io-brat-asl/src/test/java/org/dkpro/core/io/brat/BratReaderWriterTest.java new file mode 100644 index 0000000000..f1d669b599 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/test/java/org/dkpro/core/io/brat/BratReaderWriterTest.java @@ -0,0 +1,334 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.brat; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.util.Arrays.asList; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.assertj.core.api.Assertions.contentOf; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; +import static org.dkpro.core.testing.IOTestRunner.testRoundTrip; + +import java.io.File; + +import org.dkpro.core.io.conll.Conll2009Reader; +import org.dkpro.core.io.conll.Conll2012Reader; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.ReaderAssert; +import org.junit.Ignore; +import org.junit.Rule; +import org.junit.Test; + +public class BratReaderWriterTest +{ + @Test + public void testConll2009() + throws Exception + { + testOneWay( + createReaderDescription(Conll2009Reader.class), // the reader + createEngineDescription(BratWriter.class, // the writer + BratWriter.PARAM_WRITE_RELATION_ATTRIBUTES, true), + "conll/2009/en-ref.ann", // the reference file for the output + "conll/2009/en-orig.conll"); // the input file for the test + } + + @Test + public void testConll2009_2() + throws Exception + { + testRoundTrip( + createReaderDescription(BratReader.class), + createEngineDescription(BratWriter.class, + BratWriter.PARAM_WRITE_RELATION_ATTRIBUTES, true), + "conll/2009/en-ref.ann"); + } + + @Test + public void testConll2012Html() + throws Exception + { + testOneWay( + createReaderDescription(Conll2012Reader.class, + Conll2012Reader.PARAM_USE_HEADER_METADATA, false), + createEngineDescription(BratWriter.class, + BratWriter.PARAM_FILENAME_EXTENSION, ".html"), + "conll/2012/en-ref.html", + "conll/2012/en-orig.conll"); + } + + @Test + public void testConll2012Json() + throws Exception + { + testOneWay( + createReaderDescription(Conll2012Reader.class, + Conll2012Reader.PARAM_USE_HEADER_METADATA, false), + createEngineDescription(BratWriter.class, + BratWriter.PARAM_FILENAME_EXTENSION, ".json"), + "conll/2012/en-ref.json", + "conll/2012/en-orig.conll"); + } + + @Test + public void testConll2012() + throws Exception + { + testOneWay( + createReaderDescription(Conll2012Reader.class, + Conll2012Reader.PARAM_USE_HEADER_METADATA, false), + createEngineDescription(BratWriter.class), + "conll/2012/en-ref.ann", + "conll/2012/en-orig.conll"); + } + + @Ignore("Test largely ok but due to same spans for constituents not stable, thus ignoring") + @Test + public void testConll2012_2() + throws Exception + { + testRoundTrip( + createReaderDescription(BratReader.class), + createEngineDescription(BratWriter.class), + "conll/2012/en-ref.ann"); + } + + @Test + public void testConll2012_3() + throws Exception + { + testOneWay( + createReaderDescription(Conll2012Reader.class, + Conll2012Reader.PARAM_READ_LEMMA, false, + Conll2012Reader.PARAM_READ_NAMED_ENTITY, false, + Conll2012Reader.PARAM_READ_SEMANTIC_PREDICATE, false, + Conll2012Reader.PARAM_READ_COREFERENCE, false, + Conll2012Reader.PARAM_USE_HEADER_METADATA, false), + createEngineDescription(BratWriter.class), + "conll/2012/en-ref-min.ann", + "conll/2012/en-orig.conll"); + } + + @Test + public void testWithShortNames() + throws Exception + { + testRoundTrip( + createReaderDescription(BratReader.class, + BratReader.PARAM_TEXT_ANNOTATION_TYPE_MAPPINGS, asList( + "Token -> de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "Organization -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization", + "Location -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location")), + createEngineDescription(BratWriter.class, + BratWriter.PARAM_ENABLE_TYPE_MAPPINGS, true), + "brat/document0a.ann"); + } + + @Test + public void testWithLongNames() + throws Exception + { + testRoundTrip( + createReaderDescription(BratReader.class), + createEngineDescription(BratWriter.class, + BratWriter.PARAM_ENABLE_TYPE_MAPPINGS, false), + "brat/document0b.ann"); + } + + @Test + public void test1legacy() + throws Exception + { + testOneWay( + createReaderDescription(BratReader.class, + BratReader.PARAM_TEXT_ANNOTATION_TYPE_MAPPINGS, asList( + "Country -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location", + "Organization -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization", + "MERGE-ORG -> de.tudarmstadt.ukp.dkpro.core.io.brat.type.MergeOrg"), + BratReader.PARAM_RELATION_TYPE_MAPPINGS, asList( + "Origin -> de.tudarmstadt.ukp.dkpro.core.io.brat.type.AnnotationRelation"), + BratReader.PARAM_RELATION_TYPES, asList( + "de.tudarmstadt.ukp.dkpro.core.io.brat.type.AnnotationRelation:source:target{A}:value"), + BratReader.PARAM_NOTE_MAPPINGS, asList( + "de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization:value", + "de.tudarmstadt.ukp.dkpro.core.io.brat.type.AnnotationRelation:comment", + "de.tudarmstadt.ukp.dkpro.core.io.brat.type.MergeOrg:comment")), + createEngineDescription(BratWriter.class, + BratWriter.PARAM_RELATION_TYPES, asList( + "de.tudarmstadt.ukp.dkpro.core.io.brat.type.AnnotationRelation:source:target{A}:value")), + "brat/document1-ref.ann", + "brat/document1.ann"); + } + + @Test + public void test1mapping() + throws Exception + { + String mapping = String.join("\n", + "{", + " 'textTypeMapppings': [", + " {", + " 'from': 'Country',", + " 'to': 'de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location'", + " },", + " {", + " 'from': 'Organization',", + " 'to': 'de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization'", + " },", + " {", + " 'from': 'MERGE-ORG',", + " 'to': 'de.tudarmstadt.ukp.dkpro.core.io.brat.type.MergeOrg'", + " }", + " ],", + " 'relationTypeMapppings': [", + " {", + " 'from': 'Origin',", + " 'to': 'de.tudarmstadt.ukp.dkpro.core.io.brat.type.AnnotationRelation'", + " }", + " ],", + " 'spans': [", + " {", + " 'type': 'de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location',", + " 'defaultFeatureValues': {", + " 'value': 'LOC'", + " }", + " }", + " ],", + " 'relations': [", + " {", + " 'type': 'de.tudarmstadt.ukp.dkpro.core.io.brat.type.AnnotationRelation',", + " 'arg1': 'source',", + " 'arg2': 'target',", + " 'flags2': 'A',", + " 'subCatFeature': 'value'", + " }", + " ],", + " 'comments': [", + " {", + " 'type': 'de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization',", + " 'feature': 'value'", + " },", + " {", + " 'type': 'de.tudarmstadt.ukp.dkpro.core.io.brat.type.AnnotationRelation',", + " 'feature': 'comment'", + " },", + " {", + " 'type': 'de.tudarmstadt.ukp.dkpro.core.io.brat.type.MergeOrg',", + " 'feature': 'comment'", + " }", + " ]", + "}"); + + testOneWay( + createReaderDescription(BratReader.class, + BratReader.PARAM_MAPPING, mapping), + createEngineDescription(BratWriter.class, + BratWriter.PARAM_RELATION_TYPES, asList( + "de.tudarmstadt.ukp.dkpro.core.io.brat.type.AnnotationRelation:source:target{A}:value")), + "brat/document1-ref-mapping.ann", + "brat/document1.ann"); + } + + @Test + public void testTextAnnotationWithSubcategorization() + throws Exception + { + testOneWay( + createReaderDescription(BratReader.class, + BratReader.PARAM_TEXT_ANNOTATION_TYPES, + "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity:value", + BratReader.PARAM_TEXT_ANNOTATION_TYPE_MAPPINGS, asList( + "Country -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", + "Organization -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", + "MERGE-ORG -> de.tudarmstadt.ukp.dkpro.core.io.brat.type.MergeOrg"), + BratReader.PARAM_RELATION_TYPE_MAPPINGS, asList( + "Origin -> de.tudarmstadt.ukp.dkpro.core.io.brat.type.AnnotationRelation"), + BratReader.PARAM_RELATION_TYPES, asList( + "de.tudarmstadt.ukp.dkpro.core.io.brat.type.AnnotationRelation:source:target{A}:value")), + createEngineDescription(BratWriter.class, + BratWriter.PARAM_RELATION_TYPES, asList( + "de.tudarmstadt.ukp.dkpro.core.io.brat.type.AnnotationRelation:source:target{A}:value")), + "brat/document1-ref-sub.ann", + "brat/document1.ann"); + } + + @Test + public void testBratWithDiscontinuousFragmentNear() + throws Exception + { + ReaderAssert.assertThat(BratReader.class, + BratReader.PARAM_TEXT_ANNOTATION_TYPE_MAPPINGS, + asList("Token -> de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "Organization -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization", + "Location -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location")) + .readingFrom("src/test/resources/brat/document0c.ann") + .usingWriter(BratWriter.class, + BratWriter.PARAM_ENABLE_TYPE_MAPPINGS, true) + .outputAsString("document0c.ann") + .isEqualToNormalizingNewlines( + contentOf(new File("src/test/resources/brat/document0c.ann"), UTF_8)); + } + + @Test + public void testBratWithDiscontinuousFragmentFar() + throws Exception + { + testOneWay(createReaderDescription(BratReader.class, + BratReader.PARAM_TEXT_ANNOTATION_TYPE_MAPPINGS, + asList("Token -> de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "Organization -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization", + "Location -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location")), + createEngineDescription(BratWriter.class, BratWriter.PARAM_ENABLE_TYPE_MAPPINGS, + true), + "brat/document0d-ref.ann", + "brat/document0d.ann"); + } + + @Test + public void testBratEventWithoutRoleLabel() + throws Exception + { + String mapping = String.join("\n", + "{", + " 'textTypeMapppings': [", + " {", + " 'from': 'Quote',", + " 'to': 'de.tudarmstadt.ukp.dkpro.core.io.brat.type.Quote'", + " },", + " {", + " 'from': 'Speaker',", + " 'to': 'de.tudarmstadt.ukp.dkpro.core.io.brat.type.Speaker'", + " }", + " ]", + "}"); + + testOneWay( + createReaderDescription(BratReader.class, + BratReader.PARAM_MAPPING, mapping), + createEngineDescription(BratWriter.class, + BratWriter.PARAM_WRITE_RELATION_ATTRIBUTES, true, + BratWriter.PARAM_ENABLE_TYPE_MAPPINGS, true, + BratWriter.PARAM_TYPE_MAPPINGS, ".*\\.type\\.(\\w+) -> $1"), + "brat/event-ref.ann", + "brat/event.ann"); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-brat-asl/src/test/java/org/dkpro/core/io/brat/internal/mapping/MappingParamTest.java b/dkpro-core-io-brat-asl/src/test/java/org/dkpro/core/io/brat/internal/mapping/MappingParamTest.java new file mode 100644 index 0000000000..170bcd4842 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/test/java/org/dkpro/core/io/brat/internal/mapping/MappingParamTest.java @@ -0,0 +1,36 @@ +/* + * Copyright 2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.brat.internal.mapping; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.dkpro.core.io.brat.internal.mapping.TypeMapping; +import org.junit.Test; + +public class MappingParamTest +{ + @Test + public void testParsing() + { + TypeMapping param = TypeMapping + .parse("Country -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity"); + + assertThat(param.matches("Country")).isTrue(); + assertThat(param.apply()).isEqualTo("de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity"); + } +} diff --git a/dkpro-core-io-brat-asl/src/test/java/org/dkpro/core/io/brat/internal/mapping/MappingTest.java b/dkpro-core-io-brat-asl/src/test/java/org/dkpro/core/io/brat/internal/mapping/MappingTest.java new file mode 100644 index 0000000000..66adcad786 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/test/java/org/dkpro/core/io/brat/internal/mapping/MappingTest.java @@ -0,0 +1,95 @@ +/* + * Copyright 2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.brat.internal.mapping; + +import java.io.IOException; + +import org.junit.Test; + +import com.fasterxml.jackson.annotation.JsonSetter; +import com.fasterxml.jackson.annotation.Nulls; +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.databind.JsonMappingException; +import com.fasterxml.jackson.databind.ObjectMapper; + +public class MappingTest +{ + @Test + public void testParse() throws Exception + { + String json = String.join("\n", + "{", + " 'textTypeMapppings': [", + " {", + " 'from': 'NamedEntity',", + " 'to': 'de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity',", + " 'defaultFeatureValues': {", + " 'identity': 'none'", + " }", + " }", + " ],", + " 'relationTypeMapppings': [", + " {", + " 'from': 'Dependency',", + " 'to': 'de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency',", + " 'defaultFeatureValues': {", + " 'flavour': 'basic'", + " }", + " }", + " ],", + " 'spans': [", + " {", + " 'type': 'de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity',", + " 'subCatFeature': 'value',", + " 'defaultFeatureValues': {", + " 'identity': 'none'", + " }", + " }", + " ],", + " 'relations': [", + " {", + " 'type': 'de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency',", + " 'arg1': 'source',", + " 'arg2': 'target',", + " 'flags2': 'A',", + " 'subCatFeature': 'DependencyType',", + " 'defaultFeatureValues': {", + " 'flavour': 'basic'", + " }", + " }", + " ],", + " 'comments': [", + " {", + " 'type': 'de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity',", + " 'feature': 'identifier'", + " }", + " ]", + "}"); + + Mapping mapping = parse(json); + } + + private Mapping parse(String aJson) throws JsonParseException, JsonMappingException, IOException + { + ObjectMapper mapper = new ObjectMapper(); + mapper.setDefaultSetterInfo(JsonSetter.Value.forContentNulls(Nulls.AS_EMPTY)); + mapper.configure(JsonParser.Feature.ALLOW_SINGLE_QUOTES, true); + return mapper.readValue(aJson, Mapping.class); + } +} diff --git a/dkpro-core-io-brat-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAttributeTest.java b/dkpro-core-io-brat-asl/src/test/java/org/dkpro/core/io/brat/internal/model/BratAttributeTest.java similarity index 92% rename from dkpro-core-io-brat-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAttributeTest.java rename to dkpro-core-io-brat-asl/src/test/java/org/dkpro/core/io/brat/internal/model/BratAttributeTest.java index 321680bf94..eee50ab713 100644 --- a/dkpro-core-io-brat-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratAttributeTest.java +++ b/dkpro-core-io-brat-asl/src/test/java/org/dkpro/core/io/brat/internal/model/BratAttributeTest.java @@ -15,10 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; import static org.junit.Assert.assertEquals; +import org.dkpro.core.io.brat.internal.model.BratAttribute; import org.junit.Test; public class BratAttributeTest diff --git a/dkpro-core-io-brat-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratRelationAnnotationTest.java b/dkpro-core-io-brat-asl/src/test/java/org/dkpro/core/io/brat/internal/model/BratRelationAnnotationTest.java similarity index 89% rename from dkpro-core-io-brat-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratRelationAnnotationTest.java rename to dkpro-core-io-brat-asl/src/test/java/org/dkpro/core/io/brat/internal/model/BratRelationAnnotationTest.java index 4d6922a9a3..1d48ec27c4 100644 --- a/dkpro-core-io-brat-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/brat/internal/model/BratRelationAnnotationTest.java +++ b/dkpro-core-io-brat-asl/src/test/java/org/dkpro/core/io/brat/internal/model/BratRelationAnnotationTest.java @@ -15,10 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model; +package org.dkpro.core.io.brat.internal.model; import static org.junit.Assert.assertEquals; +import org.dkpro.core.io.brat.internal.model.BratRelationAnnotation; import org.junit.Test; public class BratRelationAnnotationTest diff --git a/dkpro-core-io-brat-asl/src/test/java/org/dkpro/core/io/brat/internal/model/BratTextAnnotationTest.java b/dkpro-core-io-brat-asl/src/test/java/org/dkpro/core/io/brat/internal/model/BratTextAnnotationTest.java new file mode 100644 index 0000000000..a3ccd05fd6 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/test/java/org/dkpro/core/io/brat/internal/model/BratTextAnnotationTest.java @@ -0,0 +1,52 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.brat.internal.model; + +import static org.junit.Assert.assertEquals; + +import org.dkpro.core.io.brat.internal.model.BratTextAnnotation; +import org.junit.Test; + +public class BratTextAnnotationTest +{ + @Test + public void parseTest() + { + final String in = "T1\tOrganization 0 43\tInternational Business Machines Corporation"; + BratTextAnnotation v = BratTextAnnotation.parse(in); + assertEquals(in, v.toString()); + } + + @Test + public void parseTestZeroLength() + { + final String in = "T1\tOrganization 0 0\t"; + BratTextAnnotation v = BratTextAnnotation.parse(in); + assertEquals(in, v.toString()); + } + + @Test + public void parseTestDiscontinousMergeFragments() + { + final String in = "T1\tOrganization 0 13;14 43\tInternational Business Machines Corporation"; + final String out = "T1\tOrganization 0 43\tInternational Business Machines Corporation"; + BratTextAnnotation v = BratTextAnnotation.parse(in); + assertEquals(out, v.toString()); + } + +} diff --git a/dkpro-core-io-brat-asl/src/test/resources/brat/document0c.ann b/dkpro-core-io-brat-asl/src/test/resources/brat/document0c.ann new file mode 100644 index 0000000000..eaaf0e879a --- /dev/null +++ b/dkpro-core-io-brat-asl/src/test/resources/brat/document0c.ann @@ -0,0 +1,8 @@ +T1 Token 0 4 This +T2 Token 5 7;8 9 is a +T3 Token 10 14 test +T4 Token 14 15 . +E1 Token:T1 +E2 Token:T2 +E3 Token:T3 +E4 Token:T4 diff --git a/dkpro-core-io-brat-asl/src/test/resources/brat/document0c.txt b/dkpro-core-io-brat-asl/src/test/resources/brat/document0c.txt new file mode 100644 index 0000000000..eafe8d0650 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/test/resources/brat/document0c.txt @@ -0,0 +1,2 @@ +This is +a test. diff --git a/dkpro-core-io-brat-asl/src/test/resources/brat/document0d-ref.ann b/dkpro-core-io-brat-asl/src/test/resources/brat/document0d-ref.ann new file mode 100644 index 0000000000..197d6a1624 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/test/resources/brat/document0d-ref.ann @@ -0,0 +1,10 @@ +T1 Token 0 4 This +T2 Token 5 7 is +T3 Token 9 10 n +T4 Token 11 21 other test +T5 Token 21 22 . +E1 Token:T1 +E2 Token:T2 +E3 Token:T3 +E4 Token:T4 +E5 Token:T5 diff --git a/dkpro-core-io-brat-asl/src/test/resources/brat/document0d.ann b/dkpro-core-io-brat-asl/src/test/resources/brat/document0d.ann new file mode 100644 index 0000000000..ca640f2e4c --- /dev/null +++ b/dkpro-core-io-brat-asl/src/test/resources/brat/document0d.ann @@ -0,0 +1,8 @@ +T1 Token 0 4 This +T2 Token 5 7;9 10 is n +T3 Token 11 21 other test +T4 Token 21 22 . +E1 Token:T1 +E2 Token:T2 +E3 Token:T3 +E4 Token:T4 diff --git a/dkpro-core-io-brat-asl/src/test/resources/brat/document0d.txt b/dkpro-core-io-brat-asl/src/test/resources/brat/document0d.txt new file mode 100644 index 0000000000..b09b2313c4 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/test/resources/brat/document0d.txt @@ -0,0 +1,2 @@ +This is +an other test. diff --git a/dkpro-core-io-brat-asl/src/test/resources/brat/document1-ref-mapping.ann b/dkpro-core-io-brat-asl/src/test/resources/brat/document1-ref-mapping.ann new file mode 100644 index 0000000000..93f2204d58 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/test/resources/brat/document1-ref-mapping.ann @@ -0,0 +1,9 @@ +T1 de-tudarmstadt-ukp-dkpro-core-api-ner-type-Organization 0 4 Sony +A1 de-tudarmstadt-ukp-dkpro-core-api-ner-type-Organization_value T1 This is a named entity. +T2 de-tudarmstadt-ukp-dkpro-core-api-ner-type-Organization 33 41 Ericsson +T3 de-tudarmstadt-ukp-dkpro-core-api-ner-type-Location 75 81 Sweden +A2 de-tudarmstadt-ukp-dkpro-core-api-ner-type-Location_value T3 LOC +T4 de-tudarmstadt-ukp-dkpro-core-io-brat-type-MergeOrg 14 27 joint venture +E1 de-tudarmstadt-ukp-dkpro-core-io-brat-type-MergeOrg:T4 Org:T1 Org1:T2 +A3 de-tudarmstadt-ukp-dkpro-core-io-brat-type-MergeOrg_comment E1 This is an event. +R1 de-tudarmstadt-ukp-dkpro-core-io-brat-type-AnnotationRelation source:T2 target:T3 \ No newline at end of file diff --git a/dkpro-core-io-brat-asl/src/test/resources/brat/document1-ref-sub.ann b/dkpro-core-io-brat-asl/src/test/resources/brat/document1-ref-sub.ann new file mode 100644 index 0000000000..da02fde922 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/test/resources/brat/document1-ref-sub.ann @@ -0,0 +1,9 @@ +T1 de-tudarmstadt-ukp-dkpro-core-api-ner-type-NamedEntity 0 4 Sony +A1 de-tudarmstadt-ukp-dkpro-core-api-ner-type-NamedEntity_value T1 Organization +T2 de-tudarmstadt-ukp-dkpro-core-api-ner-type-NamedEntity 33 41 Ericsson +A2 de-tudarmstadt-ukp-dkpro-core-api-ner-type-NamedEntity_value T2 Organization +T3 de-tudarmstadt-ukp-dkpro-core-api-ner-type-NamedEntity 75 81 Sweden +A3 de-tudarmstadt-ukp-dkpro-core-api-ner-type-NamedEntity_value T3 Country +T4 de-tudarmstadt-ukp-dkpro-core-io-brat-type-MergeOrg 14 27 joint venture +E1 de-tudarmstadt-ukp-dkpro-core-io-brat-type-MergeOrg:T4 Org:T1 Org1:T2 +R1 de-tudarmstadt-ukp-dkpro-core-io-brat-type-AnnotationRelation source:T2 target:T3 \ No newline at end of file diff --git a/dkpro-core-io-brat-asl/src/test/resources/brat/document1-ref.ann b/dkpro-core-io-brat-asl/src/test/resources/brat/document1-ref.ann index d82124842b..10a3a85dcc 100644 --- a/dkpro-core-io-brat-asl/src/test/resources/brat/document1-ref.ann +++ b/dkpro-core-io-brat-asl/src/test/resources/brat/document1-ref.ann @@ -1,6 +1,8 @@ T1 de-tudarmstadt-ukp-dkpro-core-api-ner-type-Organization 0 4 Sony +A1 de-tudarmstadt-ukp-dkpro-core-api-ner-type-Organization_value T1 This is a named entity. T2 de-tudarmstadt-ukp-dkpro-core-api-ner-type-Organization 33 41 Ericsson T3 de-tudarmstadt-ukp-dkpro-core-api-ner-type-Location 75 81 Sweden T4 de-tudarmstadt-ukp-dkpro-core-io-brat-type-MergeOrg 14 27 joint venture E1 de-tudarmstadt-ukp-dkpro-core-io-brat-type-MergeOrg:T4 Org:T1 Org1:T2 +A2 de-tudarmstadt-ukp-dkpro-core-io-brat-type-MergeOrg_comment E1 This is an event. R1 de-tudarmstadt-ukp-dkpro-core-io-brat-type-AnnotationRelation source:T2 target:T3 \ No newline at end of file diff --git a/dkpro-core-io-brat-asl/src/test/resources/brat/document1.ann b/dkpro-core-io-brat-asl/src/test/resources/brat/document1.ann index 2abbe6b849..6d7c20b214 100644 --- a/dkpro-core-io-brat-asl/src/test/resources/brat/document1.ann +++ b/dkpro-core-io-brat-asl/src/test/resources/brat/document1.ann @@ -4,3 +4,6 @@ T3 Organization 33 41 Ericsson E1 MERGE-ORG:T2 Org1:T1 Org2:T3 T4 Country 75 81 Sweden R1 Origin Arg1:T3 Arg2:T4 +#1 AnnotatorNotes T1 This is a named entity. +#2 AnnotatorNotes E1 This is an event. +#3 AnnotatorNotes R1 This is a relation. diff --git a/dkpro-core-io-brat-asl/src/test/resources/brat/event-ref.ann b/dkpro-core-io-brat-asl/src/test/resources/brat/event-ref.ann new file mode 100644 index 0000000000..b7c7db743d --- /dev/null +++ b/dkpro-core-io-brat-asl/src/test/resources/brat/event-ref.ann @@ -0,0 +1,3 @@ +T1 Quote 0 13 "I am hungry" +T2 Speaker 19 23 John +E1 Speaker:T2 Speak:T1 \ No newline at end of file diff --git a/dkpro-core-io-brat-asl/src/test/resources/brat/event.ann b/dkpro-core-io-brat-asl/src/test/resources/brat/event.ann new file mode 100644 index 0000000000..df34dbdd32 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/test/resources/brat/event.ann @@ -0,0 +1,3 @@ +T1 Speaker 19 23 John +T2 Quote 0 13 "I am hungry" +E1 Speaker:T1 Speak:T2 diff --git a/dkpro-core-io-brat-asl/src/test/resources/brat/event.txt b/dkpro-core-io-brat-asl/src/test/resources/brat/event.txt new file mode 100644 index 0000000000..669f5f5848 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/test/resources/brat/event.txt @@ -0,0 +1 @@ +"I am hungry" said John. diff --git a/dkpro-core-io-brat-asl/src/test/resources/desc/type/BratTest.xml b/dkpro-core-io-brat-asl/src/test/resources/desc/type/BratTest.xml index 48df9d0013..5bd52422d0 100644 --- a/dkpro-core-io-brat-asl/src/test/resources/desc/type/BratTest.xml +++ b/dkpro-core-io-brat-asl/src/test/resources/desc/type/BratTest.xml @@ -25,6 +25,16 @@ uima.cas.String + + comment + + uima.cas.String + + + flavour + + uima.cas.String + @@ -38,7 +48,47 @@ uima.cas.FSArray uima.tcas.Annotation - + + comment + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.io.brat.type.Quote + + uima.tcas.Annotation + + + de.tudarmstadt.ukp.dkpro.core.io.brat.type.Speaker + + uima.tcas.Annotation + + + Speak + + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.io.brat.type.SpeakLink + + + + + de.tudarmstadt.ukp.dkpro.core.io.brat.type.SpeakLink + + uima.cas.TOP + + + role + + uima.cas.String + + + target + + uima.tcas.Annotation + + diff --git a/dkpro-core-io-brat-asl/src/test/resources/log4j.properties b/dkpro-core-io-brat-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-io-brat-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-brat-asl/src/test/resources/log4j2.xml b/dkpro-core-io-brat-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-brat-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-cermine-gpl/pom.xml b/dkpro-core-io-cermine-gpl/pom.xml index c7da070786..d477d7fffb 100644 --- a/dkpro-core-io-cermine-gpl/pom.xml +++ b/dkpro-core-io-cermine-gpl/pom.xml @@ -1,6 +1,6 @@ - - de.tudarmstadt.ukp.dkpro.core-gpl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-gpl + 2.3.0-SNAPSHOT ../dkpro-core-gpl 4.0.0 - org.dkpro.core dkpro-core-io-cermine-gpl jar - DKPro Core GPL - IO - CERMINE - + DKPro Core GPL - IO - CERMINE (v${cermine.version}) (AGPL) + https://dkpro.github.io/dkpro-core/ + + + GNU Affero General Public License v3.0 or later + https://www.gnu.org/licenses/agpl.txt + repo + + + + 1.13 + pl.edu.icm.cermine cermine-impl - 1.13 + ${cermine.version} @@ -53,20 +63,24 @@ commons-io - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl + + + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-parameter-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + eu.openminted.share.annotations + omtd-share-annotations-api @@ -83,11 +97,22 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test + + + + + com.google.guava + guava + + 25.1-jre + + + @@ -102,7 +127,7 @@ - do require it as a compile dependency and also at runtime, so we - cannot set it to scope provided. Need to tell Maven to ignore it here. --> - de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core:dkpro-core-api-parameter-asl diff --git a/dkpro-core-io-cermine-gpl/src/main/java/org/dkpro/core/io/cermine/CerminePdfReader.java b/dkpro-core-io-cermine-gpl/src/main/java/org/dkpro/core/io/cermine/CerminePdfReader.java index 6b5a2fb3f9..795d2682b5 100644 --- a/dkpro-core-io-cermine-gpl/src/main/java/org/dkpro/core/io/cermine/CerminePdfReader.java +++ b/dkpro-core-io-cermine-gpl/src/main/java/org/dkpro/core/io/cermine/CerminePdfReader.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -18,11 +18,9 @@ */ package org.dkpro.core.io.cermine; -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; +import java.io.IOException; +import java.io.InputStream; + import org.apache.uima.UimaContext; import org.apache.uima.cas.CAS; import org.apache.uima.cas.Type; @@ -33,22 +31,25 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.ResourceCollectionReaderBase; +import org.dkpro.core.api.parameter.MimeTypes; import org.jdom.Element; import org.jdom.Text; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.TrimUtils; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; +import eu.openminted.share.annotations.api.DocumentationResource; import pl.edu.icm.cermine.ContentExtractor; import pl.edu.icm.cermine.exception.AnalysisException; -import java.io.IOException; -import java.io.InputStream; - -import static org.apache.commons.io.IOUtils.closeQuietly; - /** * Collection reader for PDF files using CERMINE * https://github.com/CeON/CERMINE. */ @ResourceMetaData(name = "CERMINE PDF Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({ MimeTypes.APPLICATION_PDF }) @TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading", @@ -123,17 +124,15 @@ public void getNext(CAS aCAS) throws IOException, CollectionException Resource res = nextFile(); initCas(aCAS, res); - InputStream is = null; - - try { - is = res.getInputStream(); - + try (InputStream is = res.getInputStream()) { // Process PDF ContentExtractor extractor = new ContentExtractor(); extractor.setPDF(is); Element result = extractor.getContentAsNLM(); nlmHandler.process(result, aCAS); + // FIXME Setting the language below should not be needed- initCas() should already + // be taking care of this. Double-check and remove if not necessary. // Set up language if (getConfigParameterValue(PARAM_LANGUAGE) != null) { aCAS.setDocumentLanguage((String) getConfigParameterValue(PARAM_LANGUAGE)); @@ -142,9 +141,6 @@ public void getNext(CAS aCAS) throws IOException, CollectionException catch (AnalysisException e) { throw new IOException("An exception occurred while processing the PDF document.", e); } - finally { - closeQuietly(is); - } } /** @@ -367,7 +363,12 @@ private void makeAnnotation(String annotationType) { if (beginIndex < sb.length()) { Type t = cas.getTypeSystem().getType(annotationType); - AnnotationFS a = cas.createAnnotation(t, beginIndex, sb.length()); + + // Trim leading/trailing whitespace + int[] offsets = {beginIndex, sb.length()}; + TrimUtils.trim(sb, offsets); + + AnnotationFS a = cas.createAnnotation(t, offsets[0], offsets[1]); cas.addFsToIndexes(a); updateCursor(); } @@ -398,10 +399,12 @@ private void parseBack(Element root) protected String normalizeString(String input) { - if (normalizeText) + if (normalizeText) { return input.replaceAll("\\s+", " "); - else + } + else { return input; + } } } } diff --git a/dkpro-core-io-cermine-gpl/src/test/java/org/dkpro/core/io/cermine/CerminePdfReaderTest.java b/dkpro-core-io-cermine-gpl/src/test/java/org/dkpro/core/io/cermine/CerminePdfReaderTest.java index 79647d3b19..e027c33428 100644 --- a/dkpro-core-io-cermine-gpl/src/test/java/org/dkpro/core/io/cermine/CerminePdfReaderTest.java +++ b/dkpro-core-io-cermine-gpl/src/test/java/org/dkpro/core/io/cermine/CerminePdfReaderTest.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -33,12 +33,12 @@ import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.dumper.CasDumpWriter; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.dumper.CasDumpWriter; public class CerminePdfReaderTest @@ -114,8 +114,8 @@ public void testIgnoreCitations() throws Exception CollectionReader reader = createReader(CerminePdfReader.class, CerminePdfReader.PARAM_SOURCE_LOCATION, "src/test/resources/data", CerminePdfReader.PARAM_PATTERNS, "[+]**/*.pdf", - CerminePdfReader.PARAM_NORMALIZE_TEXT, true, CerminePdfReader.PARAM_IGNORE_CITATIONS, - true); + CerminePdfReader.PARAM_NORMALIZE_TEXT, true, + CerminePdfReader.PARAM_IGNORE_CITATIONS, true); AnalysisEngine writer = createEngine(CasDumpWriter.class, CasDumpWriter.PARAM_TARGET_LOCATION, outputFile); @@ -153,4 +153,4 @@ public void testIgnoreReferencesSection() throws Exception @Rule public DkproTestContext testContext = new DkproTestContext(); -} \ No newline at end of file +} diff --git a/dkpro-core-io-cermine-gpl/src/test/resources/log4j.properties b/dkpro-core-io-cermine-gpl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-io-cermine-gpl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-cermine-gpl/src/test/resources/log4j2.xml b/dkpro-core-io-cermine-gpl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-cermine-gpl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-cermine-gpl/src/test/resources/reference/test-normalized-nocitations-noRefSection.dump b/dkpro-core-io-cermine-gpl/src/test/resources/reference/test-normalized-nocitations-noRefSection.dump index 6188b12776..8db87cd3b2 100644 --- a/dkpro-core-io-cermine-gpl/src/test/resources/reference/test-normalized-nocitations-noRefSection.dump +++ b/dkpro-core-io-cermine-gpl/src/test/resources/reference/test-normalized-nocitations-noRefSection.dump @@ -13,430 +13,430 @@ DocumentMetaData CAS-Text: Out-of-domain FrameNet Semantic Role Labeling Silvana Hartmann Ilia Kuznetsov Teresa Martin Iryna Gurevych Research Training Group AIPHES Ubiquitous Knowledge Processing (UKP) Lab 0 Department of Computer Science, Technische Universita ̈t Darmstadt 2017 1 471 482 Domain dependence of NLP systems is one of the major obstacles to their application in large-scale text analysis, also restricting the applicability of FrameNet semantic role labeling (SRL) systems. Yet, current FrameNet SRL systems are still only evaluated on a single in-domain test set. For the first time, we study the domain dependence of FrameNet SRL on a wide range of benchmark sets. We create a novel test set for FrameNet SRL based on user-generated web text and find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step. To address this problem, we develop a simple, yet efficient system based on distributed word representations. Our system closely approaches the state-of-the-art in-domain while outperforming the best available frame identification system out-of-domain. We publish our system and test data for research purposes.1 - Domain dependence is a major problem for supervised NLP tasks such as FrameNet semantic role labeling (SRL): systems generally exhibit a strong performance drop when applied to test data from a different distribution than the training data. This prohibits their large-scale use in language technology applications. The same problems are expected for FrameNet SRL, but due to a lack of datasets, state-of-theart FrameNet SRL is only evaluated on a single in-domain test set, see e.g. Das et al. (2014) and FitzGerald et al. (2015). In this work, we present the first comprehensive study of the domain dependence of FrameNet SRL 1www.ukp.tu-darmstadt.de/ood-fn-srl on a range of benchmark datasets. This is crucial as the demand for semantic textual analysis of largescale web data keeps growing. Based on FrameNet , FrameNet SRL extracts frame-semantic structures on the sentence level that describe a specific situation centered around a semantic predicate, often a verb, and its participants, typically syntactic arguments or adjuncts of the predicate. The predicate is assigned a frame label, essentially a word sense label, that defines the situation and determines the semantic roles of the participants. The following sentence from FrameNet provides an example of the Grinding frame and its roles: [The mill]Grinding cause malt]P atient [to grist]Result. grindsGrinding [the FrameNet SRL consists of two steps, frame identification (frameId), assigning a frame to the current predicate, and role labeling (roleId), identifying the participants and assigning them role labels licensed by the frame. The frameId step reduces the hundreds of role labels in FrameNet to a manageable set of up to 30 roles. Thus, FrameNet SRL differs from PropBank SRL , that only uses a small set of 26 syntactically motivated role labels and puts less weight on the predicate sense. The advantage of FrameNet SRL is that it results in a more fine-grained and rich interpretation of the input sentences which is crucial for many applications, e.g. reasoning in online debates . Domain dependence is a well-studied topic for PropBank SRL. However, to the best of our knowledge, there exists no analysis of the performance of modern FrameNet SRL systems when applied to data from new domains. In this work, we address this problem as follows: we introduce a new benchmark dataset YAGS (Yahoo! Answers Gold Standard), which is based on user-generated questions and answers and exemplifies an out-of-domain application use case. We use YAGS, along with other out-of-domain test sets, to perform a detailed analysis of the domain dependence of FrameNet SRL using Semafor to identify which of the stages of FrameNet SRL, frameId or roleId, is particularly sensitive to domain shifts. Our results confirm that the major bottleneck in FrameNet SRL is the frame identification step. Motivated by that, we develop a simple, yet efficient frame identification method based on distributed word representations that promise better domain generalization. Our system’s performance matches the state-of-the-art in-domain , despite using a simpler model, and improves on the out-of-domain performance of Semafor. The contributions of the present work are twofold: 1) we perform the first comprehensive study of the domain generalization capabilities of opensource FrameNet SRL, and 2) we propose a new frame identification method based on distributed word representations that enhances out-of-domain performance of frame identification. To enable our study, we created YAGS, a new, substantially-sized benchmark dataset for the out-of-domain testing of FrameNet SRL; we publish the annotations for the YAGS benchmark set and our frame identification system for research purposes. 2 Related work The domain dependence of FrameNet SRL systems has been only studied sparsely, however, there exists a large body of work on out-of-domain PropBank SRL, as well as on general domain adaptation methods for NLP. This section briefly introduces some of the relevant approaches in these areas, and then summarizes the state-of-the-art in FrameNet frame identification. Domain adaptation in NLP Low out-ofdomain performance is a problem common to many supervised machine learning tasks. The goal of domain adaptation is to improve model performance on the test data originating from a different distribution than the training data (Søgaard, 2013). For NLP, domain adaptation has been studied for various tasks such as POS-tagging and syntactic parsing . For the complex task of SRL, it is strongly associated with PropBank, because the corresponding CoNLL shared tasks promote out-of-domain evaluation . In the shared tasks, in-domain newspaper text from the WSJ Corpus is contrasted to out-of-domain data from fiction texts in the Brown Corpus. Most of the participants in the shared tasks do not consider domain adaptation and report systematically lower scores for the out-of-domain data (Hajicˇ et al., 2009). Representation learning has been successfully used to improve on the CoNLL shared task results . Yang et al. (2015) report the smallest performance difference (5.5 points in F1) between in-domain and out-of-domain test data, leading to the best results to date on the CoNLL 2009 out-of-domain test. Their system learns common representations for in-domain and out-of-domain data based on deep belief networks. Domain dependence of FrameNet SRL The FrameNet 1.5 fulltext corpus, used as a standard dataset for training and evaluating FrameNet SRL systems, contains texts from several domains . However, the standard data split used to evaluate modern systems ensures the presence of all domains in the training as well as test data and cannot be used to assess the systems’ ability to generalize. Moreover, all the texts in the FrameNet fulltext corpus, based on newspaper and literary texts, are post-edited and linguistically well-formed. The FrameNet test setup thus cannot provide information on SRL performance on less edited out-ofdomain data, e.g. user-generated web data. There are few studies related to the out-ofdomain generalization of FrameNet SRL. Johansson and Nugues (2008) evaluate the impact of different parsers on FrameNet SRL using the Nuclear Threats Initiative (NTI) data as an out-of-domain test set. They observe low domain generalization abilities of their supervised system, but find that using dependency parsers instead of constituency parsers is beneficial in the out-of-domain scenario. Croce et al. (2010) use a similar in-domain/out-ofdomain split to evaluate their approach to opendomain FrameNet SRL. They integrate a distributional model into their SRL system to generalize lexicalized features to previously unseen arguments and thus create an SRL system with a smaller performance gap between in-domain and out-ofdomain test data (only 4.5 percentage points F1). Note that they only evaluate the role labeling step. It is not transparent how their results would transfer to the current state-of-the-art SRL systems that already integrate methods to improve generalization, for instance using distributed representations. Palmer and Sporleder (2010) analyze the FrameNet 1.3 training data coverage and the performance of the Shalmaneser SRL system for frame identification on several test sets across domains, i.e. the PropBank and NTI parts of the FrameNet fulltext corpus and the fictional texts from the SemEval-2007 shared task . Having observed that the majority of errors results from coverage gaps in FrameNet, they suggest to focus on developing frame identification systems that generalize well to new domains. Our observations support their findings and show that the problem still persists even when modern SRL methods and the extended FrameNet 1.5 lexicon are used. Søgaard et al. (2015) annotate 236 tweets with FrameNet labels to apply SRL to knowledge extraction from Twitter. They report that the frameId performance of Semafor 2.1 (Das et al., 2010) on the new test set is similar to its performance on the SemEval-2007 newswire test set . For full SRL, there are large differences: F1 reaches only 25.96% on the Twitter set compared to the 46.5% reported by Das et al. (2010) on the indomain set. These results show that there is ample room for improvement for SRL on Twitter data. Recent FrameNet SRL systems are not evaluated in the context of their domain dependence: Kshirsagar et al. (2015) use the domain adaptation approach from Daume´III (2007) to augment the feature space for FrameNet SRL with FrameNet example sentences; FitzGerald et al. (2015) and Hermann et al. (2014) adopt deep learning methods, including learning representations that may generalize better to unseen data, to present stateof-the-art results for FrameNet SRL. All of the former only use the already introduced split of the FrameNet fulltext corpus for testing, as does the long-time state-of-the-art system Semafor . Out-of-domain evaluation is lacking, as are datasets that enable this kind of evaluation. Frame identification Current state of the art in frame identification is the approach by Hermann et al. (2014), further referred to as Hermann-14, followed by the previous state-of-the art model Semafor . The frame identification system of Semafor relies on an elaborate feature set based on syntactic and lexical features, using the WordNet hierarchy as a source of lexical information, and a label propagation-based approach to take unknown predicates into account. Semafor is not specifically designed for out-of-domain use: the WordNet coverage is limited, and the quality of syntactic parsing might drop when the system is applied to out-ofdomain data, especially in case of non-standard user-generated texts. Hermann-14 uses distributed word representations augmented by syntactic information. Generalpurpose distributed word representations (such as word2vec and GloVe ) are beneficial for many NLP tasks: word representations are calculated on a large unlabeled corpus, and then used as input for high-level tasks for which training data is scarce, such as syntactic parsing, word sense disambiguation, and SRL. In the syntax-augmented representations of Hermann-14, a region of the input vector, a container, is reserved for each syntactic path that can connect predicates to their arguments. This container is populated with a corresponding argument word representation, if the argument on this path is found in the training data. Hermann-14 uses the WSABIE algorithm to map input and frame representations to a common latent space. WSABIE uses WARP loss and gradient-based updates to minimize the distance between the latent representations of the predicate target and the correct frame, while maximizing the distance to all the other irrelevant frames. During testing, cosine similarity is used to find the closest frame given the input. One advantage of this approach is that similar frames are positioned close to each other in the latent space which allows information to be shared between similar predicates and similar frames. This system is the current state-ofthe-art for in-domain frame identification, but has not been applied in an out-of-domain setting. 3 Out-of-domain FrameNet test data This section describes available in-domain and outof-domain FrameNet test sets and the creation of YAGS, a new out-of-domain FrameNet test set. FrameNet test sets FrameNet SRL is typically evaluated on das-test, the test set first introduced by Das and Smith (2011). It is a held-out set randomly sampled from the FrameNet 1.5 fulltext corpus. While the FrameNet fulltext corpus contains data from various sources, we consider das-test an in-domain test set: all data sources of the test set are also represented in the training set. There are two additional datasets from other domains that we use in our study on domain generalization: The MASC word sense sentences corpus contains FrameNet annotations for a lexical sample of roughly 100 lemmas from ANC . The Twitter-based dataset from Søgaard et al. (2015), henceforth TW, has some very distinctive properties: it does not provide a gold standard, but annotations by three annotators. This leads to a high variance in role annotations: the annotator TW3 annotated only 82% of the number of roles annotated by TW1, see Table 1. Like Søgaard et al. (2015), we report SRL results as averages over the three annotations (TW-av). Table 1 shows statistics on these datasets. For TW, it displays the statistics for each annotator. The TW datasets are fairly small, containing only around 1,000 frame labels. The MASC dataset is of substantial size, but it constitutes a lexical sample and therefore a slightly artificial evaluation setup. There is another Twitter-based test set , which we do not use in our experiments, because it was created semi-automatically and is therefore of lower quality. We conclude that existing out-of-domain test sets for FrameNet SRL are insufficient, in particular for increasingly important domains like user-generated text, because available datasets are either small or of low quality. YAGS: a new FrameNet test set based on user generated text To address the need for new outof-domain test datasets, we created YAGS, a new FrameNet-annotated evaluation dataset based on question-answer data from Yahoo! Answers (YA), a community-driven question-and-answer forum. The corpus is based on a random sample of 55 questions and their answers from the test split of the YA Manner Questions dataset used by Surdeanu et al. (2011) and published as part of the Yahoo! Webscope program (https://webscope. sandbox.yahoo.com/). YAGS contains 1,415 sentences, 3,091 frame annotations, and 6,081 role annotations. Figure 1 shows a sentence from YAGS that demonstrates some non-standard properties of the user-generated question-answer data, such as typos (mortal instead of mortar). We publish the annotations as stand-off annotations to the original dataset. Annotation study Each document was annotated by a two linguistically trained annotators provided with detailed guidelines and then curated by an experienced expert, all using WebAnno 2.0.0 . Up to five predicates per sentence were pre-selected automatically based on lemma and POS, preferring verbal predicates to other POS, which leads to a larger proportion of verbs in YAGS. The annotation task was to identify the correct frame label for each predicate, if any, and then to identify the role spans as arguments and adjuncts of the frame, and to label them with the appropriate role. For reference, annotators accessed the FrameNet 1.5 definitions and examples with the FrameNet Explorer tool (www.clres.com/FNExplorer.html). Inter-rater agreement for frame labels is Krippendorff’s α=0.76; agreement for role labels given matching spans is α=0.62, and Krippendorff’s α unitizing agreement for role spans is 0.7 – a good result for such a difficult task on user-generated text. Average pairwise F1 agreement for frame labels is high at 0.96, higher than the 0.84 reported by Søgaard et al. (2015) for the TW sets. Our high frame agreement is a result of annotator experience and our elaborate annotation setup. YAGS statistics and properties Table 1 presents dataset statistics for YAGS and the other test sets. Due to the predicate selection, YAGS contains a larger proportion of verbal predicates than the other sets, and has three times more frames and roles than TW, approximating the size of das-test. The proportion of core roles, roles that are obligatory for a frame and thus typically more frequent in datasets than non-core roles, in the out-of-domain test sets (TW, YAGS, MASC) is slightly smaller data s f a n v compared to das-test. This goes along with a larger variance of roles in YAGS. The user-generated aspect of YAGS manifests in spelling errors, and in the lack of punctuation and structure of the texts. The language is informal, but there are only few emoticons or other special words such as the hashtags typically found in tweets. In the next section, we use the test sets from Table 1 to analyze the domain generalization capabilities of an open-source FrameNet SRL system. 4 Domain generalization capabilities of open-source FrameNet SRL To analyze the domain generalization capabilities of contemporary open-source SRL, we ran the frame identification from Semafor with the enhanced role labeler from Kshirsagar et al. (2015), both trained on the in-domain das-train set, on the four test sets das-test, YAGS, TW, and MASC. The systems receive text annotated with predicate spans as input, which has become the standard in recent evaluations. Evaluation script The Semafor evaluation script provides precision P, recall R, and F1 scores for full SRL (SRL), and accuracy A for frame identification (frameId). Full SRL evaluation can be performed with and without using gold frames instead of predicted (auto) frames. The script does not provide results on the role labeling (argument identification and labeling, roleId) alone: the scoring mechanism for SRL/gold also considers the by default correct gold frames. This is useful when comparing different SRL systems on the same test set, but not sufficient when 1) comparing role labeling performance on different test sets with a different ratio of frame labels to role labels (resulting from different annotation strategies), and 2) analyzing the contribution of frameId and roleId to full SRL performance across test sets. data das-test YAGS MASC TW-av frameId auto gold We therefore evaluate the output of the script to retain the original counts for role labels and compute scores on the role labeling proper (roleId). Moreover, there are two evaluation settings for frameId: exact frame match and partial frame match. We use the exact match setting that does not credit related frames and roles. Results Table 2 presents scores for exact match frameId and for SRL and roleId with automatic frames (auto) and with gold frames (gold). For TW, the results are averaged over the number of annotators. According to column SRL/auto, we observe best Semafor performance for full SRL on dastest, results for the other test sets are at least 16 percentage points F1 lower. This is mostly due to the worse frameId performance of Semafor on the new test sets, as shown in column frameId: frameId performance is at least 19 percentage points lower. This negatively affects roleId for the out-of-domain test sets (see column roleId/auto). RoleId/auto scores are also low on das-test, but higher than for the other sets. When using gold frame labels, roleId and SRL performance improve for all test sets. As shown in columns roleId/gold and SRL/gold, the difference between in-domain and out-of-domain evaluation vanishes. Only MASC scores are still two points lower for full SRL than those for das-test. TW-av scores even surpass the in-domain scores.2 This shows how much FrameNet role labels are dependent on correct frame labels. Thus, it is crucial to improve the out-of-domain performance of frameId systems. Domain dependence appears to be less of a problem for the role labeling step. The MASC dataset is the most difficult for both frameId and roleId. This is mostly a consequence of the lower training data coverage of MASC, as discussed below. 2Our TW-av results are not comparable to those from Søgaard et al. (2015) because their test setup includes predicate target identification and uses different evaluation metrics. das-test YAGS MASC TW1 TW2 TW3 Analysis In our study, it became clear that domain dependence is crucial to the frame identification step in SRL. The lower scores for the out-ofdomain test sets can be a result of different domainspecific predicate-frame distributions, or a lack of coverage of the domain in the training data. To get a better understanding of these phenomena, we compared detailed statistics of the different test sets, cf. Table 3. Das-test has the largest predicate coverage and contains a lot of monosemous predicates, which boosts the overall performance. The occurrence of fewer monosemous predicates is expected for the lexical sample dataset MASC, but might indicate a domain preference for polysemous predicates in the YAGS and TW datasets. The percentage of unseen predicates (lemmas ∈/ das-train) is slightly higher for the user-generated test sets than for das-test, and much higher for MASC. This is mirrored in the lower frameId performance for MASC compared to the other test sets, and the slightly higher performance of TW-av and YAGS. Not all errors can be explained by insufficient training data coverage, which indicates that domain effects occur for the out-of-domain sets. To support this assumption, we performed a detailed error analysis on the misclassified instances for all test sets. We compute the proportion of wrongly classified instances with unseen predicates, predicates that do not occur in the training set. For MASC, the majority of the errors, 68%, are based on unseen predicates, while the number ranges between 37% and 43% for the other test sets, i.e. 37% for TW, 39% for das-test and 43% for YAGS. This shows that training data coverage is a bigger issue for MASC than for the other test sets. The proportions of in-train errors for YAGS and TW-av are similar to das-test. Together with the fact that overall proportion of errors is still much higher for the user-generated test sets YAGS and TW-av, this further supports our hypothesis of domain effects for YAGS and TW-av. Manual analysis furthermore shows that there are differences in frequently confused frames between the in-domain das-test and out-of-domain YAGS and TW-av. In the next section, we study new methods to improve out-of-domain frame identification. 5 Frame identification with distributed word representations Given a predicate and a set of frames associated with this predicate, a frame identification system has to choose the correct frame based on the context. In this section we introduce our frame identification method and compare it to the state of the art in both in-domain and out-of-domain settings. Our system SimpleFrameId We developed a straightforward approach to frame identification based on distributed word representations, and were surprised to find that this simple model achieves results comparable to the state-of-theart system, Hermann-14. Our initial attempts to replicate Hermann-14, which is not publicly available, revealed that the container-based input feature space is very sparse: there exist many syntactic paths that can connect a predicate to its arguments, but a predicate instance rarely has more than five arguments in the sentence. So by design the input representation bears no information in most of its path containers. Moreover, Hermann-14 makes heavy use of automatically created dependency parses, which might decline in quality when applied to a new domain. We demonstrate that our simple system achieves competitive in-domain and out-of-domain performance. Our system, called SimpleFrameId, is specified as follows: given the lexicon L, the vector space vsm and the training data, our goal is to predict the frame f given the sentence S and the predicate p. From the machine learning perspective, the lexicon and the vector space are external resources. The lexicon contains associations between predicates and frames, and we further denote the set of frames available for a predicate as L(p). The vector space provides a pre-defined dense vector representation vsm(w) for each word w. In our case vsm is a simple word lookup function, since we do not modify our word representations during training. From the sentence we extract the context representation, xc = Pw∈C|Cv|sm(w) . We experiment with two kinds of contexts: SentBOW includes all the words in the sentence, i.e. C = S, DepBOW considers the dependency parse of the sentence and only includes direct dependents of the predicate, C = dep(p, S). As for the predicate, the plain embedding from the source vector space model is used, xp = vsm(p). A simple concatenation of xc and xp serves as input to the disambiguation classifier D, which outputs weights D(xc, xp, f ) for each frame known to the system f ∈ L. Note that the classifier itself is agnostic to the predicate’s part of speech and exact lemma and only relies on the word representations from the vsm. We experiment with two different classification methods: one is a twolayer neural network DNN , the other one is DW SB, which follows the line of Hermann-14 and learns representations for frames and predicates in the same latent space using the WSABIE algorithm.3 Hyperparameters are tuned on the development sets das-dev and YAGS-dev (sampled from YAGS); we test on the remaining 2,093 instances in YAGS-test. DataBaseline LexiconBaseline Semafor* Hermann-14* (best) WSB+SentBOW WSB+DepBOW NN+SentBOW NN+DepBOW total tering is performed. We find that our frame identification system performs surprisingly well in this setting, and we encourage the no-lexicon performance to be additionally reported in the future, since it better reflects the frame identification quality and smoothens the effect of lexicon coverage. Lexicon-based filtering In the testing stage, the classifier outputs weights for all the frames available in the lexicon, and the best-scoring frame is selected, f ← argmaxf∈LD(xc, xp, f ). Since the lexicon specifies available frames for each lexical unit (i.e. lemma and POS), additional filtering can be performed, which limits the search only to the available frames, f ← argmaxf∈L(p)D(xc, xp, f ). If the predicate is unknown to the lexicon, p ∈/ L, the overall bestscoring frame is chosen. If the target has only one entry in the lexicon, it’s declared unambiguous and the frame is assigned directly. Despite being common, this setup has several flaws that can obscure the differences between sys- Experiments In our experiments, we generate tems in the testing stage. As we showed in Section the lexicon L in the same way as in Hermann-14, 4, the FrameNet lexicon has coverage issues when by scanning the “frames” folder of the FrameNet applied to new domains. Neither the predicate list 1.5 distribution. For the external vector space nor the frame associations are guaranteed to be model vsm we use dependency-based word emcomplete, and hence the total results are highly de- beddings from Levy and Goldberg (2014). termined by the lexicon coverage.4 To take this into account, we also perform evaluation in the In-domain performance We report the perforno-lexicon setting, where frames are assigned mance of our system in the in-domain setting directly by the classifier and no lexicon-based fil- to compare to the state-of-the-art results from Hermann-14.5 We train our system on das-train and test it on das-test using the full FrameNet lexicon. When available, we report the no-lexicon scores as well. As Table 4 shows, our system outBaselines We employ two majority baseline models for comparison. The DataBaseline assigns frames based on how often a frame is evoked by the given predicate. This corresponds to the most frequent sense baseline in word sense disambiguation (WSD). The frames available for predicates are obtained by scanning the training data. The LexiconBaseline calculates overall frame counts first (i.e. how often a frame appears in the training data in general), and, given the predicate, selects the overall most frequent frame among the ones available for this predicate. We expect this baseline to better handle the cases when limited data is available for a given predicate sense. 3In our implementation, we use the LightFM package with the WARP option for hybrid matrix factorization. 4A justification for this can also be found in Hermann et al. (2014): the difference in Hermann-14 accuracy when switching from the Semafor lexicon to the full lexicon is comparable to the difference between Semafor and Hermann-14 when evaluated on the same lexicon. 5Based on the errata version of Hermann et al. (2014) in http://www.aclweb.org/anthology/P/ P14/P14-1136v2.pdf DataBaseline LexiconBaseline Semafor performs Semafor and performs on par with the results reported for Hermann-14. One interesting observation is that our systems perform almost as well in the no-lexicon setting as the DataBaseline, which has access to the lexicon, in the total setting. To our surprise, the WSABIEbased frame identification did not yield a consistent improvement in-domain, compared to the simple NN-based approach. We also observe that in many cases the SentBOW representation performs on par with the DepBOW, while requiring significantly less data preprocessing: SentBOW only uses tokenization, whereas DepBow relies on lemmatization, POS-tagging, and dependency parsing. We attribute this effect to the fact that SentBOW provides more context information than the sparse, dependency-filteredDepBOW. Out-of-domain performance We also investi gate how well the systems perform in the out-ofdomain setting. Table 5 summarizes the results. Each of the systems was trained on das-train and tested on a variety of test sets. As we can see, our systems outperform Semafor for all datasets. The YAGS dataset is the only dataset on which we do not strongly outperform Semafor. We attribute this to the complexity of the YAGS dataset that contains a high proportion of verbs. Overall out-of-domain performance stays behind the F1-agreement observed for the human annotators for TW and YAGS, which shows that there is a large margin for improvement. Corresponding scores for in-domain data are not available. Error analysis To further investigate the performance of our system in the out-of-domain setup we analyse statistics on the errors made by the system variant NN+SentBOW. The system’s wrong predictions are affected by the lexicon in two ways. First, if the predicate is not listed in the lexicon (unknown), the system has to choose among all frames. As we have shown before, the quality of predictions for unknown predicates is generally lower. The second case is when the predicate is listed in lexicon (so it is not unknown), but the correct frame is not associated with this predicate. We further refer to this class of errors as unlinked. For unlinked predicates, the system is restricted to the set of frames provided by the lexicon, and by design has no means to select the right frame for a given predicate occurrence. The unlinked-predicate issue points to a major design flaw in the standard frameId architecture. Although choosing among frames defined in the lexicon provides a quality boost, it also renders many instances intractable for the system, if the lexicon coverage is incomplete. As Table 6 shows, unknown and unlinked predicates are almost non-present in the in-domain case, but are a major source of errors in the out-of-domain case and even might be responsible for the majority of errors occurring due to domain shift (see MASC). It is important to point out that there is still no guarantee that these would be classified correctly once the missing linking information is available in the lexicon. However, if the correct frame is not listed among the frames available for the predicate, the misclassification is inevitable. A more detailed analysis of the errors made by the system shows that the majority of false predictions for known and linked predicates are due to the domain differences in word usage. For example, the predicate window was assigned the frame Connecting architecture instead of the correct frame Time period of action in the following sentence: “No effect of anesthetic protocol on IOP during a 12 minute measurement [window].” This problem is also relevant in generic WSD and benefits from the same solutions, for instance adapting embeddings to a particular domain and efficient use of embeddings . Another major source of errors are subtle syntactic and semantic differences between frames which are hard to resolve on the sentence level (e.g. distinguishing between Similarity and Identicality for the predicate different). This could be addressed by incorporating subcategorization information and document context into the disamdataset unk biguation model, which has been proposed in recent work in FrameNet SRL, see e.g. Hermann et al. (2014) and Roth and Lapata (2015). To further explore the impact of user-generated text, we applied word-processor spelling correction to YAGS and tested our systems on the corrected set. The results do not change significantly, which indicates that a) our distributed representations provide enough information to classify also noisy usergenerated text, and b) frameId errors cannot be attributed to preprocessing problems at large scale. 6 Discussion and outlook Our analysis in Section 4 shows that domain adaptation is mainly required for the frameId step of FrameNet SRL. Unlike in PropBank SRL, in FrameNet SRL there is no significant performance drop for roleId once correct frames are available. The number of available roles given the correct frame is lower, on average 10, which reduces the complexity of the roleId task. In Section 5 we introduced a simple, yet efficient frame identification method and evaluated it on in-domain and out-of-domain data. The method achieves competitive in-domain results, and outperforms the best available open-source system in out-of-domain accuracy. We also observe that our system performs well in the newly introduced no-lexicon evaluation setting, where no lexicon-based filtering is applied. We identified a major issue in the standard frameId architecture: shifting to a new domain might render the predicate-frame associations in the FrameNet lexicon incomplete, which leads to errors for a standard classifier trained on in-domain data. One could optimize a frameId system to work in the no-lexicon setting which does not rely on the lexicon knowledge at all. However, in this setting the classification results are currently lower. Manually or automatically increasing both predicate and predicate-frame association coverage of the FrameNet lexicon could help, and we suggest investigating this line of research in future work. While our method achieves state-of-the-art results on out-of-domain data, overall results are still significantly lower than the human performance observed for YAGS and TW, which shows that there is large room for improvement. Some further benefits could be gained from combining the WSABIE and NN-based classification, using advanced context representations, e.g. context2vec and incorporating syntactic information into the model. The out-of-domain performance could be further improved by adapting word representations to a new domain. A direct comparison to the Hermann-14 system in the out-of-domain setup would shed some more light on the properties of the task affecting the out-of-domain performance. On the one hand, we expect Hermann-14 to perform worse due to its heavy reliance on syntactic information, which might decline in quality when moved to a new domain; on the other hand, the WSABIE-based classification might smoothen this effect. We make our dataset publicly available to enable comparison to related work.6 7 Conclusion Domain dependence is a well-known issue for supervised NLP tasks such as FrameNet SRL. To the best of our knowledge, there is no recent study of the domain dependence of FrameNet SRL, also prohibited by a lack of appropriate datasets. To address this problem, we 1) present the first comprehensive study of the domain generalization performance of the open-source Semafor system on several diverse benchmark sets. As a prerequisite, we introduce YAGS, a new, substantially sized test set in the domain of user-generated questionand-answer text. We find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step; we 2) explore a promising way to improve out-of-domain frame identification, i.e. using distributed word representations. Our simple frame identification system based on distributed word representations achieves higher scores for out-of-domain frame identification than previous systems and approaches state-of-the-art results indomain. To support reproducibility of our results, we publish the YAGS test set annotations and our frame identification system for research purposes. 6www.ukp.tu-darmstadt.de/ood-fn-srl Acknowledgements This work was supported by FAZIT-Stiftung and by the German Research Foundation (DFG) through grant GU 798/18-1 (QAEduInf) and the research training group “Adaptive Preparation of Information form Heterogeneous Sources” (AIPHES, GRK 1994/1). We thank Orin Hargraves and our annotators for their excellent work on the annotation study, Dr. Richard Eckart de Castilho for support regarding WebAnno, as well as Dr. Judith Eckle-Kohler and the anonymous reviewers for their comments on earlier versions of this paper. -[ Out-of-domain FrameNet Semantic Role Labeling] +[Out-of-domain FrameNet Semantic Role Labeling] Heading sofa: _InitialView - begin: 0 + begin: 1 end: 46 -[ Silvana Hartmann Ilia Kuznetsov Teresa Martin Iryna Gurevych Research Training Group AIPHES Ubiquitous Knowledge Processing (UKP) Lab 0 Department of Computer Science, Technische Universita ̈t Darmstadt 2017 1 471 482 Domain dependence of NLP systems is one of the major obstacles to their application in large-scale text analysis, also restricting the applicability of FrameNet semantic role labeling (SRL) systems. Yet, current FrameNet SRL systems are still only evaluated on a single in-domain test set. For the first time, we study the domain dependence of FrameNet SRL on a wide range of benchmark sets. We create a novel test set for FrameNet SRL based on user-generated web text and find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step. To address this problem, we develop a simple, yet efficient system based on distributed word representations. Our system closely approaches the state-of-the-art in-domain while outperforming the best available frame identification system out-of-domain. We publish our system and test data for research purposes.1] +[Silvana Hartmann Ilia Kuznetsov Teresa Martin Iryna Gurevych Research Training Group AIPHES Ubiquitous Knowledge Processing (UKP) Lab 0 Department of Computer Science, Technische Universita ̈t Darmstadt 2017 1 471 482 Domain dependence of NLP systems is one of the major obstacles to their application in large-scale text analysis, also restricting the applicability of FrameNet semantic role labeling (SRL) systems. Yet, current FrameNet SRL systems are still only evaluated on a single in-domain test set. For the first time, we study the domain dependence of FrameNet SRL on a wide range of benchmark sets. We create a novel test set for FrameNet SRL based on user-generated web text and find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step. To address this problem, we develop a simple, yet efficient system based on distributed word representations. Our system closely approaches the state-of-the-art in-domain while outperforming the best available frame identification system out-of-domain. We publish our system and test data for research purposes.1] Paragraph sofa: _InitialView - begin: 46 + begin: 47 end: 1146 -[ - ] +[-] Heading sofa: _InitialView - begin: 1146 - end: 1149 -[ Domain dependence is a major problem for supervised NLP tasks such as FrameNet semantic role labeling (SRL): systems generally exhibit a strong performance drop when applied to test data from a different distribution than the training data. This prohibits their large-scale use in language technology applications.] + begin: 1147 + end: 1148 +[Domain dependence is a major problem for supervised NLP tasks such as FrameNet semantic role labeling (SRL): systems generally exhibit a strong performance drop when applied to test data from a different distribution than the training data. This prohibits their large-scale use in language technology applications.] Paragraph sofa: _InitialView - begin: 1149 + begin: 1150 end: 1464 -[ The same problems are expected for FrameNet SRL, but due to a lack of datasets, state-of-theart FrameNet SRL is only evaluated on a single in-domain test set, see e.g. Das et al. (2014) and FitzGerald et al. (2015).] +[The same problems are expected for FrameNet SRL, but due to a lack of datasets, state-of-theart FrameNet SRL is only evaluated on a single in-domain test set, see e.g. Das et al. (2014) and FitzGerald et al. (2015).] Paragraph sofa: _InitialView - begin: 1464 + begin: 1465 end: 1680 -[ In this work, we present the first comprehensive study of the domain dependence of FrameNet SRL] +[In this work, we present the first comprehensive study of the domain dependence of FrameNet SRL] Paragraph sofa: _InitialView - begin: 1680 + begin: 1681 end: 1776 -[ 1www.ukp.tu-darmstadt.de/ood-fn-srl ] +[1www.ukp.tu-darmstadt.de/ood-fn-srl] Heading sofa: _InitialView - begin: 1776 - end: 1813 -[ on a range of benchmark datasets. This is crucial as the demand for semantic textual analysis of largescale web data keeps growing.] + begin: 1777 + end: 1812 +[on a range of benchmark datasets. This is crucial as the demand for semantic textual analysis of largescale web data keeps growing.] Paragraph sofa: _InitialView - begin: 1813 + begin: 1814 end: 1945 -[ Based on FrameNet , FrameNet SRL extracts frame-semantic structures on the sentence level that describe a specific situation centered around a semantic predicate, often a verb, and its participants, typically syntactic arguments or adjuncts of the predicate. The predicate is assigned a frame label, essentially a word sense label, that defines the situation and determines the semantic roles of the participants. The following sentence from FrameNet provides an example of the Grinding frame and its roles: [The mill]Grinding cause malt]P atient [to grist]Result. grindsGrinding [the] +[Based on FrameNet , FrameNet SRL extracts frame-semantic structures on the sentence level that describe a specific situation centered around a semantic predicate, often a verb, and its participants, typically syntactic arguments or adjuncts of the predicate. The predicate is assigned a frame label, essentially a word sense label, that defines the situation and determines the semantic roles of the participants. The following sentence from FrameNet provides an example of the Grinding frame and its roles: [The mill]Grinding cause malt]P atient [to grist]Result. grindsGrinding [the] Paragraph sofa: _InitialView - begin: 1945 + begin: 1946 end: 2531 -[ FrameNet SRL consists of two steps, frame identification (frameId), assigning a frame to the current predicate, and role labeling (roleId), identifying the participants and assigning them role labels licensed by the frame. The frameId step reduces the hundreds of role labels in FrameNet to a manageable set of up to 30 roles. Thus, FrameNet SRL differs from PropBank SRL , that only uses a small set of 26 syntactically motivated role labels and puts less weight on the predicate sense. The advantage of FrameNet SRL is that it results in a more fine-grained and rich interpretation of the input sentences which is crucial for many applications, e.g. reasoning in online debates .] +[FrameNet SRL consists of two steps, frame identification (frameId), assigning a frame to the current predicate, and role labeling (roleId), identifying the participants and assigning them role labels licensed by the frame. The frameId step reduces the hundreds of role labels in FrameNet to a manageable set of up to 30 roles. Thus, FrameNet SRL differs from PropBank SRL , that only uses a small set of 26 syntactically motivated role labels and puts less weight on the predicate sense. The advantage of FrameNet SRL is that it results in a more fine-grained and rich interpretation of the input sentences which is crucial for many applications, e.g. reasoning in online debates .] Paragraph sofa: _InitialView - begin: 2531 + begin: 2532 end: 3215 -[ Domain dependence is a well-studied topic for PropBank SRL. However, to the best of our knowledge, there exists no analysis of the performance of modern FrameNet SRL systems when applied to data from new domains.] +[Domain dependence is a well-studied topic for PropBank SRL. However, to the best of our knowledge, there exists no analysis of the performance of modern FrameNet SRL systems when applied to data from new domains.] Paragraph sofa: _InitialView - begin: 3215 + begin: 3216 end: 3428 -[ In this work, we address this problem as follows: we introduce a new benchmark dataset YAGS (Yahoo! Answers Gold Standard), which is based on user-generated questions and answers and exemplifies an out-of-domain application use case. We use YAGS, along with other out-of-domain test sets, to perform a detailed analysis of the domain dependence of FrameNet SRL using Semafor to identify which of the stages of FrameNet SRL, frameId or roleId, is particularly sensitive to domain shifts. Our results confirm that the major bottleneck in FrameNet SRL is the frame identification step. Motivated by that, we develop a simple, yet efficient frame identification method based on distributed word representations that promise better domain generalization. Our system’s performance matches the state-of-the-art in-domain , despite using a simpler model, and improves on the out-of-domain performance of Semafor.] +[In this work, we address this problem as follows: we introduce a new benchmark dataset YAGS (Yahoo! Answers Gold Standard), which is based on user-generated questions and answers and exemplifies an out-of-domain application use case. We use YAGS, along with other out-of-domain test sets, to perform a detailed analysis of the domain dependence of FrameNet SRL using Semafor to identify which of the stages of FrameNet SRL, frameId or roleId, is particularly sensitive to domain shifts. Our results confirm that the major bottleneck in FrameNet SRL is the frame identification step. Motivated by that, we develop a simple, yet efficient frame identification method based on distributed word representations that promise better domain generalization. Our system’s performance matches the state-of-the-art in-domain , despite using a simpler model, and improves on the out-of-domain performance of Semafor.] Paragraph sofa: _InitialView - begin: 3428 + begin: 3429 end: 4336 -[ The contributions of the present work are twofold: 1) we perform the first comprehensive study of the domain generalization capabilities of opensource FrameNet SRL, and 2) we propose a new frame identification method based on distributed word representations that enhances out-of-domain performance of frame identification. To enable our study, we created YAGS, a new, substantially-sized benchmark dataset for the out-of-domain testing of FrameNet SRL; we publish the annotations for the YAGS benchmark set and our frame identification system for research purposes. 2] +[The contributions of the present work are twofold: 1) we perform the first comprehensive study of the domain generalization capabilities of opensource FrameNet SRL, and 2) we propose a new frame identification method based on distributed word representations that enhances out-of-domain performance of frame identification. To enable our study, we created YAGS, a new, substantially-sized benchmark dataset for the out-of-domain testing of FrameNet SRL; we publish the annotations for the YAGS benchmark set and our frame identification system for research purposes. 2] Paragraph sofa: _InitialView - begin: 4336 + begin: 4337 end: 4905 -[ Related work ] +[Related work] Heading sofa: _InitialView - begin: 4905 - end: 4919 -[ The domain dependence of FrameNet SRL systems has been only studied sparsely, however, there exists a large body of work on out-of-domain PropBank SRL, as well as on general domain adaptation methods for NLP. This section briefly introduces some of the relevant approaches in these areas, and then summarizes the state-of-the-art in FrameNet frame identification.] + begin: 4906 + end: 4918 +[The domain dependence of FrameNet SRL systems has been only studied sparsely, however, there exists a large body of work on out-of-domain PropBank SRL, as well as on general domain adaptation methods for NLP. This section briefly introduces some of the relevant approaches in these areas, and then summarizes the state-of-the-art in FrameNet frame identification.] Paragraph sofa: _InitialView - begin: 4919 + begin: 4920 end: 5283 -[ Domain adaptation in NLP Low out-ofdomain performance is a problem common to many supervised machine learning tasks. The goal of domain adaptation is to improve model performance on the test data originating from a different distribution than the training data (Søgaard, 2013). For NLP, domain adaptation has been studied for various tasks such as POS-tagging and syntactic parsing . For the complex task of SRL, it is strongly associated with PropBank, because the corresponding CoNLL shared tasks promote out-of-domain evaluation . In the shared tasks, in-domain newspaper text from the WSJ Corpus is contrasted to out-of-domain data from fiction texts in the Brown Corpus. Most of the participants in the shared tasks do not consider domain adaptation and report systematically lower scores for the out-of-domain data (Hajicˇ et al., 2009).] +[Domain adaptation in NLP Low out-ofdomain performance is a problem common to many supervised machine learning tasks. The goal of domain adaptation is to improve model performance on the test data originating from a different distribution than the training data (Søgaard, 2013). For NLP, domain adaptation has been studied for various tasks such as POS-tagging and syntactic parsing . For the complex task of SRL, it is strongly associated with PropBank, because the corresponding CoNLL shared tasks promote out-of-domain evaluation . In the shared tasks, in-domain newspaper text from the WSJ Corpus is contrasted to out-of-domain data from fiction texts in the Brown Corpus. Most of the participants in the shared tasks do not consider domain adaptation and report systematically lower scores for the out-of-domain data (Hajicˇ et al., 2009).] Paragraph sofa: _InitialView - begin: 5283 + begin: 5284 end: 6129 -[ Representation learning has been successfully used to improve on the CoNLL shared task results . Yang et al. (2015) report the smallest performance difference (5.5 points in F1) between in-domain and out-of-domain test data, leading to the best results to date on the CoNLL 2009 out-of-domain test. Their system learns common representations for in-domain and out-of-domain data based on deep belief networks.] +[Representation learning has been successfully used to improve on the CoNLL shared task results . Yang et al. (2015) report the smallest performance difference (5.5 points in F1) between in-domain and out-of-domain test data, leading to the best results to date on the CoNLL 2009 out-of-domain test. Their system learns common representations for in-domain and out-of-domain data based on deep belief networks.] Paragraph sofa: _InitialView - begin: 6129 + begin: 6130 end: 6540 -[ Domain dependence of FrameNet SRL The ] +[Domain dependence of FrameNet SRL The] Heading sofa: _InitialView - begin: 6540 - end: 6579 -[ FrameNet 1.5 fulltext corpus, used as a standard dataset for training and evaluating FrameNet SRL systems, contains texts from several domains . However, the standard data split used to evaluate modern systems ensures the presence of all domains in the training as well as test data and cannot be used to assess the systems’ ability to generalize. Moreover, all the texts in the FrameNet fulltext corpus, based on newspaper and literary texts, are post-edited and linguistically well-formed. The FrameNet test setup thus cannot provide information on SRL performance on less edited out-ofdomain data, e.g. user-generated web data.] + begin: 6541 + end: 6578 +[FrameNet 1.5 fulltext corpus, used as a standard dataset for training and evaluating FrameNet SRL systems, contains texts from several domains . However, the standard data split used to evaluate modern systems ensures the presence of all domains in the training as well as test data and cannot be used to assess the systems’ ability to generalize. Moreover, all the texts in the FrameNet fulltext corpus, based on newspaper and literary texts, are post-edited and linguistically well-formed. The FrameNet test setup thus cannot provide information on SRL performance on less edited out-ofdomain data, e.g. user-generated web data.] Paragraph sofa: _InitialView - begin: 6579 + begin: 6580 end: 7213 -[ There are few studies related to the out-ofdomain generalization of FrameNet SRL. Johansson and Nugues (2008) evaluate the impact of different parsers on FrameNet SRL using the Nuclear Threats Initiative (NTI) data as an out-of-domain test set. They observe low domain generalization abilities of their supervised system, but find that using dependency parsers instead of constituency parsers is beneficial in the out-of-domain scenario. Croce et al. (2010) use a similar in-domain/out-ofdomain split to evaluate their approach to opendomain FrameNet SRL. They integrate a distributional model into their SRL system to generalize lexicalized features to previously unseen arguments and thus create an SRL system with a smaller performance gap between in-domain and out-ofdomain test data (only 4.5 percentage points F1). Note that they only evaluate the role labeling step. It is not transparent how their results would transfer to the current state-of-the-art SRL systems that already integrate methods to improve generalization, for instance using distributed representations.] +[There are few studies related to the out-ofdomain generalization of FrameNet SRL. Johansson and Nugues (2008) evaluate the impact of different parsers on FrameNet SRL using the Nuclear Threats Initiative (NTI) data as an out-of-domain test set. They observe low domain generalization abilities of their supervised system, but find that using dependency parsers instead of constituency parsers is beneficial in the out-of-domain scenario. Croce et al. (2010) use a similar in-domain/out-ofdomain split to evaluate their approach to opendomain FrameNet SRL. They integrate a distributional model into their SRL system to generalize lexicalized features to previously unseen arguments and thus create an SRL system with a smaller performance gap between in-domain and out-ofdomain test data (only 4.5 percentage points F1). Note that they only evaluate the role labeling step. It is not transparent how their results would transfer to the current state-of-the-art SRL systems that already integrate methods to improve generalization, for instance using distributed representations.] Paragraph sofa: _InitialView - begin: 7213 + begin: 7214 end: 8292 -[ Palmer and Sporleder (2010) analyze the FrameNet 1.3 training data coverage and the performance of the Shalmaneser SRL system for frame identification on several test sets across domains, i.e. the PropBank and NTI parts of the FrameNet fulltext corpus and the fictional texts from the SemEval-2007 shared task . Having observed that the majority of errors results from coverage gaps in FrameNet, they suggest to focus on developing frame identification systems that generalize well to new domains. Our observations support their findings and show that the problem still persists even when modern SRL methods and the extended FrameNet 1.5 lexicon are used.] +[Palmer and Sporleder (2010) analyze the FrameNet 1.3 training data coverage and the performance of the Shalmaneser SRL system for frame identification on several test sets across domains, i.e. the PropBank and NTI parts of the FrameNet fulltext corpus and the fictional texts from the SemEval-2007 shared task . Having observed that the majority of errors results from coverage gaps in FrameNet, they suggest to focus on developing frame identification systems that generalize well to new domains. Our observations support their findings and show that the problem still persists even when modern SRL methods and the extended FrameNet 1.5 lexicon are used.] Paragraph sofa: _InitialView - begin: 8292 + begin: 8293 end: 8951 -[ Søgaard et al. (2015) annotate 236 tweets with FrameNet labels to apply SRL to knowledge extraction from Twitter. They report that the frameId performance of Semafor 2.1 (Das et al., 2010) on the new test set is similar to its performance on the SemEval-2007 newswire test set . For full SRL, there are large differences: F1 reaches only 25.96% on the Twitter set compared to the 46.5% reported by Das et al. (2010) on the indomain set. These results show that there is ample room for improvement for SRL on Twitter data.] +[Søgaard et al. (2015) annotate 236 tweets with FrameNet labels to apply SRL to knowledge extraction from Twitter. They report that the frameId performance of Semafor 2.1 (Das et al., 2010) on the new test set is similar to its performance on the SemEval-2007 newswire test set . For full SRL, there are large differences: F1 reaches only 25.96% on the Twitter set compared to the 46.5% reported by Das et al. (2010) on the indomain set. These results show that there is ample room for improvement for SRL on Twitter data.] Paragraph sofa: _InitialView - begin: 8951 + begin: 8952 end: 9474 -[ Recent FrameNet SRL systems are not evaluated in the context of their domain dependence: Kshirsagar et al. (2015) use the domain adaptation approach from Daume´III (2007) to augment the feature space for FrameNet SRL with FrameNet example sentences; FitzGerald et al. (2015) and Hermann et al. (2014) adopt deep learning methods, including learning representations that may generalize better to unseen data, to present stateof-the-art results for FrameNet SRL. All of the former only use the already introduced split of the FrameNet fulltext corpus for testing, as does the long-time state-of-the-art system Semafor . Out-of-domain evaluation is lacking, as are datasets that enable this kind of evaluation. Frame identification Current state of the art in frame identification is the approach by Hermann et al. (2014), further referred to as Hermann-14, followed by the previous state-of-the art model Semafor .] +[Recent FrameNet SRL systems are not evaluated in the context of their domain dependence: Kshirsagar et al. (2015) use the domain adaptation approach from Daume´III (2007) to augment the feature space for FrameNet SRL with FrameNet example sentences; FitzGerald et al. (2015) and Hermann et al. (2014) adopt deep learning methods, including learning representations that may generalize better to unseen data, to present stateof-the-art results for FrameNet SRL. All of the former only use the already introduced split of the FrameNet fulltext corpus for testing, as does the long-time state-of-the-art system Semafor . Out-of-domain evaluation is lacking, as are datasets that enable this kind of evaluation. Frame identification Current state of the art in frame identification is the approach by Hermann et al. (2014), further referred to as Hermann-14, followed by the previous state-of-the art model Semafor .] Paragraph sofa: _InitialView - begin: 9474 + begin: 9475 end: 10389 -[ The frame identification system of Semafor relies on an elaborate feature set based on syntactic and lexical features, using the WordNet hierarchy as a source of lexical information, and a label propagation-based approach to take unknown predicates into account. Semafor is not specifically designed for out-of-domain use: the WordNet coverage is limited, and the quality of syntactic parsing might drop when the system is applied to out-ofdomain data, especially in case of non-standard user-generated texts.] +[The frame identification system of Semafor relies on an elaborate feature set based on syntactic and lexical features, using the WordNet hierarchy as a source of lexical information, and a label propagation-based approach to take unknown predicates into account. Semafor is not specifically designed for out-of-domain use: the WordNet coverage is limited, and the quality of syntactic parsing might drop when the system is applied to out-ofdomain data, especially in case of non-standard user-generated texts.] Paragraph sofa: _InitialView - begin: 10389 + begin: 10390 end: 10899 -[ Hermann-14 uses distributed word representations augmented by syntactic information. Generalpurpose distributed word representations (such as word2vec and GloVe ) are beneficial for many NLP tasks: word representations are calculated on a large unlabeled corpus, and then used as input for high-level tasks for which training data is scarce, such as syntactic parsing, word sense disambiguation, and SRL. In the syntax-augmented representations of Hermann-14, a region of the input vector, a container, is reserved for each syntactic path that can connect predicates to their arguments. This container is populated with a corresponding argument word representation, if the argument on this path is found in the training data. Hermann-14 uses the WSABIE algorithm to map input and frame representations to a common latent space. WSABIE uses WARP loss and gradient-based updates to minimize the distance between the latent representations of the predicate target and the correct frame, while maximizing the distance to all the other irrelevant frames. During testing, cosine similarity is used to find the closest frame given the input. One advantage of this approach is that similar frames are positioned close to each other in the latent space which allows information to be shared between similar predicates and similar frames. This system is the current state-ofthe-art for in-domain frame identification, but has not been applied in an out-of-domain setting. 3] +[Hermann-14 uses distributed word representations augmented by syntactic information. Generalpurpose distributed word representations (such as word2vec and GloVe ) are beneficial for many NLP tasks: word representations are calculated on a large unlabeled corpus, and then used as input for high-level tasks for which training data is scarce, such as syntactic parsing, word sense disambiguation, and SRL. In the syntax-augmented representations of Hermann-14, a region of the input vector, a container, is reserved for each syntactic path that can connect predicates to their arguments. This container is populated with a corresponding argument word representation, if the argument on this path is found in the training data. Hermann-14 uses the WSABIE algorithm to map input and frame representations to a common latent space. WSABIE uses WARP loss and gradient-based updates to minimize the distance between the latent representations of the predicate target and the correct frame, while maximizing the distance to all the other irrelevant frames. During testing, cosine similarity is used to find the closest frame given the input. One advantage of this approach is that similar frames are positioned close to each other in the latent space which allows information to be shared between similar predicates and similar frames. This system is the current state-ofthe-art for in-domain frame identification, but has not been applied in an out-of-domain setting. 3] Paragraph sofa: _InitialView - begin: 10899 + begin: 10900 end: 12368 -[ Out-of-domain FrameNet test data ] +[Out-of-domain FrameNet test data] Heading sofa: _InitialView - begin: 12368 - end: 12402 -[ This section describes available in-domain and outof-domain FrameNet test sets and the creation of YAGS, a new out-of-domain FrameNet test set. FrameNet test sets FrameNet SRL is typically evaluated on das-test, the test set first introduced by Das and Smith (2011). It is a held-out set randomly sampled from the FrameNet 1.5 fulltext corpus. While the FrameNet fulltext corpus contains data from various sources, we consider das-test an in-domain test set: all data sources of the test set are also represented in the training set.] + begin: 12369 + end: 12401 +[This section describes available in-domain and outof-domain FrameNet test sets and the creation of YAGS, a new out-of-domain FrameNet test set. FrameNet test sets FrameNet SRL is typically evaluated on das-test, the test set first introduced by Das and Smith (2011). It is a held-out set randomly sampled from the FrameNet 1.5 fulltext corpus. While the FrameNet fulltext corpus contains data from various sources, we consider das-test an in-domain test set: all data sources of the test set are also represented in the training set.] Paragraph sofa: _InitialView - begin: 12402 + begin: 12403 end: 12936 -[ There are two additional datasets from other domains that we use in our study on domain generalization: The MASC word sense sentences corpus contains FrameNet annotations for a lexical sample of roughly 100 lemmas from ANC . The Twitter-based dataset from Søgaard et al. (2015), henceforth TW, has some very distinctive properties: it does not provide a gold standard, but annotations by three annotators. This leads to a high variance in role annotations: the annotator TW3 annotated only 82% of the number of roles annotated by TW1, see Table 1. Like Søgaard et al. (2015), we report SRL results as averages over the three annotations (TW-av).] +[There are two additional datasets from other domains that we use in our study on domain generalization: The MASC word sense sentences corpus contains FrameNet annotations for a lexical sample of roughly 100 lemmas from ANC . The Twitter-based dataset from Søgaard et al. (2015), henceforth TW, has some very distinctive properties: it does not provide a gold standard, but annotations by three annotators. This leads to a high variance in role annotations: the annotator TW3 annotated only 82% of the number of roles annotated by TW1, see Table 1. Like Søgaard et al. (2015), we report SRL results as averages over the three annotations (TW-av).] Paragraph sofa: _InitialView - begin: 12936 + begin: 12937 end: 13583 -[ Table 1 shows statistics on these datasets. For TW, it displays the statistics for each annotator. The TW datasets are fairly small, containing only around 1,000 frame labels. The MASC dataset is of substantial size, but it constitutes a lexical sample and therefore a slightly artificial evaluation setup. There is another Twitter-based test set , which we do not use in our experiments, because it was created semi-automatically and is therefore of lower quality. We conclude that existing out-of-domain test sets for FrameNet SRL are insufficient, in particular for increasingly important domains like user-generated text, because available datasets are either small or of low quality.] +[Table 1 shows statistics on these datasets. For TW, it displays the statistics for each annotator. The TW datasets are fairly small, containing only around 1,000 frame labels. The MASC dataset is of substantial size, but it constitutes a lexical sample and therefore a slightly artificial evaluation setup. There is another Twitter-based test set , which we do not use in our experiments, because it was created semi-automatically and is therefore of lower quality. We conclude that existing out-of-domain test sets for FrameNet SRL are insufficient, in particular for increasingly important domains like user-generated text, because available datasets are either small or of low quality.] Paragraph sofa: _InitialView - begin: 13583 + begin: 13584 end: 14273 -[ YAGS: a new FrameNet test set based on user ] +[YAGS: a new FrameNet test set based on user] Heading sofa: _InitialView - begin: 14273 - end: 14318 -[ generated text To address the need for new outof-domain test datasets, we created YAGS, a new FrameNet-annotated evaluation dataset based on question-answer data from Yahoo! Answers (YA), a community-driven question-and-answer forum. The corpus is based on a random sample of 55 questions and their answers from the test split of the YA Manner Questions dataset used by Surdeanu et al. (2011) and published as part of the Yahoo! Webscope program (https://webscope. sandbox.yahoo.com/).] + begin: 14274 + end: 14317 +[generated text To address the need for new outof-domain test datasets, we created YAGS, a new FrameNet-annotated evaluation dataset based on question-answer data from Yahoo! Answers (YA), a community-driven question-and-answer forum. The corpus is based on a random sample of 55 questions and their answers from the test split of the YA Manner Questions dataset used by Surdeanu et al. (2011) and published as part of the Yahoo! Webscope program (https://webscope. sandbox.yahoo.com/).] Paragraph sofa: _InitialView - begin: 14318 + begin: 14319 end: 14804 -[ YAGS contains 1,415 sentences, 3,091 frame annotations, and 6,081 role annotations. Figure 1 shows a sentence from YAGS that demonstrates some non-standard properties of the user-generated question-answer data, such as typos (mortal instead of mortar). We publish the annotations as stand-off annotations to the original dataset.] +[YAGS contains 1,415 sentences, 3,091 frame annotations, and 6,081 role annotations. Figure 1 shows a sentence from YAGS that demonstrates some non-standard properties of the user-generated question-answer data, such as typos (mortal instead of mortar). We publish the annotations as stand-off annotations to the original dataset.] Paragraph sofa: _InitialView - begin: 14804 + begin: 14805 end: 15134 -[ Annotation study Each document was annotated by a two linguistically trained annotators provided with detailed guidelines and then curated by an experienced expert, all using WebAnno 2.0.0 . Up to five predicates per sentence were pre-selected automatically based on lemma and POS, preferring verbal predicates to other POS, which leads to a larger proportion of verbs in YAGS. The annotation task was to identify the correct frame label for each predicate, if any, and then to identify the role spans as arguments and adjuncts of the frame, and to label them with the appropriate role. For reference, annotators accessed the FrameNet 1.5 definitions and examples with the FrameNet Explorer tool (www.clres.com/FNExplorer.html).] +[Annotation study Each document was annotated by a two linguistically trained annotators provided with detailed guidelines and then curated by an experienced expert, all using WebAnno 2.0.0 . Up to five predicates per sentence were pre-selected automatically based on lemma and POS, preferring verbal predicates to other POS, which leads to a larger proportion of verbs in YAGS. The annotation task was to identify the correct frame label for each predicate, if any, and then to identify the role spans as arguments and adjuncts of the frame, and to label them with the appropriate role. For reference, annotators accessed the FrameNet 1.5 definitions and examples with the FrameNet Explorer tool (www.clres.com/FNExplorer.html).] Paragraph sofa: _InitialView - begin: 15134 + begin: 15135 end: 15864 -[ Inter-rater agreement for frame labels is Krippendorff’s α=0.76; agreement for role labels given matching spans is α=0.62, and Krippendorff’s α unitizing agreement for role spans is 0.7 – a good result for such a difficult task on user-generated text. Average pairwise F1 agreement for frame labels is high at 0.96, higher than the 0.84 reported by Søgaard et al. (2015) for the TW sets. Our high frame agreement is a result of annotator experience and our elaborate annotation setup.] +[Inter-rater agreement for frame labels is Krippendorff’s α=0.76; agreement for role labels given matching spans is α=0.62, and Krippendorff’s α unitizing agreement for role spans is 0.7 – a good result for such a difficult task on user-generated text. Average pairwise F1 agreement for frame labels is high at 0.96, higher than the 0.84 reported by Søgaard et al. (2015) for the TW sets. Our high frame agreement is a result of annotator experience and our elaborate annotation setup.] Paragraph sofa: _InitialView - begin: 15864 + begin: 15865 end: 16349 -[ YAGS statistics and properties Table 1 presents ] +[YAGS statistics and properties Table 1 presents] Heading sofa: _InitialView - begin: 16349 - end: 16398 -[ dataset statistics for YAGS and the other test sets. Due to the predicate selection, YAGS contains a larger proportion of verbal predicates than the other sets, and has three times more frames and roles than TW, approximating the size of das-test. The proportion of core roles, roles that are obligatory for a frame and thus typically more frequent in datasets than non-core roles, in the out-of-domain test sets (TW, YAGS, MASC) is slightly smaller data s f a n v compared to das-test. This goes along with a larger variance of roles in YAGS.] + begin: 16350 + end: 16397 +[dataset statistics for YAGS and the other test sets. Due to the predicate selection, YAGS contains a larger proportion of verbal predicates than the other sets, and has three times more frames and roles than TW, approximating the size of das-test. The proportion of core roles, roles that are obligatory for a frame and thus typically more frequent in datasets than non-core roles, in the out-of-domain test sets (TW, YAGS, MASC) is slightly smaller data s f a n v compared to das-test. This goes along with a larger variance of roles in YAGS.] Paragraph sofa: _InitialView - begin: 16398 + begin: 16399 end: 16942 -[ The user-generated aspect of YAGS manifests in spelling errors, and in the lack of punctuation and structure of the texts. The language is informal, but there are only few emoticons or other special words such as the hashtags typically found in tweets.] +[The user-generated aspect of YAGS manifests in spelling errors, and in the lack of punctuation and structure of the texts. The language is informal, but there are only few emoticons or other special words such as the hashtags typically found in tweets.] Paragraph sofa: _InitialView - begin: 16942 + begin: 16943 end: 17195 -[ In the next section, we use the test sets from Table 1 to analyze the domain generalization capabilities of an open-source FrameNet SRL system. 4] +[In the next section, we use the test sets from Table 1 to analyze the domain generalization capabilities of an open-source FrameNet SRL system. 4] Paragraph sofa: _InitialView - begin: 17195 + begin: 17196 end: 17341 -[ Domain generalization capabilities of open-source FrameNet SRL ] +[Domain generalization capabilities of open-source FrameNet SRL] Heading sofa: _InitialView - begin: 17341 - end: 17405 -[ To analyze the domain generalization capabilities of contemporary open-source SRL, we ran the frame identification from Semafor with the enhanced role labeler from Kshirsagar et al. (2015), both trained on the in-domain das-train set, on the four test sets das-test, YAGS, TW, and MASC. The systems receive text annotated with predicate spans as input, which has become the standard in recent evaluations.] + begin: 17342 + end: 17404 +[To analyze the domain generalization capabilities of contemporary open-source SRL, we ran the frame identification from Semafor with the enhanced role labeler from Kshirsagar et al. (2015), both trained on the in-domain das-train set, on the four test sets das-test, YAGS, TW, and MASC. The systems receive text annotated with predicate spans as input, which has become the standard in recent evaluations.] Paragraph sofa: _InitialView - begin: 17405 + begin: 17406 end: 17813 -[ Evaluation script The Semafor evaluation ] +[Evaluation script The Semafor evaluation] Heading sofa: _InitialView - begin: 17813 - end: 17855 -[ script provides precision P, recall R, and F1 scores for full SRL (SRL), and accuracy A for frame identification (frameId). Full SRL evaluation can be performed with and without using gold frames instead of predicted (auto) frames.] + begin: 17814 + end: 17854 +[script provides precision P, recall R, and F1 scores for full SRL (SRL), and accuracy A for frame identification (frameId). Full SRL evaluation can be performed with and without using gold frames instead of predicted (auto) frames.] Paragraph sofa: _InitialView - begin: 17855 + begin: 17856 end: 18089 -[ The script does not provide results on the role labeling (argument identification and labeling, roleId) alone: the scoring mechanism for SRL/gold also considers the by default correct gold frames. This is useful when comparing different SRL systems on the same test set, but not sufficient when 1) comparing role labeling performance on different test sets with a different ratio of frame labels to role labels (resulting from different annotation strategies), and 2) analyzing the contribution of frameId and roleId to full SRL performance across test sets. data das-test YAGS MASC TW-av frameId auto gold We therefore evaluate the output of the script to retain the original counts for role labels and compute scores on the role labeling proper (roleId). Moreover, there are two evaluation settings for frameId: exact frame match and partial frame match. We use the exact match setting that does not credit related frames and roles.] +[The script does not provide results on the role labeling (argument identification and labeling, roleId) alone: the scoring mechanism for SRL/gold also considers the by default correct gold frames. This is useful when comparing different SRL systems on the same test set, but not sufficient when 1) comparing role labeling performance on different test sets with a different ratio of frame labels to role labels (resulting from different annotation strategies), and 2) analyzing the contribution of frameId and roleId to full SRL performance across test sets. data das-test YAGS MASC TW-av frameId auto gold We therefore evaluate the output of the script to retain the original counts for role labels and compute scores on the role labeling proper (roleId). Moreover, there are two evaluation settings for frameId: exact frame match and partial frame match. We use the exact match setting that does not credit related frames and roles.] Paragraph sofa: _InitialView - begin: 18089 + begin: 18090 end: 19024 -[ Results Table 2 presents scores for exact match frameId and for SRL and roleId with automatic frames (auto) and with gold frames (gold). For TW, the results are averaged over the number of annotators. According to column SRL/auto, we observe best Semafor performance for full SRL on dastest, results for the other test sets are at least 16 percentage points F1 lower. This is mostly due to the worse frameId performance of Semafor on the new test sets, as shown in column frameId: frameId performance is at least 19 percentage points lower. This negatively affects roleId for the out-of-domain test sets (see column roleId/auto). RoleId/auto scores are also low on das-test, but higher than for the other sets.] +[Results Table 2 presents scores for exact match frameId and for SRL and roleId with automatic frames (auto) and with gold frames (gold). For TW, the results are averaged over the number of annotators. According to column SRL/auto, we observe best Semafor performance for full SRL on dastest, results for the other test sets are at least 16 percentage points F1 lower. This is mostly due to the worse frameId performance of Semafor on the new test sets, as shown in column frameId: frameId performance is at least 19 percentage points lower. This negatively affects roleId for the out-of-domain test sets (see column roleId/auto). RoleId/auto scores are also low on das-test, but higher than for the other sets.] Paragraph sofa: _InitialView - begin: 19024 + begin: 19025 end: 19735 -[ When using gold frame labels, roleId and SRL performance improve for all test sets. As shown in columns roleId/gold and SRL/gold, the difference between in-domain and out-of-domain evaluation vanishes. Only MASC scores are still two points lower for full SRL than those for das-test. TW-av scores even surpass the in-domain scores.2] +[When using gold frame labels, roleId and SRL performance improve for all test sets. As shown in columns roleId/gold and SRL/gold, the difference between in-domain and out-of-domain evaluation vanishes. Only MASC scores are still two points lower for full SRL than those for das-test. TW-av scores even surpass the in-domain scores.2] Paragraph sofa: _InitialView - begin: 19735 + begin: 19736 end: 20068 -[ This shows how much FrameNet role labels are dependent on correct frame labels. Thus, it is crucial to improve the out-of-domain performance of frameId systems.] +[This shows how much FrameNet role labels are dependent on correct frame labels. Thus, it is crucial to improve the out-of-domain performance of frameId systems.] Paragraph sofa: _InitialView - begin: 20068 + begin: 20069 end: 20229 -[ Domain dependence appears to be less of a problem for the role labeling step. The MASC dataset is the most difficult for both frameId and roleId. This is mostly a consequence of the lower training data coverage of MASC, as discussed below.] +[Domain dependence appears to be less of a problem for the role labeling step. The MASC dataset is the most difficult for both frameId and roleId. This is mostly a consequence of the lower training data coverage of MASC, as discussed below.] Paragraph sofa: _InitialView - begin: 20229 + begin: 20230 end: 20469 -[ 2Our TW-av results are not comparable to those from Søgaard et al. (2015) because their test setup includes predicate target identification and uses different evaluation metrics. das-test YAGS MASC TW1 TW2 TW3 Analysis In our study, it became clear that domain dependence is crucial to the frame identification step in SRL. The lower scores for the out-ofdomain test sets can be a result of different domainspecific predicate-frame distributions, or a lack of coverage of the domain in the training data.] +[2Our TW-av results are not comparable to those from Søgaard et al. (2015) because their test setup includes predicate target identification and uses different evaluation metrics. das-test YAGS MASC TW1 TW2 TW3 Analysis In our study, it became clear that domain dependence is crucial to the frame identification step in SRL. The lower scores for the out-ofdomain test sets can be a result of different domainspecific predicate-frame distributions, or a lack of coverage of the domain in the training data.] Paragraph sofa: _InitialView - begin: 20469 + begin: 20470 end: 20974 -[ To get a better understanding of these phenomena, we compared detailed statistics of the different test sets, cf. Table 3. Das-test has the largest predicate coverage and contains a lot of monosemous predicates, which boosts the overall performance. The occurrence of fewer monosemous predicates is expected for the lexical sample dataset MASC, but might indicate a domain preference for polysemous predicates in the YAGS and TW datasets.] +[To get a better understanding of these phenomena, we compared detailed statistics of the different test sets, cf. Table 3. Das-test has the largest predicate coverage and contains a lot of monosemous predicates, which boosts the overall performance. The occurrence of fewer monosemous predicates is expected for the lexical sample dataset MASC, but might indicate a domain preference for polysemous predicates in the YAGS and TW datasets.] Paragraph sofa: _InitialView - begin: 20974 + begin: 20975 end: 21413 -[ The percentage of unseen predicates (lemmas ∈/ das-train) is slightly higher for the user-generated test sets than for das-test, and much higher for MASC. This is mirrored in the lower frameId performance for MASC compared to the other test sets, and the slightly higher performance of TW-av and YAGS. Not all errors can be explained by insufficient training data coverage, which indicates that domain effects occur for the out-of-domain sets.] +[The percentage of unseen predicates (lemmas ∈/ das-train) is slightly higher for the user-generated test sets than for das-test, and much higher for MASC. This is mirrored in the lower frameId performance for MASC compared to the other test sets, and the slightly higher performance of TW-av and YAGS. Not all errors can be explained by insufficient training data coverage, which indicates that domain effects occur for the out-of-domain sets.] Paragraph sofa: _InitialView - begin: 21413 + begin: 21414 end: 21857 -[ To support this assumption, we performed a detailed error analysis on the misclassified instances for all test sets. We compute the proportion of wrongly classified instances with unseen predicates, predicates that do not occur in the training set. For MASC, the majority of the errors, 68%, are based on unseen predicates, while the number ranges between 37% and 43% for the other test sets, i.e. 37% for TW, 39% for das-test and 43% for YAGS. This shows that training data coverage is a bigger issue for MASC than for the other test sets. The proportions of in-train errors for YAGS and TW-av are similar to das-test. Together with the fact that overall proportion of errors is still much higher for the user-generated test sets YAGS and TW-av, this further supports our hypothesis of domain effects for YAGS and TW-av. Manual analysis furthermore shows that there are differences in frequently confused frames between the in-domain das-test and out-of-domain YAGS and TW-av.] +[To support this assumption, we performed a detailed error analysis on the misclassified instances for all test sets. We compute the proportion of wrongly classified instances with unseen predicates, predicates that do not occur in the training set. For MASC, the majority of the errors, 68%, are based on unseen predicates, while the number ranges between 37% and 43% for the other test sets, i.e. 37% for TW, 39% for das-test and 43% for YAGS. This shows that training data coverage is a bigger issue for MASC than for the other test sets. The proportions of in-train errors for YAGS and TW-av are similar to das-test. Together with the fact that overall proportion of errors is still much higher for the user-generated test sets YAGS and TW-av, this further supports our hypothesis of domain effects for YAGS and TW-av. Manual analysis furthermore shows that there are differences in frequently confused frames between the in-domain das-test and out-of-domain YAGS and TW-av.] Paragraph sofa: _InitialView - begin: 21857 + begin: 21858 end: 22835 -[ In the next section, we study new methods to improve out-of-domain frame identification. 5] +[In the next section, we study new methods to improve out-of-domain frame identification. 5] Paragraph sofa: _InitialView - begin: 22835 + begin: 22836 end: 22926 -[ Frame identification with distributed word representations ] +[Frame identification with distributed word representations] Heading sofa: _InitialView - begin: 22926 - end: 22986 -[ Given a predicate and a set of frames associated with this predicate, a frame identification system has to choose the correct frame based on the context. In this section we introduce our frame identification method and compare it to the state of the art in both in-domain and out-of-domain settings. Our system SimpleFrameId We developed a straightforward approach to frame identification based on distributed word representations, and were surprised to find that this simple model achieves results comparable to the state-of-theart system, Hermann-14. Our initial attempts to replicate Hermann-14, which is not publicly available, revealed that the container-based input feature space is very sparse: there exist many syntactic paths that can connect a predicate to its arguments, but a predicate instance rarely has more than five arguments in the sentence. So by design the input representation bears no information in most of its path containers. Moreover, Hermann-14 makes heavy use of automatically created dependency parses, which might decline in quality when applied to a new domain. We demonstrate that our simple system achieves competitive in-domain and out-of-domain performance.] + begin: 22927 + end: 22985 +[Given a predicate and a set of frames associated with this predicate, a frame identification system has to choose the correct frame based on the context. In this section we introduce our frame identification method and compare it to the state of the art in both in-domain and out-of-domain settings. Our system SimpleFrameId We developed a straightforward approach to frame identification based on distributed word representations, and were surprised to find that this simple model achieves results comparable to the state-of-theart system, Hermann-14. Our initial attempts to replicate Hermann-14, which is not publicly available, revealed that the container-based input feature space is very sparse: there exist many syntactic paths that can connect a predicate to its arguments, but a predicate instance rarely has more than five arguments in the sentence. So by design the input representation bears no information in most of its path containers. Moreover, Hermann-14 makes heavy use of automatically created dependency parses, which might decline in quality when applied to a new domain. We demonstrate that our simple system achieves competitive in-domain and out-of-domain performance.] Paragraph sofa: _InitialView - begin: 22986 + begin: 22987 end: 24179 -[ Our system, called SimpleFrameId, is specified as follows: given the lexicon L, the vector space vsm and the training data, our goal is to predict the frame f given the sentence S and the predicate p. From the machine learning perspective, the lexicon and the vector space are external resources. The lexicon contains associations between predicates and frames, and we further denote the set of frames available for a predicate as L(p). The vector space provides a pre-defined dense vector representation vsm(w) for each word w. In our case vsm is a simple word lookup function, since we do not modify our word representations during training.] +[Our system, called SimpleFrameId, is specified as follows: given the lexicon L, the vector space vsm and the training data, our goal is to predict the frame f given the sentence S and the predicate p. From the machine learning perspective, the lexicon and the vector space are external resources. The lexicon contains associations between predicates and frames, and we further denote the set of frames available for a predicate as L(p). The vector space provides a pre-defined dense vector representation vsm(w) for each word w. In our case vsm is a simple word lookup function, since we do not modify our word representations during training.] Paragraph sofa: _InitialView - begin: 24179 + begin: 24180 end: 24823 -[ From the sentence we extract the context representation, xc = Pw∈C|Cv|sm(w) . We experiment with two kinds of contexts: SentBOW includes all the words in the sentence, i.e. C = S, DepBOW considers the dependency parse of the sentence and only includes direct dependents of the predicate, C = dep(p, S). As for the predicate, the plain embedding from the source vector space model is used, xp = vsm(p). A simple concatenation of xc and xp serves as input to the disambiguation classifier D, which outputs weights D(xc, xp, f ) for each frame known to the system f ∈ L. Note that the classifier itself is agnostic to the predicate’s part of speech and exact lemma and only relies on the word representations from the vsm. We experiment with two different classification methods: one is a twolayer neural network DNN , the other one is DW SB, which follows the line of Hermann-14 and learns representations for frames and predicates in the same latent space using the WSABIE algorithm.3 Hyperparameters are tuned on the development sets das-dev and YAGS-dev (sampled from YAGS); we test on the remaining 2,093 instances in YAGS-test. DataBaseline LexiconBaseline Semafor* Hermann-14* (best) WSB+SentBOW WSB+DepBOW NN+SentBOW NN+DepBOW total tering is performed. We find that our frame identification system performs surprisingly well in this setting, and we encourage the no-lexicon performance to be additionally reported in the future, since it better reflects the frame identification quality and smoothens the effect of lexicon coverage.] +[From the sentence we extract the context representation, xc = Pw∈C|Cv|sm(w) . We experiment with two kinds of contexts: SentBOW includes all the words in the sentence, i.e. C = S, DepBOW considers the dependency parse of the sentence and only includes direct dependents of the predicate, C = dep(p, S). As for the predicate, the plain embedding from the source vector space model is used, xp = vsm(p). A simple concatenation of xc and xp serves as input to the disambiguation classifier D, which outputs weights D(xc, xp, f ) for each frame known to the system f ∈ L. Note that the classifier itself is agnostic to the predicate’s part of speech and exact lemma and only relies on the word representations from the vsm. We experiment with two different classification methods: one is a twolayer neural network DNN , the other one is DW SB, which follows the line of Hermann-14 and learns representations for frames and predicates in the same latent space using the WSABIE algorithm.3 Hyperparameters are tuned on the development sets das-dev and YAGS-dev (sampled from YAGS); we test on the remaining 2,093 instances in YAGS-test. DataBaseline LexiconBaseline Semafor* Hermann-14* (best) WSB+SentBOW WSB+DepBOW NN+SentBOW NN+DepBOW total tering is performed. We find that our frame identification system performs surprisingly well in this setting, and we encourage the no-lexicon performance to be additionally reported in the future, since it better reflects the frame identification quality and smoothens the effect of lexicon coverage.] Paragraph sofa: _InitialView - begin: 24823 + begin: 24824 end: 26362 -[ Lexicon-based filtering In the testing stage, ] +[Lexicon-based filtering In the testing stage,] Heading sofa: _InitialView - begin: 26362 - end: 26409 -[ the classifier outputs weights for all the frames available in the lexicon, and the best-scoring frame is selected, f ← argmaxf∈LD(xc, xp, f ).] + begin: 26363 + end: 26408 +[the classifier outputs weights for all the frames available in the lexicon, and the best-scoring frame is selected, f ← argmaxf∈LD(xc, xp, f ).] Paragraph sofa: _InitialView - begin: 26409 + begin: 26410 end: 26553 -[ Since the lexicon specifies available frames for each lexical unit (i.e. lemma and POS), additional filtering can be performed, which limits the search only to the available frames, f ← argmaxf∈L(p)D(xc, xp, f ). If the predicate is unknown to the lexicon, p ∈/ L, the overall bestscoring frame is chosen. If the target has only one entry in the lexicon, it’s declared unambiguous and the frame is assigned directly.] +[Since the lexicon specifies available frames for each lexical unit (i.e. lemma and POS), additional filtering can be performed, which limits the search only to the available frames, f ← argmaxf∈L(p)D(xc, xp, f ). If the predicate is unknown to the lexicon, p ∈/ L, the overall bestscoring frame is chosen. If the target has only one entry in the lexicon, it’s declared unambiguous and the frame is assigned directly.] Paragraph sofa: _InitialView - begin: 26553 + begin: 26554 end: 26970 -[ Despite being common, this setup has several flaws that can obscure the differences between sys- Experiments In our experiments, we generate tems in the testing stage. As we showed in Section the lexicon L in the same way as in Hermann-14, 4, the FrameNet lexicon has coverage issues when by scanning the “frames” folder of the FrameNet applied to new domains. Neither the predicate list 1.5 distribution. For the external vector space nor the frame associations are guaranteed to be model vsm we use dependency-based word emcomplete, and hence the total results are highly de- beddings from Levy and Goldberg (2014). termined by the lexicon coverage.4 To take this into account, we also perform evaluation in the In-domain performance We report the perforno-lexicon setting, where frames are assigned mance of our system in the in-domain setting directly by the classifier and no lexicon-based fil- to compare to the state-of-the-art results from Hermann-14.5 We train our system on das-train and test it on das-test using the full FrameNet lexicon. When available, we report the no-lexicon scores as well. As Table 4 shows, our system outBaselines We employ two majority baseline models for comparison. The DataBaseline assigns frames based on how often a frame is evoked by the given predicate. This corresponds to the most frequent sense baseline in word sense disambiguation (WSD). The frames available for predicates are obtained by scanning the training data. The LexiconBaseline calculates overall frame counts first (i.e. how often a frame appears in the training data in general), and, given the predicate, selects the overall most frequent frame among the ones available for this predicate. We expect this baseline to better handle the cases when limited data is available for a given predicate sense.] +[Despite being common, this setup has several flaws that can obscure the differences between sys- Experiments In our experiments, we generate tems in the testing stage. As we showed in Section the lexicon L in the same way as in Hermann-14, 4, the FrameNet lexicon has coverage issues when by scanning the “frames” folder of the FrameNet applied to new domains. Neither the predicate list 1.5 distribution. For the external vector space nor the frame associations are guaranteed to be model vsm we use dependency-based word emcomplete, and hence the total results are highly de- beddings from Levy and Goldberg (2014). termined by the lexicon coverage.4 To take this into account, we also perform evaluation in the In-domain performance We report the perforno-lexicon setting, where frames are assigned mance of our system in the in-domain setting directly by the classifier and no lexicon-based fil- to compare to the state-of-the-art results from Hermann-14.5 We train our system on das-train and test it on das-test using the full FrameNet lexicon. When available, we report the no-lexicon scores as well. As Table 4 shows, our system outBaselines We employ two majority baseline models for comparison. The DataBaseline assigns frames based on how often a frame is evoked by the given predicate. This corresponds to the most frequent sense baseline in word sense disambiguation (WSD). The frames available for predicates are obtained by scanning the training data. The LexiconBaseline calculates overall frame counts first (i.e. how often a frame appears in the training data in general), and, given the predicate, selects the overall most frequent frame among the ones available for this predicate. We expect this baseline to better handle the cases when limited data is available for a given predicate sense.] Paragraph sofa: _InitialView - begin: 26970 + begin: 26971 end: 28783 -[ 3In our implementation, we use the LightFM package with the WARP option for hybrid matrix factorization.] +[3In our implementation, we use the LightFM package with the WARP option for hybrid matrix factorization.] Paragraph sofa: _InitialView - begin: 28783 + begin: 28784 end: 28890 -[ 4A justification for this can also be found in Hermann et al. (2014): the difference in Hermann-14 accuracy when switching from the Semafor lexicon to the full lexicon is comparable to the difference between Semafor and Hermann-14 when evaluated on the same lexicon.] +[4A justification for this can also be found in Hermann et al. (2014): the difference in Hermann-14 accuracy when switching from the Semafor lexicon to the full lexicon is comparable to the difference between Semafor and Hermann-14 when evaluated on the same lexicon.] Paragraph sofa: _InitialView - begin: 28890 + begin: 28891 end: 29157 -[ 5Based on the errata version of Hermann et al. (2014) in http://www.aclweb.org/anthology/P/] +[5Based on the errata version of Hermann et al. (2014) in http://www.aclweb.org/anthology/P/] Paragraph sofa: _InitialView - begin: 29157 + begin: 29158 end: 29249 -[ P14/P14-1136v2.pdf DataBaseline LexiconBaseline Semafor performs Semafor and performs on par with the results reported for Hermann-14. One interesting observation is that our systems perform almost as well in the no-lexicon setting as the DataBaseline, which has access to the lexicon, in the total setting. To our surprise, the WSABIEbased frame identification did not yield a consistent improvement in-domain, compared to the simple NN-based approach. We also observe that in many cases the SentBOW representation performs on par with the DepBOW, while requiring significantly less data preprocessing: SentBOW only uses tokenization, whereas DepBow relies on lemmatization, POS-tagging, and dependency parsing. We attribute this effect to the fact that SentBOW provides more context information than the sparse, dependency-filteredDepBOW.] +[P14/P14-1136v2.pdf DataBaseline LexiconBaseline Semafor performs Semafor and performs on par with the results reported for Hermann-14. One interesting observation is that our systems perform almost as well in the no-lexicon setting as the DataBaseline, which has access to the lexicon, in the total setting. To our surprise, the WSABIEbased frame identification did not yield a consistent improvement in-domain, compared to the simple NN-based approach. We also observe that in many cases the SentBOW representation performs on par with the DepBOW, while requiring significantly less data preprocessing: SentBOW only uses tokenization, whereas DepBow relies on lemmatization, POS-tagging, and dependency parsing. We attribute this effect to the fact that SentBOW provides more context information than the sparse, dependency-filteredDepBOW.] Paragraph sofa: _InitialView - begin: 29249 + begin: 29250 end: 30090 -[ Out-of-domain performance We also investi ] +[Out-of-domain performance We also investi] Heading sofa: _InitialView - begin: 30090 - end: 30133 -[ gate how well the systems perform in the out-ofdomain setting. Table 5 summarizes the results. Each of the systems was trained on das-train and tested on a variety of test sets. As we can see, our systems outperform Semafor for all datasets. The YAGS dataset is the only dataset on which we do not strongly outperform Semafor. We attribute this to the complexity of the YAGS dataset that contains a high proportion of verbs.] + begin: 30091 + end: 30132 +[gate how well the systems perform in the out-ofdomain setting. Table 5 summarizes the results. Each of the systems was trained on das-train and tested on a variety of test sets. As we can see, our systems outperform Semafor for all datasets. The YAGS dataset is the only dataset on which we do not strongly outperform Semafor. We attribute this to the complexity of the YAGS dataset that contains a high proportion of verbs.] Paragraph sofa: _InitialView - begin: 30133 + begin: 30134 end: 30558 -[ Overall out-of-domain performance stays behind the F1-agreement observed for the human annotators for TW and YAGS, which shows that there is a large margin for improvement. Corresponding scores for in-domain data are not available. Error analysis To further investigate the performance of our system in the out-of-domain setup we analyse statistics on the errors made by the system variant NN+SentBOW.] +[Overall out-of-domain performance stays behind the F1-agreement observed for the human annotators for TW and YAGS, which shows that there is a large margin for improvement. Corresponding scores for in-domain data are not available. Error analysis To further investigate the performance of our system in the out-of-domain setup we analyse statistics on the errors made by the system variant NN+SentBOW.] Paragraph sofa: _InitialView - begin: 30558 + begin: 30559 end: 30960 -[ The system’s wrong predictions are affected by the lexicon in two ways. First, if the predicate is not listed in the lexicon (unknown), the system has to choose among all frames. As we have shown before, the quality of predictions for unknown predicates is generally lower. The second case is when the predicate is listed in lexicon (so it is not unknown), but the correct frame is not associated with this predicate. We further refer to this class of errors as unlinked. For unlinked predicates, the system is restricted to the set of frames provided by the lexicon, and by design has no means to select the right frame for a given predicate occurrence.] +[The system’s wrong predictions are affected by the lexicon in two ways. First, if the predicate is not listed in the lexicon (unknown), the system has to choose among all frames. As we have shown before, the quality of predictions for unknown predicates is generally lower. The second case is when the predicate is listed in lexicon (so it is not unknown), but the correct frame is not associated with this predicate. We further refer to this class of errors as unlinked. For unlinked predicates, the system is restricted to the set of frames provided by the lexicon, and by design has no means to select the right frame for a given predicate occurrence.] Paragraph sofa: _InitialView - begin: 30960 + begin: 30961 end: 31615 -[ The unlinked-predicate issue points to a major design flaw in the standard frameId architecture. Although choosing among frames defined in the lexicon provides a quality boost, it also renders many instances intractable for the system, if the lexicon coverage is incomplete. As Table 6 shows, unknown and unlinked predicates are almost non-present in the in-domain case, but are a major source of errors in the out-of-domain case and even might be responsible for the majority of errors occurring due to domain shift (see MASC). It is important to point out that there is still no guarantee that these would be classified correctly once the missing linking information is available in the lexicon. However, if the correct frame is not listed among the frames available for the predicate, the misclassification is inevitable.] +[The unlinked-predicate issue points to a major design flaw in the standard frameId architecture. Although choosing among frames defined in the lexicon provides a quality boost, it also renders many instances intractable for the system, if the lexicon coverage is incomplete. As Table 6 shows, unknown and unlinked predicates are almost non-present in the in-domain case, but are a major source of errors in the out-of-domain case and even might be responsible for the majority of errors occurring due to domain shift (see MASC). It is important to point out that there is still no guarantee that these would be classified correctly once the missing linking information is available in the lexicon. However, if the correct frame is not listed among the frames available for the predicate, the misclassification is inevitable.] Paragraph sofa: _InitialView - begin: 31615 + begin: 31616 end: 32440 -[ A more detailed analysis of the errors made by the system shows that the majority of false predictions for known and linked predicates are due to the domain differences in word usage. For example, the predicate window was assigned the frame Connecting architecture instead of the correct frame Time period of action in the following sentence: “No effect of anesthetic protocol on IOP during a 12 minute measurement [window].”] +[A more detailed analysis of the errors made by the system shows that the majority of false predictions for known and linked predicates are due to the domain differences in word usage. For example, the predicate window was assigned the frame Connecting architecture instead of the correct frame Time period of action in the following sentence: “No effect of anesthetic protocol on IOP during a 12 minute measurement [window].”] Paragraph sofa: _InitialView - begin: 32440 + begin: 32441 end: 32866 -[ This problem is also relevant in generic WSD and benefits from the same solutions, for instance adapting embeddings to a particular domain and efficient use of embeddings .] +[This problem is also relevant in generic WSD and benefits from the same solutions, for instance adapting embeddings to a particular domain and efficient use of embeddings .] Paragraph sofa: _InitialView - begin: 32866 + begin: 32867 end: 33044 -[ Another major source of errors are subtle syntactic and semantic differences between frames which are hard to resolve on the sentence level (e.g. distinguishing between Similarity and Identicality for the predicate different). This could be addressed by incorporating subcategorization information and document context into the disamdataset unk biguation model, which has been proposed in recent work in FrameNet SRL, see e.g. Hermann et al. (2014) and Roth and Lapata (2015).] +[Another major source of errors are subtle syntactic and semantic differences between frames which are hard to resolve on the sentence level (e.g. distinguishing between Similarity and Identicality for the predicate different). This could be addressed by incorporating subcategorization information and document context into the disamdataset unk biguation model, which has been proposed in recent work in FrameNet SRL, see e.g. Hermann et al. (2014) and Roth and Lapata (2015).] Paragraph sofa: _InitialView - begin: 33044 + begin: 33045 end: 33521 -[ To further explore the impact of user-generated text, we applied word-processor spelling correction to YAGS and tested our systems on the corrected set. The results do not change significantly, which indicates that a) our distributed representations provide enough information to classify also noisy usergenerated text, and b) frameId errors cannot be attributed to preprocessing problems at large scale. 6] +[To further explore the impact of user-generated text, we applied word-processor spelling correction to YAGS and tested our systems on the corrected set. The results do not change significantly, which indicates that a) our distributed representations provide enough information to classify also noisy usergenerated text, and b) frameId errors cannot be attributed to preprocessing problems at large scale. 6] Paragraph sofa: _InitialView - begin: 33521 + begin: 33522 end: 33928 -[ Discussion and outlook ] +[Discussion and outlook] Heading sofa: _InitialView - begin: 33928 - end: 33952 -[ Our analysis in Section 4 shows that domain adaptation is mainly required for the frameId step of FrameNet SRL. Unlike in PropBank SRL, in FrameNet SRL there is no significant performance drop for roleId once correct frames are available. The number of available roles given the correct frame is lower, on average 10, which reduces the complexity of the roleId task.] + begin: 33929 + end: 33951 +[Our analysis in Section 4 shows that domain adaptation is mainly required for the frameId step of FrameNet SRL. Unlike in PropBank SRL, in FrameNet SRL there is no significant performance drop for roleId once correct frames are available. The number of available roles given the correct frame is lower, on average 10, which reduces the complexity of the roleId task.] Paragraph sofa: _InitialView - begin: 33952 + begin: 33953 end: 34319 -[ In Section 5 we introduced a simple, yet efficient frame identification method and evaluated it on in-domain and out-of-domain data. The method achieves competitive in-domain results, and outperforms the best available open-source system in out-of-domain accuracy. We also observe that our system performs well in the newly introduced no-lexicon evaluation setting, where no lexicon-based filtering is applied.] +[In Section 5 we introduced a simple, yet efficient frame identification method and evaluated it on in-domain and out-of-domain data. The method achieves competitive in-domain results, and outperforms the best available open-source system in out-of-domain accuracy. We also observe that our system performs well in the newly introduced no-lexicon evaluation setting, where no lexicon-based filtering is applied.] Paragraph sofa: _InitialView - begin: 34319 + begin: 34320 end: 34730 -[ We identified a major issue in the standard frameId architecture: shifting to a new domain might render the predicate-frame associations in the FrameNet lexicon incomplete, which leads to errors for a standard classifier trained on in-domain data. One could optimize a frameId system to work in the no-lexicon setting which does not rely on the lexicon knowledge at all. However, in this setting the classification results are currently lower. Manually or automatically increasing both predicate and predicate-frame association coverage of the FrameNet lexicon could help, and we suggest investigating this line of research in future work.] +[We identified a major issue in the standard frameId architecture: shifting to a new domain might render the predicate-frame associations in the FrameNet lexicon incomplete, which leads to errors for a standard classifier trained on in-domain data. One could optimize a frameId system to work in the no-lexicon setting which does not rely on the lexicon knowledge at all. However, in this setting the classification results are currently lower. Manually or automatically increasing both predicate and predicate-frame association coverage of the FrameNet lexicon could help, and we suggest investigating this line of research in future work.] Paragraph sofa: _InitialView - begin: 34730 + begin: 34731 end: 35370 -[ While our method achieves state-of-the-art results on out-of-domain data, overall results are still significantly lower than the human performance observed for YAGS and TW, which shows that there is large room for improvement. Some further benefits could be gained from combining the WSABIE and NN-based classification, using advanced context representations, e.g. context2vec and incorporating syntactic information into the model. The out-of-domain performance could be further improved by adapting word representations to a new domain.] +[While our method achieves state-of-the-art results on out-of-domain data, overall results are still significantly lower than the human performance observed for YAGS and TW, which shows that there is large room for improvement. Some further benefits could be gained from combining the WSABIE and NN-based classification, using advanced context representations, e.g. context2vec and incorporating syntactic information into the model. The out-of-domain performance could be further improved by adapting word representations to a new domain.] Paragraph sofa: _InitialView - begin: 35370 + begin: 35371 end: 35911 -[ A direct comparison to the Hermann-14 system in the out-of-domain setup would shed some more light on the properties of the task affecting the out-of-domain performance. On the one hand, we expect Hermann-14 to perform worse due to its heavy reliance on syntactic information, which might decline in quality when moved to a new domain; on the other hand, the WSABIE-based classification might smoothen this effect. We make our dataset publicly available to enable comparison to related work.6 7] +[A direct comparison to the Hermann-14 system in the out-of-domain setup would shed some more light on the properties of the task affecting the out-of-domain performance. On the one hand, we expect Hermann-14 to perform worse due to its heavy reliance on syntactic information, which might decline in quality when moved to a new domain; on the other hand, the WSABIE-based classification might smoothen this effect. We make our dataset publicly available to enable comparison to related work.6 7] Paragraph sofa: _InitialView - begin: 35911 + begin: 35912 end: 36406 -[ Conclusion ] +[Conclusion] Heading sofa: _InitialView - begin: 36406 - end: 36418 -[ Domain dependence is a well-known issue for supervised NLP tasks such as FrameNet SRL. To the best of our knowledge, there is no recent study of the domain dependence of FrameNet SRL, also prohibited by a lack of appropriate datasets.] + begin: 36407 + end: 36417 +[Domain dependence is a well-known issue for supervised NLP tasks such as FrameNet SRL. To the best of our knowledge, there is no recent study of the domain dependence of FrameNet SRL, also prohibited by a lack of appropriate datasets.] Paragraph sofa: _InitialView - begin: 36418 + begin: 36419 end: 36653 -[ To address this problem, we 1) present the first comprehensive study of the domain generalization performance of the open-source Semafor system on several diverse benchmark sets. As a prerequisite, we introduce YAGS, a new, substantially sized test set in the domain of user-generated questionand-answer text. We find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step; we 2) explore a promising way to improve out-of-domain frame identification, i.e. using distributed word representations. Our simple frame identification system based on distributed word representations achieves higher scores for out-of-domain frame identification than previous systems and approaches state-of-the-art results indomain. To support reproducibility of our results, we publish the YAGS test set annotations and our frame identification system for research purposes.] +[To address this problem, we 1) present the first comprehensive study of the domain generalization performance of the open-source Semafor system on several diverse benchmark sets. As a prerequisite, we introduce YAGS, a new, substantially sized test set in the domain of user-generated questionand-answer text. We find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step; we 2) explore a promising way to improve out-of-domain frame identification, i.e. using distributed word representations. Our simple frame identification system based on distributed word representations achieves higher scores for out-of-domain frame identification than previous systems and approaches state-of-the-art results indomain. To support reproducibility of our results, we publish the YAGS test set annotations and our frame identification system for research purposes.] Paragraph sofa: _InitialView - begin: 36653 + begin: 36654 end: 37542 -[ 6www.ukp.tu-darmstadt.de/ood-fn-srl ] +[6www.ukp.tu-darmstadt.de/ood-fn-srl] Heading sofa: _InitialView - begin: 37542 - end: 37579 -[ Acknowledgements ] + begin: 37543 + end: 37578 +[Acknowledgements] Heading sofa: _InitialView - begin: 37579 - end: 37597 -[ This work was supported by FAZIT-Stiftung and by the German Research Foundation (DFG) through grant GU 798/18-1 (QAEduInf) and the research training group “Adaptive Preparation of Information form Heterogeneous Sources” (AIPHES, GRK 1994/1). We thank Orin Hargraves and our annotators for their excellent work on the annotation study, Dr. Richard Eckart de Castilho for support regarding WebAnno, as well as Dr. Judith Eckle-Kohler and the anonymous reviewers for their comments on earlier versions of this paper.] + begin: 37580 + end: 37596 +[This work was supported by FAZIT-Stiftung and by the German Research Foundation (DFG) through grant GU 798/18-1 (QAEduInf) and the research training group “Adaptive Preparation of Information form Heterogeneous Sources” (AIPHES, GRK 1994/1). We thank Orin Hargraves and our annotators for their excellent work on the annotation study, Dr. Richard Eckart de Castilho for support regarding WebAnno, as well as Dr. Judith Eckle-Kohler and the anonymous reviewers for their comments on earlier versions of this paper.] Paragraph sofa: _InitialView - begin: 37597 + begin: 37598 end: 38111 -------- View _InitialView end ---------------------------------- diff --git a/dkpro-core-io-cermine-gpl/src/test/resources/reference/test-normalized-nocitations.dump b/dkpro-core-io-cermine-gpl/src/test/resources/reference/test-normalized-nocitations.dump index 141227932e..beb8977afe 100644 --- a/dkpro-core-io-cermine-gpl/src/test/resources/reference/test-normalized-nocitations.dump +++ b/dkpro-core-io-cermine-gpl/src/test/resources/reference/test-normalized-nocitations.dump @@ -13,435 +13,435 @@ DocumentMetaData CAS-Text: Out-of-domain FrameNet Semantic Role Labeling Silvana Hartmann Ilia Kuznetsov Teresa Martin Iryna Gurevych Research Training Group AIPHES Ubiquitous Knowledge Processing (UKP) Lab 0 Department of Computer Science, Technische Universita ̈t Darmstadt 2017 1 471 482 Domain dependence of NLP systems is one of the major obstacles to their application in large-scale text analysis, also restricting the applicability of FrameNet semantic role labeling (SRL) systems. Yet, current FrameNet SRL systems are still only evaluated on a single in-domain test set. For the first time, we study the domain dependence of FrameNet SRL on a wide range of benchmark sets. We create a novel test set for FrameNet SRL based on user-generated web text and find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step. To address this problem, we develop a simple, yet efficient system based on distributed word representations. Our system closely approaches the state-of-the-art in-domain while outperforming the best available frame identification system out-of-domain. We publish our system and test data for research purposes.1 - Domain dependence is a major problem for supervised NLP tasks such as FrameNet semantic role labeling (SRL): systems generally exhibit a strong performance drop when applied to test data from a different distribution than the training data. This prohibits their large-scale use in language technology applications. The same problems are expected for FrameNet SRL, but due to a lack of datasets, state-of-theart FrameNet SRL is only evaluated on a single in-domain test set, see e.g. Das et al. (2014) and FitzGerald et al. (2015). In this work, we present the first comprehensive study of the domain dependence of FrameNet SRL 1www.ukp.tu-darmstadt.de/ood-fn-srl on a range of benchmark datasets. This is crucial as the demand for semantic textual analysis of largescale web data keeps growing. Based on FrameNet , FrameNet SRL extracts frame-semantic structures on the sentence level that describe a specific situation centered around a semantic predicate, often a verb, and its participants, typically syntactic arguments or adjuncts of the predicate. The predicate is assigned a frame label, essentially a word sense label, that defines the situation and determines the semantic roles of the participants. The following sentence from FrameNet provides an example of the Grinding frame and its roles: [The mill]Grinding cause malt]P atient [to grist]Result. grindsGrinding [the FrameNet SRL consists of two steps, frame identification (frameId), assigning a frame to the current predicate, and role labeling (roleId), identifying the participants and assigning them role labels licensed by the frame. The frameId step reduces the hundreds of role labels in FrameNet to a manageable set of up to 30 roles. Thus, FrameNet SRL differs from PropBank SRL , that only uses a small set of 26 syntactically motivated role labels and puts less weight on the predicate sense. The advantage of FrameNet SRL is that it results in a more fine-grained and rich interpretation of the input sentences which is crucial for many applications, e.g. reasoning in online debates . Domain dependence is a well-studied topic for PropBank SRL. However, to the best of our knowledge, there exists no analysis of the performance of modern FrameNet SRL systems when applied to data from new domains. In this work, we address this problem as follows: we introduce a new benchmark dataset YAGS (Yahoo! Answers Gold Standard), which is based on user-generated questions and answers and exemplifies an out-of-domain application use case. We use YAGS, along with other out-of-domain test sets, to perform a detailed analysis of the domain dependence of FrameNet SRL using Semafor to identify which of the stages of FrameNet SRL, frameId or roleId, is particularly sensitive to domain shifts. Our results confirm that the major bottleneck in FrameNet SRL is the frame identification step. Motivated by that, we develop a simple, yet efficient frame identification method based on distributed word representations that promise better domain generalization. Our system’s performance matches the state-of-the-art in-domain , despite using a simpler model, and improves on the out-of-domain performance of Semafor. The contributions of the present work are twofold: 1) we perform the first comprehensive study of the domain generalization capabilities of opensource FrameNet SRL, and 2) we propose a new frame identification method based on distributed word representations that enhances out-of-domain performance of frame identification. To enable our study, we created YAGS, a new, substantially-sized benchmark dataset for the out-of-domain testing of FrameNet SRL; we publish the annotations for the YAGS benchmark set and our frame identification system for research purposes. 2 Related work The domain dependence of FrameNet SRL systems has been only studied sparsely, however, there exists a large body of work on out-of-domain PropBank SRL, as well as on general domain adaptation methods for NLP. This section briefly introduces some of the relevant approaches in these areas, and then summarizes the state-of-the-art in FrameNet frame identification. Domain adaptation in NLP Low out-ofdomain performance is a problem common to many supervised machine learning tasks. The goal of domain adaptation is to improve model performance on the test data originating from a different distribution than the training data (Søgaard, 2013). For NLP, domain adaptation has been studied for various tasks such as POS-tagging and syntactic parsing . For the complex task of SRL, it is strongly associated with PropBank, because the corresponding CoNLL shared tasks promote out-of-domain evaluation . In the shared tasks, in-domain newspaper text from the WSJ Corpus is contrasted to out-of-domain data from fiction texts in the Brown Corpus. Most of the participants in the shared tasks do not consider domain adaptation and report systematically lower scores for the out-of-domain data (Hajicˇ et al., 2009). Representation learning has been successfully used to improve on the CoNLL shared task results . Yang et al. (2015) report the smallest performance difference (5.5 points in F1) between in-domain and out-of-domain test data, leading to the best results to date on the CoNLL 2009 out-of-domain test. Their system learns common representations for in-domain and out-of-domain data based on deep belief networks. Domain dependence of FrameNet SRL The FrameNet 1.5 fulltext corpus, used as a standard dataset for training and evaluating FrameNet SRL systems, contains texts from several domains . However, the standard data split used to evaluate modern systems ensures the presence of all domains in the training as well as test data and cannot be used to assess the systems’ ability to generalize. Moreover, all the texts in the FrameNet fulltext corpus, based on newspaper and literary texts, are post-edited and linguistically well-formed. The FrameNet test setup thus cannot provide information on SRL performance on less edited out-ofdomain data, e.g. user-generated web data. There are few studies related to the out-ofdomain generalization of FrameNet SRL. Johansson and Nugues (2008) evaluate the impact of different parsers on FrameNet SRL using the Nuclear Threats Initiative (NTI) data as an out-of-domain test set. They observe low domain generalization abilities of their supervised system, but find that using dependency parsers instead of constituency parsers is beneficial in the out-of-domain scenario. Croce et al. (2010) use a similar in-domain/out-ofdomain split to evaluate their approach to opendomain FrameNet SRL. They integrate a distributional model into their SRL system to generalize lexicalized features to previously unseen arguments and thus create an SRL system with a smaller performance gap between in-domain and out-ofdomain test data (only 4.5 percentage points F1). Note that they only evaluate the role labeling step. It is not transparent how their results would transfer to the current state-of-the-art SRL systems that already integrate methods to improve generalization, for instance using distributed representations. Palmer and Sporleder (2010) analyze the FrameNet 1.3 training data coverage and the performance of the Shalmaneser SRL system for frame identification on several test sets across domains, i.e. the PropBank and NTI parts of the FrameNet fulltext corpus and the fictional texts from the SemEval-2007 shared task . Having observed that the majority of errors results from coverage gaps in FrameNet, they suggest to focus on developing frame identification systems that generalize well to new domains. Our observations support their findings and show that the problem still persists even when modern SRL methods and the extended FrameNet 1.5 lexicon are used. Søgaard et al. (2015) annotate 236 tweets with FrameNet labels to apply SRL to knowledge extraction from Twitter. They report that the frameId performance of Semafor 2.1 (Das et al., 2010) on the new test set is similar to its performance on the SemEval-2007 newswire test set . For full SRL, there are large differences: F1 reaches only 25.96% on the Twitter set compared to the 46.5% reported by Das et al. (2010) on the indomain set. These results show that there is ample room for improvement for SRL on Twitter data. Recent FrameNet SRL systems are not evaluated in the context of their domain dependence: Kshirsagar et al. (2015) use the domain adaptation approach from Daume´III (2007) to augment the feature space for FrameNet SRL with FrameNet example sentences; FitzGerald et al. (2015) and Hermann et al. (2014) adopt deep learning methods, including learning representations that may generalize better to unseen data, to present stateof-the-art results for FrameNet SRL. All of the former only use the already introduced split of the FrameNet fulltext corpus for testing, as does the long-time state-of-the-art system Semafor . Out-of-domain evaluation is lacking, as are datasets that enable this kind of evaluation. Frame identification Current state of the art in frame identification is the approach by Hermann et al. (2014), further referred to as Hermann-14, followed by the previous state-of-the art model Semafor . The frame identification system of Semafor relies on an elaborate feature set based on syntactic and lexical features, using the WordNet hierarchy as a source of lexical information, and a label propagation-based approach to take unknown predicates into account. Semafor is not specifically designed for out-of-domain use: the WordNet coverage is limited, and the quality of syntactic parsing might drop when the system is applied to out-ofdomain data, especially in case of non-standard user-generated texts. Hermann-14 uses distributed word representations augmented by syntactic information. Generalpurpose distributed word representations (such as word2vec and GloVe ) are beneficial for many NLP tasks: word representations are calculated on a large unlabeled corpus, and then used as input for high-level tasks for which training data is scarce, such as syntactic parsing, word sense disambiguation, and SRL. In the syntax-augmented representations of Hermann-14, a region of the input vector, a container, is reserved for each syntactic path that can connect predicates to their arguments. This container is populated with a corresponding argument word representation, if the argument on this path is found in the training data. Hermann-14 uses the WSABIE algorithm to map input and frame representations to a common latent space. WSABIE uses WARP loss and gradient-based updates to minimize the distance between the latent representations of the predicate target and the correct frame, while maximizing the distance to all the other irrelevant frames. During testing, cosine similarity is used to find the closest frame given the input. One advantage of this approach is that similar frames are positioned close to each other in the latent space which allows information to be shared between similar predicates and similar frames. This system is the current state-ofthe-art for in-domain frame identification, but has not been applied in an out-of-domain setting. 3 Out-of-domain FrameNet test data This section describes available in-domain and outof-domain FrameNet test sets and the creation of YAGS, a new out-of-domain FrameNet test set. FrameNet test sets FrameNet SRL is typically evaluated on das-test, the test set first introduced by Das and Smith (2011). It is a held-out set randomly sampled from the FrameNet 1.5 fulltext corpus. While the FrameNet fulltext corpus contains data from various sources, we consider das-test an in-domain test set: all data sources of the test set are also represented in the training set. There are two additional datasets from other domains that we use in our study on domain generalization: The MASC word sense sentences corpus contains FrameNet annotations for a lexical sample of roughly 100 lemmas from ANC . The Twitter-based dataset from Søgaard et al. (2015), henceforth TW, has some very distinctive properties: it does not provide a gold standard, but annotations by three annotators. This leads to a high variance in role annotations: the annotator TW3 annotated only 82% of the number of roles annotated by TW1, see Table 1. Like Søgaard et al. (2015), we report SRL results as averages over the three annotations (TW-av). Table 1 shows statistics on these datasets. For TW, it displays the statistics for each annotator. The TW datasets are fairly small, containing only around 1,000 frame labels. The MASC dataset is of substantial size, but it constitutes a lexical sample and therefore a slightly artificial evaluation setup. There is another Twitter-based test set , which we do not use in our experiments, because it was created semi-automatically and is therefore of lower quality. We conclude that existing out-of-domain test sets for FrameNet SRL are insufficient, in particular for increasingly important domains like user-generated text, because available datasets are either small or of low quality. YAGS: a new FrameNet test set based on user generated text To address the need for new outof-domain test datasets, we created YAGS, a new FrameNet-annotated evaluation dataset based on question-answer data from Yahoo! Answers (YA), a community-driven question-and-answer forum. The corpus is based on a random sample of 55 questions and their answers from the test split of the YA Manner Questions dataset used by Surdeanu et al. (2011) and published as part of the Yahoo! Webscope program (https://webscope. sandbox.yahoo.com/). YAGS contains 1,415 sentences, 3,091 frame annotations, and 6,081 role annotations. Figure 1 shows a sentence from YAGS that demonstrates some non-standard properties of the user-generated question-answer data, such as typos (mortal instead of mortar). We publish the annotations as stand-off annotations to the original dataset. Annotation study Each document was annotated by a two linguistically trained annotators provided with detailed guidelines and then curated by an experienced expert, all using WebAnno 2.0.0 . Up to five predicates per sentence were pre-selected automatically based on lemma and POS, preferring verbal predicates to other POS, which leads to a larger proportion of verbs in YAGS. The annotation task was to identify the correct frame label for each predicate, if any, and then to identify the role spans as arguments and adjuncts of the frame, and to label them with the appropriate role. For reference, annotators accessed the FrameNet 1.5 definitions and examples with the FrameNet Explorer tool (www.clres.com/FNExplorer.html). Inter-rater agreement for frame labels is Krippendorff’s α=0.76; agreement for role labels given matching spans is α=0.62, and Krippendorff’s α unitizing agreement for role spans is 0.7 – a good result for such a difficult task on user-generated text. Average pairwise F1 agreement for frame labels is high at 0.96, higher than the 0.84 reported by Søgaard et al. (2015) for the TW sets. Our high frame agreement is a result of annotator experience and our elaborate annotation setup. YAGS statistics and properties Table 1 presents dataset statistics for YAGS and the other test sets. Due to the predicate selection, YAGS contains a larger proportion of verbal predicates than the other sets, and has three times more frames and roles than TW, approximating the size of das-test. The proportion of core roles, roles that are obligatory for a frame and thus typically more frequent in datasets than non-core roles, in the out-of-domain test sets (TW, YAGS, MASC) is slightly smaller data s f a n v compared to das-test. This goes along with a larger variance of roles in YAGS. The user-generated aspect of YAGS manifests in spelling errors, and in the lack of punctuation and structure of the texts. The language is informal, but there are only few emoticons or other special words such as the hashtags typically found in tweets. In the next section, we use the test sets from Table 1 to analyze the domain generalization capabilities of an open-source FrameNet SRL system. 4 Domain generalization capabilities of open-source FrameNet SRL To analyze the domain generalization capabilities of contemporary open-source SRL, we ran the frame identification from Semafor with the enhanced role labeler from Kshirsagar et al. (2015), both trained on the in-domain das-train set, on the four test sets das-test, YAGS, TW, and MASC. The systems receive text annotated with predicate spans as input, which has become the standard in recent evaluations. Evaluation script The Semafor evaluation script provides precision P, recall R, and F1 scores for full SRL (SRL), and accuracy A for frame identification (frameId). Full SRL evaluation can be performed with and without using gold frames instead of predicted (auto) frames. The script does not provide results on the role labeling (argument identification and labeling, roleId) alone: the scoring mechanism for SRL/gold also considers the by default correct gold frames. This is useful when comparing different SRL systems on the same test set, but not sufficient when 1) comparing role labeling performance on different test sets with a different ratio of frame labels to role labels (resulting from different annotation strategies), and 2) analyzing the contribution of frameId and roleId to full SRL performance across test sets. data das-test YAGS MASC TW-av frameId auto gold We therefore evaluate the output of the script to retain the original counts for role labels and compute scores on the role labeling proper (roleId). Moreover, there are two evaluation settings for frameId: exact frame match and partial frame match. We use the exact match setting that does not credit related frames and roles. Results Table 2 presents scores for exact match frameId and for SRL and roleId with automatic frames (auto) and with gold frames (gold). For TW, the results are averaged over the number of annotators. According to column SRL/auto, we observe best Semafor performance for full SRL on dastest, results for the other test sets are at least 16 percentage points F1 lower. This is mostly due to the worse frameId performance of Semafor on the new test sets, as shown in column frameId: frameId performance is at least 19 percentage points lower. This negatively affects roleId for the out-of-domain test sets (see column roleId/auto). RoleId/auto scores are also low on das-test, but higher than for the other sets. When using gold frame labels, roleId and SRL performance improve for all test sets. As shown in columns roleId/gold and SRL/gold, the difference between in-domain and out-of-domain evaluation vanishes. Only MASC scores are still two points lower for full SRL than those for das-test. TW-av scores even surpass the in-domain scores.2 This shows how much FrameNet role labels are dependent on correct frame labels. Thus, it is crucial to improve the out-of-domain performance of frameId systems. Domain dependence appears to be less of a problem for the role labeling step. The MASC dataset is the most difficult for both frameId and roleId. This is mostly a consequence of the lower training data coverage of MASC, as discussed below. 2Our TW-av results are not comparable to those from Søgaard et al. (2015) because their test setup includes predicate target identification and uses different evaluation metrics. das-test YAGS MASC TW1 TW2 TW3 Analysis In our study, it became clear that domain dependence is crucial to the frame identification step in SRL. The lower scores for the out-ofdomain test sets can be a result of different domainspecific predicate-frame distributions, or a lack of coverage of the domain in the training data. To get a better understanding of these phenomena, we compared detailed statistics of the different test sets, cf. Table 3. Das-test has the largest predicate coverage and contains a lot of monosemous predicates, which boosts the overall performance. The occurrence of fewer monosemous predicates is expected for the lexical sample dataset MASC, but might indicate a domain preference for polysemous predicates in the YAGS and TW datasets. The percentage of unseen predicates (lemmas ∈/ das-train) is slightly higher for the user-generated test sets than for das-test, and much higher for MASC. This is mirrored in the lower frameId performance for MASC compared to the other test sets, and the slightly higher performance of TW-av and YAGS. Not all errors can be explained by insufficient training data coverage, which indicates that domain effects occur for the out-of-domain sets. To support this assumption, we performed a detailed error analysis on the misclassified instances for all test sets. We compute the proportion of wrongly classified instances with unseen predicates, predicates that do not occur in the training set. For MASC, the majority of the errors, 68%, are based on unseen predicates, while the number ranges between 37% and 43% for the other test sets, i.e. 37% for TW, 39% for das-test and 43% for YAGS. This shows that training data coverage is a bigger issue for MASC than for the other test sets. The proportions of in-train errors for YAGS and TW-av are similar to das-test. Together with the fact that overall proportion of errors is still much higher for the user-generated test sets YAGS and TW-av, this further supports our hypothesis of domain effects for YAGS and TW-av. Manual analysis furthermore shows that there are differences in frequently confused frames between the in-domain das-test and out-of-domain YAGS and TW-av. In the next section, we study new methods to improve out-of-domain frame identification. 5 Frame identification with distributed word representations Given a predicate and a set of frames associated with this predicate, a frame identification system has to choose the correct frame based on the context. In this section we introduce our frame identification method and compare it to the state of the art in both in-domain and out-of-domain settings. Our system SimpleFrameId We developed a straightforward approach to frame identification based on distributed word representations, and were surprised to find that this simple model achieves results comparable to the state-of-theart system, Hermann-14. Our initial attempts to replicate Hermann-14, which is not publicly available, revealed that the container-based input feature space is very sparse: there exist many syntactic paths that can connect a predicate to its arguments, but a predicate instance rarely has more than five arguments in the sentence. So by design the input representation bears no information in most of its path containers. Moreover, Hermann-14 makes heavy use of automatically created dependency parses, which might decline in quality when applied to a new domain. We demonstrate that our simple system achieves competitive in-domain and out-of-domain performance. Our system, called SimpleFrameId, is specified as follows: given the lexicon L, the vector space vsm and the training data, our goal is to predict the frame f given the sentence S and the predicate p. From the machine learning perspective, the lexicon and the vector space are external resources. The lexicon contains associations between predicates and frames, and we further denote the set of frames available for a predicate as L(p). The vector space provides a pre-defined dense vector representation vsm(w) for each word w. In our case vsm is a simple word lookup function, since we do not modify our word representations during training. From the sentence we extract the context representation, xc = Pw∈C|Cv|sm(w) . We experiment with two kinds of contexts: SentBOW includes all the words in the sentence, i.e. C = S, DepBOW considers the dependency parse of the sentence and only includes direct dependents of the predicate, C = dep(p, S). As for the predicate, the plain embedding from the source vector space model is used, xp = vsm(p). A simple concatenation of xc and xp serves as input to the disambiguation classifier D, which outputs weights D(xc, xp, f ) for each frame known to the system f ∈ L. Note that the classifier itself is agnostic to the predicate’s part of speech and exact lemma and only relies on the word representations from the vsm. We experiment with two different classification methods: one is a twolayer neural network DNN , the other one is DW SB, which follows the line of Hermann-14 and learns representations for frames and predicates in the same latent space using the WSABIE algorithm.3 Hyperparameters are tuned on the development sets das-dev and YAGS-dev (sampled from YAGS); we test on the remaining 2,093 instances in YAGS-test. DataBaseline LexiconBaseline Semafor* Hermann-14* (best) WSB+SentBOW WSB+DepBOW NN+SentBOW NN+DepBOW total tering is performed. We find that our frame identification system performs surprisingly well in this setting, and we encourage the no-lexicon performance to be additionally reported in the future, since it better reflects the frame identification quality and smoothens the effect of lexicon coverage. Lexicon-based filtering In the testing stage, the classifier outputs weights for all the frames available in the lexicon, and the best-scoring frame is selected, f ← argmaxf∈LD(xc, xp, f ). Since the lexicon specifies available frames for each lexical unit (i.e. lemma and POS), additional filtering can be performed, which limits the search only to the available frames, f ← argmaxf∈L(p)D(xc, xp, f ). If the predicate is unknown to the lexicon, p ∈/ L, the overall bestscoring frame is chosen. If the target has only one entry in the lexicon, it’s declared unambiguous and the frame is assigned directly. Despite being common, this setup has several flaws that can obscure the differences between sys- Experiments In our experiments, we generate tems in the testing stage. As we showed in Section the lexicon L in the same way as in Hermann-14, 4, the FrameNet lexicon has coverage issues when by scanning the “frames” folder of the FrameNet applied to new domains. Neither the predicate list 1.5 distribution. For the external vector space nor the frame associations are guaranteed to be model vsm we use dependency-based word emcomplete, and hence the total results are highly de- beddings from Levy and Goldberg (2014). termined by the lexicon coverage.4 To take this into account, we also perform evaluation in the In-domain performance We report the perforno-lexicon setting, where frames are assigned mance of our system in the in-domain setting directly by the classifier and no lexicon-based fil- to compare to the state-of-the-art results from Hermann-14.5 We train our system on das-train and test it on das-test using the full FrameNet lexicon. When available, we report the no-lexicon scores as well. As Table 4 shows, our system outBaselines We employ two majority baseline models for comparison. The DataBaseline assigns frames based on how often a frame is evoked by the given predicate. This corresponds to the most frequent sense baseline in word sense disambiguation (WSD). The frames available for predicates are obtained by scanning the training data. The LexiconBaseline calculates overall frame counts first (i.e. how often a frame appears in the training data in general), and, given the predicate, selects the overall most frequent frame among the ones available for this predicate. We expect this baseline to better handle the cases when limited data is available for a given predicate sense. 3In our implementation, we use the LightFM package with the WARP option for hybrid matrix factorization. 4A justification for this can also be found in Hermann et al. (2014): the difference in Hermann-14 accuracy when switching from the Semafor lexicon to the full lexicon is comparable to the difference between Semafor and Hermann-14 when evaluated on the same lexicon. 5Based on the errata version of Hermann et al. (2014) in http://www.aclweb.org/anthology/P/ P14/P14-1136v2.pdf DataBaseline LexiconBaseline Semafor performs Semafor and performs on par with the results reported for Hermann-14. One interesting observation is that our systems perform almost as well in the no-lexicon setting as the DataBaseline, which has access to the lexicon, in the total setting. To our surprise, the WSABIEbased frame identification did not yield a consistent improvement in-domain, compared to the simple NN-based approach. We also observe that in many cases the SentBOW representation performs on par with the DepBOW, while requiring significantly less data preprocessing: SentBOW only uses tokenization, whereas DepBow relies on lemmatization, POS-tagging, and dependency parsing. We attribute this effect to the fact that SentBOW provides more context information than the sparse, dependency-filteredDepBOW. Out-of-domain performance We also investi gate how well the systems perform in the out-ofdomain setting. Table 5 summarizes the results. Each of the systems was trained on das-train and tested on a variety of test sets. As we can see, our systems outperform Semafor for all datasets. The YAGS dataset is the only dataset on which we do not strongly outperform Semafor. We attribute this to the complexity of the YAGS dataset that contains a high proportion of verbs. Overall out-of-domain performance stays behind the F1-agreement observed for the human annotators for TW and YAGS, which shows that there is a large margin for improvement. Corresponding scores for in-domain data are not available. Error analysis To further investigate the performance of our system in the out-of-domain setup we analyse statistics on the errors made by the system variant NN+SentBOW. The system’s wrong predictions are affected by the lexicon in two ways. First, if the predicate is not listed in the lexicon (unknown), the system has to choose among all frames. As we have shown before, the quality of predictions for unknown predicates is generally lower. The second case is when the predicate is listed in lexicon (so it is not unknown), but the correct frame is not associated with this predicate. We further refer to this class of errors as unlinked. For unlinked predicates, the system is restricted to the set of frames provided by the lexicon, and by design has no means to select the right frame for a given predicate occurrence. The unlinked-predicate issue points to a major design flaw in the standard frameId architecture. Although choosing among frames defined in the lexicon provides a quality boost, it also renders many instances intractable for the system, if the lexicon coverage is incomplete. As Table 6 shows, unknown and unlinked predicates are almost non-present in the in-domain case, but are a major source of errors in the out-of-domain case and even might be responsible for the majority of errors occurring due to domain shift (see MASC). It is important to point out that there is still no guarantee that these would be classified correctly once the missing linking information is available in the lexicon. However, if the correct frame is not listed among the frames available for the predicate, the misclassification is inevitable. A more detailed analysis of the errors made by the system shows that the majority of false predictions for known and linked predicates are due to the domain differences in word usage. For example, the predicate window was assigned the frame Connecting architecture instead of the correct frame Time period of action in the following sentence: “No effect of anesthetic protocol on IOP during a 12 minute measurement [window].” This problem is also relevant in generic WSD and benefits from the same solutions, for instance adapting embeddings to a particular domain and efficient use of embeddings . Another major source of errors are subtle syntactic and semantic differences between frames which are hard to resolve on the sentence level (e.g. distinguishing between Similarity and Identicality for the predicate different). This could be addressed by incorporating subcategorization information and document context into the disamdataset unk biguation model, which has been proposed in recent work in FrameNet SRL, see e.g. Hermann et al. (2014) and Roth and Lapata (2015). To further explore the impact of user-generated text, we applied word-processor spelling correction to YAGS and tested our systems on the corrected set. The results do not change significantly, which indicates that a) our distributed representations provide enough information to classify also noisy usergenerated text, and b) frameId errors cannot be attributed to preprocessing problems at large scale. 6 Discussion and outlook Our analysis in Section 4 shows that domain adaptation is mainly required for the frameId step of FrameNet SRL. Unlike in PropBank SRL, in FrameNet SRL there is no significant performance drop for roleId once correct frames are available. The number of available roles given the correct frame is lower, on average 10, which reduces the complexity of the roleId task. In Section 5 we introduced a simple, yet efficient frame identification method and evaluated it on in-domain and out-of-domain data. The method achieves competitive in-domain results, and outperforms the best available open-source system in out-of-domain accuracy. We also observe that our system performs well in the newly introduced no-lexicon evaluation setting, where no lexicon-based filtering is applied. We identified a major issue in the standard frameId architecture: shifting to a new domain might render the predicate-frame associations in the FrameNet lexicon incomplete, which leads to errors for a standard classifier trained on in-domain data. One could optimize a frameId system to work in the no-lexicon setting which does not rely on the lexicon knowledge at all. However, in this setting the classification results are currently lower. Manually or automatically increasing both predicate and predicate-frame association coverage of the FrameNet lexicon could help, and we suggest investigating this line of research in future work. While our method achieves state-of-the-art results on out-of-domain data, overall results are still significantly lower than the human performance observed for YAGS and TW, which shows that there is large room for improvement. Some further benefits could be gained from combining the WSABIE and NN-based classification, using advanced context representations, e.g. context2vec and incorporating syntactic information into the model. The out-of-domain performance could be further improved by adapting word representations to a new domain. A direct comparison to the Hermann-14 system in the out-of-domain setup would shed some more light on the properties of the task affecting the out-of-domain performance. On the one hand, we expect Hermann-14 to perform worse due to its heavy reliance on syntactic information, which might decline in quality when moved to a new domain; on the other hand, the WSABIE-based classification might smoothen this effect. We make our dataset publicly available to enable comparison to related work.6 7 Conclusion Domain dependence is a well-known issue for supervised NLP tasks such as FrameNet SRL. To the best of our knowledge, there is no recent study of the domain dependence of FrameNet SRL, also prohibited by a lack of appropriate datasets. To address this problem, we 1) present the first comprehensive study of the domain generalization performance of the open-source Semafor system on several diverse benchmark sets. As a prerequisite, we introduce YAGS, a new, substantially sized test set in the domain of user-generated questionand-answer text. We find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step; we 2) explore a promising way to improve out-of-domain frame identification, i.e. using distributed word representations. Our simple frame identification system based on distributed word representations achieves higher scores for out-of-domain frame identification than previous systems and approaches state-of-the-art results indomain. To support reproducibility of our results, we publish the YAGS test set annotations and our frame identification system for research purposes. 6www.ukp.tu-darmstadt.de/ood-fn-srl Acknowledgements This work was supported by FAZIT-Stiftung and by the German Research Foundation (DFG) through grant GU 798/18-1 (QAEduInf) and the research training group “Adaptive Preparation of Information form Heterogeneous Sources” (AIPHES, GRK 1994/1). We thank Orin Hargraves and our annotators for their excellent work on the annotation study, Dr. Richard Eckart de Castilho for support regarding WebAnno, as well as Dr. Judith Eckle-Kohler and the anonymous reviewers for their comments on earlier versions of this paper. Eneko Agirre , Oier Lo´pez de Lacalle, Christiane Fellbaum, Shu-Kai Hsieh , Maurizio Tesconi, Monica Monachini, Piek Vossen, and Roxanne Segers . 2010 . SemEval-2010 Task 17 : All-Words Word Sense Disambiguation on a Specific Domain . InProceedings of the 5th International Workshop on Semantic Evaluation , pages 75 - 80 . Association for Computational Linguistics. Collin Baker , Michael Ellsworth , and Katrin Erk . 2007 . SemEval-2007 Task 19 : Frame Semantic Structure Extraction . In Proceedings of the Fourth International Workshop on Semantic Evaluations (SemEval2007) , pages 99 - 104 , Prague, Czech Republic, June. Association for Computational Linguistics. Jonathan Berant , Vivek Srikumar, Pei-Chun Chen , Abby Vander Linden, Brittany Harding, Brad Huang, Peter Clark , and Christopher D. Manning . 2014 . Modeling Biological Processes for Reading Comprehension . In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 1499 - 1510 , Doha, Qatar. Association for Computational Linguistics. John Blitzer , Ryan McDonald , and Fernando Pereira . 2006 . Domain adaptation with structural correspondence learning . In Proceedings of the 2006 Conference on Empirical Methods in Natural Language Processing , pages 120 - 128 , Sydney, Australia, July. Association for Computational Linguistics. Xavier Carreras and Llu´ıs Ma`rquez. 2005 . Introduction to the CoNLL-2005 shared task: Semantic role labeling . In Proceedings of the Ninth Conference on Computational Natural Language Learning (CoNLL-2005) , pages 152 - 164 , Ann Arbor, Michigan, June. Association for Computational Linguistics. Danilo Croce , Cristina Giannone, Paolo Annesi, and Roberto Basili . 2010 . Towards open-domain semantic role labeling . In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics , pages 237 - 246 , Uppsala, Sweden, July. Association for Computational Linguistics. Dipanjan Das and Noah A. Smith . 2011 . SemiSupervised Frame-Semantic Parsing for Unknown Predicates . In Proc. of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies , pages 1435 - 1444 , Portland, Oregon, USA. Dipanjan Das , Desai Chen , Andre´ F. T. Martins , Nathan Schneider , and Noah A. Smith . 2014 . Frame-semantic parsing . Computational Linguistics , 40 ( 1 ): 9 - 56 . Hal Daume ´III. 2007 . Frustratingly easy domain adaptation . In Proceedings of the 45th Annual Meeting of the Association of Computational Linguistics , pages 256 - 263 , Prague, Czech Republic, June. Association for Computational Linguistics. Katrin Erk and Sebastian Pado´. 2006 . SHALMANESER - A Toolchain For Shallow Semantic Parsing . In Proceedings of the 5th International Conference on Language Resources and Evaluation (LREC 2006 ), volume 6 , pages 527 - 532 , Genoa, Italy. ELRA. Charles J. Fillmore , Christopher R. Johnson , and Miriam R.L. Petruck . 2003 . Background to FrameNet. International journal of lexicography , 16 ( 3 ): 235 - 250 . Nicholas FitzGerald , Oscar Ta¨ckstro¨m, Kuzman Ganchev, and Dipanjan Das . 2015 . Semantic role labeling with neural network factors . In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing , pages 960 - 970 , Lisbon, Portugal, September. Association for Computational Linguistics. Jan Hajicˇ , Massimiliano Ciaramita, Richard Johansson, Daisuke Kawahara, Maria Anto`nia Mart´ı, Llu´ıs Ma`rquez, Adam Meyers, Joakim Nivre, Sebastian Pado´, Jan Sˇ teˇpa´nek, Pavel Stranˇa´k, Mihai Surdeanu, Nianwen Xue, and Yi Zhang . 2009 . The conll2009 shared task: Syntactic and semantic dependencies in multiple languages . In Proceedings of the Thirteenth Conference on Computational Natural Language Learning (CoNLL 2009 ): Shared Task, pages 1 - 18 , Boulder, Colorado, June. Association for Computational Linguistics. Karl Moritz Hermann , Dipanjan Das , Jason Weston , and Kuzman Ganchev . 2014 . Semantic frame identification with distributed word representations . In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 1448 - 1458 , Baltimore, Maryland, June. Association for Computational Linguistics. Fei Huang and Alexander Yates . 2010 . Open-domain semantic role labeling by modeling word spans . In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics , pages 968 - 978 , Uppsala, Sweden, July. Association for Computational Linguistics. Ignacio Iacobacci , Mohammad Taher Pilehvar, and Roberto Navigli . 2016 . Embeddings for Word Sense Disambiguation: An Evaluation Study . In Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 897 - 907 , Berlin, Germany, August. Association for Computational Linguistics. Anders Johannsen , He´ctor Mart´ınez Alonso, and Anders Søgaard . 2015 . Any-language frame-semantic parsing . In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing , pages 2062 - 2066 , Lisbon, Portugal, September. Association for Computational Linguistics. Richard Johansson and Pierre Nugues . 2008 . The effect of syntactic representation on semantic role labeling . In Proceedings of the 22nd International Conference on Computational Linguistics (Coling 2008 ), pages 393 - 400 , Manchester, UK , August . Coling 2008 Organizing Committee . Meghana Kshirsagar , Sam Thomson, Nathan Schneider, Jaime Carbonell, Noah A. Smith , and Chris Dyer . 2015 . Frame-semantic role labeling with heterogeneous annotations . In Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 2: Short Papers) , pages 218 - 224 , Beijing, China, July. Association for Computational Linguistics. Maciej Kula . 2015 . Metadata embeddings for user and item cold-start recommendations . In Toine Bogers and Marijn Koolen , editors, Proceedings of the 2nd Workshop on New Trends on Content-Based Recommender Systems co-located with 9th ACM Conference on Recommender Systems (RecSys 2015 ), volume 1448 of CEUR Workshop Proceedings , pages 14 - 21 , Vienna, Austria, September. CEUR-WS.org. Omer Levy and Yoav Goldberg . 2014 . Dependencybased word embeddings . In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics , ACL 2014 , June 22-27, 2014 , Baltimore, MD , USA, Volume 2 : Short Papers , pages 302 - 308 . The Association for Computer Linguistics. Oren Melamud , Jacob Goldberger , and Ido Dagan . 2016 . context2vec: Learning generic context embedding with bidirectional LSTM . In Proceedings of the 20th SIGNLL Conference on Computational Natural Language Learning , CoNLL 2016 , Berlin, Germany, August 11-12 , 2016 , pages 51 - 61 . Tomas Mikolov , Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean . 2013 . Distributed Representations of Words and Phrases and Their Compositionality . In Proceedings of the 26th International Conference on Neural Information Processing Systems (NIPS '13) , pages 3111 - 3119 , Lake Tahoe , Nevada, USA. Alexis Palmer and Caroline Sporleder . 2010 . Evaluating FrameNet-style semantic parsing: the role of coverage gaps in FrameNet . In Proceedings of the 23rd International Conference on Computational Linguistics: Posters , pages 928 - 936 , Beijing, China, August . Rebecca J. Passonneau , Collin F. Baker , Christiane Fellbaum, and Nancy Ide . 2012 . The MASC Word Sense Corpus . In Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12) , pages 3025 - 3030 , Istanbul, Turkey. Jeffrey Pennington , Richard Socher, and Christopher Manning . 2014 . Glove: Global vectors for word representation . In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 1532 - 1543 , Doha, Qatar, October. Association for Computational Linguistics. Michael Roth and Mirella Lapata . 2015 . Contextaware frame-semantic role labeling . Transactions of the Association for Computational Linguistics , 3 : 449 - 460 . Josef Ruppenhofer , Michael Ellsworth, Miriam R. L. Petruck , Christopher R. Johnson , and Jan Scheffczyk . 2010 . FrameNet II: Extended Theory and Practice . Technical report , ICSI, University of California, Berkeley. Anders Søgaard , Barbara Plank, and H e´ctor Mart´ınez Alonso. 2015 . Using Frame Semantics for Knowledge Extraction from Twitter . In Proceedings of the Twenty-Ninth AAAI Conference on Artificial Intelligence , pages 2447 - 2452 , Austin, Texas, USA. Anders Søgaard . 2013 . Semi-supervised learning and domain adaptation in natural language processing . Synthesis Lectures on Human Language Technologies , 6 ( 2 ): 1 - 103 . Mihai Surdeanu , Richard Johansson, Adam Meyers, Llu´ıs Ma`rquez, and Joakim Nivre . 2008 . The conll 2008 shared task on joint parsing of syntactic and semantic dependencies . In CoNLL 2008: Proceedings of the Twelfth Conference on Computational Natural Language Learning , pages 159 - 177 , Manchester, England, August. Coling 2008 Organizing Committee . Mihai Surdeanu , Massimiliano Ciaramita, and Hugo Zaragoza . 2011 . Learning to rank answers to nonfactoid questions from web collections . Computational Linguistics , 37 ( 2 ): 351 - 383 . Kaveh Taghipour and Hwee Tou Ng. 2015 . SemiSupervised Word Sense Disambiguation Using Word Embeddings in General and Specific Domains . In Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies , pages 314 - 323 , Denver, Colorado, May-June. Association for Computational Linguistics . Jason Weston , Samy Bengio, and Nicolas Usunier . 2011 . WSABIE: Scaling Up to Large Vocabulary Image Annotation . In Proceedings of the Twenty-Second International Joint Conference on Artificial Intelligence - Volume Volume Three , IJCAI'11 , pages 2764 - 2770 , Barcelona, Catalonia, Spain. AAAI Press. Haitong Yang , Tao Zhuang , and Chengqing Zong . 2015 . Domain adaptation for syntactic and semantic dependency parsing using deep belief networks . Transactions of the Association for Computational Linguistics , 3 : 271 - 282 . Seid Muhie Yimam , Richard Eckart de Castilho, Iryna Gurevych, and Chris Biemann . 2014 . Automatic Annotation Suggestions and Custom Annotation Layers in WebAnno . In Kalina Bontcheva and Zhu Jingbo, editors, Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics. System Demonstrations , pages 91 - 96 , Stroudsburg, PA 18360 , USA. Association for Computational Linguistics. -[ Out-of-domain FrameNet Semantic Role Labeling] +[Out-of-domain FrameNet Semantic Role Labeling] Heading sofa: _InitialView - begin: 0 + begin: 1 end: 46 -[ Silvana Hartmann Ilia Kuznetsov Teresa Martin Iryna Gurevych Research Training Group AIPHES Ubiquitous Knowledge Processing (UKP) Lab 0 Department of Computer Science, Technische Universita ̈t Darmstadt 2017 1 471 482 Domain dependence of NLP systems is one of the major obstacles to their application in large-scale text analysis, also restricting the applicability of FrameNet semantic role labeling (SRL) systems. Yet, current FrameNet SRL systems are still only evaluated on a single in-domain test set. For the first time, we study the domain dependence of FrameNet SRL on a wide range of benchmark sets. We create a novel test set for FrameNet SRL based on user-generated web text and find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step. To address this problem, we develop a simple, yet efficient system based on distributed word representations. Our system closely approaches the state-of-the-art in-domain while outperforming the best available frame identification system out-of-domain. We publish our system and test data for research purposes.1] +[Silvana Hartmann Ilia Kuznetsov Teresa Martin Iryna Gurevych Research Training Group AIPHES Ubiquitous Knowledge Processing (UKP) Lab 0 Department of Computer Science, Technische Universita ̈t Darmstadt 2017 1 471 482 Domain dependence of NLP systems is one of the major obstacles to their application in large-scale text analysis, also restricting the applicability of FrameNet semantic role labeling (SRL) systems. Yet, current FrameNet SRL systems are still only evaluated on a single in-domain test set. For the first time, we study the domain dependence of FrameNet SRL on a wide range of benchmark sets. We create a novel test set for FrameNet SRL based on user-generated web text and find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step. To address this problem, we develop a simple, yet efficient system based on distributed word representations. Our system closely approaches the state-of-the-art in-domain while outperforming the best available frame identification system out-of-domain. We publish our system and test data for research purposes.1] Paragraph sofa: _InitialView - begin: 46 + begin: 47 end: 1146 -[ - ] +[-] Heading sofa: _InitialView - begin: 1146 - end: 1149 -[ Domain dependence is a major problem for supervised NLP tasks such as FrameNet semantic role labeling (SRL): systems generally exhibit a strong performance drop when applied to test data from a different distribution than the training data. This prohibits their large-scale use in language technology applications.] + begin: 1147 + end: 1148 +[Domain dependence is a major problem for supervised NLP tasks such as FrameNet semantic role labeling (SRL): systems generally exhibit a strong performance drop when applied to test data from a different distribution than the training data. This prohibits their large-scale use in language technology applications.] Paragraph sofa: _InitialView - begin: 1149 + begin: 1150 end: 1464 -[ The same problems are expected for FrameNet SRL, but due to a lack of datasets, state-of-theart FrameNet SRL is only evaluated on a single in-domain test set, see e.g. Das et al. (2014) and FitzGerald et al. (2015).] +[The same problems are expected for FrameNet SRL, but due to a lack of datasets, state-of-theart FrameNet SRL is only evaluated on a single in-domain test set, see e.g. Das et al. (2014) and FitzGerald et al. (2015).] Paragraph sofa: _InitialView - begin: 1464 + begin: 1465 end: 1680 -[ In this work, we present the first comprehensive study of the domain dependence of FrameNet SRL] +[In this work, we present the first comprehensive study of the domain dependence of FrameNet SRL] Paragraph sofa: _InitialView - begin: 1680 + begin: 1681 end: 1776 -[ 1www.ukp.tu-darmstadt.de/ood-fn-srl ] +[1www.ukp.tu-darmstadt.de/ood-fn-srl] Heading sofa: _InitialView - begin: 1776 - end: 1813 -[ on a range of benchmark datasets. This is crucial as the demand for semantic textual analysis of largescale web data keeps growing.] + begin: 1777 + end: 1812 +[on a range of benchmark datasets. This is crucial as the demand for semantic textual analysis of largescale web data keeps growing.] Paragraph sofa: _InitialView - begin: 1813 + begin: 1814 end: 1945 -[ Based on FrameNet , FrameNet SRL extracts frame-semantic structures on the sentence level that describe a specific situation centered around a semantic predicate, often a verb, and its participants, typically syntactic arguments or adjuncts of the predicate. The predicate is assigned a frame label, essentially a word sense label, that defines the situation and determines the semantic roles of the participants. The following sentence from FrameNet provides an example of the Grinding frame and its roles: [The mill]Grinding cause malt]P atient [to grist]Result. grindsGrinding [the] +[Based on FrameNet , FrameNet SRL extracts frame-semantic structures on the sentence level that describe a specific situation centered around a semantic predicate, often a verb, and its participants, typically syntactic arguments or adjuncts of the predicate. The predicate is assigned a frame label, essentially a word sense label, that defines the situation and determines the semantic roles of the participants. The following sentence from FrameNet provides an example of the Grinding frame and its roles: [The mill]Grinding cause malt]P atient [to grist]Result. grindsGrinding [the] Paragraph sofa: _InitialView - begin: 1945 + begin: 1946 end: 2531 -[ FrameNet SRL consists of two steps, frame identification (frameId), assigning a frame to the current predicate, and role labeling (roleId), identifying the participants and assigning them role labels licensed by the frame. The frameId step reduces the hundreds of role labels in FrameNet to a manageable set of up to 30 roles. Thus, FrameNet SRL differs from PropBank SRL , that only uses a small set of 26 syntactically motivated role labels and puts less weight on the predicate sense. The advantage of FrameNet SRL is that it results in a more fine-grained and rich interpretation of the input sentences which is crucial for many applications, e.g. reasoning in online debates .] +[FrameNet SRL consists of two steps, frame identification (frameId), assigning a frame to the current predicate, and role labeling (roleId), identifying the participants and assigning them role labels licensed by the frame. The frameId step reduces the hundreds of role labels in FrameNet to a manageable set of up to 30 roles. Thus, FrameNet SRL differs from PropBank SRL , that only uses a small set of 26 syntactically motivated role labels and puts less weight on the predicate sense. The advantage of FrameNet SRL is that it results in a more fine-grained and rich interpretation of the input sentences which is crucial for many applications, e.g. reasoning in online debates .] Paragraph sofa: _InitialView - begin: 2531 + begin: 2532 end: 3215 -[ Domain dependence is a well-studied topic for PropBank SRL. However, to the best of our knowledge, there exists no analysis of the performance of modern FrameNet SRL systems when applied to data from new domains.] +[Domain dependence is a well-studied topic for PropBank SRL. However, to the best of our knowledge, there exists no analysis of the performance of modern FrameNet SRL systems when applied to data from new domains.] Paragraph sofa: _InitialView - begin: 3215 + begin: 3216 end: 3428 -[ In this work, we address this problem as follows: we introduce a new benchmark dataset YAGS (Yahoo! Answers Gold Standard), which is based on user-generated questions and answers and exemplifies an out-of-domain application use case. We use YAGS, along with other out-of-domain test sets, to perform a detailed analysis of the domain dependence of FrameNet SRL using Semafor to identify which of the stages of FrameNet SRL, frameId or roleId, is particularly sensitive to domain shifts. Our results confirm that the major bottleneck in FrameNet SRL is the frame identification step. Motivated by that, we develop a simple, yet efficient frame identification method based on distributed word representations that promise better domain generalization. Our system’s performance matches the state-of-the-art in-domain , despite using a simpler model, and improves on the out-of-domain performance of Semafor.] +[In this work, we address this problem as follows: we introduce a new benchmark dataset YAGS (Yahoo! Answers Gold Standard), which is based on user-generated questions and answers and exemplifies an out-of-domain application use case. We use YAGS, along with other out-of-domain test sets, to perform a detailed analysis of the domain dependence of FrameNet SRL using Semafor to identify which of the stages of FrameNet SRL, frameId or roleId, is particularly sensitive to domain shifts. Our results confirm that the major bottleneck in FrameNet SRL is the frame identification step. Motivated by that, we develop a simple, yet efficient frame identification method based on distributed word representations that promise better domain generalization. Our system’s performance matches the state-of-the-art in-domain , despite using a simpler model, and improves on the out-of-domain performance of Semafor.] Paragraph sofa: _InitialView - begin: 3428 + begin: 3429 end: 4336 -[ The contributions of the present work are twofold: 1) we perform the first comprehensive study of the domain generalization capabilities of opensource FrameNet SRL, and 2) we propose a new frame identification method based on distributed word representations that enhances out-of-domain performance of frame identification. To enable our study, we created YAGS, a new, substantially-sized benchmark dataset for the out-of-domain testing of FrameNet SRL; we publish the annotations for the YAGS benchmark set and our frame identification system for research purposes. 2] +[The contributions of the present work are twofold: 1) we perform the first comprehensive study of the domain generalization capabilities of opensource FrameNet SRL, and 2) we propose a new frame identification method based on distributed word representations that enhances out-of-domain performance of frame identification. To enable our study, we created YAGS, a new, substantially-sized benchmark dataset for the out-of-domain testing of FrameNet SRL; we publish the annotations for the YAGS benchmark set and our frame identification system for research purposes. 2] Paragraph sofa: _InitialView - begin: 4336 + begin: 4337 end: 4905 -[ Related work ] +[Related work] Heading sofa: _InitialView - begin: 4905 - end: 4919 -[ The domain dependence of FrameNet SRL systems has been only studied sparsely, however, there exists a large body of work on out-of-domain PropBank SRL, as well as on general domain adaptation methods for NLP. This section briefly introduces some of the relevant approaches in these areas, and then summarizes the state-of-the-art in FrameNet frame identification.] + begin: 4906 + end: 4918 +[The domain dependence of FrameNet SRL systems has been only studied sparsely, however, there exists a large body of work on out-of-domain PropBank SRL, as well as on general domain adaptation methods for NLP. This section briefly introduces some of the relevant approaches in these areas, and then summarizes the state-of-the-art in FrameNet frame identification.] Paragraph sofa: _InitialView - begin: 4919 + begin: 4920 end: 5283 -[ Domain adaptation in NLP Low out-ofdomain performance is a problem common to many supervised machine learning tasks. The goal of domain adaptation is to improve model performance on the test data originating from a different distribution than the training data (Søgaard, 2013). For NLP, domain adaptation has been studied for various tasks such as POS-tagging and syntactic parsing . For the complex task of SRL, it is strongly associated with PropBank, because the corresponding CoNLL shared tasks promote out-of-domain evaluation . In the shared tasks, in-domain newspaper text from the WSJ Corpus is contrasted to out-of-domain data from fiction texts in the Brown Corpus. Most of the participants in the shared tasks do not consider domain adaptation and report systematically lower scores for the out-of-domain data (Hajicˇ et al., 2009).] +[Domain adaptation in NLP Low out-ofdomain performance is a problem common to many supervised machine learning tasks. The goal of domain adaptation is to improve model performance on the test data originating from a different distribution than the training data (Søgaard, 2013). For NLP, domain adaptation has been studied for various tasks such as POS-tagging and syntactic parsing . For the complex task of SRL, it is strongly associated with PropBank, because the corresponding CoNLL shared tasks promote out-of-domain evaluation . In the shared tasks, in-domain newspaper text from the WSJ Corpus is contrasted to out-of-domain data from fiction texts in the Brown Corpus. Most of the participants in the shared tasks do not consider domain adaptation and report systematically lower scores for the out-of-domain data (Hajicˇ et al., 2009).] Paragraph sofa: _InitialView - begin: 5283 + begin: 5284 end: 6129 -[ Representation learning has been successfully used to improve on the CoNLL shared task results . Yang et al. (2015) report the smallest performance difference (5.5 points in F1) between in-domain and out-of-domain test data, leading to the best results to date on the CoNLL 2009 out-of-domain test. Their system learns common representations for in-domain and out-of-domain data based on deep belief networks.] +[Representation learning has been successfully used to improve on the CoNLL shared task results . Yang et al. (2015) report the smallest performance difference (5.5 points in F1) between in-domain and out-of-domain test data, leading to the best results to date on the CoNLL 2009 out-of-domain test. Their system learns common representations for in-domain and out-of-domain data based on deep belief networks.] Paragraph sofa: _InitialView - begin: 6129 + begin: 6130 end: 6540 -[ Domain dependence of FrameNet SRL The ] +[Domain dependence of FrameNet SRL The] Heading sofa: _InitialView - begin: 6540 - end: 6579 -[ FrameNet 1.5 fulltext corpus, used as a standard dataset for training and evaluating FrameNet SRL systems, contains texts from several domains . However, the standard data split used to evaluate modern systems ensures the presence of all domains in the training as well as test data and cannot be used to assess the systems’ ability to generalize. Moreover, all the texts in the FrameNet fulltext corpus, based on newspaper and literary texts, are post-edited and linguistically well-formed. The FrameNet test setup thus cannot provide information on SRL performance on less edited out-ofdomain data, e.g. user-generated web data.] + begin: 6541 + end: 6578 +[FrameNet 1.5 fulltext corpus, used as a standard dataset for training and evaluating FrameNet SRL systems, contains texts from several domains . However, the standard data split used to evaluate modern systems ensures the presence of all domains in the training as well as test data and cannot be used to assess the systems’ ability to generalize. Moreover, all the texts in the FrameNet fulltext corpus, based on newspaper and literary texts, are post-edited and linguistically well-formed. The FrameNet test setup thus cannot provide information on SRL performance on less edited out-ofdomain data, e.g. user-generated web data.] Paragraph sofa: _InitialView - begin: 6579 + begin: 6580 end: 7213 -[ There are few studies related to the out-ofdomain generalization of FrameNet SRL. Johansson and Nugues (2008) evaluate the impact of different parsers on FrameNet SRL using the Nuclear Threats Initiative (NTI) data as an out-of-domain test set. They observe low domain generalization abilities of their supervised system, but find that using dependency parsers instead of constituency parsers is beneficial in the out-of-domain scenario. Croce et al. (2010) use a similar in-domain/out-ofdomain split to evaluate their approach to opendomain FrameNet SRL. They integrate a distributional model into their SRL system to generalize lexicalized features to previously unseen arguments and thus create an SRL system with a smaller performance gap between in-domain and out-ofdomain test data (only 4.5 percentage points F1). Note that they only evaluate the role labeling step. It is not transparent how their results would transfer to the current state-of-the-art SRL systems that already integrate methods to improve generalization, for instance using distributed representations.] +[There are few studies related to the out-ofdomain generalization of FrameNet SRL. Johansson and Nugues (2008) evaluate the impact of different parsers on FrameNet SRL using the Nuclear Threats Initiative (NTI) data as an out-of-domain test set. They observe low domain generalization abilities of their supervised system, but find that using dependency parsers instead of constituency parsers is beneficial in the out-of-domain scenario. Croce et al. (2010) use a similar in-domain/out-ofdomain split to evaluate their approach to opendomain FrameNet SRL. They integrate a distributional model into their SRL system to generalize lexicalized features to previously unseen arguments and thus create an SRL system with a smaller performance gap between in-domain and out-ofdomain test data (only 4.5 percentage points F1). Note that they only evaluate the role labeling step. It is not transparent how their results would transfer to the current state-of-the-art SRL systems that already integrate methods to improve generalization, for instance using distributed representations.] Paragraph sofa: _InitialView - begin: 7213 + begin: 7214 end: 8292 -[ Palmer and Sporleder (2010) analyze the FrameNet 1.3 training data coverage and the performance of the Shalmaneser SRL system for frame identification on several test sets across domains, i.e. the PropBank and NTI parts of the FrameNet fulltext corpus and the fictional texts from the SemEval-2007 shared task . Having observed that the majority of errors results from coverage gaps in FrameNet, they suggest to focus on developing frame identification systems that generalize well to new domains. Our observations support their findings and show that the problem still persists even when modern SRL methods and the extended FrameNet 1.5 lexicon are used.] +[Palmer and Sporleder (2010) analyze the FrameNet 1.3 training data coverage and the performance of the Shalmaneser SRL system for frame identification on several test sets across domains, i.e. the PropBank and NTI parts of the FrameNet fulltext corpus and the fictional texts from the SemEval-2007 shared task . Having observed that the majority of errors results from coverage gaps in FrameNet, they suggest to focus on developing frame identification systems that generalize well to new domains. Our observations support their findings and show that the problem still persists even when modern SRL methods and the extended FrameNet 1.5 lexicon are used.] Paragraph sofa: _InitialView - begin: 8292 + begin: 8293 end: 8951 -[ Søgaard et al. (2015) annotate 236 tweets with FrameNet labels to apply SRL to knowledge extraction from Twitter. They report that the frameId performance of Semafor 2.1 (Das et al., 2010) on the new test set is similar to its performance on the SemEval-2007 newswire test set . For full SRL, there are large differences: F1 reaches only 25.96% on the Twitter set compared to the 46.5% reported by Das et al. (2010) on the indomain set. These results show that there is ample room for improvement for SRL on Twitter data.] +[Søgaard et al. (2015) annotate 236 tweets with FrameNet labels to apply SRL to knowledge extraction from Twitter. They report that the frameId performance of Semafor 2.1 (Das et al., 2010) on the new test set is similar to its performance on the SemEval-2007 newswire test set . For full SRL, there are large differences: F1 reaches only 25.96% on the Twitter set compared to the 46.5% reported by Das et al. (2010) on the indomain set. These results show that there is ample room for improvement for SRL on Twitter data.] Paragraph sofa: _InitialView - begin: 8951 + begin: 8952 end: 9474 -[ Recent FrameNet SRL systems are not evaluated in the context of their domain dependence: Kshirsagar et al. (2015) use the domain adaptation approach from Daume´III (2007) to augment the feature space for FrameNet SRL with FrameNet example sentences; FitzGerald et al. (2015) and Hermann et al. (2014) adopt deep learning methods, including learning representations that may generalize better to unseen data, to present stateof-the-art results for FrameNet SRL. All of the former only use the already introduced split of the FrameNet fulltext corpus for testing, as does the long-time state-of-the-art system Semafor . Out-of-domain evaluation is lacking, as are datasets that enable this kind of evaluation. Frame identification Current state of the art in frame identification is the approach by Hermann et al. (2014), further referred to as Hermann-14, followed by the previous state-of-the art model Semafor .] +[Recent FrameNet SRL systems are not evaluated in the context of their domain dependence: Kshirsagar et al. (2015) use the domain adaptation approach from Daume´III (2007) to augment the feature space for FrameNet SRL with FrameNet example sentences; FitzGerald et al. (2015) and Hermann et al. (2014) adopt deep learning methods, including learning representations that may generalize better to unseen data, to present stateof-the-art results for FrameNet SRL. All of the former only use the already introduced split of the FrameNet fulltext corpus for testing, as does the long-time state-of-the-art system Semafor . Out-of-domain evaluation is lacking, as are datasets that enable this kind of evaluation. Frame identification Current state of the art in frame identification is the approach by Hermann et al. (2014), further referred to as Hermann-14, followed by the previous state-of-the art model Semafor .] Paragraph sofa: _InitialView - begin: 9474 + begin: 9475 end: 10389 -[ The frame identification system of Semafor relies on an elaborate feature set based on syntactic and lexical features, using the WordNet hierarchy as a source of lexical information, and a label propagation-based approach to take unknown predicates into account. Semafor is not specifically designed for out-of-domain use: the WordNet coverage is limited, and the quality of syntactic parsing might drop when the system is applied to out-ofdomain data, especially in case of non-standard user-generated texts.] +[The frame identification system of Semafor relies on an elaborate feature set based on syntactic and lexical features, using the WordNet hierarchy as a source of lexical information, and a label propagation-based approach to take unknown predicates into account. Semafor is not specifically designed for out-of-domain use: the WordNet coverage is limited, and the quality of syntactic parsing might drop when the system is applied to out-ofdomain data, especially in case of non-standard user-generated texts.] Paragraph sofa: _InitialView - begin: 10389 + begin: 10390 end: 10899 -[ Hermann-14 uses distributed word representations augmented by syntactic information. Generalpurpose distributed word representations (such as word2vec and GloVe ) are beneficial for many NLP tasks: word representations are calculated on a large unlabeled corpus, and then used as input for high-level tasks for which training data is scarce, such as syntactic parsing, word sense disambiguation, and SRL. In the syntax-augmented representations of Hermann-14, a region of the input vector, a container, is reserved for each syntactic path that can connect predicates to their arguments. This container is populated with a corresponding argument word representation, if the argument on this path is found in the training data. Hermann-14 uses the WSABIE algorithm to map input and frame representations to a common latent space. WSABIE uses WARP loss and gradient-based updates to minimize the distance between the latent representations of the predicate target and the correct frame, while maximizing the distance to all the other irrelevant frames. During testing, cosine similarity is used to find the closest frame given the input. One advantage of this approach is that similar frames are positioned close to each other in the latent space which allows information to be shared between similar predicates and similar frames. This system is the current state-ofthe-art for in-domain frame identification, but has not been applied in an out-of-domain setting. 3] +[Hermann-14 uses distributed word representations augmented by syntactic information. Generalpurpose distributed word representations (such as word2vec and GloVe ) are beneficial for many NLP tasks: word representations are calculated on a large unlabeled corpus, and then used as input for high-level tasks for which training data is scarce, such as syntactic parsing, word sense disambiguation, and SRL. In the syntax-augmented representations of Hermann-14, a region of the input vector, a container, is reserved for each syntactic path that can connect predicates to their arguments. This container is populated with a corresponding argument word representation, if the argument on this path is found in the training data. Hermann-14 uses the WSABIE algorithm to map input and frame representations to a common latent space. WSABIE uses WARP loss and gradient-based updates to minimize the distance between the latent representations of the predicate target and the correct frame, while maximizing the distance to all the other irrelevant frames. During testing, cosine similarity is used to find the closest frame given the input. One advantage of this approach is that similar frames are positioned close to each other in the latent space which allows information to be shared between similar predicates and similar frames. This system is the current state-ofthe-art for in-domain frame identification, but has not been applied in an out-of-domain setting. 3] Paragraph sofa: _InitialView - begin: 10899 + begin: 10900 end: 12368 -[ Out-of-domain FrameNet test data ] +[Out-of-domain FrameNet test data] Heading sofa: _InitialView - begin: 12368 - end: 12402 -[ This section describes available in-domain and outof-domain FrameNet test sets and the creation of YAGS, a new out-of-domain FrameNet test set. FrameNet test sets FrameNet SRL is typically evaluated on das-test, the test set first introduced by Das and Smith (2011). It is a held-out set randomly sampled from the FrameNet 1.5 fulltext corpus. While the FrameNet fulltext corpus contains data from various sources, we consider das-test an in-domain test set: all data sources of the test set are also represented in the training set.] + begin: 12369 + end: 12401 +[This section describes available in-domain and outof-domain FrameNet test sets and the creation of YAGS, a new out-of-domain FrameNet test set. FrameNet test sets FrameNet SRL is typically evaluated on das-test, the test set first introduced by Das and Smith (2011). It is a held-out set randomly sampled from the FrameNet 1.5 fulltext corpus. While the FrameNet fulltext corpus contains data from various sources, we consider das-test an in-domain test set: all data sources of the test set are also represented in the training set.] Paragraph sofa: _InitialView - begin: 12402 + begin: 12403 end: 12936 -[ There are two additional datasets from other domains that we use in our study on domain generalization: The MASC word sense sentences corpus contains FrameNet annotations for a lexical sample of roughly 100 lemmas from ANC . The Twitter-based dataset from Søgaard et al. (2015), henceforth TW, has some very distinctive properties: it does not provide a gold standard, but annotations by three annotators. This leads to a high variance in role annotations: the annotator TW3 annotated only 82% of the number of roles annotated by TW1, see Table 1. Like Søgaard et al. (2015), we report SRL results as averages over the three annotations (TW-av).] +[There are two additional datasets from other domains that we use in our study on domain generalization: The MASC word sense sentences corpus contains FrameNet annotations for a lexical sample of roughly 100 lemmas from ANC . The Twitter-based dataset from Søgaard et al. (2015), henceforth TW, has some very distinctive properties: it does not provide a gold standard, but annotations by three annotators. This leads to a high variance in role annotations: the annotator TW3 annotated only 82% of the number of roles annotated by TW1, see Table 1. Like Søgaard et al. (2015), we report SRL results as averages over the three annotations (TW-av).] Paragraph sofa: _InitialView - begin: 12936 + begin: 12937 end: 13583 -[ Table 1 shows statistics on these datasets. For TW, it displays the statistics for each annotator. The TW datasets are fairly small, containing only around 1,000 frame labels. The MASC dataset is of substantial size, but it constitutes a lexical sample and therefore a slightly artificial evaluation setup. There is another Twitter-based test set , which we do not use in our experiments, because it was created semi-automatically and is therefore of lower quality. We conclude that existing out-of-domain test sets for FrameNet SRL are insufficient, in particular for increasingly important domains like user-generated text, because available datasets are either small or of low quality.] +[Table 1 shows statistics on these datasets. For TW, it displays the statistics for each annotator. The TW datasets are fairly small, containing only around 1,000 frame labels. The MASC dataset is of substantial size, but it constitutes a lexical sample and therefore a slightly artificial evaluation setup. There is another Twitter-based test set , which we do not use in our experiments, because it was created semi-automatically and is therefore of lower quality. We conclude that existing out-of-domain test sets for FrameNet SRL are insufficient, in particular for increasingly important domains like user-generated text, because available datasets are either small or of low quality.] Paragraph sofa: _InitialView - begin: 13583 + begin: 13584 end: 14273 -[ YAGS: a new FrameNet test set based on user ] +[YAGS: a new FrameNet test set based on user] Heading sofa: _InitialView - begin: 14273 - end: 14318 -[ generated text To address the need for new outof-domain test datasets, we created YAGS, a new FrameNet-annotated evaluation dataset based on question-answer data from Yahoo! Answers (YA), a community-driven question-and-answer forum. The corpus is based on a random sample of 55 questions and their answers from the test split of the YA Manner Questions dataset used by Surdeanu et al. (2011) and published as part of the Yahoo! Webscope program (https://webscope. sandbox.yahoo.com/).] + begin: 14274 + end: 14317 +[generated text To address the need for new outof-domain test datasets, we created YAGS, a new FrameNet-annotated evaluation dataset based on question-answer data from Yahoo! Answers (YA), a community-driven question-and-answer forum. The corpus is based on a random sample of 55 questions and their answers from the test split of the YA Manner Questions dataset used by Surdeanu et al. (2011) and published as part of the Yahoo! Webscope program (https://webscope. sandbox.yahoo.com/).] Paragraph sofa: _InitialView - begin: 14318 + begin: 14319 end: 14804 -[ YAGS contains 1,415 sentences, 3,091 frame annotations, and 6,081 role annotations. Figure 1 shows a sentence from YAGS that demonstrates some non-standard properties of the user-generated question-answer data, such as typos (mortal instead of mortar). We publish the annotations as stand-off annotations to the original dataset.] +[YAGS contains 1,415 sentences, 3,091 frame annotations, and 6,081 role annotations. Figure 1 shows a sentence from YAGS that demonstrates some non-standard properties of the user-generated question-answer data, such as typos (mortal instead of mortar). We publish the annotations as stand-off annotations to the original dataset.] Paragraph sofa: _InitialView - begin: 14804 + begin: 14805 end: 15134 -[ Annotation study Each document was annotated by a two linguistically trained annotators provided with detailed guidelines and then curated by an experienced expert, all using WebAnno 2.0.0 . Up to five predicates per sentence were pre-selected automatically based on lemma and POS, preferring verbal predicates to other POS, which leads to a larger proportion of verbs in YAGS. The annotation task was to identify the correct frame label for each predicate, if any, and then to identify the role spans as arguments and adjuncts of the frame, and to label them with the appropriate role. For reference, annotators accessed the FrameNet 1.5 definitions and examples with the FrameNet Explorer tool (www.clres.com/FNExplorer.html).] +[Annotation study Each document was annotated by a two linguistically trained annotators provided with detailed guidelines and then curated by an experienced expert, all using WebAnno 2.0.0 . Up to five predicates per sentence were pre-selected automatically based on lemma and POS, preferring verbal predicates to other POS, which leads to a larger proportion of verbs in YAGS. The annotation task was to identify the correct frame label for each predicate, if any, and then to identify the role spans as arguments and adjuncts of the frame, and to label them with the appropriate role. For reference, annotators accessed the FrameNet 1.5 definitions and examples with the FrameNet Explorer tool (www.clres.com/FNExplorer.html).] Paragraph sofa: _InitialView - begin: 15134 + begin: 15135 end: 15864 -[ Inter-rater agreement for frame labels is Krippendorff’s α=0.76; agreement for role labels given matching spans is α=0.62, and Krippendorff’s α unitizing agreement for role spans is 0.7 – a good result for such a difficult task on user-generated text. Average pairwise F1 agreement for frame labels is high at 0.96, higher than the 0.84 reported by Søgaard et al. (2015) for the TW sets. Our high frame agreement is a result of annotator experience and our elaborate annotation setup.] +[Inter-rater agreement for frame labels is Krippendorff’s α=0.76; agreement for role labels given matching spans is α=0.62, and Krippendorff’s α unitizing agreement for role spans is 0.7 – a good result for such a difficult task on user-generated text. Average pairwise F1 agreement for frame labels is high at 0.96, higher than the 0.84 reported by Søgaard et al. (2015) for the TW sets. Our high frame agreement is a result of annotator experience and our elaborate annotation setup.] Paragraph sofa: _InitialView - begin: 15864 + begin: 15865 end: 16349 -[ YAGS statistics and properties Table 1 presents ] +[YAGS statistics and properties Table 1 presents] Heading sofa: _InitialView - begin: 16349 - end: 16398 -[ dataset statistics for YAGS and the other test sets. Due to the predicate selection, YAGS contains a larger proportion of verbal predicates than the other sets, and has three times more frames and roles than TW, approximating the size of das-test. The proportion of core roles, roles that are obligatory for a frame and thus typically more frequent in datasets than non-core roles, in the out-of-domain test sets (TW, YAGS, MASC) is slightly smaller data s f a n v compared to das-test. This goes along with a larger variance of roles in YAGS.] + begin: 16350 + end: 16397 +[dataset statistics for YAGS and the other test sets. Due to the predicate selection, YAGS contains a larger proportion of verbal predicates than the other sets, and has three times more frames and roles than TW, approximating the size of das-test. The proportion of core roles, roles that are obligatory for a frame and thus typically more frequent in datasets than non-core roles, in the out-of-domain test sets (TW, YAGS, MASC) is slightly smaller data s f a n v compared to das-test. This goes along with a larger variance of roles in YAGS.] Paragraph sofa: _InitialView - begin: 16398 + begin: 16399 end: 16942 -[ The user-generated aspect of YAGS manifests in spelling errors, and in the lack of punctuation and structure of the texts. The language is informal, but there are only few emoticons or other special words such as the hashtags typically found in tweets.] +[The user-generated aspect of YAGS manifests in spelling errors, and in the lack of punctuation and structure of the texts. The language is informal, but there are only few emoticons or other special words such as the hashtags typically found in tweets.] Paragraph sofa: _InitialView - begin: 16942 + begin: 16943 end: 17195 -[ In the next section, we use the test sets from Table 1 to analyze the domain generalization capabilities of an open-source FrameNet SRL system. 4] +[In the next section, we use the test sets from Table 1 to analyze the domain generalization capabilities of an open-source FrameNet SRL system. 4] Paragraph sofa: _InitialView - begin: 17195 + begin: 17196 end: 17341 -[ Domain generalization capabilities of open-source FrameNet SRL ] +[Domain generalization capabilities of open-source FrameNet SRL] Heading sofa: _InitialView - begin: 17341 - end: 17405 -[ To analyze the domain generalization capabilities of contemporary open-source SRL, we ran the frame identification from Semafor with the enhanced role labeler from Kshirsagar et al. (2015), both trained on the in-domain das-train set, on the four test sets das-test, YAGS, TW, and MASC. The systems receive text annotated with predicate spans as input, which has become the standard in recent evaluations.] + begin: 17342 + end: 17404 +[To analyze the domain generalization capabilities of contemporary open-source SRL, we ran the frame identification from Semafor with the enhanced role labeler from Kshirsagar et al. (2015), both trained on the in-domain das-train set, on the four test sets das-test, YAGS, TW, and MASC. The systems receive text annotated with predicate spans as input, which has become the standard in recent evaluations.] Paragraph sofa: _InitialView - begin: 17405 + begin: 17406 end: 17813 -[ Evaluation script The Semafor evaluation ] +[Evaluation script The Semafor evaluation] Heading sofa: _InitialView - begin: 17813 - end: 17855 -[ script provides precision P, recall R, and F1 scores for full SRL (SRL), and accuracy A for frame identification (frameId). Full SRL evaluation can be performed with and without using gold frames instead of predicted (auto) frames.] + begin: 17814 + end: 17854 +[script provides precision P, recall R, and F1 scores for full SRL (SRL), and accuracy A for frame identification (frameId). Full SRL evaluation can be performed with and without using gold frames instead of predicted (auto) frames.] Paragraph sofa: _InitialView - begin: 17855 + begin: 17856 end: 18089 -[ The script does not provide results on the role labeling (argument identification and labeling, roleId) alone: the scoring mechanism for SRL/gold also considers the by default correct gold frames. This is useful when comparing different SRL systems on the same test set, but not sufficient when 1) comparing role labeling performance on different test sets with a different ratio of frame labels to role labels (resulting from different annotation strategies), and 2) analyzing the contribution of frameId and roleId to full SRL performance across test sets. data das-test YAGS MASC TW-av frameId auto gold We therefore evaluate the output of the script to retain the original counts for role labels and compute scores on the role labeling proper (roleId). Moreover, there are two evaluation settings for frameId: exact frame match and partial frame match. We use the exact match setting that does not credit related frames and roles.] +[The script does not provide results on the role labeling (argument identification and labeling, roleId) alone: the scoring mechanism for SRL/gold also considers the by default correct gold frames. This is useful when comparing different SRL systems on the same test set, but not sufficient when 1) comparing role labeling performance on different test sets with a different ratio of frame labels to role labels (resulting from different annotation strategies), and 2) analyzing the contribution of frameId and roleId to full SRL performance across test sets. data das-test YAGS MASC TW-av frameId auto gold We therefore evaluate the output of the script to retain the original counts for role labels and compute scores on the role labeling proper (roleId). Moreover, there are two evaluation settings for frameId: exact frame match and partial frame match. We use the exact match setting that does not credit related frames and roles.] Paragraph sofa: _InitialView - begin: 18089 + begin: 18090 end: 19024 -[ Results Table 2 presents scores for exact match frameId and for SRL and roleId with automatic frames (auto) and with gold frames (gold). For TW, the results are averaged over the number of annotators. According to column SRL/auto, we observe best Semafor performance for full SRL on dastest, results for the other test sets are at least 16 percentage points F1 lower. This is mostly due to the worse frameId performance of Semafor on the new test sets, as shown in column frameId: frameId performance is at least 19 percentage points lower. This negatively affects roleId for the out-of-domain test sets (see column roleId/auto). RoleId/auto scores are also low on das-test, but higher than for the other sets.] +[Results Table 2 presents scores for exact match frameId and for SRL and roleId with automatic frames (auto) and with gold frames (gold). For TW, the results are averaged over the number of annotators. According to column SRL/auto, we observe best Semafor performance for full SRL on dastest, results for the other test sets are at least 16 percentage points F1 lower. This is mostly due to the worse frameId performance of Semafor on the new test sets, as shown in column frameId: frameId performance is at least 19 percentage points lower. This negatively affects roleId for the out-of-domain test sets (see column roleId/auto). RoleId/auto scores are also low on das-test, but higher than for the other sets.] Paragraph sofa: _InitialView - begin: 19024 + begin: 19025 end: 19735 -[ When using gold frame labels, roleId and SRL performance improve for all test sets. As shown in columns roleId/gold and SRL/gold, the difference between in-domain and out-of-domain evaluation vanishes. Only MASC scores are still two points lower for full SRL than those for das-test. TW-av scores even surpass the in-domain scores.2] +[When using gold frame labels, roleId and SRL performance improve for all test sets. As shown in columns roleId/gold and SRL/gold, the difference between in-domain and out-of-domain evaluation vanishes. Only MASC scores are still two points lower for full SRL than those for das-test. TW-av scores even surpass the in-domain scores.2] Paragraph sofa: _InitialView - begin: 19735 + begin: 19736 end: 20068 -[ This shows how much FrameNet role labels are dependent on correct frame labels. Thus, it is crucial to improve the out-of-domain performance of frameId systems.] +[This shows how much FrameNet role labels are dependent on correct frame labels. Thus, it is crucial to improve the out-of-domain performance of frameId systems.] Paragraph sofa: _InitialView - begin: 20068 + begin: 20069 end: 20229 -[ Domain dependence appears to be less of a problem for the role labeling step. The MASC dataset is the most difficult for both frameId and roleId. This is mostly a consequence of the lower training data coverage of MASC, as discussed below.] +[Domain dependence appears to be less of a problem for the role labeling step. The MASC dataset is the most difficult for both frameId and roleId. This is mostly a consequence of the lower training data coverage of MASC, as discussed below.] Paragraph sofa: _InitialView - begin: 20229 + begin: 20230 end: 20469 -[ 2Our TW-av results are not comparable to those from Søgaard et al. (2015) because their test setup includes predicate target identification and uses different evaluation metrics. das-test YAGS MASC TW1 TW2 TW3 Analysis In our study, it became clear that domain dependence is crucial to the frame identification step in SRL. The lower scores for the out-ofdomain test sets can be a result of different domainspecific predicate-frame distributions, or a lack of coverage of the domain in the training data.] +[2Our TW-av results are not comparable to those from Søgaard et al. (2015) because their test setup includes predicate target identification and uses different evaluation metrics. das-test YAGS MASC TW1 TW2 TW3 Analysis In our study, it became clear that domain dependence is crucial to the frame identification step in SRL. The lower scores for the out-ofdomain test sets can be a result of different domainspecific predicate-frame distributions, or a lack of coverage of the domain in the training data.] Paragraph sofa: _InitialView - begin: 20469 + begin: 20470 end: 20974 -[ To get a better understanding of these phenomena, we compared detailed statistics of the different test sets, cf. Table 3. Das-test has the largest predicate coverage and contains a lot of monosemous predicates, which boosts the overall performance. The occurrence of fewer monosemous predicates is expected for the lexical sample dataset MASC, but might indicate a domain preference for polysemous predicates in the YAGS and TW datasets.] +[To get a better understanding of these phenomena, we compared detailed statistics of the different test sets, cf. Table 3. Das-test has the largest predicate coverage and contains a lot of monosemous predicates, which boosts the overall performance. The occurrence of fewer monosemous predicates is expected for the lexical sample dataset MASC, but might indicate a domain preference for polysemous predicates in the YAGS and TW datasets.] Paragraph sofa: _InitialView - begin: 20974 + begin: 20975 end: 21413 -[ The percentage of unseen predicates (lemmas ∈/ das-train) is slightly higher for the user-generated test sets than for das-test, and much higher for MASC. This is mirrored in the lower frameId performance for MASC compared to the other test sets, and the slightly higher performance of TW-av and YAGS. Not all errors can be explained by insufficient training data coverage, which indicates that domain effects occur for the out-of-domain sets.] +[The percentage of unseen predicates (lemmas ∈/ das-train) is slightly higher for the user-generated test sets than for das-test, and much higher for MASC. This is mirrored in the lower frameId performance for MASC compared to the other test sets, and the slightly higher performance of TW-av and YAGS. Not all errors can be explained by insufficient training data coverage, which indicates that domain effects occur for the out-of-domain sets.] Paragraph sofa: _InitialView - begin: 21413 + begin: 21414 end: 21857 -[ To support this assumption, we performed a detailed error analysis on the misclassified instances for all test sets. We compute the proportion of wrongly classified instances with unseen predicates, predicates that do not occur in the training set. For MASC, the majority of the errors, 68%, are based on unseen predicates, while the number ranges between 37% and 43% for the other test sets, i.e. 37% for TW, 39% for das-test and 43% for YAGS. This shows that training data coverage is a bigger issue for MASC than for the other test sets. The proportions of in-train errors for YAGS and TW-av are similar to das-test. Together with the fact that overall proportion of errors is still much higher for the user-generated test sets YAGS and TW-av, this further supports our hypothesis of domain effects for YAGS and TW-av. Manual analysis furthermore shows that there are differences in frequently confused frames between the in-domain das-test and out-of-domain YAGS and TW-av.] +[To support this assumption, we performed a detailed error analysis on the misclassified instances for all test sets. We compute the proportion of wrongly classified instances with unseen predicates, predicates that do not occur in the training set. For MASC, the majority of the errors, 68%, are based on unseen predicates, while the number ranges between 37% and 43% for the other test sets, i.e. 37% for TW, 39% for das-test and 43% for YAGS. This shows that training data coverage is a bigger issue for MASC than for the other test sets. The proportions of in-train errors for YAGS and TW-av are similar to das-test. Together with the fact that overall proportion of errors is still much higher for the user-generated test sets YAGS and TW-av, this further supports our hypothesis of domain effects for YAGS and TW-av. Manual analysis furthermore shows that there are differences in frequently confused frames between the in-domain das-test and out-of-domain YAGS and TW-av.] Paragraph sofa: _InitialView - begin: 21857 + begin: 21858 end: 22835 -[ In the next section, we study new methods to improve out-of-domain frame identification. 5] +[In the next section, we study new methods to improve out-of-domain frame identification. 5] Paragraph sofa: _InitialView - begin: 22835 + begin: 22836 end: 22926 -[ Frame identification with distributed word representations ] +[Frame identification with distributed word representations] Heading sofa: _InitialView - begin: 22926 - end: 22986 -[ Given a predicate and a set of frames associated with this predicate, a frame identification system has to choose the correct frame based on the context. In this section we introduce our frame identification method and compare it to the state of the art in both in-domain and out-of-domain settings. Our system SimpleFrameId We developed a straightforward approach to frame identification based on distributed word representations, and were surprised to find that this simple model achieves results comparable to the state-of-theart system, Hermann-14. Our initial attempts to replicate Hermann-14, which is not publicly available, revealed that the container-based input feature space is very sparse: there exist many syntactic paths that can connect a predicate to its arguments, but a predicate instance rarely has more than five arguments in the sentence. So by design the input representation bears no information in most of its path containers. Moreover, Hermann-14 makes heavy use of automatically created dependency parses, which might decline in quality when applied to a new domain. We demonstrate that our simple system achieves competitive in-domain and out-of-domain performance.] + begin: 22927 + end: 22985 +[Given a predicate and a set of frames associated with this predicate, a frame identification system has to choose the correct frame based on the context. In this section we introduce our frame identification method and compare it to the state of the art in both in-domain and out-of-domain settings. Our system SimpleFrameId We developed a straightforward approach to frame identification based on distributed word representations, and were surprised to find that this simple model achieves results comparable to the state-of-theart system, Hermann-14. Our initial attempts to replicate Hermann-14, which is not publicly available, revealed that the container-based input feature space is very sparse: there exist many syntactic paths that can connect a predicate to its arguments, but a predicate instance rarely has more than five arguments in the sentence. So by design the input representation bears no information in most of its path containers. Moreover, Hermann-14 makes heavy use of automatically created dependency parses, which might decline in quality when applied to a new domain. We demonstrate that our simple system achieves competitive in-domain and out-of-domain performance.] Paragraph sofa: _InitialView - begin: 22986 + begin: 22987 end: 24179 -[ Our system, called SimpleFrameId, is specified as follows: given the lexicon L, the vector space vsm and the training data, our goal is to predict the frame f given the sentence S and the predicate p. From the machine learning perspective, the lexicon and the vector space are external resources. The lexicon contains associations between predicates and frames, and we further denote the set of frames available for a predicate as L(p). The vector space provides a pre-defined dense vector representation vsm(w) for each word w. In our case vsm is a simple word lookup function, since we do not modify our word representations during training.] +[Our system, called SimpleFrameId, is specified as follows: given the lexicon L, the vector space vsm and the training data, our goal is to predict the frame f given the sentence S and the predicate p. From the machine learning perspective, the lexicon and the vector space are external resources. The lexicon contains associations between predicates and frames, and we further denote the set of frames available for a predicate as L(p). The vector space provides a pre-defined dense vector representation vsm(w) for each word w. In our case vsm is a simple word lookup function, since we do not modify our word representations during training.] Paragraph sofa: _InitialView - begin: 24179 + begin: 24180 end: 24823 -[ From the sentence we extract the context representation, xc = Pw∈C|Cv|sm(w) . We experiment with two kinds of contexts: SentBOW includes all the words in the sentence, i.e. C = S, DepBOW considers the dependency parse of the sentence and only includes direct dependents of the predicate, C = dep(p, S). As for the predicate, the plain embedding from the source vector space model is used, xp = vsm(p). A simple concatenation of xc and xp serves as input to the disambiguation classifier D, which outputs weights D(xc, xp, f ) for each frame known to the system f ∈ L. Note that the classifier itself is agnostic to the predicate’s part of speech and exact lemma and only relies on the word representations from the vsm. We experiment with two different classification methods: one is a twolayer neural network DNN , the other one is DW SB, which follows the line of Hermann-14 and learns representations for frames and predicates in the same latent space using the WSABIE algorithm.3 Hyperparameters are tuned on the development sets das-dev and YAGS-dev (sampled from YAGS); we test on the remaining 2,093 instances in YAGS-test. DataBaseline LexiconBaseline Semafor* Hermann-14* (best) WSB+SentBOW WSB+DepBOW NN+SentBOW NN+DepBOW total tering is performed. We find that our frame identification system performs surprisingly well in this setting, and we encourage the no-lexicon performance to be additionally reported in the future, since it better reflects the frame identification quality and smoothens the effect of lexicon coverage.] +[From the sentence we extract the context representation, xc = Pw∈C|Cv|sm(w) . We experiment with two kinds of contexts: SentBOW includes all the words in the sentence, i.e. C = S, DepBOW considers the dependency parse of the sentence and only includes direct dependents of the predicate, C = dep(p, S). As for the predicate, the plain embedding from the source vector space model is used, xp = vsm(p). A simple concatenation of xc and xp serves as input to the disambiguation classifier D, which outputs weights D(xc, xp, f ) for each frame known to the system f ∈ L. Note that the classifier itself is agnostic to the predicate’s part of speech and exact lemma and only relies on the word representations from the vsm. We experiment with two different classification methods: one is a twolayer neural network DNN , the other one is DW SB, which follows the line of Hermann-14 and learns representations for frames and predicates in the same latent space using the WSABIE algorithm.3 Hyperparameters are tuned on the development sets das-dev and YAGS-dev (sampled from YAGS); we test on the remaining 2,093 instances in YAGS-test. DataBaseline LexiconBaseline Semafor* Hermann-14* (best) WSB+SentBOW WSB+DepBOW NN+SentBOW NN+DepBOW total tering is performed. We find that our frame identification system performs surprisingly well in this setting, and we encourage the no-lexicon performance to be additionally reported in the future, since it better reflects the frame identification quality and smoothens the effect of lexicon coverage.] Paragraph sofa: _InitialView - begin: 24823 + begin: 24824 end: 26362 -[ Lexicon-based filtering In the testing stage, ] +[Lexicon-based filtering In the testing stage,] Heading sofa: _InitialView - begin: 26362 - end: 26409 -[ the classifier outputs weights for all the frames available in the lexicon, and the best-scoring frame is selected, f ← argmaxf∈LD(xc, xp, f ).] + begin: 26363 + end: 26408 +[the classifier outputs weights for all the frames available in the lexicon, and the best-scoring frame is selected, f ← argmaxf∈LD(xc, xp, f ).] Paragraph sofa: _InitialView - begin: 26409 + begin: 26410 end: 26553 -[ Since the lexicon specifies available frames for each lexical unit (i.e. lemma and POS), additional filtering can be performed, which limits the search only to the available frames, f ← argmaxf∈L(p)D(xc, xp, f ). If the predicate is unknown to the lexicon, p ∈/ L, the overall bestscoring frame is chosen. If the target has only one entry in the lexicon, it’s declared unambiguous and the frame is assigned directly.] +[Since the lexicon specifies available frames for each lexical unit (i.e. lemma and POS), additional filtering can be performed, which limits the search only to the available frames, f ← argmaxf∈L(p)D(xc, xp, f ). If the predicate is unknown to the lexicon, p ∈/ L, the overall bestscoring frame is chosen. If the target has only one entry in the lexicon, it’s declared unambiguous and the frame is assigned directly.] Paragraph sofa: _InitialView - begin: 26553 + begin: 26554 end: 26970 -[ Despite being common, this setup has several flaws that can obscure the differences between sys- Experiments In our experiments, we generate tems in the testing stage. As we showed in Section the lexicon L in the same way as in Hermann-14, 4, the FrameNet lexicon has coverage issues when by scanning the “frames” folder of the FrameNet applied to new domains. Neither the predicate list 1.5 distribution. For the external vector space nor the frame associations are guaranteed to be model vsm we use dependency-based word emcomplete, and hence the total results are highly de- beddings from Levy and Goldberg (2014). termined by the lexicon coverage.4 To take this into account, we also perform evaluation in the In-domain performance We report the perforno-lexicon setting, where frames are assigned mance of our system in the in-domain setting directly by the classifier and no lexicon-based fil- to compare to the state-of-the-art results from Hermann-14.5 We train our system on das-train and test it on das-test using the full FrameNet lexicon. When available, we report the no-lexicon scores as well. As Table 4 shows, our system outBaselines We employ two majority baseline models for comparison. The DataBaseline assigns frames based on how often a frame is evoked by the given predicate. This corresponds to the most frequent sense baseline in word sense disambiguation (WSD). The frames available for predicates are obtained by scanning the training data. The LexiconBaseline calculates overall frame counts first (i.e. how often a frame appears in the training data in general), and, given the predicate, selects the overall most frequent frame among the ones available for this predicate. We expect this baseline to better handle the cases when limited data is available for a given predicate sense.] +[Despite being common, this setup has several flaws that can obscure the differences between sys- Experiments In our experiments, we generate tems in the testing stage. As we showed in Section the lexicon L in the same way as in Hermann-14, 4, the FrameNet lexicon has coverage issues when by scanning the “frames” folder of the FrameNet applied to new domains. Neither the predicate list 1.5 distribution. For the external vector space nor the frame associations are guaranteed to be model vsm we use dependency-based word emcomplete, and hence the total results are highly de- beddings from Levy and Goldberg (2014). termined by the lexicon coverage.4 To take this into account, we also perform evaluation in the In-domain performance We report the perforno-lexicon setting, where frames are assigned mance of our system in the in-domain setting directly by the classifier and no lexicon-based fil- to compare to the state-of-the-art results from Hermann-14.5 We train our system on das-train and test it on das-test using the full FrameNet lexicon. When available, we report the no-lexicon scores as well. As Table 4 shows, our system outBaselines We employ two majority baseline models for comparison. The DataBaseline assigns frames based on how often a frame is evoked by the given predicate. This corresponds to the most frequent sense baseline in word sense disambiguation (WSD). The frames available for predicates are obtained by scanning the training data. The LexiconBaseline calculates overall frame counts first (i.e. how often a frame appears in the training data in general), and, given the predicate, selects the overall most frequent frame among the ones available for this predicate. We expect this baseline to better handle the cases when limited data is available for a given predicate sense.] Paragraph sofa: _InitialView - begin: 26970 + begin: 26971 end: 28783 -[ 3In our implementation, we use the LightFM package with the WARP option for hybrid matrix factorization.] +[3In our implementation, we use the LightFM package with the WARP option for hybrid matrix factorization.] Paragraph sofa: _InitialView - begin: 28783 + begin: 28784 end: 28890 -[ 4A justification for this can also be found in Hermann et al. (2014): the difference in Hermann-14 accuracy when switching from the Semafor lexicon to the full lexicon is comparable to the difference between Semafor and Hermann-14 when evaluated on the same lexicon.] +[4A justification for this can also be found in Hermann et al. (2014): the difference in Hermann-14 accuracy when switching from the Semafor lexicon to the full lexicon is comparable to the difference between Semafor and Hermann-14 when evaluated on the same lexicon.] Paragraph sofa: _InitialView - begin: 28890 + begin: 28891 end: 29157 -[ 5Based on the errata version of Hermann et al. (2014) in http://www.aclweb.org/anthology/P/] +[5Based on the errata version of Hermann et al. (2014) in http://www.aclweb.org/anthology/P/] Paragraph sofa: _InitialView - begin: 29157 + begin: 29158 end: 29249 -[ P14/P14-1136v2.pdf DataBaseline LexiconBaseline Semafor performs Semafor and performs on par with the results reported for Hermann-14. One interesting observation is that our systems perform almost as well in the no-lexicon setting as the DataBaseline, which has access to the lexicon, in the total setting. To our surprise, the WSABIEbased frame identification did not yield a consistent improvement in-domain, compared to the simple NN-based approach. We also observe that in many cases the SentBOW representation performs on par with the DepBOW, while requiring significantly less data preprocessing: SentBOW only uses tokenization, whereas DepBow relies on lemmatization, POS-tagging, and dependency parsing. We attribute this effect to the fact that SentBOW provides more context information than the sparse, dependency-filteredDepBOW.] +[P14/P14-1136v2.pdf DataBaseline LexiconBaseline Semafor performs Semafor and performs on par with the results reported for Hermann-14. One interesting observation is that our systems perform almost as well in the no-lexicon setting as the DataBaseline, which has access to the lexicon, in the total setting. To our surprise, the WSABIEbased frame identification did not yield a consistent improvement in-domain, compared to the simple NN-based approach. We also observe that in many cases the SentBOW representation performs on par with the DepBOW, while requiring significantly less data preprocessing: SentBOW only uses tokenization, whereas DepBow relies on lemmatization, POS-tagging, and dependency parsing. We attribute this effect to the fact that SentBOW provides more context information than the sparse, dependency-filteredDepBOW.] Paragraph sofa: _InitialView - begin: 29249 + begin: 29250 end: 30090 -[ Out-of-domain performance We also investi ] +[Out-of-domain performance We also investi] Heading sofa: _InitialView - begin: 30090 - end: 30133 -[ gate how well the systems perform in the out-ofdomain setting. Table 5 summarizes the results. Each of the systems was trained on das-train and tested on a variety of test sets. As we can see, our systems outperform Semafor for all datasets. The YAGS dataset is the only dataset on which we do not strongly outperform Semafor. We attribute this to the complexity of the YAGS dataset that contains a high proportion of verbs.] + begin: 30091 + end: 30132 +[gate how well the systems perform in the out-ofdomain setting. Table 5 summarizes the results. Each of the systems was trained on das-train and tested on a variety of test sets. As we can see, our systems outperform Semafor for all datasets. The YAGS dataset is the only dataset on which we do not strongly outperform Semafor. We attribute this to the complexity of the YAGS dataset that contains a high proportion of verbs.] Paragraph sofa: _InitialView - begin: 30133 + begin: 30134 end: 30558 -[ Overall out-of-domain performance stays behind the F1-agreement observed for the human annotators for TW and YAGS, which shows that there is a large margin for improvement. Corresponding scores for in-domain data are not available. Error analysis To further investigate the performance of our system in the out-of-domain setup we analyse statistics on the errors made by the system variant NN+SentBOW.] +[Overall out-of-domain performance stays behind the F1-agreement observed for the human annotators for TW and YAGS, which shows that there is a large margin for improvement. Corresponding scores for in-domain data are not available. Error analysis To further investigate the performance of our system in the out-of-domain setup we analyse statistics on the errors made by the system variant NN+SentBOW.] Paragraph sofa: _InitialView - begin: 30558 + begin: 30559 end: 30960 -[ The system’s wrong predictions are affected by the lexicon in two ways. First, if the predicate is not listed in the lexicon (unknown), the system has to choose among all frames. As we have shown before, the quality of predictions for unknown predicates is generally lower. The second case is when the predicate is listed in lexicon (so it is not unknown), but the correct frame is not associated with this predicate. We further refer to this class of errors as unlinked. For unlinked predicates, the system is restricted to the set of frames provided by the lexicon, and by design has no means to select the right frame for a given predicate occurrence.] +[The system’s wrong predictions are affected by the lexicon in two ways. First, if the predicate is not listed in the lexicon (unknown), the system has to choose among all frames. As we have shown before, the quality of predictions for unknown predicates is generally lower. The second case is when the predicate is listed in lexicon (so it is not unknown), but the correct frame is not associated with this predicate. We further refer to this class of errors as unlinked. For unlinked predicates, the system is restricted to the set of frames provided by the lexicon, and by design has no means to select the right frame for a given predicate occurrence.] Paragraph sofa: _InitialView - begin: 30960 + begin: 30961 end: 31615 -[ The unlinked-predicate issue points to a major design flaw in the standard frameId architecture. Although choosing among frames defined in the lexicon provides a quality boost, it also renders many instances intractable for the system, if the lexicon coverage is incomplete. As Table 6 shows, unknown and unlinked predicates are almost non-present in the in-domain case, but are a major source of errors in the out-of-domain case and even might be responsible for the majority of errors occurring due to domain shift (see MASC). It is important to point out that there is still no guarantee that these would be classified correctly once the missing linking information is available in the lexicon. However, if the correct frame is not listed among the frames available for the predicate, the misclassification is inevitable.] +[The unlinked-predicate issue points to a major design flaw in the standard frameId architecture. Although choosing among frames defined in the lexicon provides a quality boost, it also renders many instances intractable for the system, if the lexicon coverage is incomplete. As Table 6 shows, unknown and unlinked predicates are almost non-present in the in-domain case, but are a major source of errors in the out-of-domain case and even might be responsible for the majority of errors occurring due to domain shift (see MASC). It is important to point out that there is still no guarantee that these would be classified correctly once the missing linking information is available in the lexicon. However, if the correct frame is not listed among the frames available for the predicate, the misclassification is inevitable.] Paragraph sofa: _InitialView - begin: 31615 + begin: 31616 end: 32440 -[ A more detailed analysis of the errors made by the system shows that the majority of false predictions for known and linked predicates are due to the domain differences in word usage. For example, the predicate window was assigned the frame Connecting architecture instead of the correct frame Time period of action in the following sentence: “No effect of anesthetic protocol on IOP during a 12 minute measurement [window].”] +[A more detailed analysis of the errors made by the system shows that the majority of false predictions for known and linked predicates are due to the domain differences in word usage. For example, the predicate window was assigned the frame Connecting architecture instead of the correct frame Time period of action in the following sentence: “No effect of anesthetic protocol on IOP during a 12 minute measurement [window].”] Paragraph sofa: _InitialView - begin: 32440 + begin: 32441 end: 32866 -[ This problem is also relevant in generic WSD and benefits from the same solutions, for instance adapting embeddings to a particular domain and efficient use of embeddings .] +[This problem is also relevant in generic WSD and benefits from the same solutions, for instance adapting embeddings to a particular domain and efficient use of embeddings .] Paragraph sofa: _InitialView - begin: 32866 + begin: 32867 end: 33044 -[ Another major source of errors are subtle syntactic and semantic differences between frames which are hard to resolve on the sentence level (e.g. distinguishing between Similarity and Identicality for the predicate different). This could be addressed by incorporating subcategorization information and document context into the disamdataset unk biguation model, which has been proposed in recent work in FrameNet SRL, see e.g. Hermann et al. (2014) and Roth and Lapata (2015).] +[Another major source of errors are subtle syntactic and semantic differences between frames which are hard to resolve on the sentence level (e.g. distinguishing between Similarity and Identicality for the predicate different). This could be addressed by incorporating subcategorization information and document context into the disamdataset unk biguation model, which has been proposed in recent work in FrameNet SRL, see e.g. Hermann et al. (2014) and Roth and Lapata (2015).] Paragraph sofa: _InitialView - begin: 33044 + begin: 33045 end: 33521 -[ To further explore the impact of user-generated text, we applied word-processor spelling correction to YAGS and tested our systems on the corrected set. The results do not change significantly, which indicates that a) our distributed representations provide enough information to classify also noisy usergenerated text, and b) frameId errors cannot be attributed to preprocessing problems at large scale. 6] +[To further explore the impact of user-generated text, we applied word-processor spelling correction to YAGS and tested our systems on the corrected set. The results do not change significantly, which indicates that a) our distributed representations provide enough information to classify also noisy usergenerated text, and b) frameId errors cannot be attributed to preprocessing problems at large scale. 6] Paragraph sofa: _InitialView - begin: 33521 + begin: 33522 end: 33928 -[ Discussion and outlook ] +[Discussion and outlook] Heading sofa: _InitialView - begin: 33928 - end: 33952 -[ Our analysis in Section 4 shows that domain adaptation is mainly required for the frameId step of FrameNet SRL. Unlike in PropBank SRL, in FrameNet SRL there is no significant performance drop for roleId once correct frames are available. The number of available roles given the correct frame is lower, on average 10, which reduces the complexity of the roleId task.] + begin: 33929 + end: 33951 +[Our analysis in Section 4 shows that domain adaptation is mainly required for the frameId step of FrameNet SRL. Unlike in PropBank SRL, in FrameNet SRL there is no significant performance drop for roleId once correct frames are available. The number of available roles given the correct frame is lower, on average 10, which reduces the complexity of the roleId task.] Paragraph sofa: _InitialView - begin: 33952 + begin: 33953 end: 34319 -[ In Section 5 we introduced a simple, yet efficient frame identification method and evaluated it on in-domain and out-of-domain data. The method achieves competitive in-domain results, and outperforms the best available open-source system in out-of-domain accuracy. We also observe that our system performs well in the newly introduced no-lexicon evaluation setting, where no lexicon-based filtering is applied.] +[In Section 5 we introduced a simple, yet efficient frame identification method and evaluated it on in-domain and out-of-domain data. The method achieves competitive in-domain results, and outperforms the best available open-source system in out-of-domain accuracy. We also observe that our system performs well in the newly introduced no-lexicon evaluation setting, where no lexicon-based filtering is applied.] Paragraph sofa: _InitialView - begin: 34319 + begin: 34320 end: 34730 -[ We identified a major issue in the standard frameId architecture: shifting to a new domain might render the predicate-frame associations in the FrameNet lexicon incomplete, which leads to errors for a standard classifier trained on in-domain data. One could optimize a frameId system to work in the no-lexicon setting which does not rely on the lexicon knowledge at all. However, in this setting the classification results are currently lower. Manually or automatically increasing both predicate and predicate-frame association coverage of the FrameNet lexicon could help, and we suggest investigating this line of research in future work.] +[We identified a major issue in the standard frameId architecture: shifting to a new domain might render the predicate-frame associations in the FrameNet lexicon incomplete, which leads to errors for a standard classifier trained on in-domain data. One could optimize a frameId system to work in the no-lexicon setting which does not rely on the lexicon knowledge at all. However, in this setting the classification results are currently lower. Manually or automatically increasing both predicate and predicate-frame association coverage of the FrameNet lexicon could help, and we suggest investigating this line of research in future work.] Paragraph sofa: _InitialView - begin: 34730 + begin: 34731 end: 35370 -[ While our method achieves state-of-the-art results on out-of-domain data, overall results are still significantly lower than the human performance observed for YAGS and TW, which shows that there is large room for improvement. Some further benefits could be gained from combining the WSABIE and NN-based classification, using advanced context representations, e.g. context2vec and incorporating syntactic information into the model. The out-of-domain performance could be further improved by adapting word representations to a new domain.] +[While our method achieves state-of-the-art results on out-of-domain data, overall results are still significantly lower than the human performance observed for YAGS and TW, which shows that there is large room for improvement. Some further benefits could be gained from combining the WSABIE and NN-based classification, using advanced context representations, e.g. context2vec and incorporating syntactic information into the model. The out-of-domain performance could be further improved by adapting word representations to a new domain.] Paragraph sofa: _InitialView - begin: 35370 + begin: 35371 end: 35911 -[ A direct comparison to the Hermann-14 system in the out-of-domain setup would shed some more light on the properties of the task affecting the out-of-domain performance. On the one hand, we expect Hermann-14 to perform worse due to its heavy reliance on syntactic information, which might decline in quality when moved to a new domain; on the other hand, the WSABIE-based classification might smoothen this effect. We make our dataset publicly available to enable comparison to related work.6 7] +[A direct comparison to the Hermann-14 system in the out-of-domain setup would shed some more light on the properties of the task affecting the out-of-domain performance. On the one hand, we expect Hermann-14 to perform worse due to its heavy reliance on syntactic information, which might decline in quality when moved to a new domain; on the other hand, the WSABIE-based classification might smoothen this effect. We make our dataset publicly available to enable comparison to related work.6 7] Paragraph sofa: _InitialView - begin: 35911 + begin: 35912 end: 36406 -[ Conclusion ] +[Conclusion] Heading sofa: _InitialView - begin: 36406 - end: 36418 -[ Domain dependence is a well-known issue for supervised NLP tasks such as FrameNet SRL. To the best of our knowledge, there is no recent study of the domain dependence of FrameNet SRL, also prohibited by a lack of appropriate datasets.] + begin: 36407 + end: 36417 +[Domain dependence is a well-known issue for supervised NLP tasks such as FrameNet SRL. To the best of our knowledge, there is no recent study of the domain dependence of FrameNet SRL, also prohibited by a lack of appropriate datasets.] Paragraph sofa: _InitialView - begin: 36418 + begin: 36419 end: 36653 -[ To address this problem, we 1) present the first comprehensive study of the domain generalization performance of the open-source Semafor system on several diverse benchmark sets. As a prerequisite, we introduce YAGS, a new, substantially sized test set in the domain of user-generated questionand-answer text. We find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step; we 2) explore a promising way to improve out-of-domain frame identification, i.e. using distributed word representations. Our simple frame identification system based on distributed word representations achieves higher scores for out-of-domain frame identification than previous systems and approaches state-of-the-art results indomain. To support reproducibility of our results, we publish the YAGS test set annotations and our frame identification system for research purposes.] +[To address this problem, we 1) present the first comprehensive study of the domain generalization performance of the open-source Semafor system on several diverse benchmark sets. As a prerequisite, we introduce YAGS, a new, substantially sized test set in the domain of user-generated questionand-answer text. We find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step; we 2) explore a promising way to improve out-of-domain frame identification, i.e. using distributed word representations. Our simple frame identification system based on distributed word representations achieves higher scores for out-of-domain frame identification than previous systems and approaches state-of-the-art results indomain. To support reproducibility of our results, we publish the YAGS test set annotations and our frame identification system for research purposes.] Paragraph sofa: _InitialView - begin: 36653 + begin: 36654 end: 37542 -[ 6www.ukp.tu-darmstadt.de/ood-fn-srl ] +[6www.ukp.tu-darmstadt.de/ood-fn-srl] Heading sofa: _InitialView - begin: 37542 - end: 37579 -[ Acknowledgements ] + begin: 37543 + end: 37578 +[Acknowledgements] Heading sofa: _InitialView - begin: 37579 - end: 37597 -[ This work was supported by FAZIT-Stiftung and by the German Research Foundation (DFG) through grant GU 798/18-1 (QAEduInf) and the research training group “Adaptive Preparation of Information form Heterogeneous Sources” (AIPHES, GRK 1994/1). We thank Orin Hargraves and our annotators for their excellent work on the annotation study, Dr. Richard Eckart de Castilho for support regarding WebAnno, as well as Dr. Judith Eckle-Kohler and the anonymous reviewers for their comments on earlier versions of this paper.] + begin: 37580 + end: 37596 +[This work was supported by FAZIT-Stiftung and by the German Research Foundation (DFG) through grant GU 798/18-1 (QAEduInf) and the research training group “Adaptive Preparation of Information form Heterogeneous Sources” (AIPHES, GRK 1994/1). We thank Orin Hargraves and our annotators for their excellent work on the annotation study, Dr. Richard Eckart de Castilho for support regarding WebAnno, as well as Dr. Judith Eckle-Kohler and the anonymous reviewers for their comments on earlier versions of this paper.] Paragraph sofa: _InitialView - begin: 37597 + begin: 37598 end: 38111 -[ Eneko Agirre , Oier Lo´pez de Lacalle, Christiane Fellbaum, Shu-Kai Hsieh , Maurizio Tesconi, Monica Monachini, Piek Vossen, and Roxanne Segers . 2010 . SemEval-2010 Task 17 : All-Words Word Sense Disambiguation on a Specific Domain . InProceedings of the 5th International Workshop on Semantic Evaluation , pages 75 - 80 . Association for Computational Linguistics. Collin Baker , Michael Ellsworth , and Katrin Erk . 2007 . SemEval-2007 Task 19 : Frame Semantic Structure Extraction . In Proceedings of the Fourth International Workshop on Semantic Evaluations (SemEval2007) , pages 99 - 104 , Prague, Czech Republic, June. Association for Computational Linguistics. Jonathan Berant , Vivek Srikumar, Pei-Chun Chen , Abby Vander Linden, Brittany Harding, Brad Huang, Peter Clark , and Christopher D. Manning . 2014 . Modeling Biological Processes for Reading Comprehension . In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 1499 - 1510 , Doha, Qatar. Association for Computational Linguistics. John Blitzer , Ryan McDonald , and Fernando Pereira . 2006 . Domain adaptation with structural correspondence learning . In Proceedings of the 2006 Conference on Empirical Methods in Natural Language Processing , pages 120 - 128 , Sydney, Australia, July. Association for Computational Linguistics. Xavier Carreras and Llu´ıs Ma`rquez. 2005 . Introduction to the CoNLL-2005 shared task: Semantic role labeling . In Proceedings of the Ninth Conference on Computational Natural Language Learning (CoNLL-2005) , pages 152 - 164 , Ann Arbor, Michigan, June. Association for Computational Linguistics. Danilo Croce , Cristina Giannone, Paolo Annesi, and Roberto Basili . 2010 . Towards open-domain semantic role labeling . In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics , pages 237 - 246 , Uppsala, Sweden, July. Association for Computational Linguistics. Dipanjan Das and Noah A. Smith . 2011 . SemiSupervised Frame-Semantic Parsing for Unknown Predicates . In Proc. of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies , pages 1435 - 1444 , Portland, Oregon, USA. Dipanjan Das , Desai Chen , Andre´ F. T. Martins , Nathan Schneider , and Noah A. Smith . 2014 . Frame-semantic parsing . Computational Linguistics , 40 ( 1 ): 9 - 56 . Hal Daume ´III. 2007 . Frustratingly easy domain adaptation . In Proceedings of the 45th Annual Meeting of the Association of Computational Linguistics , pages 256 - 263 , Prague, Czech Republic, June. Association for Computational Linguistics. Katrin Erk and Sebastian Pado´. 2006 . SHALMANESER - A Toolchain For Shallow Semantic Parsing . In Proceedings of the 5th International Conference on Language Resources and Evaluation (LREC 2006 ), volume 6 , pages 527 - 532 , Genoa, Italy. ELRA. Charles J. Fillmore , Christopher R. Johnson , and Miriam R.L. Petruck . 2003 . Background to FrameNet. International journal of lexicography , 16 ( 3 ): 235 - 250 . Nicholas FitzGerald , Oscar Ta¨ckstro¨m, Kuzman Ganchev, and Dipanjan Das . 2015 . Semantic role labeling with neural network factors . In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing , pages 960 - 970 , Lisbon, Portugal, September. Association for Computational Linguistics. Jan Hajicˇ , Massimiliano Ciaramita, Richard Johansson, Daisuke Kawahara, Maria Anto`nia Mart´ı, Llu´ıs Ma`rquez, Adam Meyers, Joakim Nivre, Sebastian Pado´, Jan Sˇ teˇpa´nek, Pavel Stranˇa´k, Mihai Surdeanu, Nianwen Xue, and Yi Zhang . 2009 . The conll2009 shared task: Syntactic and semantic dependencies in multiple languages . In Proceedings of the Thirteenth Conference on Computational Natural Language Learning (CoNLL 2009 ): Shared Task, pages 1 - 18 , Boulder, Colorado, June. Association for Computational Linguistics. Karl Moritz Hermann , Dipanjan Das , Jason Weston , and Kuzman Ganchev . 2014 . Semantic frame identification with distributed word representations . In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 1448 - 1458 , Baltimore, Maryland, June. Association for Computational Linguistics. Fei Huang and Alexander Yates . 2010 . Open-domain semantic role labeling by modeling word spans . In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics , pages 968 - 978 , Uppsala, Sweden, July. Association for Computational Linguistics. Ignacio Iacobacci , Mohammad Taher Pilehvar, and Roberto Navigli . 2016 . Embeddings for Word Sense Disambiguation: An Evaluation Study . In Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 897 - 907 , Berlin, Germany, August. Association for Computational Linguistics. Anders Johannsen , He´ctor Mart´ınez Alonso, and Anders Søgaard . 2015 . Any-language frame-semantic parsing . In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing , pages 2062 - 2066 , Lisbon, Portugal, September. Association for Computational Linguistics. Richard Johansson and Pierre Nugues . 2008 . The effect of syntactic representation on semantic role labeling . In Proceedings of the 22nd International Conference on Computational Linguistics (Coling 2008 ), pages 393 - 400 , Manchester, UK , August . Coling 2008 Organizing Committee . Meghana Kshirsagar , Sam Thomson, Nathan Schneider, Jaime Carbonell, Noah A. Smith , and Chris Dyer . 2015 . Frame-semantic role labeling with heterogeneous annotations . In Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 2: Short Papers) , pages 218 - 224 , Beijing, China, July. Association for Computational Linguistics. Maciej Kula . 2015 . Metadata embeddings for user and item cold-start recommendations . In Toine Bogers and Marijn Koolen , editors, Proceedings of the 2nd Workshop on New Trends on Content-Based Recommender Systems co-located with 9th ACM Conference on Recommender Systems (RecSys 2015 ), volume 1448 of CEUR Workshop Proceedings , pages 14 - 21 , Vienna, Austria, September. CEUR-WS.org. Omer Levy and Yoav Goldberg . 2014 . Dependencybased word embeddings . In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics , ACL 2014 , June 22-27, 2014 , Baltimore, MD , USA, Volume 2 : Short Papers , pages 302 - 308 . The Association for Computer Linguistics. Oren Melamud , Jacob Goldberger , and Ido Dagan . 2016 . context2vec: Learning generic context embedding with bidirectional LSTM . In Proceedings of the 20th SIGNLL Conference on Computational Natural Language Learning , CoNLL 2016 , Berlin, Germany, August 11-12 , 2016 , pages 51 - 61 . Tomas Mikolov , Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean . 2013 . Distributed Representations of Words and Phrases and Their Compositionality . In Proceedings of the 26th International Conference on Neural Information Processing Systems (NIPS '13) , pages 3111 - 3119 , Lake Tahoe , Nevada, USA. Alexis Palmer and Caroline Sporleder . 2010 . Evaluating FrameNet-style semantic parsing: the role of coverage gaps in FrameNet . In Proceedings of the 23rd International Conference on Computational Linguistics: Posters , pages 928 - 936 , Beijing, China, August . Rebecca J. Passonneau , Collin F. Baker , Christiane Fellbaum, and Nancy Ide . 2012 . The MASC Word Sense Corpus . In Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12) , pages 3025 - 3030 , Istanbul, Turkey. Jeffrey Pennington , Richard Socher, and Christopher Manning . 2014 . Glove: Global vectors for word representation . In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 1532 - 1543 , Doha, Qatar, October. Association for Computational Linguistics. Michael Roth and Mirella Lapata . 2015 . Contextaware frame-semantic role labeling . Transactions of the Association for Computational Linguistics , 3 : 449 - 460 . Josef Ruppenhofer , Michael Ellsworth, Miriam R. L. Petruck , Christopher R. Johnson , and Jan Scheffczyk . 2010 . FrameNet II: Extended Theory and Practice . Technical report , ICSI, University of California, Berkeley. Anders Søgaard , Barbara Plank, and H e´ctor Mart´ınez Alonso. 2015 . Using Frame Semantics for Knowledge Extraction from Twitter . In Proceedings of the Twenty-Ninth AAAI Conference on Artificial Intelligence , pages 2447 - 2452 , Austin, Texas, USA. Anders Søgaard . 2013 . Semi-supervised learning and domain adaptation in natural language processing . Synthesis Lectures on Human Language Technologies , 6 ( 2 ): 1 - 103 . Mihai Surdeanu , Richard Johansson, Adam Meyers, Llu´ıs Ma`rquez, and Joakim Nivre . 2008 . The conll 2008 shared task on joint parsing of syntactic and semantic dependencies . In CoNLL 2008: Proceedings of the Twelfth Conference on Computational Natural Language Learning , pages 159 - 177 , Manchester, England, August. Coling 2008 Organizing Committee . Mihai Surdeanu , Massimiliano Ciaramita, and Hugo Zaragoza . 2011 . Learning to rank answers to nonfactoid questions from web collections . Computational Linguistics , 37 ( 2 ): 351 - 383 . Kaveh Taghipour and Hwee Tou Ng. 2015 . SemiSupervised Word Sense Disambiguation Using Word Embeddings in General and Specific Domains . In Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies , pages 314 - 323 , Denver, Colorado, May-June. Association for Computational Linguistics . Jason Weston , Samy Bengio, and Nicolas Usunier . 2011 . WSABIE: Scaling Up to Large Vocabulary Image Annotation . In Proceedings of the Twenty-Second International Joint Conference on Artificial Intelligence - Volume Volume Three , IJCAI'11 , pages 2764 - 2770 , Barcelona, Catalonia, Spain. AAAI Press. Haitong Yang , Tao Zhuang , and Chengqing Zong . 2015 . Domain adaptation for syntactic and semantic dependency parsing using deep belief networks . Transactions of the Association for Computational Linguistics , 3 : 271 - 282 . Seid Muhie Yimam , Richard Eckart de Castilho, Iryna Gurevych, and Chris Biemann . 2014 . Automatic Annotation Suggestions and Custom Annotation Layers in WebAnno . In Kalina Bontcheva and Zhu Jingbo, editors, Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics. System Demonstrations , pages 91 - 96 , Stroudsburg, PA 18360 , USA. Association for Computational Linguistics.] +[Eneko Agirre , Oier Lo´pez de Lacalle, Christiane Fellbaum, Shu-Kai Hsieh , Maurizio Tesconi, Monica Monachini, Piek Vossen, and Roxanne Segers . 2010 . SemEval-2010 Task 17 : All-Words Word Sense Disambiguation on a Specific Domain . InProceedings of the 5th International Workshop on Semantic Evaluation , pages 75 - 80 . Association for Computational Linguistics. Collin Baker , Michael Ellsworth , and Katrin Erk . 2007 . SemEval-2007 Task 19 : Frame Semantic Structure Extraction . In Proceedings of the Fourth International Workshop on Semantic Evaluations (SemEval2007) , pages 99 - 104 , Prague, Czech Republic, June. Association for Computational Linguistics. Jonathan Berant , Vivek Srikumar, Pei-Chun Chen , Abby Vander Linden, Brittany Harding, Brad Huang, Peter Clark , and Christopher D. Manning . 2014 . Modeling Biological Processes for Reading Comprehension . In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 1499 - 1510 , Doha, Qatar. Association for Computational Linguistics. John Blitzer , Ryan McDonald , and Fernando Pereira . 2006 . Domain adaptation with structural correspondence learning . In Proceedings of the 2006 Conference on Empirical Methods in Natural Language Processing , pages 120 - 128 , Sydney, Australia, July. Association for Computational Linguistics. Xavier Carreras and Llu´ıs Ma`rquez. 2005 . Introduction to the CoNLL-2005 shared task: Semantic role labeling . In Proceedings of the Ninth Conference on Computational Natural Language Learning (CoNLL-2005) , pages 152 - 164 , Ann Arbor, Michigan, June. Association for Computational Linguistics. Danilo Croce , Cristina Giannone, Paolo Annesi, and Roberto Basili . 2010 . Towards open-domain semantic role labeling . In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics , pages 237 - 246 , Uppsala, Sweden, July. Association for Computational Linguistics. Dipanjan Das and Noah A. Smith . 2011 . SemiSupervised Frame-Semantic Parsing for Unknown Predicates . In Proc. of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies , pages 1435 - 1444 , Portland, Oregon, USA. Dipanjan Das , Desai Chen , Andre´ F. T. Martins , Nathan Schneider , and Noah A. Smith . 2014 . Frame-semantic parsing . Computational Linguistics , 40 ( 1 ): 9 - 56 . Hal Daume ´III. 2007 . Frustratingly easy domain adaptation . In Proceedings of the 45th Annual Meeting of the Association of Computational Linguistics , pages 256 - 263 , Prague, Czech Republic, June. Association for Computational Linguistics. Katrin Erk and Sebastian Pado´. 2006 . SHALMANESER - A Toolchain For Shallow Semantic Parsing . In Proceedings of the 5th International Conference on Language Resources and Evaluation (LREC 2006 ), volume 6 , pages 527 - 532 , Genoa, Italy. ELRA. Charles J. Fillmore , Christopher R. Johnson , and Miriam R.L. Petruck . 2003 . Background to FrameNet. International journal of lexicography , 16 ( 3 ): 235 - 250 . Nicholas FitzGerald , Oscar Ta¨ckstro¨m, Kuzman Ganchev, and Dipanjan Das . 2015 . Semantic role labeling with neural network factors . In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing , pages 960 - 970 , Lisbon, Portugal, September. Association for Computational Linguistics. Jan Hajicˇ , Massimiliano Ciaramita, Richard Johansson, Daisuke Kawahara, Maria Anto`nia Mart´ı, Llu´ıs Ma`rquez, Adam Meyers, Joakim Nivre, Sebastian Pado´, Jan Sˇ teˇpa´nek, Pavel Stranˇa´k, Mihai Surdeanu, Nianwen Xue, and Yi Zhang . 2009 . The conll2009 shared task: Syntactic and semantic dependencies in multiple languages . In Proceedings of the Thirteenth Conference on Computational Natural Language Learning (CoNLL 2009 ): Shared Task, pages 1 - 18 , Boulder, Colorado, June. Association for Computational Linguistics. Karl Moritz Hermann , Dipanjan Das , Jason Weston , and Kuzman Ganchev . 2014 . Semantic frame identification with distributed word representations . In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 1448 - 1458 , Baltimore, Maryland, June. Association for Computational Linguistics. Fei Huang and Alexander Yates . 2010 . Open-domain semantic role labeling by modeling word spans . In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics , pages 968 - 978 , Uppsala, Sweden, July. Association for Computational Linguistics. Ignacio Iacobacci , Mohammad Taher Pilehvar, and Roberto Navigli . 2016 . Embeddings for Word Sense Disambiguation: An Evaluation Study . In Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 897 - 907 , Berlin, Germany, August. Association for Computational Linguistics. Anders Johannsen , He´ctor Mart´ınez Alonso, and Anders Søgaard . 2015 . Any-language frame-semantic parsing . In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing , pages 2062 - 2066 , Lisbon, Portugal, September. Association for Computational Linguistics. Richard Johansson and Pierre Nugues . 2008 . The effect of syntactic representation on semantic role labeling . In Proceedings of the 22nd International Conference on Computational Linguistics (Coling 2008 ), pages 393 - 400 , Manchester, UK , August . Coling 2008 Organizing Committee . Meghana Kshirsagar , Sam Thomson, Nathan Schneider, Jaime Carbonell, Noah A. Smith , and Chris Dyer . 2015 . Frame-semantic role labeling with heterogeneous annotations . In Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 2: Short Papers) , pages 218 - 224 , Beijing, China, July. Association for Computational Linguistics. Maciej Kula . 2015 . Metadata embeddings for user and item cold-start recommendations . In Toine Bogers and Marijn Koolen , editors, Proceedings of the 2nd Workshop on New Trends on Content-Based Recommender Systems co-located with 9th ACM Conference on Recommender Systems (RecSys 2015 ), volume 1448 of CEUR Workshop Proceedings , pages 14 - 21 , Vienna, Austria, September. CEUR-WS.org. Omer Levy and Yoav Goldberg . 2014 . Dependencybased word embeddings . In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics , ACL 2014 , June 22-27, 2014 , Baltimore, MD , USA, Volume 2 : Short Papers , pages 302 - 308 . The Association for Computer Linguistics. Oren Melamud , Jacob Goldberger , and Ido Dagan . 2016 . context2vec: Learning generic context embedding with bidirectional LSTM . In Proceedings of the 20th SIGNLL Conference on Computational Natural Language Learning , CoNLL 2016 , Berlin, Germany, August 11-12 , 2016 , pages 51 - 61 . Tomas Mikolov , Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean . 2013 . Distributed Representations of Words and Phrases and Their Compositionality . In Proceedings of the 26th International Conference on Neural Information Processing Systems (NIPS '13) , pages 3111 - 3119 , Lake Tahoe , Nevada, USA. Alexis Palmer and Caroline Sporleder . 2010 . Evaluating FrameNet-style semantic parsing: the role of coverage gaps in FrameNet . In Proceedings of the 23rd International Conference on Computational Linguistics: Posters , pages 928 - 936 , Beijing, China, August . Rebecca J. Passonneau , Collin F. Baker , Christiane Fellbaum, and Nancy Ide . 2012 . The MASC Word Sense Corpus . In Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12) , pages 3025 - 3030 , Istanbul, Turkey. Jeffrey Pennington , Richard Socher, and Christopher Manning . 2014 . Glove: Global vectors for word representation . In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 1532 - 1543 , Doha, Qatar, October. Association for Computational Linguistics. Michael Roth and Mirella Lapata . 2015 . Contextaware frame-semantic role labeling . Transactions of the Association for Computational Linguistics , 3 : 449 - 460 . Josef Ruppenhofer , Michael Ellsworth, Miriam R. L. Petruck , Christopher R. Johnson , and Jan Scheffczyk . 2010 . FrameNet II: Extended Theory and Practice . Technical report , ICSI, University of California, Berkeley. Anders Søgaard , Barbara Plank, and H e´ctor Mart´ınez Alonso. 2015 . Using Frame Semantics for Knowledge Extraction from Twitter . In Proceedings of the Twenty-Ninth AAAI Conference on Artificial Intelligence , pages 2447 - 2452 , Austin, Texas, USA. Anders Søgaard . 2013 . Semi-supervised learning and domain adaptation in natural language processing . Synthesis Lectures on Human Language Technologies , 6 ( 2 ): 1 - 103 . Mihai Surdeanu , Richard Johansson, Adam Meyers, Llu´ıs Ma`rquez, and Joakim Nivre . 2008 . The conll 2008 shared task on joint parsing of syntactic and semantic dependencies . In CoNLL 2008: Proceedings of the Twelfth Conference on Computational Natural Language Learning , pages 159 - 177 , Manchester, England, August. Coling 2008 Organizing Committee . Mihai Surdeanu , Massimiliano Ciaramita, and Hugo Zaragoza . 2011 . Learning to rank answers to nonfactoid questions from web collections . Computational Linguistics , 37 ( 2 ): 351 - 383 . Kaveh Taghipour and Hwee Tou Ng. 2015 . SemiSupervised Word Sense Disambiguation Using Word Embeddings in General and Specific Domains . In Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies , pages 314 - 323 , Denver, Colorado, May-June. Association for Computational Linguistics . Jason Weston , Samy Bengio, and Nicolas Usunier . 2011 . WSABIE: Scaling Up to Large Vocabulary Image Annotation . In Proceedings of the Twenty-Second International Joint Conference on Artificial Intelligence - Volume Volume Three , IJCAI'11 , pages 2764 - 2770 , Barcelona, Catalonia, Spain. AAAI Press. Haitong Yang , Tao Zhuang , and Chengqing Zong . 2015 . Domain adaptation for syntactic and semantic dependency parsing using deep belief networks . Transactions of the Association for Computational Linguistics , 3 : 271 - 282 . Seid Muhie Yimam , Richard Eckart de Castilho, Iryna Gurevych, and Chris Biemann . 2014 . Automatic Annotation Suggestions and Custom Annotation Layers in WebAnno . In Kalina Bontcheva and Zhu Jingbo, editors, Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics. System Demonstrations , pages 91 - 96 , Stroudsburg, PA 18360 , USA. Association for Computational Linguistics.] Paragraph sofa: _InitialView - begin: 38111 + begin: 38113 end: 49363 -------- View _InitialView end ---------------------------------- diff --git a/dkpro-core-io-cermine-gpl/src/test/resources/reference/test-normalized.dump b/dkpro-core-io-cermine-gpl/src/test/resources/reference/test-normalized.dump index 7d399a636a..7139d37db2 100644 --- a/dkpro-core-io-cermine-gpl/src/test/resources/reference/test-normalized.dump +++ b/dkpro-core-io-cermine-gpl/src/test/resources/reference/test-normalized.dump @@ -13,435 +13,435 @@ DocumentMetaData CAS-Text: Out-of-domain FrameNet Semantic Role Labeling Silvana Hartmann 0 Ilia Kuznetsov 0 Teresa Martin 0 Iryna Gurevych 0 Research Training Group AIPHES 0 Ubiquitous Knowledge Processing (UKP) Lab 0 0 Department of Computer Science, Technische Universita ̈t Darmstadt 2017 1 471 482 Domain dependence of NLP systems is one of the major obstacles to their application in large-scale text analysis, also restricting the applicability of FrameNet semantic role labeling (SRL) systems. Yet, current FrameNet SRL systems are still only evaluated on a single in-domain test set. For the first time, we study the domain dependence of FrameNet SRL on a wide range of benchmark sets. We create a novel test set for FrameNet SRL based on user-generated web text and find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step. To address this problem, we develop a simple, yet efficient system based on distributed word representations. Our system closely approaches the state-of-the-art in-domain while outperforming the best available frame identification system out-of-domain. We publish our system and test data for research purposes.1 - Domain dependence is a major problem for supervised NLP tasks such as FrameNet semantic role labeling (SRL): systems generally exhibit a strong performance drop when applied to test data from a different distribution than the training data. This prohibits their large-scale use in language technology applications. The same problems are expected for FrameNet SRL, but due to a lack of datasets, state-of-theart FrameNet SRL is only evaluated on a single in-domain test set, see e.g. Das et al. (2014) and FitzGerald et al. (2015). In this work, we present the first comprehensive study of the domain dependence of FrameNet SRL 1www.ukp.tu-darmstadt.de/ood-fn-srl on a range of benchmark datasets. This is crucial as the demand for semantic textual analysis of largescale web data keeps growing. Based on FrameNet (Fillmore et al., 2003) , FrameNet SRL extracts frame-semantic structures on the sentence level that describe a specific situation centered around a semantic predicate, often a verb, and its participants, typically syntactic arguments or adjuncts of the predicate. The predicate is assigned a frame label, essentially a word sense label, that defines the situation and determines the semantic roles of the participants. The following sentence from FrameNet provides an example of the Grinding frame and its roles: [The mill]Grinding cause malt]P atient [to grist]Result. grindsGrinding [the FrameNet SRL consists of two steps, frame identification (frameId), assigning a frame to the current predicate, and role labeling (roleId), identifying the participants and assigning them role labels licensed by the frame. The frameId step reduces the hundreds of role labels in FrameNet to a manageable set of up to 30 roles. Thus, FrameNet SRL differs from PropBank SRL (Carreras and Ma`rquez, 2005) , that only uses a small set of 26 syntactically motivated role labels and puts less weight on the predicate sense. The advantage of FrameNet SRL is that it results in a more fine-grained and rich interpretation of the input sentences which is crucial for many applications, e.g. reasoning in online debates (Berant et al., 2014) . Domain dependence is a well-studied topic for PropBank SRL. However, to the best of our knowledge, there exists no analysis of the performance of modern FrameNet SRL systems when applied to data from new domains. In this work, we address this problem as follows: we introduce a new benchmark dataset YAGS (Yahoo! Answers Gold Standard), which is based on user-generated questions and answers and exemplifies an out-of-domain application use case. We use YAGS, along with other out-of-domain test sets, to perform a detailed analysis of the domain dependence of FrameNet SRL using Semafor (Das et al., 2014; Kshirsagar et al., 2015) to identify which of the stages of FrameNet SRL, frameId or roleId, is particularly sensitive to domain shifts. Our results confirm that the major bottleneck in FrameNet SRL is the frame identification step. Motivated by that, we develop a simple, yet efficient frame identification method based on distributed word representations that promise better domain generalization. Our system’s performance matches the state-of-the-art in-domain (Hermann et al., 2014) , despite using a simpler model, and improves on the out-of-domain performance of Semafor. The contributions of the present work are twofold: 1) we perform the first comprehensive study of the domain generalization capabilities of opensource FrameNet SRL, and 2) we propose a new frame identification method based on distributed word representations that enhances out-of-domain performance of frame identification. To enable our study, we created YAGS, a new, substantially-sized benchmark dataset for the out-of-domain testing of FrameNet SRL; we publish the annotations for the YAGS benchmark set and our frame identification system for research purposes. 2 Related work The domain dependence of FrameNet SRL systems has been only studied sparsely, however, there exists a large body of work on out-of-domain PropBank SRL, as well as on general domain adaptation methods for NLP. This section briefly introduces some of the relevant approaches in these areas, and then summarizes the state-of-the-art in FrameNet frame identification. Domain adaptation in NLP Low out-ofdomain performance is a problem common to many supervised machine learning tasks. The goal of domain adaptation is to improve model performance on the test data originating from a different distribution than the training data (Søgaard, 2013). For NLP, domain adaptation has been studied for various tasks such as POS-tagging and syntactic parsing (Daume´III, 2007; Blitzer et al., 2006) . For the complex task of SRL, it is strongly associated with PropBank, because the corresponding CoNLL shared tasks promote out-of-domain evaluation (Surdeanu et al., 2008; Hajicˇ et al., 2009) . In the shared tasks, in-domain newspaper text from the WSJ Corpus is contrasted to out-of-domain data from fiction texts in the Brown Corpus. Most of the participants in the shared tasks do not consider domain adaptation and report systematically lower scores for the out-of-domain data (Hajicˇ et al., 2009). Representation learning has been successfully used to improve on the CoNLL shared task results (Huang and Yates, 2010; FitzGerald et al., 2015; Yang et al., 2015) . Yang et al. (2015) report the smallest performance difference (5.5 points in F1) between in-domain and out-of-domain test data, leading to the best results to date on the CoNLL 2009 out-of-domain test. Their system learns common representations for in-domain and out-of-domain data based on deep belief networks. Domain dependence of FrameNet SRL The FrameNet 1.5 fulltext corpus, used as a standard dataset for training and evaluating FrameNet SRL systems, contains texts from several domains (Ruppenhofer et al., 2010) . However, the standard data split used to evaluate modern systems (Das and Smith, 2011) ensures the presence of all domains in the training as well as test data and cannot be used to assess the systems’ ability to generalize. Moreover, all the texts in the FrameNet fulltext corpus, based on newspaper and literary texts, are post-edited and linguistically well-formed. The FrameNet test setup thus cannot provide information on SRL performance on less edited out-ofdomain data, e.g. user-generated web data. There are few studies related to the out-ofdomain generalization of FrameNet SRL. Johansson and Nugues (2008) evaluate the impact of different parsers on FrameNet SRL using the Nuclear Threats Initiative (NTI) data as an out-of-domain test set. They observe low domain generalization abilities of their supervised system, but find that using dependency parsers instead of constituency parsers is beneficial in the out-of-domain scenario. Croce et al. (2010) use a similar in-domain/out-ofdomain split to evaluate their approach to opendomain FrameNet SRL. They integrate a distributional model into their SRL system to generalize lexicalized features to previously unseen arguments and thus create an SRL system with a smaller performance gap between in-domain and out-ofdomain test data (only 4.5 percentage points F1). Note that they only evaluate the role labeling step. It is not transparent how their results would transfer to the current state-of-the-art SRL systems that already integrate methods to improve generalization, for instance using distributed representations. Palmer and Sporleder (2010) analyze the FrameNet 1.3 training data coverage and the performance of the Shalmaneser SRL system (Erk and Pado´, 2006) for frame identification on several test sets across domains, i.e. the PropBank and NTI parts of the FrameNet fulltext corpus and the fictional texts from the SemEval-2007 shared task (Baker et al., 2007) . Having observed that the majority of errors results from coverage gaps in FrameNet, they suggest to focus on developing frame identification systems that generalize well to new domains. Our observations support their findings and show that the problem still persists even when modern SRL methods and the extended FrameNet 1.5 lexicon are used. Søgaard et al. (2015) annotate 236 tweets with FrameNet labels to apply SRL to knowledge extraction from Twitter. They report that the frameId performance of Semafor 2.1 (Das et al., 2010) on the new test set is similar to its performance on the SemEval-2007 newswire test set (Baker et al., 2007) . For full SRL, there are large differences: F1 reaches only 25.96% on the Twitter set compared to the 46.5% reported by Das et al. (2010) on the indomain set. These results show that there is ample room for improvement for SRL on Twitter data. Recent FrameNet SRL systems are not evaluated in the context of their domain dependence: Kshirsagar et al. (2015) use the domain adaptation approach from Daume´III (2007) to augment the feature space for FrameNet SRL with FrameNet example sentences; FitzGerald et al. (2015) and Hermann et al. (2014) adopt deep learning methods, including learning representations that may generalize better to unseen data, to present stateof-the-art results for FrameNet SRL. All of the former only use the already introduced split of the FrameNet fulltext corpus for testing, as does the long-time state-of-the-art system Semafor (Das et al., 2014) . Out-of-domain evaluation is lacking, as are datasets that enable this kind of evaluation. Frame identification Current state of the art in frame identification is the approach by Hermann et al. (2014), further referred to as Hermann-14, followed by the previous state-of-the art model Semafor (Das et al., 2014) . The frame identification system of Semafor relies on an elaborate feature set based on syntactic and lexical features, using the WordNet hierarchy as a source of lexical information, and a label propagation-based approach to take unknown predicates into account. Semafor is not specifically designed for out-of-domain use: the WordNet coverage is limited, and the quality of syntactic parsing might drop when the system is applied to out-ofdomain data, especially in case of non-standard user-generated texts. Hermann-14 uses distributed word representations augmented by syntactic information. Generalpurpose distributed word representations (such as word2vec (Mikolov et al., 2013) and GloVe (Pennington et al., 2014) ) are beneficial for many NLP tasks: word representations are calculated on a large unlabeled corpus, and then used as input for high-level tasks for which training data is scarce, such as syntactic parsing, word sense disambiguation, and SRL. In the syntax-augmented representations of Hermann-14, a region of the input vector, a container, is reserved for each syntactic path that can connect predicates to their arguments. This container is populated with a corresponding argument word representation, if the argument on this path is found in the training data. Hermann-14 uses the WSABIE algorithm (Weston et al., 2011) to map input and frame representations to a common latent space. WSABIE uses WARP loss and gradient-based updates to minimize the distance between the latent representations of the predicate target and the correct frame, while maximizing the distance to all the other irrelevant frames. During testing, cosine similarity is used to find the closest frame given the input. One advantage of this approach is that similar frames are positioned close to each other in the latent space which allows information to be shared between similar predicates and similar frames. This system is the current state-ofthe-art for in-domain frame identification, but has not been applied in an out-of-domain setting. 3 Out-of-domain FrameNet test data This section describes available in-domain and outof-domain FrameNet test sets and the creation of YAGS, a new out-of-domain FrameNet test set. FrameNet test sets FrameNet SRL is typically evaluated on das-test, the test set first introduced by Das and Smith (2011). It is a held-out set randomly sampled from the FrameNet 1.5 fulltext corpus. While the FrameNet fulltext corpus contains data from various sources, we consider das-test an in-domain test set: all data sources of the test set are also represented in the training set. There are two additional datasets from other domains that we use in our study on domain generalization: The MASC word sense sentences corpus contains FrameNet annotations for a lexical sample of roughly 100 lemmas from ANC (Passonneau et al., 2012) . The Twitter-based dataset from Søgaard et al. (2015), henceforth TW, has some very distinctive properties: it does not provide a gold standard, but annotations by three annotators. This leads to a high variance in role annotations: the annotator TW3 annotated only 82% of the number of roles annotated by TW1, see Table 1. Like Søgaard et al. (2015), we report SRL results as averages over the three annotations (TW-av). Table 1 shows statistics on these datasets. For TW, it displays the statistics for each annotator. The TW datasets are fairly small, containing only around 1,000 frame labels. The MASC dataset is of substantial size, but it constitutes a lexical sample and therefore a slightly artificial evaluation setup. There is another Twitter-based test set (Johannsen et al., 2015) , which we do not use in our experiments, because it was created semi-automatically and is therefore of lower quality. We conclude that existing out-of-domain test sets for FrameNet SRL are insufficient, in particular for increasingly important domains like user-generated text, because available datasets are either small or of low quality. YAGS: a new FrameNet test set based on user generated text To address the need for new outof-domain test datasets, we created YAGS, a new FrameNet-annotated evaluation dataset based on question-answer data from Yahoo! Answers (YA), a community-driven question-and-answer forum. The corpus is based on a random sample of 55 questions and their answers from the test split of the YA Manner Questions dataset used by Surdeanu et al. (2011) and published as part of the Yahoo! Webscope program (https://webscope. sandbox.yahoo.com/). YAGS contains 1,415 sentences, 3,091 frame annotations, and 6,081 role annotations. Figure 1 shows a sentence from YAGS that demonstrates some non-standard properties of the user-generated question-answer data, such as typos (mortal instead of mortar). We publish the annotations as stand-off annotations to the original dataset. Annotation study Each document was annotated by a two linguistically trained annotators provided with detailed guidelines and then curated by an experienced expert, all using WebAnno 2.0.0 (Yimam et al., 2014) . Up to five predicates per sentence were pre-selected automatically based on lemma and POS, preferring verbal predicates to other POS, which leads to a larger proportion of verbs in YAGS. The annotation task was to identify the correct frame label for each predicate, if any, and then to identify the role spans as arguments and adjuncts of the frame, and to label them with the appropriate role. For reference, annotators accessed the FrameNet 1.5 definitions and examples with the FrameNet Explorer tool (www.clres.com/FNExplorer.html). Inter-rater agreement for frame labels is Krippendorff’s α=0.76; agreement for role labels given matching spans is α=0.62, and Krippendorff’s α unitizing agreement for role spans is 0.7 – a good result for such a difficult task on user-generated text. Average pairwise F1 agreement for frame labels is high at 0.96, higher than the 0.84 reported by Søgaard et al. (2015) for the TW sets. Our high frame agreement is a result of annotator experience and our elaborate annotation setup. YAGS statistics and properties Table 1 presents dataset statistics for YAGS and the other test sets. Due to the predicate selection, YAGS contains a larger proportion of verbal predicates than the other sets, and has three times more frames and roles than TW, approximating the size of das-test. The proportion of core roles, roles that are obligatory for a frame and thus typically more frequent in datasets than non-core roles, in the out-of-domain test sets (TW, YAGS, MASC) is slightly smaller data s f a n v compared to das-test. This goes along with a larger variance of roles in YAGS. The user-generated aspect of YAGS manifests in spelling errors, and in the lack of punctuation and structure of the texts. The language is informal, but there are only few emoticons or other special words such as the hashtags typically found in tweets. In the next section, we use the test sets from Table 1 to analyze the domain generalization capabilities of an open-source FrameNet SRL system. 4 Domain generalization capabilities of open-source FrameNet SRL To analyze the domain generalization capabilities of contemporary open-source SRL, we ran the frame identification from Semafor (Das et al., 2014) with the enhanced role labeler from Kshirsagar et al. (2015), both trained on the in-domain das-train set, on the four test sets das-test, YAGS, TW, and MASC. The systems receive text annotated with predicate spans as input, which has become the standard in recent evaluations. Evaluation script The Semafor evaluation script (Das et al., 2014) provides precision P, recall R, and F1 scores for full SRL (SRL), and accuracy A for frame identification (frameId). Full SRL evaluation can be performed with and without using gold frames instead of predicted (auto) frames. The script does not provide results on the role labeling (argument identification and labeling, roleId) alone: the scoring mechanism for SRL/gold also considers the by default correct gold frames. This is useful when comparing different SRL systems on the same test set, but not sufficient when 1) comparing role labeling performance on different test sets with a different ratio of frame labels to role labels (resulting from different annotation strategies), and 2) analyzing the contribution of frameId and roleId to full SRL performance across test sets. data das-test YAGS MASC TW-av frameId auto gold We therefore evaluate the output of the script to retain the original counts for role labels and compute scores on the role labeling proper (roleId). Moreover, there are two evaluation settings for frameId: exact frame match and partial frame match. We use the exact match setting that does not credit related frames and roles. Results Table 2 presents scores for exact match frameId and for SRL and roleId with automatic frames (auto) and with gold frames (gold). For TW, the results are averaged over the number of annotators. According to column SRL/auto, we observe best Semafor performance for full SRL on dastest, results for the other test sets are at least 16 percentage points F1 lower. This is mostly due to the worse frameId performance of Semafor on the new test sets, as shown in column frameId: frameId performance is at least 19 percentage points lower. This negatively affects roleId for the out-of-domain test sets (see column roleId/auto). RoleId/auto scores are also low on das-test, but higher than for the other sets. When using gold frame labels, roleId and SRL performance improve for all test sets. As shown in columns roleId/gold and SRL/gold, the difference between in-domain and out-of-domain evaluation vanishes. Only MASC scores are still two points lower for full SRL than those for das-test. TW-av scores even surpass the in-domain scores.2 This shows how much FrameNet role labels are dependent on correct frame labels. Thus, it is crucial to improve the out-of-domain performance of frameId systems. Domain dependence appears to be less of a problem for the role labeling step. The MASC dataset is the most difficult for both frameId and roleId. This is mostly a consequence of the lower training data coverage of MASC, as discussed below. 2Our TW-av results are not comparable to those from Søgaard et al. (2015) because their test setup includes predicate target identification and uses different evaluation metrics. das-test YAGS MASC TW1 TW2 TW3 Analysis In our study, it became clear that domain dependence is crucial to the frame identification step in SRL. The lower scores for the out-ofdomain test sets can be a result of different domainspecific predicate-frame distributions, or a lack of coverage of the domain in the training data. To get a better understanding of these phenomena, we compared detailed statistics of the different test sets, cf. Table 3. Das-test has the largest predicate coverage and contains a lot of monosemous predicates, which boosts the overall performance. The occurrence of fewer monosemous predicates is expected for the lexical sample dataset MASC, but might indicate a domain preference for polysemous predicates in the YAGS and TW datasets. The percentage of unseen predicates (lemmas ∈/ das-train) is slightly higher for the user-generated test sets than for das-test, and much higher for MASC. This is mirrored in the lower frameId performance for MASC compared to the other test sets, and the slightly higher performance of TW-av and YAGS. Not all errors can be explained by insufficient training data coverage, which indicates that domain effects occur for the out-of-domain sets. To support this assumption, we performed a detailed error analysis on the misclassified instances for all test sets. We compute the proportion of wrongly classified instances with unseen predicates, predicates that do not occur in the training set. For MASC, the majority of the errors, 68%, are based on unseen predicates, while the number ranges between 37% and 43% for the other test sets, i.e. 37% for TW, 39% for das-test and 43% for YAGS. This shows that training data coverage is a bigger issue for MASC than for the other test sets. The proportions of in-train errors for YAGS and TW-av are similar to das-test. Together with the fact that overall proportion of errors is still much higher for the user-generated test sets YAGS and TW-av, this further supports our hypothesis of domain effects for YAGS and TW-av. Manual analysis furthermore shows that there are differences in frequently confused frames between the in-domain das-test and out-of-domain YAGS and TW-av. In the next section, we study new methods to improve out-of-domain frame identification. 5 Frame identification with distributed word representations Given a predicate and a set of frames associated with this predicate, a frame identification system has to choose the correct frame based on the context. In this section we introduce our frame identification method and compare it to the state of the art in both in-domain and out-of-domain settings. Our system SimpleFrameId We developed a straightforward approach to frame identification based on distributed word representations, and were surprised to find that this simple model achieves results comparable to the state-of-theart system, Hermann-14. Our initial attempts to replicate Hermann-14, which is not publicly available, revealed that the container-based input feature space is very sparse: there exist many syntactic paths that can connect a predicate to its arguments, but a predicate instance rarely has more than five arguments in the sentence. So by design the input representation bears no information in most of its path containers. Moreover, Hermann-14 makes heavy use of automatically created dependency parses, which might decline in quality when applied to a new domain. We demonstrate that our simple system achieves competitive in-domain and out-of-domain performance. Our system, called SimpleFrameId, is specified as follows: given the lexicon L, the vector space vsm and the training data, our goal is to predict the frame f given the sentence S and the predicate p. From the machine learning perspective, the lexicon and the vector space are external resources. The lexicon contains associations between predicates and frames, and we further denote the set of frames available for a predicate as L(p). The vector space provides a pre-defined dense vector representation vsm(w) for each word w. In our case vsm is a simple word lookup function, since we do not modify our word representations during training. From the sentence we extract the context representation, xc = Pw∈C|Cv|sm(w) . We experiment with two kinds of contexts: SentBOW includes all the words in the sentence, i.e. C = S, DepBOW considers the dependency parse of the sentence and only includes direct dependents of the predicate, C = dep(p, S). As for the predicate, the plain embedding from the source vector space model is used, xp = vsm(p). A simple concatenation of xc and xp serves as input to the disambiguation classifier D, which outputs weights D(xc, xp, f ) for each frame known to the system f ∈ L. Note that the classifier itself is agnostic to the predicate’s part of speech and exact lemma and only relies on the word representations from the vsm. We experiment with two different classification methods: one is a twolayer neural network DNN , the other one is DW SB, which follows the line of Hermann-14 and learns representations for frames and predicates in the same latent space using the WSABIE algorithm.3 Hyperparameters are tuned on the development sets das-dev and YAGS-dev (sampled from YAGS); we test on the remaining 2,093 instances in YAGS-test. DataBaseline LexiconBaseline Semafor* Hermann-14* (best) WSB+SentBOW WSB+DepBOW NN+SentBOW NN+DepBOW total tering is performed. We find that our frame identification system performs surprisingly well in this setting, and we encourage the no-lexicon performance to be additionally reported in the future, since it better reflects the frame identification quality and smoothens the effect of lexicon coverage. Lexicon-based filtering In the testing stage, the classifier outputs weights for all the frames available in the lexicon, and the best-scoring frame is selected, f ← argmaxf∈LD(xc, xp, f ). Since the lexicon specifies available frames for each lexical unit (i.e. lemma and POS), additional filtering can be performed, which limits the search only to the available frames, f ← argmaxf∈L(p)D(xc, xp, f ). If the predicate is unknown to the lexicon, p ∈/ L, the overall bestscoring frame is chosen. If the target has only one entry in the lexicon, it’s declared unambiguous and the frame is assigned directly. Despite being common, this setup has several flaws that can obscure the differences between sys- Experiments In our experiments, we generate tems in the testing stage. As we showed in Section the lexicon L in the same way as in Hermann-14, 4, the FrameNet lexicon has coverage issues when by scanning the “frames” folder of the FrameNet applied to new domains. Neither the predicate list 1.5 distribution. For the external vector space nor the frame associations are guaranteed to be model vsm we use dependency-based word emcomplete, and hence the total results are highly de- beddings from Levy and Goldberg (2014). termined by the lexicon coverage.4 To take this into account, we also perform evaluation in the In-domain performance We report the perforno-lexicon setting, where frames are assigned mance of our system in the in-domain setting directly by the classifier and no lexicon-based fil- to compare to the state-of-the-art results from Hermann-14.5 We train our system on das-train and test it on das-test using the full FrameNet lexicon. When available, we report the no-lexicon scores as well. As Table 4 shows, our system outBaselines We employ two majority baseline models for comparison. The DataBaseline assigns frames based on how often a frame is evoked by the given predicate. This corresponds to the most frequent sense baseline in word sense disambiguation (WSD). The frames available for predicates are obtained by scanning the training data. The LexiconBaseline calculates overall frame counts first (i.e. how often a frame appears in the training data in general), and, given the predicate, selects the overall most frequent frame among the ones available for this predicate. We expect this baseline to better handle the cases when limited data is available for a given predicate sense. 3In our implementation, we use the LightFM package (Kula, 2015) with the WARP option for hybrid matrix factorization. 4A justification for this can also be found in Hermann et al. (2014): the difference in Hermann-14 accuracy when switching from the Semafor lexicon to the full lexicon is comparable to the difference between Semafor and Hermann-14 when evaluated on the same lexicon. 5Based on the errata version of Hermann et al. (2014) in http://www.aclweb.org/anthology/P/ P14/P14-1136v2.pdf DataBaseline LexiconBaseline Semafor performs Semafor and performs on par with the results reported for Hermann-14. One interesting observation is that our systems perform almost as well in the no-lexicon setting as the DataBaseline, which has access to the lexicon, in the total setting. To our surprise, the WSABIEbased frame identification did not yield a consistent improvement in-domain, compared to the simple NN-based approach. We also observe that in many cases the SentBOW representation performs on par with the DepBOW, while requiring significantly less data preprocessing: SentBOW only uses tokenization, whereas DepBow relies on lemmatization, POS-tagging, and dependency parsing. We attribute this effect to the fact that SentBOW provides more context information than the sparse, dependency-filteredDepBOW. Out-of-domain performance We also investi gate how well the systems perform in the out-ofdomain setting. Table 5 summarizes the results. Each of the systems was trained on das-train and tested on a variety of test sets. As we can see, our systems outperform Semafor for all datasets. The YAGS dataset is the only dataset on which we do not strongly outperform Semafor. We attribute this to the complexity of the YAGS dataset that contains a high proportion of verbs. Overall out-of-domain performance stays behind the F1-agreement observed for the human annotators for TW and YAGS, which shows that there is a large margin for improvement. Corresponding scores for in-domain data are not available. Error analysis To further investigate the performance of our system in the out-of-domain setup we analyse statistics on the errors made by the system variant NN+SentBOW. The system’s wrong predictions are affected by the lexicon in two ways. First, if the predicate is not listed in the lexicon (unknown), the system has to choose among all frames. As we have shown before, the quality of predictions for unknown predicates is generally lower. The second case is when the predicate is listed in lexicon (so it is not unknown), but the correct frame is not associated with this predicate. We further refer to this class of errors as unlinked. For unlinked predicates, the system is restricted to the set of frames provided by the lexicon, and by design has no means to select the right frame for a given predicate occurrence. The unlinked-predicate issue points to a major design flaw in the standard frameId architecture. Although choosing among frames defined in the lexicon provides a quality boost, it also renders many instances intractable for the system, if the lexicon coverage is incomplete. As Table 6 shows, unknown and unlinked predicates are almost non-present in the in-domain case, but are a major source of errors in the out-of-domain case and even might be responsible for the majority of errors occurring due to domain shift (see MASC). It is important to point out that there is still no guarantee that these would be classified correctly once the missing linking information is available in the lexicon. However, if the correct frame is not listed among the frames available for the predicate, the misclassification is inevitable. A more detailed analysis of the errors made by the system shows that the majority of false predictions for known and linked predicates are due to the domain differences in word usage. For example, the predicate window was assigned the frame Connecting architecture instead of the correct frame Time period of action in the following sentence: “No effect of anesthetic protocol on IOP during a 12 minute measurement [window].” This problem is also relevant in generic WSD (Agirre et al., 2010) and benefits from the same solutions, for instance adapting embeddings to a particular domain (Taghipour and Ng, 2015) and efficient use of embeddings (Iacobacci et al., 2016) . Another major source of errors are subtle syntactic and semantic differences between frames which are hard to resolve on the sentence level (e.g. distinguishing between Similarity and Identicality for the predicate different). This could be addressed by incorporating subcategorization information and document context into the disamdataset unk biguation model, which has been proposed in recent work in FrameNet SRL, see e.g. Hermann et al. (2014) and Roth and Lapata (2015). To further explore the impact of user-generated text, we applied word-processor spelling correction to YAGS and tested our systems on the corrected set. The results do not change significantly, which indicates that a) our distributed representations provide enough information to classify also noisy usergenerated text, and b) frameId errors cannot be attributed to preprocessing problems at large scale. 6 Discussion and outlook Our analysis in Section 4 shows that domain adaptation is mainly required for the frameId step of FrameNet SRL. Unlike in PropBank SRL, in FrameNet SRL there is no significant performance drop for roleId once correct frames are available. The number of available roles given the correct frame is lower, on average 10, which reduces the complexity of the roleId task. In Section 5 we introduced a simple, yet efficient frame identification method and evaluated it on in-domain and out-of-domain data. The method achieves competitive in-domain results, and outperforms the best available open-source system in out-of-domain accuracy. We also observe that our system performs well in the newly introduced no-lexicon evaluation setting, where no lexicon-based filtering is applied. We identified a major issue in the standard frameId architecture: shifting to a new domain might render the predicate-frame associations in the FrameNet lexicon incomplete, which leads to errors for a standard classifier trained on in-domain data. One could optimize a frameId system to work in the no-lexicon setting which does not rely on the lexicon knowledge at all. However, in this setting the classification results are currently lower. Manually or automatically increasing both predicate and predicate-frame association coverage of the FrameNet lexicon could help, and we suggest investigating this line of research in future work. While our method achieves state-of-the-art results on out-of-domain data, overall results are still significantly lower than the human performance observed for YAGS and TW, which shows that there is large room for improvement. Some further benefits could be gained from combining the WSABIE and NN-based classification, using advanced context representations, e.g. context2vec (Melamud et al., 2016) and incorporating syntactic information into the model. The out-of-domain performance could be further improved by adapting word representations to a new domain. A direct comparison to the Hermann-14 system in the out-of-domain setup would shed some more light on the properties of the task affecting the out-of-domain performance. On the one hand, we expect Hermann-14 to perform worse due to its heavy reliance on syntactic information, which might decline in quality when moved to a new domain; on the other hand, the WSABIE-based classification might smoothen this effect. We make our dataset publicly available to enable comparison to related work.6 7 Conclusion Domain dependence is a well-known issue for supervised NLP tasks such as FrameNet SRL. To the best of our knowledge, there is no recent study of the domain dependence of FrameNet SRL, also prohibited by a lack of appropriate datasets. To address this problem, we 1) present the first comprehensive study of the domain generalization performance of the open-source Semafor system on several diverse benchmark sets. As a prerequisite, we introduce YAGS, a new, substantially sized test set in the domain of user-generated questionand-answer text. We find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step; we 2) explore a promising way to improve out-of-domain frame identification, i.e. using distributed word representations. Our simple frame identification system based on distributed word representations achieves higher scores for out-of-domain frame identification than previous systems and approaches state-of-the-art results indomain. To support reproducibility of our results, we publish the YAGS test set annotations and our frame identification system for research purposes. 6www.ukp.tu-darmstadt.de/ood-fn-srl Acknowledgements This work was supported by FAZIT-Stiftung and by the German Research Foundation (DFG) through grant GU 798/18-1 (QAEduInf) and the research training group “Adaptive Preparation of Information form Heterogeneous Sources” (AIPHES, GRK 1994/1). We thank Orin Hargraves and our annotators for their excellent work on the annotation study, Dr. Richard Eckart de Castilho for support regarding WebAnno, as well as Dr. Judith Eckle-Kohler and the anonymous reviewers for their comments on earlier versions of this paper. Eneko Agirre , Oier Lo´pez de Lacalle, Christiane Fellbaum, Shu-Kai Hsieh , Maurizio Tesconi, Monica Monachini, Piek Vossen, and Roxanne Segers . 2010 . SemEval-2010 Task 17 : All-Words Word Sense Disambiguation on a Specific Domain . InProceedings of the 5th International Workshop on Semantic Evaluation , pages 75 - 80 . Association for Computational Linguistics. Collin Baker , Michael Ellsworth , and Katrin Erk . 2007 . SemEval-2007 Task 19 : Frame Semantic Structure Extraction . In Proceedings of the Fourth International Workshop on Semantic Evaluations (SemEval2007) , pages 99 - 104 , Prague, Czech Republic, June. Association for Computational Linguistics. Jonathan Berant , Vivek Srikumar, Pei-Chun Chen , Abby Vander Linden, Brittany Harding, Brad Huang, Peter Clark , and Christopher D. Manning . 2014 . Modeling Biological Processes for Reading Comprehension . In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 1499 - 1510 , Doha, Qatar. Association for Computational Linguistics. John Blitzer , Ryan McDonald , and Fernando Pereira . 2006 . Domain adaptation with structural correspondence learning . In Proceedings of the 2006 Conference on Empirical Methods in Natural Language Processing , pages 120 - 128 , Sydney, Australia, July. Association for Computational Linguistics. Xavier Carreras and Llu´ıs Ma`rquez. 2005 . Introduction to the CoNLL-2005 shared task: Semantic role labeling . In Proceedings of the Ninth Conference on Computational Natural Language Learning (CoNLL-2005) , pages 152 - 164 , Ann Arbor, Michigan, June. Association for Computational Linguistics. Danilo Croce , Cristina Giannone, Paolo Annesi, and Roberto Basili . 2010 . Towards open-domain semantic role labeling . In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics , pages 237 - 246 , Uppsala, Sweden, July. Association for Computational Linguistics. Dipanjan Das and Noah A. Smith . 2011 . SemiSupervised Frame-Semantic Parsing for Unknown Predicates . In Proc. of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies , pages 1435 - 1444 , Portland, Oregon, USA. Dipanjan Das , Desai Chen , Andre´ F. T. Martins , Nathan Schneider , and Noah A. Smith . 2014 . Frame-semantic parsing . Computational Linguistics , 40 ( 1 ): 9 - 56 . Hal Daume ´III. 2007 . Frustratingly easy domain adaptation . In Proceedings of the 45th Annual Meeting of the Association of Computational Linguistics , pages 256 - 263 , Prague, Czech Republic, June. Association for Computational Linguistics. Katrin Erk and Sebastian Pado´. 2006 . SHALMANESER - A Toolchain For Shallow Semantic Parsing . In Proceedings of the 5th International Conference on Language Resources and Evaluation (LREC 2006 ), volume 6 , pages 527 - 532 , Genoa, Italy. ELRA. Charles J. Fillmore , Christopher R. Johnson , and Miriam R.L. Petruck . 2003 . Background to FrameNet. International journal of lexicography , 16 ( 3 ): 235 - 250 . Nicholas FitzGerald , Oscar Ta¨ckstro¨m, Kuzman Ganchev, and Dipanjan Das . 2015 . Semantic role labeling with neural network factors . In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing , pages 960 - 970 , Lisbon, Portugal, September. Association for Computational Linguistics. Jan Hajicˇ , Massimiliano Ciaramita, Richard Johansson, Daisuke Kawahara, Maria Anto`nia Mart´ı, Llu´ıs Ma`rquez, Adam Meyers, Joakim Nivre, Sebastian Pado´, Jan Sˇ teˇpa´nek, Pavel Stranˇa´k, Mihai Surdeanu, Nianwen Xue, and Yi Zhang . 2009 . The conll2009 shared task: Syntactic and semantic dependencies in multiple languages . In Proceedings of the Thirteenth Conference on Computational Natural Language Learning (CoNLL 2009 ): Shared Task, pages 1 - 18 , Boulder, Colorado, June. Association for Computational Linguistics. Karl Moritz Hermann , Dipanjan Das , Jason Weston , and Kuzman Ganchev . 2014 . Semantic frame identification with distributed word representations . In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 1448 - 1458 , Baltimore, Maryland, June. Association for Computational Linguistics. Fei Huang and Alexander Yates . 2010 . Open-domain semantic role labeling by modeling word spans . In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics , pages 968 - 978 , Uppsala, Sweden, July. Association for Computational Linguistics. Ignacio Iacobacci , Mohammad Taher Pilehvar, and Roberto Navigli . 2016 . Embeddings for Word Sense Disambiguation: An Evaluation Study . In Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 897 - 907 , Berlin, Germany, August. Association for Computational Linguistics. Anders Johannsen , He´ctor Mart´ınez Alonso, and Anders Søgaard . 2015 . Any-language frame-semantic parsing . In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing , pages 2062 - 2066 , Lisbon, Portugal, September. Association for Computational Linguistics. Richard Johansson and Pierre Nugues . 2008 . The effect of syntactic representation on semantic role labeling . In Proceedings of the 22nd International Conference on Computational Linguistics (Coling 2008 ), pages 393 - 400 , Manchester, UK , August . Coling 2008 Organizing Committee . Meghana Kshirsagar , Sam Thomson, Nathan Schneider, Jaime Carbonell, Noah A. Smith , and Chris Dyer . 2015 . Frame-semantic role labeling with heterogeneous annotations . In Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 2: Short Papers) , pages 218 - 224 , Beijing, China, July. Association for Computational Linguistics. Maciej Kula . 2015 . Metadata embeddings for user and item cold-start recommendations . In Toine Bogers and Marijn Koolen , editors, Proceedings of the 2nd Workshop on New Trends on Content-Based Recommender Systems co-located with 9th ACM Conference on Recommender Systems (RecSys 2015 ), volume 1448 of CEUR Workshop Proceedings , pages 14 - 21 , Vienna, Austria, September. CEUR-WS.org. Omer Levy and Yoav Goldberg . 2014 . Dependencybased word embeddings . In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics , ACL 2014 , June 22-27, 2014 , Baltimore, MD , USA, Volume 2 : Short Papers , pages 302 - 308 . The Association for Computer Linguistics. Oren Melamud , Jacob Goldberger , and Ido Dagan . 2016 . context2vec: Learning generic context embedding with bidirectional LSTM . In Proceedings of the 20th SIGNLL Conference on Computational Natural Language Learning , CoNLL 2016 , Berlin, Germany, August 11-12 , 2016 , pages 51 - 61 . Tomas Mikolov , Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean . 2013 . Distributed Representations of Words and Phrases and Their Compositionality . In Proceedings of the 26th International Conference on Neural Information Processing Systems (NIPS '13) , pages 3111 - 3119 , Lake Tahoe , Nevada, USA. Alexis Palmer and Caroline Sporleder . 2010 . Evaluating FrameNet-style semantic parsing: the role of coverage gaps in FrameNet . In Proceedings of the 23rd International Conference on Computational Linguistics: Posters , pages 928 - 936 , Beijing, China, August . Rebecca J. Passonneau , Collin F. Baker , Christiane Fellbaum, and Nancy Ide . 2012 . The MASC Word Sense Corpus . In Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12) , pages 3025 - 3030 , Istanbul, Turkey. Jeffrey Pennington , Richard Socher, and Christopher Manning . 2014 . Glove: Global vectors for word representation . In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 1532 - 1543 , Doha, Qatar, October. Association for Computational Linguistics. Michael Roth and Mirella Lapata . 2015 . Contextaware frame-semantic role labeling . Transactions of the Association for Computational Linguistics , 3 : 449 - 460 . Josef Ruppenhofer , Michael Ellsworth, Miriam R. L. Petruck , Christopher R. Johnson , and Jan Scheffczyk . 2010 . FrameNet II: Extended Theory and Practice . Technical report , ICSI, University of California, Berkeley. Anders Søgaard , Barbara Plank, and H e´ctor Mart´ınez Alonso. 2015 . Using Frame Semantics for Knowledge Extraction from Twitter . In Proceedings of the Twenty-Ninth AAAI Conference on Artificial Intelligence , pages 2447 - 2452 , Austin, Texas, USA. Anders Søgaard . 2013 . Semi-supervised learning and domain adaptation in natural language processing . Synthesis Lectures on Human Language Technologies , 6 ( 2 ): 1 - 103 . Mihai Surdeanu , Richard Johansson, Adam Meyers, Llu´ıs Ma`rquez, and Joakim Nivre . 2008 . The conll 2008 shared task on joint parsing of syntactic and semantic dependencies . In CoNLL 2008: Proceedings of the Twelfth Conference on Computational Natural Language Learning , pages 159 - 177 , Manchester, England, August. Coling 2008 Organizing Committee . Mihai Surdeanu , Massimiliano Ciaramita, and Hugo Zaragoza . 2011 . Learning to rank answers to nonfactoid questions from web collections . Computational Linguistics , 37 ( 2 ): 351 - 383 . Kaveh Taghipour and Hwee Tou Ng. 2015 . SemiSupervised Word Sense Disambiguation Using Word Embeddings in General and Specific Domains . In Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies , pages 314 - 323 , Denver, Colorado, May-June. Association for Computational Linguistics . Jason Weston , Samy Bengio, and Nicolas Usunier . 2011 . WSABIE: Scaling Up to Large Vocabulary Image Annotation . In Proceedings of the Twenty-Second International Joint Conference on Artificial Intelligence - Volume Volume Three , IJCAI'11 , pages 2764 - 2770 , Barcelona, Catalonia, Spain. AAAI Press. Haitong Yang , Tao Zhuang , and Chengqing Zong . 2015 . Domain adaptation for syntactic and semantic dependency parsing using deep belief networks . Transactions of the Association for Computational Linguistics , 3 : 271 - 282 . Seid Muhie Yimam , Richard Eckart de Castilho, Iryna Gurevych, and Chris Biemann . 2014 . Automatic Annotation Suggestions and Custom Annotation Layers in WebAnno . In Kalina Bontcheva and Zhu Jingbo, editors, Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics. System Demonstrations , pages 91 - 96 , Stroudsburg, PA 18360 , USA. Association for Computational Linguistics. -[ Out-of-domain FrameNet Semantic Role Labeling] +[Out-of-domain FrameNet Semantic Role Labeling] Heading sofa: _InitialView - begin: 0 + begin: 1 end: 46 -[ Silvana Hartmann 0 Ilia Kuznetsov 0 Teresa Martin 0 Iryna Gurevych 0 Research Training Group AIPHES 0 Ubiquitous Knowledge Processing (UKP) Lab 0 0 Department of Computer Science, Technische Universita ̈t Darmstadt 2017 1 471 482 Domain dependence of NLP systems is one of the major obstacles to their application in large-scale text analysis, also restricting the applicability of FrameNet semantic role labeling (SRL) systems. Yet, current FrameNet SRL systems are still only evaluated on a single in-domain test set. For the first time, we study the domain dependence of FrameNet SRL on a wide range of benchmark sets. We create a novel test set for FrameNet SRL based on user-generated web text and find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step. To address this problem, we develop a simple, yet efficient system based on distributed word representations. Our system closely approaches the state-of-the-art in-domain while outperforming the best available frame identification system out-of-domain. We publish our system and test data for research purposes.1] +[Silvana Hartmann 0 Ilia Kuznetsov 0 Teresa Martin 0 Iryna Gurevych 0 Research Training Group AIPHES 0 Ubiquitous Knowledge Processing (UKP) Lab 0 0 Department of Computer Science, Technische Universita ̈t Darmstadt 2017 1 471 482 Domain dependence of NLP systems is one of the major obstacles to their application in large-scale text analysis, also restricting the applicability of FrameNet semantic role labeling (SRL) systems. Yet, current FrameNet SRL systems are still only evaluated on a single in-domain test set. For the first time, we study the domain dependence of FrameNet SRL on a wide range of benchmark sets. We create a novel test set for FrameNet SRL based on user-generated web text and find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step. To address this problem, we develop a simple, yet efficient system based on distributed word representations. Our system closely approaches the state-of-the-art in-domain while outperforming the best available frame identification system out-of-domain. We publish our system and test data for research purposes.1] Paragraph sofa: _InitialView - begin: 46 + begin: 47 end: 1158 -[ - ] +[-] Heading sofa: _InitialView - begin: 1158 - end: 1161 -[ Domain dependence is a major problem for supervised NLP tasks such as FrameNet semantic role labeling (SRL): systems generally exhibit a strong performance drop when applied to test data from a different distribution than the training data. This prohibits their large-scale use in language technology applications.] + begin: 1159 + end: 1160 +[Domain dependence is a major problem for supervised NLP tasks such as FrameNet semantic role labeling (SRL): systems generally exhibit a strong performance drop when applied to test data from a different distribution than the training data. This prohibits their large-scale use in language technology applications.] Paragraph sofa: _InitialView - begin: 1161 + begin: 1162 end: 1476 -[ The same problems are expected for FrameNet SRL, but due to a lack of datasets, state-of-theart FrameNet SRL is only evaluated on a single in-domain test set, see e.g. Das et al. (2014) and FitzGerald et al. (2015).] +[The same problems are expected for FrameNet SRL, but due to a lack of datasets, state-of-theart FrameNet SRL is only evaluated on a single in-domain test set, see e.g. Das et al. (2014) and FitzGerald et al. (2015).] Paragraph sofa: _InitialView - begin: 1476 + begin: 1477 end: 1692 -[ In this work, we present the first comprehensive study of the domain dependence of FrameNet SRL] +[In this work, we present the first comprehensive study of the domain dependence of FrameNet SRL] Paragraph sofa: _InitialView - begin: 1692 + begin: 1693 end: 1788 -[ 1www.ukp.tu-darmstadt.de/ood-fn-srl ] +[1www.ukp.tu-darmstadt.de/ood-fn-srl] Heading sofa: _InitialView - begin: 1788 - end: 1825 -[ on a range of benchmark datasets. This is crucial as the demand for semantic textual analysis of largescale web data keeps growing.] + begin: 1789 + end: 1824 +[on a range of benchmark datasets. This is crucial as the demand for semantic textual analysis of largescale web data keeps growing.] Paragraph sofa: _InitialView - begin: 1825 + begin: 1826 end: 1957 -[ Based on FrameNet (Fillmore et al., 2003) , FrameNet SRL extracts frame-semantic structures on the sentence level that describe a specific situation centered around a semantic predicate, often a verb, and its participants, typically syntactic arguments or adjuncts of the predicate. The predicate is assigned a frame label, essentially a word sense label, that defines the situation and determines the semantic roles of the participants. The following sentence from FrameNet provides an example of the Grinding frame and its roles: [The mill]Grinding cause malt]P atient [to grist]Result. grindsGrinding [the] +[Based on FrameNet (Fillmore et al., 2003) , FrameNet SRL extracts frame-semantic structures on the sentence level that describe a specific situation centered around a semantic predicate, often a verb, and its participants, typically syntactic arguments or adjuncts of the predicate. The predicate is assigned a frame label, essentially a word sense label, that defines the situation and determines the semantic roles of the participants. The following sentence from FrameNet provides an example of the Grinding frame and its roles: [The mill]Grinding cause malt]P atient [to grist]Result. grindsGrinding [the] Paragraph sofa: _InitialView - begin: 1957 + begin: 1958 end: 2567 -[ FrameNet SRL consists of two steps, frame identification (frameId), assigning a frame to the current predicate, and role labeling (roleId), identifying the participants and assigning them role labels licensed by the frame. The frameId step reduces the hundreds of role labels in FrameNet to a manageable set of up to 30 roles. Thus, FrameNet SRL differs from PropBank SRL (Carreras and Ma`rquez, 2005) , that only uses a small set of 26 syntactically motivated role labels and puts less weight on the predicate sense. The advantage of FrameNet SRL is that it results in a more fine-grained and rich interpretation of the input sentences which is crucial for many applications, e.g. reasoning in online debates (Berant et al., 2014) .] +[FrameNet SRL consists of two steps, frame identification (frameId), assigning a frame to the current predicate, and role labeling (roleId), identifying the participants and assigning them role labels licensed by the frame. The frameId step reduces the hundreds of role labels in FrameNet to a manageable set of up to 30 roles. Thus, FrameNet SRL differs from PropBank SRL (Carreras and Ma`rquez, 2005) , that only uses a small set of 26 syntactically motivated role labels and puts less weight on the predicate sense. The advantage of FrameNet SRL is that it results in a more fine-grained and rich interpretation of the input sentences which is crucial for many applications, e.g. reasoning in online debates (Berant et al., 2014) .] Paragraph sofa: _InitialView - begin: 2567 + begin: 2568 end: 3303 -[ Domain dependence is a well-studied topic for PropBank SRL. However, to the best of our knowledge, there exists no analysis of the performance of modern FrameNet SRL systems when applied to data from new domains.] +[Domain dependence is a well-studied topic for PropBank SRL. However, to the best of our knowledge, there exists no analysis of the performance of modern FrameNet SRL systems when applied to data from new domains.] Paragraph sofa: _InitialView - begin: 3303 + begin: 3304 end: 3516 -[ In this work, we address this problem as follows: we introduce a new benchmark dataset YAGS (Yahoo! Answers Gold Standard), which is based on user-generated questions and answers and exemplifies an out-of-domain application use case. We use YAGS, along with other out-of-domain test sets, to perform a detailed analysis of the domain dependence of FrameNet SRL using Semafor (Das et al., 2014; Kshirsagar et al., 2015) to identify which of the stages of FrameNet SRL, frameId or roleId, is particularly sensitive to domain shifts. Our results confirm that the major bottleneck in FrameNet SRL is the frame identification step. Motivated by that, we develop a simple, yet efficient frame identification method based on distributed word representations that promise better domain generalization. Our system’s performance matches the state-of-the-art in-domain (Hermann et al., 2014) , despite using a simpler model, and improves on the out-of-domain performance of Semafor.] +[In this work, we address this problem as follows: we introduce a new benchmark dataset YAGS (Yahoo! Answers Gold Standard), which is based on user-generated questions and answers and exemplifies an out-of-domain application use case. We use YAGS, along with other out-of-domain test sets, to perform a detailed analysis of the domain dependence of FrameNet SRL using Semafor (Das et al., 2014; Kshirsagar et al., 2015) to identify which of the stages of FrameNet SRL, frameId or roleId, is particularly sensitive to domain shifts. Our results confirm that the major bottleneck in FrameNet SRL is the frame identification step. Motivated by that, we develop a simple, yet efficient frame identification method based on distributed word representations that promise better domain generalization. Our system’s performance matches the state-of-the-art in-domain (Hermann et al., 2014) , despite using a simpler model, and improves on the out-of-domain performance of Semafor.] Paragraph sofa: _InitialView - begin: 3516 + begin: 3517 end: 4491 -[ The contributions of the present work are twofold: 1) we perform the first comprehensive study of the domain generalization capabilities of opensource FrameNet SRL, and 2) we propose a new frame identification method based on distributed word representations that enhances out-of-domain performance of frame identification. To enable our study, we created YAGS, a new, substantially-sized benchmark dataset for the out-of-domain testing of FrameNet SRL; we publish the annotations for the YAGS benchmark set and our frame identification system for research purposes. 2] +[The contributions of the present work are twofold: 1) we perform the first comprehensive study of the domain generalization capabilities of opensource FrameNet SRL, and 2) we propose a new frame identification method based on distributed word representations that enhances out-of-domain performance of frame identification. To enable our study, we created YAGS, a new, substantially-sized benchmark dataset for the out-of-domain testing of FrameNet SRL; we publish the annotations for the YAGS benchmark set and our frame identification system for research purposes. 2] Paragraph sofa: _InitialView - begin: 4491 + begin: 4492 end: 5060 -[ Related work ] +[Related work] Heading sofa: _InitialView - begin: 5060 - end: 5074 -[ The domain dependence of FrameNet SRL systems has been only studied sparsely, however, there exists a large body of work on out-of-domain PropBank SRL, as well as on general domain adaptation methods for NLP. This section briefly introduces some of the relevant approaches in these areas, and then summarizes the state-of-the-art in FrameNet frame identification.] + begin: 5061 + end: 5073 +[The domain dependence of FrameNet SRL systems has been only studied sparsely, however, there exists a large body of work on out-of-domain PropBank SRL, as well as on general domain adaptation methods for NLP. This section briefly introduces some of the relevant approaches in these areas, and then summarizes the state-of-the-art in FrameNet frame identification.] Paragraph sofa: _InitialView - begin: 5074 + begin: 5075 end: 5438 -[ Domain adaptation in NLP Low out-ofdomain performance is a problem common to many supervised machine learning tasks. The goal of domain adaptation is to improve model performance on the test data originating from a different distribution than the training data (Søgaard, 2013). For NLP, domain adaptation has been studied for various tasks such as POS-tagging and syntactic parsing (Daume´III, 2007; Blitzer et al., 2006) . For the complex task of SRL, it is strongly associated with PropBank, because the corresponding CoNLL shared tasks promote out-of-domain evaluation (Surdeanu et al., 2008; Hajicˇ et al., 2009) . In the shared tasks, in-domain newspaper text from the WSJ Corpus is contrasted to out-of-domain data from fiction texts in the Brown Corpus. Most of the participants in the shared tasks do not consider domain adaptation and report systematically lower scores for the out-of-domain data (Hajicˇ et al., 2009).] +[Domain adaptation in NLP Low out-ofdomain performance is a problem common to many supervised machine learning tasks. The goal of domain adaptation is to improve model performance on the test data originating from a different distribution than the training data (Søgaard, 2013). For NLP, domain adaptation has been studied for various tasks such as POS-tagging and syntactic parsing (Daume´III, 2007; Blitzer et al., 2006) . For the complex task of SRL, it is strongly associated with PropBank, because the corresponding CoNLL shared tasks promote out-of-domain evaluation (Surdeanu et al., 2008; Hajicˇ et al., 2009) . In the shared tasks, in-domain newspaper text from the WSJ Corpus is contrasted to out-of-domain data from fiction texts in the Brown Corpus. Most of the participants in the shared tasks do not consider domain adaptation and report systematically lower scores for the out-of-domain data (Hajicˇ et al., 2009).] Paragraph sofa: _InitialView - begin: 5438 + begin: 5439 end: 6369 -[ Representation learning has been successfully used to improve on the CoNLL shared task results (Huang and Yates, 2010; FitzGerald et al., 2015; Yang et al., 2015) . Yang et al. (2015) report the smallest performance difference (5.5 points in F1) between in-domain and out-of-domain test data, leading to the best results to date on the CoNLL 2009 out-of-domain test. Their system learns common representations for in-domain and out-of-domain data based on deep belief networks.] +[Representation learning has been successfully used to improve on the CoNLL shared task results (Huang and Yates, 2010; FitzGerald et al., 2015; Yang et al., 2015) . Yang et al. (2015) report the smallest performance difference (5.5 points in F1) between in-domain and out-of-domain test data, leading to the best results to date on the CoNLL 2009 out-of-domain test. Their system learns common representations for in-domain and out-of-domain data based on deep belief networks.] Paragraph sofa: _InitialView - begin: 6369 + begin: 6370 end: 6848 -[ Domain dependence of FrameNet SRL The ] +[Domain dependence of FrameNet SRL The] Heading sofa: _InitialView - begin: 6848 - end: 6887 -[ FrameNet 1.5 fulltext corpus, used as a standard dataset for training and evaluating FrameNet SRL systems, contains texts from several domains (Ruppenhofer et al., 2010) . However, the standard data split used to evaluate modern systems (Das and Smith, 2011) ensures the presence of all domains in the training as well as test data and cannot be used to assess the systems’ ability to generalize. Moreover, all the texts in the FrameNet fulltext corpus, based on newspaper and literary texts, are post-edited and linguistically well-formed. The FrameNet test setup thus cannot provide information on SRL performance on less edited out-ofdomain data, e.g. user-generated web data.] + begin: 6849 + end: 6886 +[FrameNet 1.5 fulltext corpus, used as a standard dataset for training and evaluating FrameNet SRL systems, contains texts from several domains (Ruppenhofer et al., 2010) . However, the standard data split used to evaluate modern systems (Das and Smith, 2011) ensures the presence of all domains in the training as well as test data and cannot be used to assess the systems’ ability to generalize. Moreover, all the texts in the FrameNet fulltext corpus, based on newspaper and literary texts, are post-edited and linguistically well-formed. The FrameNet test setup thus cannot provide information on SRL performance on less edited out-ofdomain data, e.g. user-generated web data.] Paragraph sofa: _InitialView - begin: 6887 + begin: 6888 end: 7570 -[ There are few studies related to the out-ofdomain generalization of FrameNet SRL. Johansson and Nugues (2008) evaluate the impact of different parsers on FrameNet SRL using the Nuclear Threats Initiative (NTI) data as an out-of-domain test set. They observe low domain generalization abilities of their supervised system, but find that using dependency parsers instead of constituency parsers is beneficial in the out-of-domain scenario. Croce et al. (2010) use a similar in-domain/out-ofdomain split to evaluate their approach to opendomain FrameNet SRL. They integrate a distributional model into their SRL system to generalize lexicalized features to previously unseen arguments and thus create an SRL system with a smaller performance gap between in-domain and out-ofdomain test data (only 4.5 percentage points F1). Note that they only evaluate the role labeling step. It is not transparent how their results would transfer to the current state-of-the-art SRL systems that already integrate methods to improve generalization, for instance using distributed representations.] +[There are few studies related to the out-ofdomain generalization of FrameNet SRL. Johansson and Nugues (2008) evaluate the impact of different parsers on FrameNet SRL using the Nuclear Threats Initiative (NTI) data as an out-of-domain test set. They observe low domain generalization abilities of their supervised system, but find that using dependency parsers instead of constituency parsers is beneficial in the out-of-domain scenario. Croce et al. (2010) use a similar in-domain/out-ofdomain split to evaluate their approach to opendomain FrameNet SRL. They integrate a distributional model into their SRL system to generalize lexicalized features to previously unseen arguments and thus create an SRL system with a smaller performance gap between in-domain and out-ofdomain test data (only 4.5 percentage points F1). Note that they only evaluate the role labeling step. It is not transparent how their results would transfer to the current state-of-the-art SRL systems that already integrate methods to improve generalization, for instance using distributed representations.] Paragraph sofa: _InitialView - begin: 7570 + begin: 7571 end: 8649 -[ Palmer and Sporleder (2010) analyze the FrameNet 1.3 training data coverage and the performance of the Shalmaneser SRL system (Erk and Pado´, 2006) for frame identification on several test sets across domains, i.e. the PropBank and NTI parts of the FrameNet fulltext corpus and the fictional texts from the SemEval-2007 shared task (Baker et al., 2007) . Having observed that the majority of errors results from coverage gaps in FrameNet, they suggest to focus on developing frame identification systems that generalize well to new domains. Our observations support their findings and show that the problem still persists even when modern SRL methods and the extended FrameNet 1.5 lexicon are used.] +[Palmer and Sporleder (2010) analyze the FrameNet 1.3 training data coverage and the performance of the Shalmaneser SRL system (Erk and Pado´, 2006) for frame identification on several test sets across domains, i.e. the PropBank and NTI parts of the FrameNet fulltext corpus and the fictional texts from the SemEval-2007 shared task (Baker et al., 2007) . Having observed that the majority of errors results from coverage gaps in FrameNet, they suggest to focus on developing frame identification systems that generalize well to new domains. Our observations support their findings and show that the problem still persists even when modern SRL methods and the extended FrameNet 1.5 lexicon are used.] Paragraph sofa: _InitialView - begin: 8649 + begin: 8650 end: 9351 -[ Søgaard et al. (2015) annotate 236 tweets with FrameNet labels to apply SRL to knowledge extraction from Twitter. They report that the frameId performance of Semafor 2.1 (Das et al., 2010) on the new test set is similar to its performance on the SemEval-2007 newswire test set (Baker et al., 2007) . For full SRL, there are large differences: F1 reaches only 25.96% on the Twitter set compared to the 46.5% reported by Das et al. (2010) on the indomain set. These results show that there is ample room for improvement for SRL on Twitter data.] +[Søgaard et al. (2015) annotate 236 tweets with FrameNet labels to apply SRL to knowledge extraction from Twitter. They report that the frameId performance of Semafor 2.1 (Das et al., 2010) on the new test set is similar to its performance on the SemEval-2007 newswire test set (Baker et al., 2007) . For full SRL, there are large differences: F1 reaches only 25.96% on the Twitter set compared to the 46.5% reported by Das et al. (2010) on the indomain set. These results show that there is ample room for improvement for SRL on Twitter data.] Paragraph sofa: _InitialView - begin: 9351 + begin: 9352 end: 9895 -[ Recent FrameNet SRL systems are not evaluated in the context of their domain dependence: Kshirsagar et al. (2015) use the domain adaptation approach from Daume´III (2007) to augment the feature space for FrameNet SRL with FrameNet example sentences; FitzGerald et al. (2015) and Hermann et al. (2014) adopt deep learning methods, including learning representations that may generalize better to unseen data, to present stateof-the-art results for FrameNet SRL. All of the former only use the already introduced split of the FrameNet fulltext corpus for testing, as does the long-time state-of-the-art system Semafor (Das et al., 2014) . Out-of-domain evaluation is lacking, as are datasets that enable this kind of evaluation. Frame identification Current state of the art in frame identification is the approach by Hermann et al. (2014), further referred to as Hermann-14, followed by the previous state-of-the art model Semafor (Das et al., 2014) .] +[Recent FrameNet SRL systems are not evaluated in the context of their domain dependence: Kshirsagar et al. (2015) use the domain adaptation approach from Daume´III (2007) to augment the feature space for FrameNet SRL with FrameNet example sentences; FitzGerald et al. (2015) and Hermann et al. (2014) adopt deep learning methods, including learning representations that may generalize better to unseen data, to present stateof-the-art results for FrameNet SRL. All of the former only use the already introduced split of the FrameNet fulltext corpus for testing, as does the long-time state-of-the-art system Semafor (Das et al., 2014) . Out-of-domain evaluation is lacking, as are datasets that enable this kind of evaluation. Frame identification Current state of the art in frame identification is the approach by Hermann et al. (2014), further referred to as Hermann-14, followed by the previous state-of-the art model Semafor (Das et al., 2014) .] Paragraph sofa: _InitialView - begin: 9895 + begin: 9896 end: 10848 -[ The frame identification system of Semafor relies on an elaborate feature set based on syntactic and lexical features, using the WordNet hierarchy as a source of lexical information, and a label propagation-based approach to take unknown predicates into account. Semafor is not specifically designed for out-of-domain use: the WordNet coverage is limited, and the quality of syntactic parsing might drop when the system is applied to out-ofdomain data, especially in case of non-standard user-generated texts.] +[The frame identification system of Semafor relies on an elaborate feature set based on syntactic and lexical features, using the WordNet hierarchy as a source of lexical information, and a label propagation-based approach to take unknown predicates into account. Semafor is not specifically designed for out-of-domain use: the WordNet coverage is limited, and the quality of syntactic parsing might drop when the system is applied to out-ofdomain data, especially in case of non-standard user-generated texts.] Paragraph sofa: _InitialView - begin: 10848 + begin: 10849 end: 11358 -[ Hermann-14 uses distributed word representations augmented by syntactic information. Generalpurpose distributed word representations (such as word2vec (Mikolov et al., 2013) and GloVe (Pennington et al., 2014) ) are beneficial for many NLP tasks: word representations are calculated on a large unlabeled corpus, and then used as input for high-level tasks for which training data is scarce, such as syntactic parsing, word sense disambiguation, and SRL. In the syntax-augmented representations of Hermann-14, a region of the input vector, a container, is reserved for each syntactic path that can connect predicates to their arguments. This container is populated with a corresponding argument word representation, if the argument on this path is found in the training data. Hermann-14 uses the WSABIE algorithm (Weston et al., 2011) to map input and frame representations to a common latent space. WSABIE uses WARP loss and gradient-based updates to minimize the distance between the latent representations of the predicate target and the correct frame, while maximizing the distance to all the other irrelevant frames. During testing, cosine similarity is used to find the closest frame given the input. One advantage of this approach is that similar frames are positioned close to each other in the latent space which allows information to be shared between similar predicates and similar frames. This system is the current state-ofthe-art for in-domain frame identification, but has not been applied in an out-of-domain setting. 3] +[Hermann-14 uses distributed word representations augmented by syntactic information. Generalpurpose distributed word representations (such as word2vec (Mikolov et al., 2013) and GloVe (Pennington et al., 2014) ) are beneficial for many NLP tasks: word representations are calculated on a large unlabeled corpus, and then used as input for high-level tasks for which training data is scarce, such as syntactic parsing, word sense disambiguation, and SRL. In the syntax-augmented representations of Hermann-14, a region of the input vector, a container, is reserved for each syntactic path that can connect predicates to their arguments. This container is populated with a corresponding argument word representation, if the argument on this path is found in the training data. Hermann-14 uses the WSABIE algorithm (Weston et al., 2011) to map input and frame representations to a common latent space. WSABIE uses WARP loss and gradient-based updates to minimize the distance between the latent representations of the predicate target and the correct frame, while maximizing the distance to all the other irrelevant frames. During testing, cosine similarity is used to find the closest frame given the input. One advantage of this approach is that similar frames are positioned close to each other in the latent space which allows information to be shared between similar predicates and similar frames. This system is the current state-ofthe-art for in-domain frame identification, but has not been applied in an out-of-domain setting. 3] Paragraph sofa: _InitialView - begin: 11358 + begin: 11359 end: 12898 -[ Out-of-domain FrameNet test data ] +[Out-of-domain FrameNet test data] Heading sofa: _InitialView - begin: 12898 - end: 12932 -[ This section describes available in-domain and outof-domain FrameNet test sets and the creation of YAGS, a new out-of-domain FrameNet test set. FrameNet test sets FrameNet SRL is typically evaluated on das-test, the test set first introduced by Das and Smith (2011). It is a held-out set randomly sampled from the FrameNet 1.5 fulltext corpus. While the FrameNet fulltext corpus contains data from various sources, we consider das-test an in-domain test set: all data sources of the test set are also represented in the training set.] + begin: 12899 + end: 12931 +[This section describes available in-domain and outof-domain FrameNet test sets and the creation of YAGS, a new out-of-domain FrameNet test set. FrameNet test sets FrameNet SRL is typically evaluated on das-test, the test set first introduced by Das and Smith (2011). It is a held-out set randomly sampled from the FrameNet 1.5 fulltext corpus. While the FrameNet fulltext corpus contains data from various sources, we consider das-test an in-domain test set: all data sources of the test set are also represented in the training set.] Paragraph sofa: _InitialView - begin: 12932 + begin: 12933 end: 13466 -[ There are two additional datasets from other domains that we use in our study on domain generalization: The MASC word sense sentences corpus contains FrameNet annotations for a lexical sample of roughly 100 lemmas from ANC (Passonneau et al., 2012) . The Twitter-based dataset from Søgaard et al. (2015), henceforth TW, has some very distinctive properties: it does not provide a gold standard, but annotations by three annotators. This leads to a high variance in role annotations: the annotator TW3 annotated only 82% of the number of roles annotated by TW1, see Table 1. Like Søgaard et al. (2015), we report SRL results as averages over the three annotations (TW-av).] +[There are two additional datasets from other domains that we use in our study on domain generalization: The MASC word sense sentences corpus contains FrameNet annotations for a lexical sample of roughly 100 lemmas from ANC (Passonneau et al., 2012) . The Twitter-based dataset from Søgaard et al. (2015), henceforth TW, has some very distinctive properties: it does not provide a gold standard, but annotations by three annotators. This leads to a high variance in role annotations: the annotator TW3 annotated only 82% of the number of roles annotated by TW1, see Table 1. Like Søgaard et al. (2015), we report SRL results as averages over the three annotations (TW-av).] Paragraph sofa: _InitialView - begin: 13466 + begin: 13467 end: 14139 -[ Table 1 shows statistics on these datasets. For TW, it displays the statistics for each annotator. The TW datasets are fairly small, containing only around 1,000 frame labels. The MASC dataset is of substantial size, but it constitutes a lexical sample and therefore a slightly artificial evaluation setup. There is another Twitter-based test set (Johannsen et al., 2015) , which we do not use in our experiments, because it was created semi-automatically and is therefore of lower quality. We conclude that existing out-of-domain test sets for FrameNet SRL are insufficient, in particular for increasingly important domains like user-generated text, because available datasets are either small or of low quality.] +[Table 1 shows statistics on these datasets. For TW, it displays the statistics for each annotator. The TW datasets are fairly small, containing only around 1,000 frame labels. The MASC dataset is of substantial size, but it constitutes a lexical sample and therefore a slightly artificial evaluation setup. There is another Twitter-based test set (Johannsen et al., 2015) , which we do not use in our experiments, because it was created semi-automatically and is therefore of lower quality. We conclude that existing out-of-domain test sets for FrameNet SRL are insufficient, in particular for increasingly important domains like user-generated text, because available datasets are either small or of low quality.] Paragraph sofa: _InitialView - begin: 14139 + begin: 14140 end: 14854 -[ YAGS: a new FrameNet test set based on user ] +[YAGS: a new FrameNet test set based on user] Heading sofa: _InitialView - begin: 14854 - end: 14899 -[ generated text To address the need for new outof-domain test datasets, we created YAGS, a new FrameNet-annotated evaluation dataset based on question-answer data from Yahoo! Answers (YA), a community-driven question-and-answer forum. The corpus is based on a random sample of 55 questions and their answers from the test split of the YA Manner Questions dataset used by Surdeanu et al. (2011) and published as part of the Yahoo! Webscope program (https://webscope. sandbox.yahoo.com/).] + begin: 14855 + end: 14898 +[generated text To address the need for new outof-domain test datasets, we created YAGS, a new FrameNet-annotated evaluation dataset based on question-answer data from Yahoo! Answers (YA), a community-driven question-and-answer forum. The corpus is based on a random sample of 55 questions and their answers from the test split of the YA Manner Questions dataset used by Surdeanu et al. (2011) and published as part of the Yahoo! Webscope program (https://webscope. sandbox.yahoo.com/).] Paragraph sofa: _InitialView - begin: 14899 + begin: 14900 end: 15385 -[ YAGS contains 1,415 sentences, 3,091 frame annotations, and 6,081 role annotations. Figure 1 shows a sentence from YAGS that demonstrates some non-standard properties of the user-generated question-answer data, such as typos (mortal instead of mortar). We publish the annotations as stand-off annotations to the original dataset.] +[YAGS contains 1,415 sentences, 3,091 frame annotations, and 6,081 role annotations. Figure 1 shows a sentence from YAGS that demonstrates some non-standard properties of the user-generated question-answer data, such as typos (mortal instead of mortar). We publish the annotations as stand-off annotations to the original dataset.] Paragraph sofa: _InitialView - begin: 15385 + begin: 15386 end: 15715 -[ Annotation study Each document was annotated by a two linguistically trained annotators provided with detailed guidelines and then curated by an experienced expert, all using WebAnno 2.0.0 (Yimam et al., 2014) . Up to five predicates per sentence were pre-selected automatically based on lemma and POS, preferring verbal predicates to other POS, which leads to a larger proportion of verbs in YAGS. The annotation task was to identify the correct frame label for each predicate, if any, and then to identify the role spans as arguments and adjuncts of the frame, and to label them with the appropriate role. For reference, annotators accessed the FrameNet 1.5 definitions and examples with the FrameNet Explorer tool (www.clres.com/FNExplorer.html).] +[Annotation study Each document was annotated by a two linguistically trained annotators provided with detailed guidelines and then curated by an experienced expert, all using WebAnno 2.0.0 (Yimam et al., 2014) . Up to five predicates per sentence were pre-selected automatically based on lemma and POS, preferring verbal predicates to other POS, which leads to a larger proportion of verbs in YAGS. The annotation task was to identify the correct frame label for each predicate, if any, and then to identify the role spans as arguments and adjuncts of the frame, and to label them with the appropriate role. For reference, annotators accessed the FrameNet 1.5 definitions and examples with the FrameNet Explorer tool (www.clres.com/FNExplorer.html).] Paragraph sofa: _InitialView - begin: 15715 + begin: 15716 end: 16466 -[ Inter-rater agreement for frame labels is Krippendorff’s α=0.76; agreement for role labels given matching spans is α=0.62, and Krippendorff’s α unitizing agreement for role spans is 0.7 – a good result for such a difficult task on user-generated text. Average pairwise F1 agreement for frame labels is high at 0.96, higher than the 0.84 reported by Søgaard et al. (2015) for the TW sets. Our high frame agreement is a result of annotator experience and our elaborate annotation setup.] +[Inter-rater agreement for frame labels is Krippendorff’s α=0.76; agreement for role labels given matching spans is α=0.62, and Krippendorff’s α unitizing agreement for role spans is 0.7 – a good result for such a difficult task on user-generated text. Average pairwise F1 agreement for frame labels is high at 0.96, higher than the 0.84 reported by Søgaard et al. (2015) for the TW sets. Our high frame agreement is a result of annotator experience and our elaborate annotation setup.] Paragraph sofa: _InitialView - begin: 16466 + begin: 16467 end: 16951 -[ YAGS statistics and properties Table 1 presents ] +[YAGS statistics and properties Table 1 presents] Heading sofa: _InitialView - begin: 16951 - end: 17000 -[ dataset statistics for YAGS and the other test sets. Due to the predicate selection, YAGS contains a larger proportion of verbal predicates than the other sets, and has three times more frames and roles than TW, approximating the size of das-test. The proportion of core roles, roles that are obligatory for a frame and thus typically more frequent in datasets than non-core roles, in the out-of-domain test sets (TW, YAGS, MASC) is slightly smaller data s f a n v compared to das-test. This goes along with a larger variance of roles in YAGS.] + begin: 16952 + end: 16999 +[dataset statistics for YAGS and the other test sets. Due to the predicate selection, YAGS contains a larger proportion of verbal predicates than the other sets, and has three times more frames and roles than TW, approximating the size of das-test. The proportion of core roles, roles that are obligatory for a frame and thus typically more frequent in datasets than non-core roles, in the out-of-domain test sets (TW, YAGS, MASC) is slightly smaller data s f a n v compared to das-test. This goes along with a larger variance of roles in YAGS.] Paragraph sofa: _InitialView - begin: 17000 + begin: 17001 end: 17544 -[ The user-generated aspect of YAGS manifests in spelling errors, and in the lack of punctuation and structure of the texts. The language is informal, but there are only few emoticons or other special words such as the hashtags typically found in tweets.] +[The user-generated aspect of YAGS manifests in spelling errors, and in the lack of punctuation and structure of the texts. The language is informal, but there are only few emoticons or other special words such as the hashtags typically found in tweets.] Paragraph sofa: _InitialView - begin: 17544 + begin: 17545 end: 17797 -[ In the next section, we use the test sets from Table 1 to analyze the domain generalization capabilities of an open-source FrameNet SRL system. 4] +[In the next section, we use the test sets from Table 1 to analyze the domain generalization capabilities of an open-source FrameNet SRL system. 4] Paragraph sofa: _InitialView - begin: 17797 + begin: 17798 end: 17943 -[ Domain generalization capabilities of open-source FrameNet SRL ] +[Domain generalization capabilities of open-source FrameNet SRL] Heading sofa: _InitialView - begin: 17943 - end: 18007 -[ To analyze the domain generalization capabilities of contemporary open-source SRL, we ran the frame identification from Semafor (Das et al., 2014) with the enhanced role labeler from Kshirsagar et al. (2015), both trained on the in-domain das-train set, on the four test sets das-test, YAGS, TW, and MASC. The systems receive text annotated with predicate spans as input, which has become the standard in recent evaluations.] + begin: 17944 + end: 18006 +[To analyze the domain generalization capabilities of contemporary open-source SRL, we ran the frame identification from Semafor (Das et al., 2014) with the enhanced role labeler from Kshirsagar et al. (2015), both trained on the in-domain das-train set, on the four test sets das-test, YAGS, TW, and MASC. The systems receive text annotated with predicate spans as input, which has become the standard in recent evaluations.] Paragraph sofa: _InitialView - begin: 18007 + begin: 18008 end: 18434 -[ Evaluation script The Semafor evaluation ] +[Evaluation script The Semafor evaluation] Heading sofa: _InitialView - begin: 18434 - end: 18476 -[ script (Das et al., 2014) provides precision P, recall R, and F1 scores for full SRL (SRL), and accuracy A for frame identification (frameId). Full SRL evaluation can be performed with and without using gold frames instead of predicted (auto) frames.] + begin: 18435 + end: 18475 +[script (Das et al., 2014) provides precision P, recall R, and F1 scores for full SRL (SRL), and accuracy A for frame identification (frameId). Full SRL evaluation can be performed with and without using gold frames instead of predicted (auto) frames.] Paragraph sofa: _InitialView - begin: 18476 + begin: 18477 end: 18729 -[ The script does not provide results on the role labeling (argument identification and labeling, roleId) alone: the scoring mechanism for SRL/gold also considers the by default correct gold frames. This is useful when comparing different SRL systems on the same test set, but not sufficient when 1) comparing role labeling performance on different test sets with a different ratio of frame labels to role labels (resulting from different annotation strategies), and 2) analyzing the contribution of frameId and roleId to full SRL performance across test sets. data das-test YAGS MASC TW-av frameId auto gold We therefore evaluate the output of the script to retain the original counts for role labels and compute scores on the role labeling proper (roleId). Moreover, there are two evaluation settings for frameId: exact frame match and partial frame match. We use the exact match setting that does not credit related frames and roles.] +[The script does not provide results on the role labeling (argument identification and labeling, roleId) alone: the scoring mechanism for SRL/gold also considers the by default correct gold frames. This is useful when comparing different SRL systems on the same test set, but not sufficient when 1) comparing role labeling performance on different test sets with a different ratio of frame labels to role labels (resulting from different annotation strategies), and 2) analyzing the contribution of frameId and roleId to full SRL performance across test sets. data das-test YAGS MASC TW-av frameId auto gold We therefore evaluate the output of the script to retain the original counts for role labels and compute scores on the role labeling proper (roleId). Moreover, there are two evaluation settings for frameId: exact frame match and partial frame match. We use the exact match setting that does not credit related frames and roles.] Paragraph sofa: _InitialView - begin: 18729 + begin: 18730 end: 19664 -[ Results Table 2 presents scores for exact match frameId and for SRL and roleId with automatic frames (auto) and with gold frames (gold). For TW, the results are averaged over the number of annotators. According to column SRL/auto, we observe best Semafor performance for full SRL on dastest, results for the other test sets are at least 16 percentage points F1 lower. This is mostly due to the worse frameId performance of Semafor on the new test sets, as shown in column frameId: frameId performance is at least 19 percentage points lower. This negatively affects roleId for the out-of-domain test sets (see column roleId/auto). RoleId/auto scores are also low on das-test, but higher than for the other sets.] +[Results Table 2 presents scores for exact match frameId and for SRL and roleId with automatic frames (auto) and with gold frames (gold). For TW, the results are averaged over the number of annotators. According to column SRL/auto, we observe best Semafor performance for full SRL on dastest, results for the other test sets are at least 16 percentage points F1 lower. This is mostly due to the worse frameId performance of Semafor on the new test sets, as shown in column frameId: frameId performance is at least 19 percentage points lower. This negatively affects roleId for the out-of-domain test sets (see column roleId/auto). RoleId/auto scores are also low on das-test, but higher than for the other sets.] Paragraph sofa: _InitialView - begin: 19664 + begin: 19665 end: 20375 -[ When using gold frame labels, roleId and SRL performance improve for all test sets. As shown in columns roleId/gold and SRL/gold, the difference between in-domain and out-of-domain evaluation vanishes. Only MASC scores are still two points lower for full SRL than those for das-test. TW-av scores even surpass the in-domain scores.2] +[When using gold frame labels, roleId and SRL performance improve for all test sets. As shown in columns roleId/gold and SRL/gold, the difference between in-domain and out-of-domain evaluation vanishes. Only MASC scores are still two points lower for full SRL than those for das-test. TW-av scores even surpass the in-domain scores.2] Paragraph sofa: _InitialView - begin: 20375 + begin: 20376 end: 20708 -[ This shows how much FrameNet role labels are dependent on correct frame labels. Thus, it is crucial to improve the out-of-domain performance of frameId systems.] +[This shows how much FrameNet role labels are dependent on correct frame labels. Thus, it is crucial to improve the out-of-domain performance of frameId systems.] Paragraph sofa: _InitialView - begin: 20708 + begin: 20709 end: 20869 -[ Domain dependence appears to be less of a problem for the role labeling step. The MASC dataset is the most difficult for both frameId and roleId. This is mostly a consequence of the lower training data coverage of MASC, as discussed below.] +[Domain dependence appears to be less of a problem for the role labeling step. The MASC dataset is the most difficult for both frameId and roleId. This is mostly a consequence of the lower training data coverage of MASC, as discussed below.] Paragraph sofa: _InitialView - begin: 20869 + begin: 20870 end: 21109 -[ 2Our TW-av results are not comparable to those from Søgaard et al. (2015) because their test setup includes predicate target identification and uses different evaluation metrics. das-test YAGS MASC TW1 TW2 TW3 Analysis In our study, it became clear that domain dependence is crucial to the frame identification step in SRL. The lower scores for the out-ofdomain test sets can be a result of different domainspecific predicate-frame distributions, or a lack of coverage of the domain in the training data.] +[2Our TW-av results are not comparable to those from Søgaard et al. (2015) because their test setup includes predicate target identification and uses different evaluation metrics. das-test YAGS MASC TW1 TW2 TW3 Analysis In our study, it became clear that domain dependence is crucial to the frame identification step in SRL. The lower scores for the out-ofdomain test sets can be a result of different domainspecific predicate-frame distributions, or a lack of coverage of the domain in the training data.] Paragraph sofa: _InitialView - begin: 21109 + begin: 21110 end: 21614 -[ To get a better understanding of these phenomena, we compared detailed statistics of the different test sets, cf. Table 3. Das-test has the largest predicate coverage and contains a lot of monosemous predicates, which boosts the overall performance. The occurrence of fewer monosemous predicates is expected for the lexical sample dataset MASC, but might indicate a domain preference for polysemous predicates in the YAGS and TW datasets.] +[To get a better understanding of these phenomena, we compared detailed statistics of the different test sets, cf. Table 3. Das-test has the largest predicate coverage and contains a lot of monosemous predicates, which boosts the overall performance. The occurrence of fewer monosemous predicates is expected for the lexical sample dataset MASC, but might indicate a domain preference for polysemous predicates in the YAGS and TW datasets.] Paragraph sofa: _InitialView - begin: 21614 + begin: 21615 end: 22053 -[ The percentage of unseen predicates (lemmas ∈/ das-train) is slightly higher for the user-generated test sets than for das-test, and much higher for MASC. This is mirrored in the lower frameId performance for MASC compared to the other test sets, and the slightly higher performance of TW-av and YAGS. Not all errors can be explained by insufficient training data coverage, which indicates that domain effects occur for the out-of-domain sets.] +[The percentage of unseen predicates (lemmas ∈/ das-train) is slightly higher for the user-generated test sets than for das-test, and much higher for MASC. This is mirrored in the lower frameId performance for MASC compared to the other test sets, and the slightly higher performance of TW-av and YAGS. Not all errors can be explained by insufficient training data coverage, which indicates that domain effects occur for the out-of-domain sets.] Paragraph sofa: _InitialView - begin: 22053 + begin: 22054 end: 22497 -[ To support this assumption, we performed a detailed error analysis on the misclassified instances for all test sets. We compute the proportion of wrongly classified instances with unseen predicates, predicates that do not occur in the training set. For MASC, the majority of the errors, 68%, are based on unseen predicates, while the number ranges between 37% and 43% for the other test sets, i.e. 37% for TW, 39% for das-test and 43% for YAGS. This shows that training data coverage is a bigger issue for MASC than for the other test sets. The proportions of in-train errors for YAGS and TW-av are similar to das-test. Together with the fact that overall proportion of errors is still much higher for the user-generated test sets YAGS and TW-av, this further supports our hypothesis of domain effects for YAGS and TW-av. Manual analysis furthermore shows that there are differences in frequently confused frames between the in-domain das-test and out-of-domain YAGS and TW-av.] +[To support this assumption, we performed a detailed error analysis on the misclassified instances for all test sets. We compute the proportion of wrongly classified instances with unseen predicates, predicates that do not occur in the training set. For MASC, the majority of the errors, 68%, are based on unseen predicates, while the number ranges between 37% and 43% for the other test sets, i.e. 37% for TW, 39% for das-test and 43% for YAGS. This shows that training data coverage is a bigger issue for MASC than for the other test sets. The proportions of in-train errors for YAGS and TW-av are similar to das-test. Together with the fact that overall proportion of errors is still much higher for the user-generated test sets YAGS and TW-av, this further supports our hypothesis of domain effects for YAGS and TW-av. Manual analysis furthermore shows that there are differences in frequently confused frames between the in-domain das-test and out-of-domain YAGS and TW-av.] Paragraph sofa: _InitialView - begin: 22497 + begin: 22498 end: 23475 -[ In the next section, we study new methods to improve out-of-domain frame identification. 5] +[In the next section, we study new methods to improve out-of-domain frame identification. 5] Paragraph sofa: _InitialView - begin: 23475 + begin: 23476 end: 23566 -[ Frame identification with distributed word representations ] +[Frame identification with distributed word representations] Heading sofa: _InitialView - begin: 23566 - end: 23626 -[ Given a predicate and a set of frames associated with this predicate, a frame identification system has to choose the correct frame based on the context. In this section we introduce our frame identification method and compare it to the state of the art in both in-domain and out-of-domain settings. Our system SimpleFrameId We developed a straightforward approach to frame identification based on distributed word representations, and were surprised to find that this simple model achieves results comparable to the state-of-theart system, Hermann-14. Our initial attempts to replicate Hermann-14, which is not publicly available, revealed that the container-based input feature space is very sparse: there exist many syntactic paths that can connect a predicate to its arguments, but a predicate instance rarely has more than five arguments in the sentence. So by design the input representation bears no information in most of its path containers. Moreover, Hermann-14 makes heavy use of automatically created dependency parses, which might decline in quality when applied to a new domain. We demonstrate that our simple system achieves competitive in-domain and out-of-domain performance.] + begin: 23567 + end: 23625 +[Given a predicate and a set of frames associated with this predicate, a frame identification system has to choose the correct frame based on the context. In this section we introduce our frame identification method and compare it to the state of the art in both in-domain and out-of-domain settings. Our system SimpleFrameId We developed a straightforward approach to frame identification based on distributed word representations, and were surprised to find that this simple model achieves results comparable to the state-of-theart system, Hermann-14. Our initial attempts to replicate Hermann-14, which is not publicly available, revealed that the container-based input feature space is very sparse: there exist many syntactic paths that can connect a predicate to its arguments, but a predicate instance rarely has more than five arguments in the sentence. So by design the input representation bears no information in most of its path containers. Moreover, Hermann-14 makes heavy use of automatically created dependency parses, which might decline in quality when applied to a new domain. We demonstrate that our simple system achieves competitive in-domain and out-of-domain performance.] Paragraph sofa: _InitialView - begin: 23626 + begin: 23627 end: 24819 -[ Our system, called SimpleFrameId, is specified as follows: given the lexicon L, the vector space vsm and the training data, our goal is to predict the frame f given the sentence S and the predicate p. From the machine learning perspective, the lexicon and the vector space are external resources. The lexicon contains associations between predicates and frames, and we further denote the set of frames available for a predicate as L(p). The vector space provides a pre-defined dense vector representation vsm(w) for each word w. In our case vsm is a simple word lookup function, since we do not modify our word representations during training.] +[Our system, called SimpleFrameId, is specified as follows: given the lexicon L, the vector space vsm and the training data, our goal is to predict the frame f given the sentence S and the predicate p. From the machine learning perspective, the lexicon and the vector space are external resources. The lexicon contains associations between predicates and frames, and we further denote the set of frames available for a predicate as L(p). The vector space provides a pre-defined dense vector representation vsm(w) for each word w. In our case vsm is a simple word lookup function, since we do not modify our word representations during training.] Paragraph sofa: _InitialView - begin: 24819 + begin: 24820 end: 25463 -[ From the sentence we extract the context representation, xc = Pw∈C|Cv|sm(w) . We experiment with two kinds of contexts: SentBOW includes all the words in the sentence, i.e. C = S, DepBOW considers the dependency parse of the sentence and only includes direct dependents of the predicate, C = dep(p, S). As for the predicate, the plain embedding from the source vector space model is used, xp = vsm(p). A simple concatenation of xc and xp serves as input to the disambiguation classifier D, which outputs weights D(xc, xp, f ) for each frame known to the system f ∈ L. Note that the classifier itself is agnostic to the predicate’s part of speech and exact lemma and only relies on the word representations from the vsm. We experiment with two different classification methods: one is a twolayer neural network DNN , the other one is DW SB, which follows the line of Hermann-14 and learns representations for frames and predicates in the same latent space using the WSABIE algorithm.3 Hyperparameters are tuned on the development sets das-dev and YAGS-dev (sampled from YAGS); we test on the remaining 2,093 instances in YAGS-test. DataBaseline LexiconBaseline Semafor* Hermann-14* (best) WSB+SentBOW WSB+DepBOW NN+SentBOW NN+DepBOW total tering is performed. We find that our frame identification system performs surprisingly well in this setting, and we encourage the no-lexicon performance to be additionally reported in the future, since it better reflects the frame identification quality and smoothens the effect of lexicon coverage.] +[From the sentence we extract the context representation, xc = Pw∈C|Cv|sm(w) . We experiment with two kinds of contexts: SentBOW includes all the words in the sentence, i.e. C = S, DepBOW considers the dependency parse of the sentence and only includes direct dependents of the predicate, C = dep(p, S). As for the predicate, the plain embedding from the source vector space model is used, xp = vsm(p). A simple concatenation of xc and xp serves as input to the disambiguation classifier D, which outputs weights D(xc, xp, f ) for each frame known to the system f ∈ L. Note that the classifier itself is agnostic to the predicate’s part of speech and exact lemma and only relies on the word representations from the vsm. We experiment with two different classification methods: one is a twolayer neural network DNN , the other one is DW SB, which follows the line of Hermann-14 and learns representations for frames and predicates in the same latent space using the WSABIE algorithm.3 Hyperparameters are tuned on the development sets das-dev and YAGS-dev (sampled from YAGS); we test on the remaining 2,093 instances in YAGS-test. DataBaseline LexiconBaseline Semafor* Hermann-14* (best) WSB+SentBOW WSB+DepBOW NN+SentBOW NN+DepBOW total tering is performed. We find that our frame identification system performs surprisingly well in this setting, and we encourage the no-lexicon performance to be additionally reported in the future, since it better reflects the frame identification quality and smoothens the effect of lexicon coverage.] Paragraph sofa: _InitialView - begin: 25463 + begin: 25464 end: 27002 -[ Lexicon-based filtering In the testing stage, ] +[Lexicon-based filtering In the testing stage,] Heading sofa: _InitialView - begin: 27002 - end: 27049 -[ the classifier outputs weights for all the frames available in the lexicon, and the best-scoring frame is selected, f ← argmaxf∈LD(xc, xp, f ).] + begin: 27003 + end: 27048 +[the classifier outputs weights for all the frames available in the lexicon, and the best-scoring frame is selected, f ← argmaxf∈LD(xc, xp, f ).] Paragraph sofa: _InitialView - begin: 27049 + begin: 27050 end: 27193 -[ Since the lexicon specifies available frames for each lexical unit (i.e. lemma and POS), additional filtering can be performed, which limits the search only to the available frames, f ← argmaxf∈L(p)D(xc, xp, f ). If the predicate is unknown to the lexicon, p ∈/ L, the overall bestscoring frame is chosen. If the target has only one entry in the lexicon, it’s declared unambiguous and the frame is assigned directly.] +[Since the lexicon specifies available frames for each lexical unit (i.e. lemma and POS), additional filtering can be performed, which limits the search only to the available frames, f ← argmaxf∈L(p)D(xc, xp, f ). If the predicate is unknown to the lexicon, p ∈/ L, the overall bestscoring frame is chosen. If the target has only one entry in the lexicon, it’s declared unambiguous and the frame is assigned directly.] Paragraph sofa: _InitialView - begin: 27193 + begin: 27194 end: 27610 -[ Despite being common, this setup has several flaws that can obscure the differences between sys- Experiments In our experiments, we generate tems in the testing stage. As we showed in Section the lexicon L in the same way as in Hermann-14, 4, the FrameNet lexicon has coverage issues when by scanning the “frames” folder of the FrameNet applied to new domains. Neither the predicate list 1.5 distribution. For the external vector space nor the frame associations are guaranteed to be model vsm we use dependency-based word emcomplete, and hence the total results are highly de- beddings from Levy and Goldberg (2014). termined by the lexicon coverage.4 To take this into account, we also perform evaluation in the In-domain performance We report the perforno-lexicon setting, where frames are assigned mance of our system in the in-domain setting directly by the classifier and no lexicon-based fil- to compare to the state-of-the-art results from Hermann-14.5 We train our system on das-train and test it on das-test using the full FrameNet lexicon. When available, we report the no-lexicon scores as well. As Table 4 shows, our system outBaselines We employ two majority baseline models for comparison. The DataBaseline assigns frames based on how often a frame is evoked by the given predicate. This corresponds to the most frequent sense baseline in word sense disambiguation (WSD). The frames available for predicates are obtained by scanning the training data. The LexiconBaseline calculates overall frame counts first (i.e. how often a frame appears in the training data in general), and, given the predicate, selects the overall most frequent frame among the ones available for this predicate. We expect this baseline to better handle the cases when limited data is available for a given predicate sense.] +[Despite being common, this setup has several flaws that can obscure the differences between sys- Experiments In our experiments, we generate tems in the testing stage. As we showed in Section the lexicon L in the same way as in Hermann-14, 4, the FrameNet lexicon has coverage issues when by scanning the “frames” folder of the FrameNet applied to new domains. Neither the predicate list 1.5 distribution. For the external vector space nor the frame associations are guaranteed to be model vsm we use dependency-based word emcomplete, and hence the total results are highly de- beddings from Levy and Goldberg (2014). termined by the lexicon coverage.4 To take this into account, we also perform evaluation in the In-domain performance We report the perforno-lexicon setting, where frames are assigned mance of our system in the in-domain setting directly by the classifier and no lexicon-based fil- to compare to the state-of-the-art results from Hermann-14.5 We train our system on das-train and test it on das-test using the full FrameNet lexicon. When available, we report the no-lexicon scores as well. As Table 4 shows, our system outBaselines We employ two majority baseline models for comparison. The DataBaseline assigns frames based on how often a frame is evoked by the given predicate. This corresponds to the most frequent sense baseline in word sense disambiguation (WSD). The frames available for predicates are obtained by scanning the training data. The LexiconBaseline calculates overall frame counts first (i.e. how often a frame appears in the training data in general), and, given the predicate, selects the overall most frequent frame among the ones available for this predicate. We expect this baseline to better handle the cases when limited data is available for a given predicate sense.] Paragraph sofa: _InitialView - begin: 27610 + begin: 27611 end: 29423 -[ 3In our implementation, we use the LightFM package (Kula, 2015) with the WARP option for hybrid matrix factorization.] +[3In our implementation, we use the LightFM package (Kula, 2015) with the WARP option for hybrid matrix factorization.] Paragraph sofa: _InitialView - begin: 29423 + begin: 29424 end: 29543 -[ 4A justification for this can also be found in Hermann et al. (2014): the difference in Hermann-14 accuracy when switching from the Semafor lexicon to the full lexicon is comparable to the difference between Semafor and Hermann-14 when evaluated on the same lexicon.] +[4A justification for this can also be found in Hermann et al. (2014): the difference in Hermann-14 accuracy when switching from the Semafor lexicon to the full lexicon is comparable to the difference between Semafor and Hermann-14 when evaluated on the same lexicon.] Paragraph sofa: _InitialView - begin: 29543 + begin: 29544 end: 29810 -[ 5Based on the errata version of Hermann et al. (2014) in http://www.aclweb.org/anthology/P/] +[5Based on the errata version of Hermann et al. (2014) in http://www.aclweb.org/anthology/P/] Paragraph sofa: _InitialView - begin: 29810 + begin: 29811 end: 29902 -[ P14/P14-1136v2.pdf DataBaseline LexiconBaseline Semafor performs Semafor and performs on par with the results reported for Hermann-14. One interesting observation is that our systems perform almost as well in the no-lexicon setting as the DataBaseline, which has access to the lexicon, in the total setting. To our surprise, the WSABIEbased frame identification did not yield a consistent improvement in-domain, compared to the simple NN-based approach. We also observe that in many cases the SentBOW representation performs on par with the DepBOW, while requiring significantly less data preprocessing: SentBOW only uses tokenization, whereas DepBow relies on lemmatization, POS-tagging, and dependency parsing. We attribute this effect to the fact that SentBOW provides more context information than the sparse, dependency-filteredDepBOW.] +[P14/P14-1136v2.pdf DataBaseline LexiconBaseline Semafor performs Semafor and performs on par with the results reported for Hermann-14. One interesting observation is that our systems perform almost as well in the no-lexicon setting as the DataBaseline, which has access to the lexicon, in the total setting. To our surprise, the WSABIEbased frame identification did not yield a consistent improvement in-domain, compared to the simple NN-based approach. We also observe that in many cases the SentBOW representation performs on par with the DepBOW, while requiring significantly less data preprocessing: SentBOW only uses tokenization, whereas DepBow relies on lemmatization, POS-tagging, and dependency parsing. We attribute this effect to the fact that SentBOW provides more context information than the sparse, dependency-filteredDepBOW.] Paragraph sofa: _InitialView - begin: 29902 + begin: 29903 end: 30743 -[ Out-of-domain performance We also investi ] +[Out-of-domain performance We also investi] Heading sofa: _InitialView - begin: 30743 - end: 30786 -[ gate how well the systems perform in the out-ofdomain setting. Table 5 summarizes the results. Each of the systems was trained on das-train and tested on a variety of test sets. As we can see, our systems outperform Semafor for all datasets. The YAGS dataset is the only dataset on which we do not strongly outperform Semafor. We attribute this to the complexity of the YAGS dataset that contains a high proportion of verbs.] + begin: 30744 + end: 30785 +[gate how well the systems perform in the out-ofdomain setting. Table 5 summarizes the results. Each of the systems was trained on das-train and tested on a variety of test sets. As we can see, our systems outperform Semafor for all datasets. The YAGS dataset is the only dataset on which we do not strongly outperform Semafor. We attribute this to the complexity of the YAGS dataset that contains a high proportion of verbs.] Paragraph sofa: _InitialView - begin: 30786 + begin: 30787 end: 31211 -[ Overall out-of-domain performance stays behind the F1-agreement observed for the human annotators for TW and YAGS, which shows that there is a large margin for improvement. Corresponding scores for in-domain data are not available. Error analysis To further investigate the performance of our system in the out-of-domain setup we analyse statistics on the errors made by the system variant NN+SentBOW.] +[Overall out-of-domain performance stays behind the F1-agreement observed for the human annotators for TW and YAGS, which shows that there is a large margin for improvement. Corresponding scores for in-domain data are not available. Error analysis To further investigate the performance of our system in the out-of-domain setup we analyse statistics on the errors made by the system variant NN+SentBOW.] Paragraph sofa: _InitialView - begin: 31211 + begin: 31212 end: 31613 -[ The system’s wrong predictions are affected by the lexicon in two ways. First, if the predicate is not listed in the lexicon (unknown), the system has to choose among all frames. As we have shown before, the quality of predictions for unknown predicates is generally lower. The second case is when the predicate is listed in lexicon (so it is not unknown), but the correct frame is not associated with this predicate. We further refer to this class of errors as unlinked. For unlinked predicates, the system is restricted to the set of frames provided by the lexicon, and by design has no means to select the right frame for a given predicate occurrence.] +[The system’s wrong predictions are affected by the lexicon in two ways. First, if the predicate is not listed in the lexicon (unknown), the system has to choose among all frames. As we have shown before, the quality of predictions for unknown predicates is generally lower. The second case is when the predicate is listed in lexicon (so it is not unknown), but the correct frame is not associated with this predicate. We further refer to this class of errors as unlinked. For unlinked predicates, the system is restricted to the set of frames provided by the lexicon, and by design has no means to select the right frame for a given predicate occurrence.] Paragraph sofa: _InitialView - begin: 31613 + begin: 31614 end: 32268 -[ The unlinked-predicate issue points to a major design flaw in the standard frameId architecture. Although choosing among frames defined in the lexicon provides a quality boost, it also renders many instances intractable for the system, if the lexicon coverage is incomplete. As Table 6 shows, unknown and unlinked predicates are almost non-present in the in-domain case, but are a major source of errors in the out-of-domain case and even might be responsible for the majority of errors occurring due to domain shift (see MASC). It is important to point out that there is still no guarantee that these would be classified correctly once the missing linking information is available in the lexicon. However, if the correct frame is not listed among the frames available for the predicate, the misclassification is inevitable.] +[The unlinked-predicate issue points to a major design flaw in the standard frameId architecture. Although choosing among frames defined in the lexicon provides a quality boost, it also renders many instances intractable for the system, if the lexicon coverage is incomplete. As Table 6 shows, unknown and unlinked predicates are almost non-present in the in-domain case, but are a major source of errors in the out-of-domain case and even might be responsible for the majority of errors occurring due to domain shift (see MASC). It is important to point out that there is still no guarantee that these would be classified correctly once the missing linking information is available in the lexicon. However, if the correct frame is not listed among the frames available for the predicate, the misclassification is inevitable.] Paragraph sofa: _InitialView - begin: 32268 + begin: 32269 end: 33093 -[ A more detailed analysis of the errors made by the system shows that the majority of false predictions for known and linked predicates are due to the domain differences in word usage. For example, the predicate window was assigned the frame Connecting architecture instead of the correct frame Time period of action in the following sentence: “No effect of anesthetic protocol on IOP during a 12 minute measurement [window].”] +[A more detailed analysis of the errors made by the system shows that the majority of false predictions for known and linked predicates are due to the domain differences in word usage. For example, the predicate window was assigned the frame Connecting architecture instead of the correct frame Time period of action in the following sentence: “No effect of anesthetic protocol on IOP during a 12 minute measurement [window].”] Paragraph sofa: _InitialView - begin: 33093 + begin: 33094 end: 33519 -[ This problem is also relevant in generic WSD (Agirre et al., 2010) and benefits from the same solutions, for instance adapting embeddings to a particular domain (Taghipour and Ng, 2015) and efficient use of embeddings (Iacobacci et al., 2016) .] +[This problem is also relevant in generic WSD (Agirre et al., 2010) and benefits from the same solutions, for instance adapting embeddings to a particular domain (Taghipour and Ng, 2015) and efficient use of embeddings (Iacobacci et al., 2016) .] Paragraph sofa: _InitialView - begin: 33519 + begin: 33520 end: 33769 -[ Another major source of errors are subtle syntactic and semantic differences between frames which are hard to resolve on the sentence level (e.g. distinguishing between Similarity and Identicality for the predicate different). This could be addressed by incorporating subcategorization information and document context into the disamdataset unk biguation model, which has been proposed in recent work in FrameNet SRL, see e.g. Hermann et al. (2014) and Roth and Lapata (2015).] +[Another major source of errors are subtle syntactic and semantic differences between frames which are hard to resolve on the sentence level (e.g. distinguishing between Similarity and Identicality for the predicate different). This could be addressed by incorporating subcategorization information and document context into the disamdataset unk biguation model, which has been proposed in recent work in FrameNet SRL, see e.g. Hermann et al. (2014) and Roth and Lapata (2015).] Paragraph sofa: _InitialView - begin: 33769 + begin: 33770 end: 34246 -[ To further explore the impact of user-generated text, we applied word-processor spelling correction to YAGS and tested our systems on the corrected set. The results do not change significantly, which indicates that a) our distributed representations provide enough information to classify also noisy usergenerated text, and b) frameId errors cannot be attributed to preprocessing problems at large scale. 6] +[To further explore the impact of user-generated text, we applied word-processor spelling correction to YAGS and tested our systems on the corrected set. The results do not change significantly, which indicates that a) our distributed representations provide enough information to classify also noisy usergenerated text, and b) frameId errors cannot be attributed to preprocessing problems at large scale. 6] Paragraph sofa: _InitialView - begin: 34246 + begin: 34247 end: 34653 -[ Discussion and outlook ] +[Discussion and outlook] Heading sofa: _InitialView - begin: 34653 - end: 34677 -[ Our analysis in Section 4 shows that domain adaptation is mainly required for the frameId step of FrameNet SRL. Unlike in PropBank SRL, in FrameNet SRL there is no significant performance drop for roleId once correct frames are available. The number of available roles given the correct frame is lower, on average 10, which reduces the complexity of the roleId task.] + begin: 34654 + end: 34676 +[Our analysis in Section 4 shows that domain adaptation is mainly required for the frameId step of FrameNet SRL. Unlike in PropBank SRL, in FrameNet SRL there is no significant performance drop for roleId once correct frames are available. The number of available roles given the correct frame is lower, on average 10, which reduces the complexity of the roleId task.] Paragraph sofa: _InitialView - begin: 34677 + begin: 34678 end: 35044 -[ In Section 5 we introduced a simple, yet efficient frame identification method and evaluated it on in-domain and out-of-domain data. The method achieves competitive in-domain results, and outperforms the best available open-source system in out-of-domain accuracy. We also observe that our system performs well in the newly introduced no-lexicon evaluation setting, where no lexicon-based filtering is applied.] +[In Section 5 we introduced a simple, yet efficient frame identification method and evaluated it on in-domain and out-of-domain data. The method achieves competitive in-domain results, and outperforms the best available open-source system in out-of-domain accuracy. We also observe that our system performs well in the newly introduced no-lexicon evaluation setting, where no lexicon-based filtering is applied.] Paragraph sofa: _InitialView - begin: 35044 + begin: 35045 end: 35455 -[ We identified a major issue in the standard frameId architecture: shifting to a new domain might render the predicate-frame associations in the FrameNet lexicon incomplete, which leads to errors for a standard classifier trained on in-domain data. One could optimize a frameId system to work in the no-lexicon setting which does not rely on the lexicon knowledge at all. However, in this setting the classification results are currently lower. Manually or automatically increasing both predicate and predicate-frame association coverage of the FrameNet lexicon could help, and we suggest investigating this line of research in future work.] +[We identified a major issue in the standard frameId architecture: shifting to a new domain might render the predicate-frame associations in the FrameNet lexicon incomplete, which leads to errors for a standard classifier trained on in-domain data. One could optimize a frameId system to work in the no-lexicon setting which does not rely on the lexicon knowledge at all. However, in this setting the classification results are currently lower. Manually or automatically increasing both predicate and predicate-frame association coverage of the FrameNet lexicon could help, and we suggest investigating this line of research in future work.] Paragraph sofa: _InitialView - begin: 35455 + begin: 35456 end: 36095 -[ While our method achieves state-of-the-art results on out-of-domain data, overall results are still significantly lower than the human performance observed for YAGS and TW, which shows that there is large room for improvement. Some further benefits could be gained from combining the WSABIE and NN-based classification, using advanced context representations, e.g. context2vec (Melamud et al., 2016) and incorporating syntactic information into the model. The out-of-domain performance could be further improved by adapting word representations to a new domain.] +[While our method achieves state-of-the-art results on out-of-domain data, overall results are still significantly lower than the human performance observed for YAGS and TW, which shows that there is large room for improvement. Some further benefits could be gained from combining the WSABIE and NN-based classification, using advanced context representations, e.g. context2vec (Melamud et al., 2016) and incorporating syntactic information into the model. The out-of-domain performance could be further improved by adapting word representations to a new domain.] Paragraph sofa: _InitialView - begin: 36095 + begin: 36096 end: 36659 -[ A direct comparison to the Hermann-14 system in the out-of-domain setup would shed some more light on the properties of the task affecting the out-of-domain performance. On the one hand, we expect Hermann-14 to perform worse due to its heavy reliance on syntactic information, which might decline in quality when moved to a new domain; on the other hand, the WSABIE-based classification might smoothen this effect. We make our dataset publicly available to enable comparison to related work.6 7] +[A direct comparison to the Hermann-14 system in the out-of-domain setup would shed some more light on the properties of the task affecting the out-of-domain performance. On the one hand, we expect Hermann-14 to perform worse due to its heavy reliance on syntactic information, which might decline in quality when moved to a new domain; on the other hand, the WSABIE-based classification might smoothen this effect. We make our dataset publicly available to enable comparison to related work.6 7] Paragraph sofa: _InitialView - begin: 36659 + begin: 36660 end: 37154 -[ Conclusion ] +[Conclusion] Heading sofa: _InitialView - begin: 37154 - end: 37166 -[ Domain dependence is a well-known issue for supervised NLP tasks such as FrameNet SRL. To the best of our knowledge, there is no recent study of the domain dependence of FrameNet SRL, also prohibited by a lack of appropriate datasets.] + begin: 37155 + end: 37165 +[Domain dependence is a well-known issue for supervised NLP tasks such as FrameNet SRL. To the best of our knowledge, there is no recent study of the domain dependence of FrameNet SRL, also prohibited by a lack of appropriate datasets.] Paragraph sofa: _InitialView - begin: 37166 + begin: 37167 end: 37401 -[ To address this problem, we 1) present the first comprehensive study of the domain generalization performance of the open-source Semafor system on several diverse benchmark sets. As a prerequisite, we introduce YAGS, a new, substantially sized test set in the domain of user-generated questionand-answer text. We find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step; we 2) explore a promising way to improve out-of-domain frame identification, i.e. using distributed word representations. Our simple frame identification system based on distributed word representations achieves higher scores for out-of-domain frame identification than previous systems and approaches state-of-the-art results indomain. To support reproducibility of our results, we publish the YAGS test set annotations and our frame identification system for research purposes.] +[To address this problem, we 1) present the first comprehensive study of the domain generalization performance of the open-source Semafor system on several diverse benchmark sets. As a prerequisite, we introduce YAGS, a new, substantially sized test set in the domain of user-generated questionand-answer text. We find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step; we 2) explore a promising way to improve out-of-domain frame identification, i.e. using distributed word representations. Our simple frame identification system based on distributed word representations achieves higher scores for out-of-domain frame identification than previous systems and approaches state-of-the-art results indomain. To support reproducibility of our results, we publish the YAGS test set annotations and our frame identification system for research purposes.] Paragraph sofa: _InitialView - begin: 37401 + begin: 37402 end: 38290 -[ 6www.ukp.tu-darmstadt.de/ood-fn-srl ] +[6www.ukp.tu-darmstadt.de/ood-fn-srl] Heading sofa: _InitialView - begin: 38290 - end: 38327 -[ Acknowledgements ] + begin: 38291 + end: 38326 +[Acknowledgements] Heading sofa: _InitialView - begin: 38327 - end: 38345 -[ This work was supported by FAZIT-Stiftung and by the German Research Foundation (DFG) through grant GU 798/18-1 (QAEduInf) and the research training group “Adaptive Preparation of Information form Heterogeneous Sources” (AIPHES, GRK 1994/1). We thank Orin Hargraves and our annotators for their excellent work on the annotation study, Dr. Richard Eckart de Castilho for support regarding WebAnno, as well as Dr. Judith Eckle-Kohler and the anonymous reviewers for their comments on earlier versions of this paper.] + begin: 38328 + end: 38344 +[This work was supported by FAZIT-Stiftung and by the German Research Foundation (DFG) through grant GU 798/18-1 (QAEduInf) and the research training group “Adaptive Preparation of Information form Heterogeneous Sources” (AIPHES, GRK 1994/1). We thank Orin Hargraves and our annotators for their excellent work on the annotation study, Dr. Richard Eckart de Castilho for support regarding WebAnno, as well as Dr. Judith Eckle-Kohler and the anonymous reviewers for their comments on earlier versions of this paper.] Paragraph sofa: _InitialView - begin: 38345 + begin: 38346 end: 38859 -[ Eneko Agirre , Oier Lo´pez de Lacalle, Christiane Fellbaum, Shu-Kai Hsieh , Maurizio Tesconi, Monica Monachini, Piek Vossen, and Roxanne Segers . 2010 . SemEval-2010 Task 17 : All-Words Word Sense Disambiguation on a Specific Domain . InProceedings of the 5th International Workshop on Semantic Evaluation , pages 75 - 80 . Association for Computational Linguistics. Collin Baker , Michael Ellsworth , and Katrin Erk . 2007 . SemEval-2007 Task 19 : Frame Semantic Structure Extraction . In Proceedings of the Fourth International Workshop on Semantic Evaluations (SemEval2007) , pages 99 - 104 , Prague, Czech Republic, June. Association for Computational Linguistics. Jonathan Berant , Vivek Srikumar, Pei-Chun Chen , Abby Vander Linden, Brittany Harding, Brad Huang, Peter Clark , and Christopher D. Manning . 2014 . Modeling Biological Processes for Reading Comprehension . In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 1499 - 1510 , Doha, Qatar. Association for Computational Linguistics. John Blitzer , Ryan McDonald , and Fernando Pereira . 2006 . Domain adaptation with structural correspondence learning . In Proceedings of the 2006 Conference on Empirical Methods in Natural Language Processing , pages 120 - 128 , Sydney, Australia, July. Association for Computational Linguistics. Xavier Carreras and Llu´ıs Ma`rquez. 2005 . Introduction to the CoNLL-2005 shared task: Semantic role labeling . In Proceedings of the Ninth Conference on Computational Natural Language Learning (CoNLL-2005) , pages 152 - 164 , Ann Arbor, Michigan, June. Association for Computational Linguistics. Danilo Croce , Cristina Giannone, Paolo Annesi, and Roberto Basili . 2010 . Towards open-domain semantic role labeling . In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics , pages 237 - 246 , Uppsala, Sweden, July. Association for Computational Linguistics. Dipanjan Das and Noah A. Smith . 2011 . SemiSupervised Frame-Semantic Parsing for Unknown Predicates . In Proc. of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies , pages 1435 - 1444 , Portland, Oregon, USA. Dipanjan Das , Desai Chen , Andre´ F. T. Martins , Nathan Schneider , and Noah A. Smith . 2014 . Frame-semantic parsing . Computational Linguistics , 40 ( 1 ): 9 - 56 . Hal Daume ´III. 2007 . Frustratingly easy domain adaptation . In Proceedings of the 45th Annual Meeting of the Association of Computational Linguistics , pages 256 - 263 , Prague, Czech Republic, June. Association for Computational Linguistics. Katrin Erk and Sebastian Pado´. 2006 . SHALMANESER - A Toolchain For Shallow Semantic Parsing . In Proceedings of the 5th International Conference on Language Resources and Evaluation (LREC 2006 ), volume 6 , pages 527 - 532 , Genoa, Italy. ELRA. Charles J. Fillmore , Christopher R. Johnson , and Miriam R.L. Petruck . 2003 . Background to FrameNet. International journal of lexicography , 16 ( 3 ): 235 - 250 . Nicholas FitzGerald , Oscar Ta¨ckstro¨m, Kuzman Ganchev, and Dipanjan Das . 2015 . Semantic role labeling with neural network factors . In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing , pages 960 - 970 , Lisbon, Portugal, September. Association for Computational Linguistics. Jan Hajicˇ , Massimiliano Ciaramita, Richard Johansson, Daisuke Kawahara, Maria Anto`nia Mart´ı, Llu´ıs Ma`rquez, Adam Meyers, Joakim Nivre, Sebastian Pado´, Jan Sˇ teˇpa´nek, Pavel Stranˇa´k, Mihai Surdeanu, Nianwen Xue, and Yi Zhang . 2009 . The conll2009 shared task: Syntactic and semantic dependencies in multiple languages . In Proceedings of the Thirteenth Conference on Computational Natural Language Learning (CoNLL 2009 ): Shared Task, pages 1 - 18 , Boulder, Colorado, June. Association for Computational Linguistics. Karl Moritz Hermann , Dipanjan Das , Jason Weston , and Kuzman Ganchev . 2014 . Semantic frame identification with distributed word representations . In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 1448 - 1458 , Baltimore, Maryland, June. Association for Computational Linguistics. Fei Huang and Alexander Yates . 2010 . Open-domain semantic role labeling by modeling word spans . In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics , pages 968 - 978 , Uppsala, Sweden, July. Association for Computational Linguistics. Ignacio Iacobacci , Mohammad Taher Pilehvar, and Roberto Navigli . 2016 . Embeddings for Word Sense Disambiguation: An Evaluation Study . In Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 897 - 907 , Berlin, Germany, August. Association for Computational Linguistics. Anders Johannsen , He´ctor Mart´ınez Alonso, and Anders Søgaard . 2015 . Any-language frame-semantic parsing . In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing , pages 2062 - 2066 , Lisbon, Portugal, September. Association for Computational Linguistics. Richard Johansson and Pierre Nugues . 2008 . The effect of syntactic representation on semantic role labeling . In Proceedings of the 22nd International Conference on Computational Linguistics (Coling 2008 ), pages 393 - 400 , Manchester, UK , August . Coling 2008 Organizing Committee . Meghana Kshirsagar , Sam Thomson, Nathan Schneider, Jaime Carbonell, Noah A. Smith , and Chris Dyer . 2015 . Frame-semantic role labeling with heterogeneous annotations . In Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 2: Short Papers) , pages 218 - 224 , Beijing, China, July. Association for Computational Linguistics. Maciej Kula . 2015 . Metadata embeddings for user and item cold-start recommendations . In Toine Bogers and Marijn Koolen , editors, Proceedings of the 2nd Workshop on New Trends on Content-Based Recommender Systems co-located with 9th ACM Conference on Recommender Systems (RecSys 2015 ), volume 1448 of CEUR Workshop Proceedings , pages 14 - 21 , Vienna, Austria, September. CEUR-WS.org. Omer Levy and Yoav Goldberg . 2014 . Dependencybased word embeddings . In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics , ACL 2014 , June 22-27, 2014 , Baltimore, MD , USA, Volume 2 : Short Papers , pages 302 - 308 . The Association for Computer Linguistics. Oren Melamud , Jacob Goldberger , and Ido Dagan . 2016 . context2vec: Learning generic context embedding with bidirectional LSTM . In Proceedings of the 20th SIGNLL Conference on Computational Natural Language Learning , CoNLL 2016 , Berlin, Germany, August 11-12 , 2016 , pages 51 - 61 . Tomas Mikolov , Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean . 2013 . Distributed Representations of Words and Phrases and Their Compositionality . In Proceedings of the 26th International Conference on Neural Information Processing Systems (NIPS '13) , pages 3111 - 3119 , Lake Tahoe , Nevada, USA. Alexis Palmer and Caroline Sporleder . 2010 . Evaluating FrameNet-style semantic parsing: the role of coverage gaps in FrameNet . In Proceedings of the 23rd International Conference on Computational Linguistics: Posters , pages 928 - 936 , Beijing, China, August . Rebecca J. Passonneau , Collin F. Baker , Christiane Fellbaum, and Nancy Ide . 2012 . The MASC Word Sense Corpus . In Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12) , pages 3025 - 3030 , Istanbul, Turkey. Jeffrey Pennington , Richard Socher, and Christopher Manning . 2014 . Glove: Global vectors for word representation . In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 1532 - 1543 , Doha, Qatar, October. Association for Computational Linguistics. Michael Roth and Mirella Lapata . 2015 . Contextaware frame-semantic role labeling . Transactions of the Association for Computational Linguistics , 3 : 449 - 460 . Josef Ruppenhofer , Michael Ellsworth, Miriam R. L. Petruck , Christopher R. Johnson , and Jan Scheffczyk . 2010 . FrameNet II: Extended Theory and Practice . Technical report , ICSI, University of California, Berkeley. Anders Søgaard , Barbara Plank, and H e´ctor Mart´ınez Alonso. 2015 . Using Frame Semantics for Knowledge Extraction from Twitter . In Proceedings of the Twenty-Ninth AAAI Conference on Artificial Intelligence , pages 2447 - 2452 , Austin, Texas, USA. Anders Søgaard . 2013 . Semi-supervised learning and domain adaptation in natural language processing . Synthesis Lectures on Human Language Technologies , 6 ( 2 ): 1 - 103 . Mihai Surdeanu , Richard Johansson, Adam Meyers, Llu´ıs Ma`rquez, and Joakim Nivre . 2008 . The conll 2008 shared task on joint parsing of syntactic and semantic dependencies . In CoNLL 2008: Proceedings of the Twelfth Conference on Computational Natural Language Learning , pages 159 - 177 , Manchester, England, August. Coling 2008 Organizing Committee . Mihai Surdeanu , Massimiliano Ciaramita, and Hugo Zaragoza . 2011 . Learning to rank answers to nonfactoid questions from web collections . Computational Linguistics , 37 ( 2 ): 351 - 383 . Kaveh Taghipour and Hwee Tou Ng. 2015 . SemiSupervised Word Sense Disambiguation Using Word Embeddings in General and Specific Domains . In Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies , pages 314 - 323 , Denver, Colorado, May-June. Association for Computational Linguistics . Jason Weston , Samy Bengio, and Nicolas Usunier . 2011 . WSABIE: Scaling Up to Large Vocabulary Image Annotation . In Proceedings of the Twenty-Second International Joint Conference on Artificial Intelligence - Volume Volume Three , IJCAI'11 , pages 2764 - 2770 , Barcelona, Catalonia, Spain. AAAI Press. Haitong Yang , Tao Zhuang , and Chengqing Zong . 2015 . Domain adaptation for syntactic and semantic dependency parsing using deep belief networks . Transactions of the Association for Computational Linguistics , 3 : 271 - 282 . Seid Muhie Yimam , Richard Eckart de Castilho, Iryna Gurevych, and Chris Biemann . 2014 . Automatic Annotation Suggestions and Custom Annotation Layers in WebAnno . In Kalina Bontcheva and Zhu Jingbo, editors, Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics. System Demonstrations , pages 91 - 96 , Stroudsburg, PA 18360 , USA. Association for Computational Linguistics.] +[Eneko Agirre , Oier Lo´pez de Lacalle, Christiane Fellbaum, Shu-Kai Hsieh , Maurizio Tesconi, Monica Monachini, Piek Vossen, and Roxanne Segers . 2010 . SemEval-2010 Task 17 : All-Words Word Sense Disambiguation on a Specific Domain . InProceedings of the 5th International Workshop on Semantic Evaluation , pages 75 - 80 . Association for Computational Linguistics. Collin Baker , Michael Ellsworth , and Katrin Erk . 2007 . SemEval-2007 Task 19 : Frame Semantic Structure Extraction . In Proceedings of the Fourth International Workshop on Semantic Evaluations (SemEval2007) , pages 99 - 104 , Prague, Czech Republic, June. Association for Computational Linguistics. Jonathan Berant , Vivek Srikumar, Pei-Chun Chen , Abby Vander Linden, Brittany Harding, Brad Huang, Peter Clark , and Christopher D. Manning . 2014 . Modeling Biological Processes for Reading Comprehension . In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 1499 - 1510 , Doha, Qatar. Association for Computational Linguistics. John Blitzer , Ryan McDonald , and Fernando Pereira . 2006 . Domain adaptation with structural correspondence learning . In Proceedings of the 2006 Conference on Empirical Methods in Natural Language Processing , pages 120 - 128 , Sydney, Australia, July. Association for Computational Linguistics. Xavier Carreras and Llu´ıs Ma`rquez. 2005 . Introduction to the CoNLL-2005 shared task: Semantic role labeling . In Proceedings of the Ninth Conference on Computational Natural Language Learning (CoNLL-2005) , pages 152 - 164 , Ann Arbor, Michigan, June. Association for Computational Linguistics. Danilo Croce , Cristina Giannone, Paolo Annesi, and Roberto Basili . 2010 . Towards open-domain semantic role labeling . In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics , pages 237 - 246 , Uppsala, Sweden, July. Association for Computational Linguistics. Dipanjan Das and Noah A. Smith . 2011 . SemiSupervised Frame-Semantic Parsing for Unknown Predicates . In Proc. of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies , pages 1435 - 1444 , Portland, Oregon, USA. Dipanjan Das , Desai Chen , Andre´ F. T. Martins , Nathan Schneider , and Noah A. Smith . 2014 . Frame-semantic parsing . Computational Linguistics , 40 ( 1 ): 9 - 56 . Hal Daume ´III. 2007 . Frustratingly easy domain adaptation . In Proceedings of the 45th Annual Meeting of the Association of Computational Linguistics , pages 256 - 263 , Prague, Czech Republic, June. Association for Computational Linguistics. Katrin Erk and Sebastian Pado´. 2006 . SHALMANESER - A Toolchain For Shallow Semantic Parsing . In Proceedings of the 5th International Conference on Language Resources and Evaluation (LREC 2006 ), volume 6 , pages 527 - 532 , Genoa, Italy. ELRA. Charles J. Fillmore , Christopher R. Johnson , and Miriam R.L. Petruck . 2003 . Background to FrameNet. International journal of lexicography , 16 ( 3 ): 235 - 250 . Nicholas FitzGerald , Oscar Ta¨ckstro¨m, Kuzman Ganchev, and Dipanjan Das . 2015 . Semantic role labeling with neural network factors . In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing , pages 960 - 970 , Lisbon, Portugal, September. Association for Computational Linguistics. Jan Hajicˇ , Massimiliano Ciaramita, Richard Johansson, Daisuke Kawahara, Maria Anto`nia Mart´ı, Llu´ıs Ma`rquez, Adam Meyers, Joakim Nivre, Sebastian Pado´, Jan Sˇ teˇpa´nek, Pavel Stranˇa´k, Mihai Surdeanu, Nianwen Xue, and Yi Zhang . 2009 . The conll2009 shared task: Syntactic and semantic dependencies in multiple languages . In Proceedings of the Thirteenth Conference on Computational Natural Language Learning (CoNLL 2009 ): Shared Task, pages 1 - 18 , Boulder, Colorado, June. Association for Computational Linguistics. Karl Moritz Hermann , Dipanjan Das , Jason Weston , and Kuzman Ganchev . 2014 . Semantic frame identification with distributed word representations . In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 1448 - 1458 , Baltimore, Maryland, June. Association for Computational Linguistics. Fei Huang and Alexander Yates . 2010 . Open-domain semantic role labeling by modeling word spans . In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics , pages 968 - 978 , Uppsala, Sweden, July. Association for Computational Linguistics. Ignacio Iacobacci , Mohammad Taher Pilehvar, and Roberto Navigli . 2016 . Embeddings for Word Sense Disambiguation: An Evaluation Study . In Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 897 - 907 , Berlin, Germany, August. Association for Computational Linguistics. Anders Johannsen , He´ctor Mart´ınez Alonso, and Anders Søgaard . 2015 . Any-language frame-semantic parsing . In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing , pages 2062 - 2066 , Lisbon, Portugal, September. Association for Computational Linguistics. Richard Johansson and Pierre Nugues . 2008 . The effect of syntactic representation on semantic role labeling . In Proceedings of the 22nd International Conference on Computational Linguistics (Coling 2008 ), pages 393 - 400 , Manchester, UK , August . Coling 2008 Organizing Committee . Meghana Kshirsagar , Sam Thomson, Nathan Schneider, Jaime Carbonell, Noah A. Smith , and Chris Dyer . 2015 . Frame-semantic role labeling with heterogeneous annotations . In Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 2: Short Papers) , pages 218 - 224 , Beijing, China, July. Association for Computational Linguistics. Maciej Kula . 2015 . Metadata embeddings for user and item cold-start recommendations . In Toine Bogers and Marijn Koolen , editors, Proceedings of the 2nd Workshop on New Trends on Content-Based Recommender Systems co-located with 9th ACM Conference on Recommender Systems (RecSys 2015 ), volume 1448 of CEUR Workshop Proceedings , pages 14 - 21 , Vienna, Austria, September. CEUR-WS.org. Omer Levy and Yoav Goldberg . 2014 . Dependencybased word embeddings . In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics , ACL 2014 , June 22-27, 2014 , Baltimore, MD , USA, Volume 2 : Short Papers , pages 302 - 308 . The Association for Computer Linguistics. Oren Melamud , Jacob Goldberger , and Ido Dagan . 2016 . context2vec: Learning generic context embedding with bidirectional LSTM . In Proceedings of the 20th SIGNLL Conference on Computational Natural Language Learning , CoNLL 2016 , Berlin, Germany, August 11-12 , 2016 , pages 51 - 61 . Tomas Mikolov , Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean . 2013 . Distributed Representations of Words and Phrases and Their Compositionality . In Proceedings of the 26th International Conference on Neural Information Processing Systems (NIPS '13) , pages 3111 - 3119 , Lake Tahoe , Nevada, USA. Alexis Palmer and Caroline Sporleder . 2010 . Evaluating FrameNet-style semantic parsing: the role of coverage gaps in FrameNet . In Proceedings of the 23rd International Conference on Computational Linguistics: Posters , pages 928 - 936 , Beijing, China, August . Rebecca J. Passonneau , Collin F. Baker , Christiane Fellbaum, and Nancy Ide . 2012 . The MASC Word Sense Corpus . In Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12) , pages 3025 - 3030 , Istanbul, Turkey. Jeffrey Pennington , Richard Socher, and Christopher Manning . 2014 . Glove: Global vectors for word representation . In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 1532 - 1543 , Doha, Qatar, October. Association for Computational Linguistics. Michael Roth and Mirella Lapata . 2015 . Contextaware frame-semantic role labeling . Transactions of the Association for Computational Linguistics , 3 : 449 - 460 . Josef Ruppenhofer , Michael Ellsworth, Miriam R. L. Petruck , Christopher R. Johnson , and Jan Scheffczyk . 2010 . FrameNet II: Extended Theory and Practice . Technical report , ICSI, University of California, Berkeley. Anders Søgaard , Barbara Plank, and H e´ctor Mart´ınez Alonso. 2015 . Using Frame Semantics for Knowledge Extraction from Twitter . In Proceedings of the Twenty-Ninth AAAI Conference on Artificial Intelligence , pages 2447 - 2452 , Austin, Texas, USA. Anders Søgaard . 2013 . Semi-supervised learning and domain adaptation in natural language processing . Synthesis Lectures on Human Language Technologies , 6 ( 2 ): 1 - 103 . Mihai Surdeanu , Richard Johansson, Adam Meyers, Llu´ıs Ma`rquez, and Joakim Nivre . 2008 . The conll 2008 shared task on joint parsing of syntactic and semantic dependencies . In CoNLL 2008: Proceedings of the Twelfth Conference on Computational Natural Language Learning , pages 159 - 177 , Manchester, England, August. Coling 2008 Organizing Committee . Mihai Surdeanu , Massimiliano Ciaramita, and Hugo Zaragoza . 2011 . Learning to rank answers to nonfactoid questions from web collections . Computational Linguistics , 37 ( 2 ): 351 - 383 . Kaveh Taghipour and Hwee Tou Ng. 2015 . SemiSupervised Word Sense Disambiguation Using Word Embeddings in General and Specific Domains . In Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies , pages 314 - 323 , Denver, Colorado, May-June. Association for Computational Linguistics . Jason Weston , Samy Bengio, and Nicolas Usunier . 2011 . WSABIE: Scaling Up to Large Vocabulary Image Annotation . In Proceedings of the Twenty-Second International Joint Conference on Artificial Intelligence - Volume Volume Three , IJCAI'11 , pages 2764 - 2770 , Barcelona, Catalonia, Spain. AAAI Press. Haitong Yang , Tao Zhuang , and Chengqing Zong . 2015 . Domain adaptation for syntactic and semantic dependency parsing using deep belief networks . Transactions of the Association for Computational Linguistics , 3 : 271 - 282 . Seid Muhie Yimam , Richard Eckart de Castilho, Iryna Gurevych, and Chris Biemann . 2014 . Automatic Annotation Suggestions and Custom Annotation Layers in WebAnno . In Kalina Bontcheva and Zhu Jingbo, editors, Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics. System Demonstrations , pages 91 - 96 , Stroudsburg, PA 18360 , USA. Association for Computational Linguistics.] Paragraph sofa: _InitialView - begin: 38859 + begin: 38861 end: 50111 -------- View _InitialView end ---------------------------------- diff --git a/dkpro-core-io-cermine-gpl/src/test/resources/reference/test.dump b/dkpro-core-io-cermine-gpl/src/test/resources/reference/test.dump index 601ee4821d..9ad5d4d9df 100644 --- a/dkpro-core-io-cermine-gpl/src/test/resources/reference/test.dump +++ b/dkpro-core-io-cermine-gpl/src/test/resources/reference/test.dump @@ -766,23 +766,22 @@ Dr. Richard Eckart de Castilho for support regarding WebAnno, as well as Dr. Judith Eckle-Kohler and the anonymous reviewers for their comments on earlier versions of this paper. Eneko Agirre , Oier Lo´pez de Lacalle, Christiane Fellbaum, Shu-Kai Hsieh , Maurizio Tesconi, Monica Monachini, Piek Vossen, and Roxanne Segers . 2010 . SemEval-2010 Task 17 : All-Words Word Sense Disambiguation on a Specific Domain . InProceedings of the 5th International Workshop on Semantic Evaluation , pages 75 - 80 . Association for Computational Linguistics. Collin Baker , Michael Ellsworth , and Katrin Erk . 2007 . SemEval-2007 Task 19 : Frame Semantic Structure Extraction . In Proceedings of the Fourth International Workshop on Semantic Evaluations (SemEval2007) , pages 99 - 104 , Prague, Czech Republic, June. Association for Computational Linguistics. Jonathan Berant , Vivek Srikumar, Pei-Chun Chen , Abby Vander Linden, Brittany Harding, Brad Huang, Peter Clark , and Christopher D. Manning . 2014 . Modeling Biological Processes for Reading Comprehension . In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 1499 - 1510 , Doha, Qatar. Association for Computational Linguistics. John Blitzer , Ryan McDonald , and Fernando Pereira . 2006 . Domain adaptation with structural correspondence learning . In Proceedings of the 2006 Conference on Empirical Methods in Natural Language Processing , pages 120 - 128 , Sydney, Australia, July. Association for Computational Linguistics. Xavier Carreras and Llu´ıs Ma`rquez. 2005 . Introduction to the CoNLL-2005 shared task: Semantic role labeling . In Proceedings of the Ninth Conference on Computational Natural Language Learning (CoNLL-2005) , pages 152 - 164 , Ann Arbor, Michigan, June. Association for Computational Linguistics. Danilo Croce , Cristina Giannone, Paolo Annesi, and Roberto Basili . 2010 . Towards open-domain semantic role labeling . In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics , pages 237 - 246 , Uppsala, Sweden, July. Association for Computational Linguistics. Dipanjan Das and Noah A. Smith . 2011 . SemiSupervised Frame-Semantic Parsing for Unknown Predicates . In Proc. of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies , pages 1435 - 1444 , Portland, Oregon, USA. Dipanjan Das , Desai Chen , Andre´ F. T. Martins , Nathan Schneider , and Noah A. Smith . 2014 . Frame-semantic parsing . Computational Linguistics , 40 ( 1 ): 9 - 56 . Hal Daume ´III. 2007 . Frustratingly easy domain adaptation . In Proceedings of the 45th Annual Meeting of the Association of Computational Linguistics , pages 256 - 263 , Prague, Czech Republic, June. Association for Computational Linguistics. Katrin Erk and Sebastian Pado´. 2006 . SHALMANESER - A Toolchain For Shallow Semantic Parsing . In Proceedings of the 5th International Conference on Language Resources and Evaluation (LREC 2006 ), volume 6 , pages 527 - 532 , Genoa, Italy. ELRA. Charles J. Fillmore , Christopher R. Johnson , and Miriam R.L. Petruck . 2003 . Background to FrameNet. International journal of lexicography , 16 ( 3 ): 235 - 250 . Nicholas FitzGerald , Oscar Ta¨ckstro¨m, Kuzman Ganchev, and Dipanjan Das . 2015 . Semantic role labeling with neural network factors . In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing , pages 960 - 970 , Lisbon, Portugal, September. Association for Computational Linguistics. Jan Hajicˇ , Massimiliano Ciaramita, Richard Johansson, Daisuke Kawahara, Maria Anto`nia Mart´ı, Llu´ıs Ma`rquez, Adam Meyers, Joakim Nivre, Sebastian Pado´, Jan Sˇ teˇpa´nek, Pavel Stranˇa´k, Mihai Surdeanu, Nianwen Xue, and Yi Zhang . 2009 . The conll2009 shared task: Syntactic and semantic dependencies in multiple languages . In Proceedings of the Thirteenth Conference on Computational Natural Language Learning (CoNLL 2009 ): Shared Task, pages 1 - 18 , Boulder, Colorado, June. Association for Computational Linguistics. Karl Moritz Hermann , Dipanjan Das , Jason Weston , and Kuzman Ganchev . 2014 . Semantic frame identification with distributed word representations . In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 1448 - 1458 , Baltimore, Maryland, June. Association for Computational Linguistics. Fei Huang and Alexander Yates . 2010 . Open-domain semantic role labeling by modeling word spans . In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics , pages 968 - 978 , Uppsala, Sweden, July. Association for Computational Linguistics. Ignacio Iacobacci , Mohammad Taher Pilehvar, and Roberto Navigli . 2016 . Embeddings for Word Sense Disambiguation: An Evaluation Study . In Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 897 - 907 , Berlin, Germany, August. Association for Computational Linguistics. Anders Johannsen , He´ctor Mart´ınez Alonso, and Anders Søgaard . 2015 . Any-language frame-semantic parsing . In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing , pages 2062 - 2066 , Lisbon, Portugal, September. Association for Computational Linguistics. Richard Johansson and Pierre Nugues . 2008 . The effect of syntactic representation on semantic role labeling . In Proceedings of the 22nd International Conference on Computational Linguistics (Coling 2008 ), pages 393 - 400 , Manchester, UK , August . Coling 2008 Organizing Committee . Meghana Kshirsagar , Sam Thomson, Nathan Schneider, Jaime Carbonell, Noah A. Smith , and Chris Dyer . 2015 . Frame-semantic role labeling with heterogeneous annotations . In Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 2: Short Papers) , pages 218 - 224 , Beijing, China, July. Association for Computational Linguistics. Maciej Kula . 2015 . Metadata embeddings for user and item cold-start recommendations . In Toine Bogers and Marijn Koolen , editors, Proceedings of the 2nd Workshop on New Trends on Content-Based Recommender Systems co-located with 9th ACM Conference on Recommender Systems (RecSys 2015 ), volume 1448 of CEUR Workshop Proceedings , pages 14 - 21 , Vienna, Austria, September. CEUR-WS.org. Omer Levy and Yoav Goldberg . 2014 . Dependencybased word embeddings . In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics , ACL 2014 , June 22-27, 2014 , Baltimore, MD , USA, Volume 2 : Short Papers , pages 302 - 308 . The Association for Computer Linguistics. Oren Melamud , Jacob Goldberger , and Ido Dagan . 2016 . context2vec: Learning generic context embedding with bidirectional LSTM . In Proceedings of the 20th SIGNLL Conference on Computational Natural Language Learning , CoNLL 2016 , Berlin, Germany, August 11-12 , 2016 , pages 51 - 61 . Tomas Mikolov , Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean . 2013 . Distributed Representations of Words and Phrases and Their Compositionality . In Proceedings of the 26th International Conference on Neural Information Processing Systems (NIPS '13) , pages 3111 - 3119 , Lake Tahoe , Nevada, USA. Alexis Palmer and Caroline Sporleder . 2010 . Evaluating FrameNet-style semantic parsing: the role of coverage gaps in FrameNet . In Proceedings of the 23rd International Conference on Computational Linguistics: Posters , pages 928 - 936 , Beijing, China, August . Rebecca J. Passonneau , Collin F. Baker , Christiane Fellbaum, and Nancy Ide . 2012 . The MASC Word Sense Corpus . In Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12) , pages 3025 - 3030 , Istanbul, Turkey. Jeffrey Pennington , Richard Socher, and Christopher Manning . 2014 . Glove: Global vectors for word representation . In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 1532 - 1543 , Doha, Qatar, October. Association for Computational Linguistics. Michael Roth and Mirella Lapata . 2015 . Contextaware frame-semantic role labeling . Transactions of the Association for Computational Linguistics , 3 : 449 - 460 . Josef Ruppenhofer , Michael Ellsworth, Miriam R. L. Petruck , Christopher R. Johnson , and Jan Scheffczyk . 2010 . FrameNet II: Extended Theory and Practice . Technical report , ICSI, University of California, Berkeley. Anders Søgaard , Barbara Plank, and H e´ctor Mart´ınez Alonso. 2015 . Using Frame Semantics for Knowledge Extraction from Twitter . In Proceedings of the Twenty-Ninth AAAI Conference on Artificial Intelligence , pages 2447 - 2452 , Austin, Texas, USA. Anders Søgaard . 2013 . Semi-supervised learning and domain adaptation in natural language processing . Synthesis Lectures on Human Language Technologies , 6 ( 2 ): 1 - 103 . Mihai Surdeanu , Richard Johansson, Adam Meyers, Llu´ıs Ma`rquez, and Joakim Nivre . 2008 . The conll 2008 shared task on joint parsing of syntactic and semantic dependencies . In CoNLL 2008: Proceedings of the Twelfth Conference on Computational Natural Language Learning , pages 159 - 177 , Manchester, England, August. Coling 2008 Organizing Committee . Mihai Surdeanu , Massimiliano Ciaramita, and Hugo Zaragoza . 2011 . Learning to rank answers to nonfactoid questions from web collections . Computational Linguistics , 37 ( 2 ): 351 - 383 . Kaveh Taghipour and Hwee Tou Ng. 2015 . SemiSupervised Word Sense Disambiguation Using Word Embeddings in General and Specific Domains . In Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies , pages 314 - 323 , Denver, Colorado, May-June. Association for Computational Linguistics . Jason Weston , Samy Bengio, and Nicolas Usunier . 2011 . WSABIE: Scaling Up to Large Vocabulary Image Annotation . In Proceedings of the Twenty-Second International Joint Conference on Artificial Intelligence - Volume Volume Three , IJCAI'11 , pages 2764 - 2770 , Barcelona, Catalonia, Spain. AAAI Press. Haitong Yang , Tao Zhuang , and Chengqing Zong . 2015 . Domain adaptation for syntactic and semantic dependency parsing using deep belief networks . Transactions of the Association for Computational Linguistics , 3 : 271 - 282 . Seid Muhie Yimam , Richard Eckart de Castilho, Iryna Gurevych, and Chris Biemann . 2014 . Automatic Annotation Suggestions and Custom Annotation Layers in WebAnno . In Kalina Bontcheva and Zhu Jingbo, editors, Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics. System Demonstrations , pages 91 - 96 , Stroudsburg, PA 18360 , USA. Association for Computational Linguistics. -[ Out-of-domain FrameNet Semantic Role Labeling] +[Out-of-domain FrameNet Semantic Role Labeling] Heading sofa: _InitialView - begin: 0 + begin: 1 end: 46 -[ Silvana Hartmann 0 Ilia Kuznetsov 0 Teresa Martin 0 Iryna Gurevych 0 Research Training Group AIPHES 0 Ubiquitous Knowledge Processing (UKP) Lab 0 0 Department of Computer Science, Technische Universita ̈t Darmstadt 2017 1 471 482 Domain dependence of NLP systems is one of the major obstacles to their application in large-scale text analysis, also restricting the applicability of FrameNet semantic role labeling (SRL) systems. Yet, current FrameNet SRL systems are still only evaluated on a single in-domain test set. For the first time, we study the domain dependence of FrameNet SRL on a wide range of benchmark sets. We create a novel test set for FrameNet SRL based on user-generated web text and find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step. To address this problem, we develop a simple, yet efficient system based on distributed word representations. Our system closely approaches the state-of-the-art in-domain while outperforming the best available frame identification system out-of-domain. We publish our system and test data for research purposes.1] +[Silvana Hartmann 0 Ilia Kuznetsov 0 Teresa Martin 0 Iryna Gurevych 0 Research Training Group AIPHES 0 Ubiquitous Knowledge Processing (UKP) Lab 0 0 Department of Computer Science, Technische Universita ̈t Darmstadt 2017 1 471 482 Domain dependence of NLP systems is one of the major obstacles to their application in large-scale text analysis, also restricting the applicability of FrameNet semantic role labeling (SRL) systems. Yet, current FrameNet SRL systems are still only evaluated on a single in-domain test set. For the first time, we study the domain dependence of FrameNet SRL on a wide range of benchmark sets. We create a novel test set for FrameNet SRL based on user-generated web text and find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step. To address this problem, we develop a simple, yet efficient system based on distributed word representations. Our system closely approaches the state-of-the-art in-domain while outperforming the best available frame identification system out-of-domain. We publish our system and test data for research purposes.1] Paragraph sofa: _InitialView - begin: 46 + begin: 47 end: 1158 -[ - -] +[-] Heading sofa: _InitialView - begin: 1158 - end: 1161 -[ Domain dependence is a major problem for + begin: 1159 + end: 1160 +[Domain dependence is a major problem for supervised NLP tasks such as FrameNet semantic role labeling (SRL): systems generally exhibit a strong performance drop when applied to test data from @@ -791,37 +790,36 @@ prohibits their large-scale use in language technology applications.] Paragraph sofa: _InitialView - begin: 1161 + begin: 1162 end: 1476 -[ The same problems are expected for FrameNet +[The same problems are expected for FrameNet SRL, but due to a lack of datasets, state-of-theart FrameNet SRL is only evaluated on a single in-domain test set, see e.g. Das et al. (2014) and FitzGerald et al. (2015).] Paragraph sofa: _InitialView - begin: 1476 + begin: 1477 end: 1692 -[ In this work, we present the first comprehensive +[In this work, we present the first comprehensive study of the domain dependence of FrameNet SRL] Paragraph sofa: _InitialView - begin: 1692 + begin: 1693 end: 1788 -[ 1www.ukp.tu-darmstadt.de/ood-fn-srl -] +[1www.ukp.tu-darmstadt.de/ood-fn-srl] Heading sofa: _InitialView - begin: 1788 - end: 1825 -[ on a range of benchmark datasets. This is crucial as + begin: 1789 + end: 1824 +[on a range of benchmark datasets. This is crucial as the demand for semantic textual analysis of largescale web data keeps growing.] Paragraph sofa: _InitialView - begin: 1825 + begin: 1826 end: 1957 -[ Based on FrameNet (Fillmore et al., 2003) , +[Based on FrameNet (Fillmore et al., 2003) , FrameNet SRL extracts frame-semantic structures on the sentence level that describe a specific situation centered around a semantic predicate, @@ -838,9 +836,9 @@ grindsGrinding [the] Paragraph sofa: _InitialView - begin: 1957 + begin: 1958 end: 2567 -[ FrameNet SRL consists of two steps, frame +[FrameNet SRL consists of two steps, frame identification (frameId), assigning a frame to the current predicate, and role labeling (roleId), identifying the participants and assigning them role labels licensed @@ -857,18 +855,18 @@ crucial for many applications, e.g. reasoning in online debates (Berant et al., 2014) .] Paragraph sofa: _InitialView - begin: 2567 + begin: 2568 end: 3303 -[ Domain dependence is a well-studied topic for +[Domain dependence is a well-studied topic for PropBank SRL. However, to the best of our knowledge, there exists no analysis of the performance of modern FrameNet SRL systems when applied to data from new domains.] Paragraph sofa: _InitialView - begin: 3303 + begin: 3304 end: 3516 -[ In this work, we address this problem as +[In this work, we address this problem as follows: we introduce a new benchmark dataset YAGS (Yahoo! Answers Gold Standard), which is based on user-generated questions and answers and @@ -890,9 +888,9 @@ despite using a simpler model, and improves on the out-of-domain performance of Semafor.] Paragraph sofa: _InitialView - begin: 3516 + begin: 3517 end: 4491 -[ The contributions of the present work are +[The contributions of the present work are twofold: 1) we perform the first comprehensive study of the domain generalization capabilities of opensource FrameNet SRL, and 2) we propose a new @@ -907,15 +905,14 @@ system for research purposes. 2] Paragraph sofa: _InitialView - begin: 4491 + begin: 4492 end: 5060 -[ Related work -] +[Related work] Heading sofa: _InitialView - begin: 5060 - end: 5074 -[ The domain dependence of FrameNet SRL + begin: 5061 + end: 5073 +[The domain dependence of FrameNet SRL systems has been only studied sparsely, however, there exists a large body of work on out-of-domain PropBank SRL, as well as on general domain adaptation @@ -925,9 +922,9 @@ then summarizes the state-of-the-art in FrameNet frame identification.] Paragraph sofa: _InitialView - begin: 5074 + begin: 5075 end: 5438 -[ Domain adaptation in NLP Low +[Domain adaptation in NLP Low out-ofdomain performance is a problem common to many supervised machine learning tasks. The goal of domain adaptation is to improve model @@ -949,9 +946,9 @@ and report systematically lower scores for the out-of-domain data (Hajicˇ et al., 2009).] Paragraph sofa: _InitialView - begin: 5438 + begin: 5439 end: 6369 -[ Representation learning has been successfully +[Representation learning has been successfully used to improve on the CoNLL shared task results (Huang and Yates, 2010; FitzGerald et al., 2015; Yang et al., 2015) . Yang et al. (2015) @@ -963,15 +960,14 @@ learns common representations for in-domain and out-of-domain data based on deep belief networks.] Paragraph sofa: _InitialView - begin: 6369 + begin: 6370 end: 6848 -[ Domain dependence of FrameNet SRL The -] +[Domain dependence of FrameNet SRL The] Heading sofa: _InitialView - begin: 6848 - end: 6887 -[ FrameNet 1.5 fulltext corpus, used as a standard + begin: 6849 + end: 6886 +[FrameNet 1.5 fulltext corpus, used as a standard dataset for training and evaluating FrameNet SRL systems, contains texts from several domains (Ruppenhofer et al., 2010) . However, the standard data @@ -987,9 +983,9 @@ information on SRL performance on less edited out-ofdomain data, e.g. user-generated web data.] Paragraph sofa: _InitialView - begin: 6887 + begin: 6888 end: 7570 -[ There are few studies related to the +[There are few studies related to the out-ofdomain generalization of FrameNet SRL. Johansson and Nugues (2008) evaluate the impact of different parsers on FrameNet SRL using the Nuclear @@ -1013,9 +1009,9 @@ already integrate methods to improve generalization, for instance using distributed representations.] Paragraph sofa: _InitialView - begin: 7570 + begin: 7571 end: 8649 -[ Palmer and Sporleder (2010) analyze the +[Palmer and Sporleder (2010) analyze the FrameNet 1.3 training data coverage and the performance of the Shalmaneser SRL system (Erk and Pado´, 2006) for frame identification on @@ -1032,9 +1028,9 @@ even when modern SRL methods and the extended FrameNet 1.5 lexicon are used.] Paragraph sofa: _InitialView - begin: 8649 + begin: 8650 end: 9351 -[ Søgaard et al. (2015) annotate 236 tweets with +[Søgaard et al. (2015) annotate 236 tweets with FrameNet labels to apply SRL to knowledge extraction from Twitter. They report that the frameId performance of Semafor 2.1 (Das et al., 2010) @@ -1047,9 +1043,9 @@ indomain set. These results show that there is ample room for improvement for SRL on Twitter data.] Paragraph sofa: _InitialView - begin: 9351 + begin: 9352 end: 9895 -[ Recent FrameNet SRL systems are not +[Recent FrameNet SRL systems are not evaluated in the context of their domain dependence: Kshirsagar et al. (2015) use the domain adaptation approach from Daume´III (2007) to augment the @@ -1071,9 +1067,9 @@ followed by the previous state-of-the art model Semafor (Das et al., 2014) .] Paragraph sofa: _InitialView - begin: 9895 + begin: 9896 end: 10848 -[ The frame identification system of Semafor +[The frame identification system of Semafor relies on an elaborate feature set based on syntactic and lexical features, using the WordNet hierarchy as a source of lexical information, and a label @@ -1086,9 +1082,9 @@ out-ofdomain data, especially in case of non-standard user-generated texts.] Paragraph sofa: _InitialView - begin: 10848 + begin: 10849 end: 11358 -[ Hermann-14 uses distributed word +[Hermann-14 uses distributed word representations augmented by syntactic information. Generalpurpose distributed word representations (such as word2vec (Mikolov et al., 2013) and GloVe @@ -1122,15 +1118,14 @@ not been applied in an out-of-domain setting. 3] Paragraph sofa: _InitialView - begin: 11358 + begin: 11359 end: 12898 -[ Out-of-domain FrameNet test data -] +[Out-of-domain FrameNet test data] Heading sofa: _InitialView - begin: 12898 - end: 12932 -[ This section describes available in-domain and + begin: 12899 + end: 12931 +[This section describes available in-domain and outof-domain FrameNet test sets and the creation of YAGS, a new out-of-domain FrameNet test set. FrameNet test sets FrameNet SRL is typically @@ -1143,9 +1138,9 @@ in-domain test set: all data sources of the test set are also represented in the training set.] Paragraph sofa: _InitialView - begin: 12932 + begin: 12933 end: 13466 -[ There are two additional datasets from other +[There are two additional datasets from other domains that we use in our study on domain generalization: The MASC word sense sentences corpus contains FrameNet annotations for a lexical @@ -1161,9 +1156,9 @@ Søgaard et al. (2015), we report SRL results as averages over the three annotations (TW-av).] Paragraph sofa: _InitialView - begin: 13466 + begin: 13467 end: 14139 -[ Table 1 shows statistics on these datasets. For +[Table 1 shows statistics on these datasets. For TW, it displays the statistics for each annotator. The TW datasets are fairly small, containing only around 1,000 frame labels. The MASC dataset is of @@ -1179,15 +1174,14 @@ important domains like user-generated text, because available datasets are either small or of low quality.] Paragraph sofa: _InitialView - begin: 14139 + begin: 14140 end: 14854 -[ YAGS: a new FrameNet test set based on user -] +[YAGS: a new FrameNet test set based on user] Heading sofa: _InitialView - begin: 14854 - end: 14899 -[ generated text To address the need for new + begin: 14855 + end: 14898 +[generated text To address the need for new outof-domain test datasets, we created YAGS, a new FrameNet-annotated evaluation dataset based on question-answer data from Yahoo! Answers (YA), @@ -1200,9 +1194,9 @@ Yahoo! Webscope program (https://webscope. sandbox.yahoo.com/).] Paragraph sofa: _InitialView - begin: 14899 + begin: 14900 end: 15385 -[ YAGS contains 1,415 sentences, 3,091 frame +[YAGS contains 1,415 sentences, 3,091 frame annotations, and 6,081 role annotations. Figure 1 shows a sentence from YAGS that demonstrates some non-standard properties of the user-generated @@ -1211,9 +1205,9 @@ of mortar). We publish the annotations as stand-off annotations to the original dataset.] Paragraph sofa: _InitialView - begin: 15385 + begin: 15386 end: 15715 -[ Annotation study Each document was +[Annotation study Each document was annotated by a two linguistically trained annotators provided with detailed guidelines and then curated by an experienced expert, all using WebAnno 2.0.0 @@ -1231,9 +1225,9 @@ and examples with the FrameNet Explorer tool (www.clres.com/FNExplorer.html).] Paragraph sofa: _InitialView - begin: 15715 + begin: 15716 end: 16466 -[ Inter-rater agreement for frame labels is +[Inter-rater agreement for frame labels is Krippendorff’s α=0.76; agreement for role labels given matching spans is α=0.62, and Krippendorff’s α unitizing agreement for role spans is 0.7 – a good @@ -1245,15 +1239,14 @@ frame agreement is a result of annotator experience and our elaborate annotation setup.] Paragraph sofa: _InitialView - begin: 16466 + begin: 16467 end: 16951 -[ YAGS statistics and properties Table 1 presents -] +[YAGS statistics and properties Table 1 presents] Heading sofa: _InitialView - begin: 16951 - end: 17000 -[ dataset statistics for YAGS and the other test sets. + begin: 16952 + end: 16999 +[dataset statistics for YAGS and the other test sets. Due to the predicate selection, YAGS contains a larger proportion of verbal predicates than the other sets, and has three times more frames and roles @@ -1272,32 +1265,31 @@ compared to das-test. This goes along with a larger variance of roles in YAGS.] Paragraph sofa: _InitialView - begin: 17000 + begin: 17001 end: 17544 -[ The user-generated aspect of YAGS manifests in +[The user-generated aspect of YAGS manifests in spelling errors, and in the lack of punctuation and structure of the texts. The language is informal, but there are only few emoticons or other special words such as the hashtags typically found in tweets.] Paragraph sofa: _InitialView - begin: 17544 + begin: 17545 end: 17797 -[ In the next section, we use the test sets from +[In the next section, we use the test sets from Table 1 to analyze the domain generalization capabilities of an open-source FrameNet SRL system. 4] Paragraph sofa: _InitialView - begin: 17797 + begin: 17798 end: 17943 -[ Domain generalization capabilities of open-source FrameNet SRL -] +[Domain generalization capabilities of open-source FrameNet SRL] Heading sofa: _InitialView - begin: 17943 - end: 18007 -[ To analyze the domain generalization capabilities + begin: 17944 + end: 18006 +[To analyze the domain generalization capabilities of contemporary open-source SRL, we ran the frame identification from Semafor (Das et al., 2014) with the enhanced role labeler from @@ -1308,24 +1300,23 @@ annotated with predicate spans as input, which has become the standard in recent evaluations.] Paragraph sofa: _InitialView - begin: 18007 + begin: 18008 end: 18434 -[ Evaluation script The Semafor evaluation -] +[Evaluation script The Semafor evaluation] Heading sofa: _InitialView - begin: 18434 - end: 18476 -[ script (Das et al., 2014) provides precision P, recall + begin: 18435 + end: 18475 +[script (Das et al., 2014) provides precision P, recall R, and F1 scores for full SRL (SRL), and accuracy A for frame identification (frameId). Full SRL evaluation can be performed with and without using gold frames instead of predicted (auto) frames.] Paragraph sofa: _InitialView - begin: 18476 + begin: 18477 end: 18729 -[ The script does not provide results on the +[The script does not provide results on the role labeling (argument identification and labeling, roleId) alone: the scoring mechanism for SRL/gold also considers the by default correct gold frames. @@ -1353,9 +1344,9 @@ the exact match setting that does not credit related frames and roles.] Paragraph sofa: _InitialView - begin: 18729 + begin: 18730 end: 19664 -[ Results Table 2 presents scores for exact match +[Results Table 2 presents scores for exact match frameId and for SRL and roleId with automatic frames (auto) and with gold frames (gold). For TW, the results are averaged over the number of @@ -1372,9 +1363,9 @@ scores are also low on das-test, but higher than for the other sets.] Paragraph sofa: _InitialView - begin: 19664 + begin: 19665 end: 20375 -[ When using gold frame labels, roleId and SRL +[When using gold frame labels, roleId and SRL performance improve for all test sets. As shown in columns roleId/gold and SRL/gold, the difference between in-domain and out-of-domain evaluation @@ -1383,26 +1374,26 @@ lower for full SRL than those for das-test. TW-av scores even surpass the in-domain scores.2] Paragraph sofa: _InitialView - begin: 20375 + begin: 20376 end: 20708 -[ This shows how much FrameNet role labels are +[This shows how much FrameNet role labels are dependent on correct frame labels. Thus, it is crucial to improve the out-of-domain performance of frameId systems.] Paragraph sofa: _InitialView - begin: 20708 + begin: 20709 end: 20869 -[ Domain dependence appears to be less of a +[Domain dependence appears to be less of a problem for the role labeling step. The MASC dataset is the most difficult for both frameId and roleId. This is mostly a consequence of the lower training data coverage of MASC, as discussed below.] Paragraph sofa: _InitialView - begin: 20869 + begin: 20870 end: 21109 -[ 2Our TW-av results are not comparable to those from +[2Our TW-av results are not comparable to those from Søgaard et al. (2015) because their test setup includes predicate target identification and uses different evaluation metrics. das-test @@ -1419,9 +1410,9 @@ domainspecific predicate-frame distributions, or a lack of coverage of the domain in the training data.] Paragraph sofa: _InitialView - begin: 21109 + begin: 21110 end: 21614 -[ To get a better understanding of these +[To get a better understanding of these phenomena, we compared detailed statistics of the different test sets, cf. Table 3. Das-test has the largest predicate coverage and contains a lot of monosemous @@ -1432,9 +1423,9 @@ might indicate a domain preference for polysemous predicates in the YAGS and TW datasets.] Paragraph sofa: _InitialView - begin: 21614 + begin: 21615 end: 22053 -[ The percentage of unseen predicates (lemmas ∈/ +[The percentage of unseen predicates (lemmas ∈/ das-train) is slightly higher for the user-generated test sets than for das-test, and much higher for MASC. This is mirrored in the lower frameId @@ -1445,9 +1436,9 @@ insufficient training data coverage, which indicates that domain effects occur for the out-of-domain sets.] Paragraph sofa: _InitialView - begin: 22053 + begin: 22054 end: 22497 -[ To support this assumption, we performed a +[To support this assumption, we performed a detailed error analysis on the misclassified instances for all test sets. We compute the proportion of wrongly classified instances with unseen predicates, @@ -1469,22 +1460,21 @@ confused frames between the in-domain das-test and out-of-domain YAGS and TW-av.] Paragraph sofa: _InitialView - begin: 22497 + begin: 22498 end: 23475 -[ In the next section, we study new methods to +[In the next section, we study new methods to improve out-of-domain frame identification. 5] Paragraph sofa: _InitialView - begin: 23475 + begin: 23476 end: 23566 -[ Frame identification with distributed word representations -] +[Frame identification with distributed word representations] Heading sofa: _InitialView - begin: 23566 - end: 23626 -[ Given a predicate and a set of frames associated + begin: 23567 + end: 23625 +[Given a predicate and a set of frames associated with this predicate, a frame identification system has to choose the correct frame based on the context. In this section we introduce our frame @@ -1511,9 +1501,9 @@ simple system achieves competitive in-domain and out-of-domain performance.] Paragraph sofa: _InitialView - begin: 23626 + begin: 23627 end: 24819 -[ Our system, called SimpleFrameId, is specified +[Our system, called SimpleFrameId, is specified as follows: given the lexicon L, the vector space vsm and the training data, our goal is to predict the frame f given the sentence S and the predicate p. @@ -1528,9 +1518,9 @@ simple word lookup function, since we do not modify our word representations during training.] Paragraph sofa: _InitialView - begin: 24819 + begin: 24820 end: 25463 -[ From the sentence we extract the context +[From the sentence we extract the context representation, xc = Pw∈C|Cv|sm(w) . We experiment with two kinds of contexts: SentBOW includes all the words in the sentence, i.e. C = S, DepBOW @@ -1570,22 +1560,21 @@ since it better reflects the frame identification quality and smoothens the effect of lexicon coverage.] Paragraph sofa: _InitialView - begin: 25463 + begin: 25464 end: 27002 -[ Lexicon-based filtering In the testing stage, -] +[Lexicon-based filtering In the testing stage,] Heading sofa: _InitialView - begin: 27002 - end: 27049 -[ the classifier outputs weights for all the frames + begin: 27003 + end: 27048 +[the classifier outputs weights for all the frames available in the lexicon, and the best-scoring frame is selected, f ← argmaxf∈LD(xc, xp, f ).] Paragraph sofa: _InitialView - begin: 27049 + begin: 27050 end: 27193 -[ Since the lexicon specifies available frames for +[Since the lexicon specifies available frames for each lexical unit (i.e. lemma and POS), additional filtering can be performed, which limits the search only to the available frames, @@ -1596,9 +1585,9 @@ entry in the lexicon, it’s declared unambiguous and the frame is assigned directly.] Paragraph sofa: _InitialView - begin: 27193 + begin: 27194 end: 27610 -[ Despite being common, this setup has several +[Despite being common, this setup has several flaws that can obscure the differences between sys- Experiments In our experiments, we generate tems in the testing stage. As we showed in Section the lexicon L in the same way as in Hermann-14, 4, the FrameNet lexicon has coverage issues when by scanning the “frames” folder of the FrameNet @@ -1629,31 +1618,31 @@ baseline to better handle the cases when limited data is available for a given predicate sense.] Paragraph sofa: _InitialView - begin: 27610 + begin: 27611 end: 29423 -[ 3In our implementation, we use the LightFM package +[3In our implementation, we use the LightFM package (Kula, 2015) with the WARP option for hybrid matrix factorization.] Paragraph sofa: _InitialView - begin: 29423 + begin: 29424 end: 29543 -[ 4A justification for this can also be found in Hermann +[4A justification for this can also be found in Hermann et al. (2014): the difference in Hermann-14 accuracy when switching from the Semafor lexicon to the full lexicon is comparable to the difference between Semafor and Hermann-14 when evaluated on the same lexicon.] Paragraph sofa: _InitialView - begin: 29543 + begin: 29544 end: 29810 -[ 5Based on the errata version of Hermann et al. +[5Based on the errata version of Hermann et al. (2014) in http://www.aclweb.org/anthology/P/] Paragraph sofa: _InitialView - begin: 29810 + begin: 29811 end: 29902 -[ P14/P14-1136v2.pdf +[P14/P14-1136v2.pdf DataBaseline LexiconBaseline Semafor @@ -1676,15 +1665,14 @@ provides more context information than the sparse, dependency-filteredDepBOW.] Paragraph sofa: _InitialView - begin: 29902 + begin: 29903 end: 30743 -[ Out-of-domain performance We also investi -] +[Out-of-domain performance We also investi] Heading sofa: _InitialView - begin: 30743 - end: 30786 -[ gate how well the systems perform in the + begin: 30744 + end: 30785 +[gate how well the systems perform in the out-ofdomain setting. Table 5 summarizes the results. Each of the systems was trained on das-train and tested on a variety of test sets. As we can see, our @@ -1695,9 +1683,9 @@ this to the complexity of the YAGS dataset that contains a high proportion of verbs.] Paragraph sofa: _InitialView - begin: 30786 + begin: 30787 end: 31211 -[ Overall out-of-domain performance stays behind +[Overall out-of-domain performance stays behind the F1-agreement observed for the human annotators for TW and YAGS, which shows that there is a large margin for improvement. Corresponding @@ -1708,9 +1696,9 @@ analyse statistics on the errors made by the system variant NN+SentBOW.] Paragraph sofa: _InitialView - begin: 31211 + begin: 31212 end: 31613 -[ The system’s wrong predictions are affected by +[The system’s wrong predictions are affected by the lexicon in two ways. First, if the predicate is not listed in the lexicon (unknown), the system has to choose among all frames. As we have shown @@ -1725,9 +1713,9 @@ the lexicon, and by design has no means to select the right frame for a given predicate occurrence.] Paragraph sofa: _InitialView - begin: 31613 + begin: 31614 end: 32268 -[ The unlinked-predicate issue points to a +[The unlinked-predicate issue points to a major design flaw in the standard frameId architecture. Although choosing among frames defined in the lexicon provides a quality boost, it also @@ -1746,9 +1734,9 @@ among the frames available for the predicate, the misclassification is inevitable.] Paragraph sofa: _InitialView - begin: 32268 + begin: 32269 end: 33093 -[ A more detailed analysis of the errors made +[A more detailed analysis of the errors made by the system shows that the majority of false predictions for known and linked predicates are due to the domain differences in word usage. For @@ -1760,18 +1748,18 @@ following sentence: 12 minute measurement [window].”] Paragraph sofa: _InitialView - begin: 33093 + begin: 33094 end: 33519 -[ This problem is also relevant in generic WSD +[This problem is also relevant in generic WSD (Agirre et al., 2010) and benefits from the same solutions, for instance adapting embeddings to a particular domain (Taghipour and Ng, 2015) and efficient use of embeddings (Iacobacci et al., 2016) .] Paragraph sofa: _InitialView - begin: 33519 + begin: 33520 end: 33769 -[ Another major source of errors are subtle +[Another major source of errors are subtle syntactic and semantic differences between frames which are hard to resolve on the sentence level (e.g. distinguishing between Similarity and @@ -1785,9 +1773,9 @@ recent work in FrameNet SRL, see e.g. Hermann et al. (2014) and Roth and Lapata (2015).] Paragraph sofa: _InitialView - begin: 33769 + begin: 33770 end: 34246 -[ To further explore the impact of user-generated +[To further explore the impact of user-generated text, we applied word-processor spelling correction to YAGS and tested our systems on the corrected set. The results do not change significantly, which @@ -1798,15 +1786,14 @@ attributed to preprocessing problems at large scale. 6] Paragraph sofa: _InitialView - begin: 34246 + begin: 34247 end: 34653 -[ Discussion and outlook -] +[Discussion and outlook] Heading sofa: _InitialView - begin: 34653 - end: 34677 -[ Our analysis in Section 4 shows that domain + begin: 34654 + end: 34676 +[Our analysis in Section 4 shows that domain adaptation is mainly required for the frameId step of FrameNet SRL. Unlike in PropBank SRL, in FrameNet SRL there is no significant performance @@ -1816,9 +1803,9 @@ frame is lower, on average 10, which reduces the complexity of the roleId task.] Paragraph sofa: _InitialView - begin: 34677 + begin: 34678 end: 35044 -[ In Section 5 we introduced a simple, yet +[In Section 5 we introduced a simple, yet efficient frame identification method and evaluated it on in-domain and out-of-domain data. The method achieves competitive in-domain results, @@ -1829,9 +1816,9 @@ introduced no-lexicon evaluation setting, where no lexicon-based filtering is applied.] Paragraph sofa: _InitialView - begin: 35044 + begin: 35045 end: 35455 -[ We identified a major issue in the standard +[We identified a major issue in the standard frameId architecture: shifting to a new domain might render the predicate-frame associations in the FrameNet lexicon incomplete, which leads to @@ -1846,9 +1833,9 @@ the FrameNet lexicon could help, and we suggest investigating this line of research in future work.] Paragraph sofa: _InitialView - begin: 35455 + begin: 35456 end: 36095 -[ While our method achieves state-of-the-art +[While our method achieves state-of-the-art results on out-of-domain data, overall results are still significantly lower than the human performance observed for YAGS and TW, which shows that there is @@ -1862,9 +1849,9 @@ be further improved by adapting word representations to a new domain.] Paragraph sofa: _InitialView - begin: 36095 + begin: 36096 end: 36659 -[ A direct comparison to the Hermann-14 +[A direct comparison to the Hermann-14 system in the out-of-domain setup would shed some more light on the properties of the task affecting the out-of-domain performance. On the one hand, @@ -1878,24 +1865,23 @@ related work.6 7] Paragraph sofa: _InitialView - begin: 36659 + begin: 36660 end: 37154 -[ Conclusion -] +[Conclusion] Heading sofa: _InitialView - begin: 37154 - end: 37166 -[ Domain dependence is a well-known issue for + begin: 37155 + end: 37165 +[Domain dependence is a well-known issue for supervised NLP tasks such as FrameNet SRL. To the best of our knowledge, there is no recent study of the domain dependence of FrameNet SRL, also prohibited by a lack of appropriate datasets.] Paragraph sofa: _InitialView - begin: 37166 + begin: 37167 end: 37401 -[ To address this problem, we 1) present the first +[To address this problem, we 1) present the first comprehensive study of the domain generalization performance of the open-source Semafor system on several diverse benchmark sets. As a @@ -1915,21 +1901,19 @@ we publish the YAGS test set annotations and our frame identification system for research purposes.] Paragraph sofa: _InitialView - begin: 37401 + begin: 37402 end: 38290 -[ 6www.ukp.tu-darmstadt.de/ood-fn-srl -] +[6www.ukp.tu-darmstadt.de/ood-fn-srl] Heading sofa: _InitialView - begin: 38290 - end: 38327 -[ Acknowledgements -] + begin: 38291 + end: 38326 +[Acknowledgements] Heading sofa: _InitialView - begin: 38327 - end: 38345 -[ This work was supported by FAZIT-Stiftung and by + begin: 38328 + end: 38344 +[This work was supported by FAZIT-Stiftung and by the German Research Foundation (DFG) through grant GU 798/18-1 (QAEduInf) and the research training group “Adaptive Preparation of @@ -1942,12 +1926,12 @@ and the anonymous reviewers for their comments on earlier versions of this paper.] Paragraph sofa: _InitialView - begin: 38345 + begin: 38346 end: 38859 -[ Eneko Agirre , Oier Lo´pez de Lacalle, Christiane Fellbaum, Shu-Kai Hsieh , Maurizio Tesconi, Monica Monachini, Piek Vossen, and Roxanne Segers . 2010 . SemEval-2010 Task 17 : All-Words Word Sense Disambiguation on a Specific Domain . InProceedings of the 5th International Workshop on Semantic Evaluation , pages 75 - 80 . Association for Computational Linguistics. Collin Baker , Michael Ellsworth , and Katrin Erk . 2007 . SemEval-2007 Task 19 : Frame Semantic Structure Extraction . In Proceedings of the Fourth International Workshop on Semantic Evaluations (SemEval2007) , pages 99 - 104 , Prague, Czech Republic, June. Association for Computational Linguistics. Jonathan Berant , Vivek Srikumar, Pei-Chun Chen , Abby Vander Linden, Brittany Harding, Brad Huang, Peter Clark , and Christopher D. Manning . 2014 . Modeling Biological Processes for Reading Comprehension . In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 1499 - 1510 , Doha, Qatar. Association for Computational Linguistics. John Blitzer , Ryan McDonald , and Fernando Pereira . 2006 . Domain adaptation with structural correspondence learning . In Proceedings of the 2006 Conference on Empirical Methods in Natural Language Processing , pages 120 - 128 , Sydney, Australia, July. Association for Computational Linguistics. Xavier Carreras and Llu´ıs Ma`rquez. 2005 . Introduction to the CoNLL-2005 shared task: Semantic role labeling . In Proceedings of the Ninth Conference on Computational Natural Language Learning (CoNLL-2005) , pages 152 - 164 , Ann Arbor, Michigan, June. Association for Computational Linguistics. Danilo Croce , Cristina Giannone, Paolo Annesi, and Roberto Basili . 2010 . Towards open-domain semantic role labeling . In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics , pages 237 - 246 , Uppsala, Sweden, July. Association for Computational Linguistics. Dipanjan Das and Noah A. Smith . 2011 . SemiSupervised Frame-Semantic Parsing for Unknown Predicates . In Proc. of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies , pages 1435 - 1444 , Portland, Oregon, USA. Dipanjan Das , Desai Chen , Andre´ F. T. Martins , Nathan Schneider , and Noah A. Smith . 2014 . Frame-semantic parsing . Computational Linguistics , 40 ( 1 ): 9 - 56 . Hal Daume ´III. 2007 . Frustratingly easy domain adaptation . In Proceedings of the 45th Annual Meeting of the Association of Computational Linguistics , pages 256 - 263 , Prague, Czech Republic, June. Association for Computational Linguistics. Katrin Erk and Sebastian Pado´. 2006 . SHALMANESER - A Toolchain For Shallow Semantic Parsing . In Proceedings of the 5th International Conference on Language Resources and Evaluation (LREC 2006 ), volume 6 , pages 527 - 532 , Genoa, Italy. ELRA. Charles J. Fillmore , Christopher R. Johnson , and Miriam R.L. Petruck . 2003 . Background to FrameNet. International journal of lexicography , 16 ( 3 ): 235 - 250 . Nicholas FitzGerald , Oscar Ta¨ckstro¨m, Kuzman Ganchev, and Dipanjan Das . 2015 . Semantic role labeling with neural network factors . In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing , pages 960 - 970 , Lisbon, Portugal, September. Association for Computational Linguistics. Jan Hajicˇ , Massimiliano Ciaramita, Richard Johansson, Daisuke Kawahara, Maria Anto`nia Mart´ı, Llu´ıs Ma`rquez, Adam Meyers, Joakim Nivre, Sebastian Pado´, Jan Sˇ teˇpa´nek, Pavel Stranˇa´k, Mihai Surdeanu, Nianwen Xue, and Yi Zhang . 2009 . The conll2009 shared task: Syntactic and semantic dependencies in multiple languages . In Proceedings of the Thirteenth Conference on Computational Natural Language Learning (CoNLL 2009 ): Shared Task, pages 1 - 18 , Boulder, Colorado, June. Association for Computational Linguistics. Karl Moritz Hermann , Dipanjan Das , Jason Weston , and Kuzman Ganchev . 2014 . Semantic frame identification with distributed word representations . In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 1448 - 1458 , Baltimore, Maryland, June. Association for Computational Linguistics. Fei Huang and Alexander Yates . 2010 . Open-domain semantic role labeling by modeling word spans . In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics , pages 968 - 978 , Uppsala, Sweden, July. Association for Computational Linguistics. Ignacio Iacobacci , Mohammad Taher Pilehvar, and Roberto Navigli . 2016 . Embeddings for Word Sense Disambiguation: An Evaluation Study . In Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 897 - 907 , Berlin, Germany, August. Association for Computational Linguistics. Anders Johannsen , He´ctor Mart´ınez Alonso, and Anders Søgaard . 2015 . Any-language frame-semantic parsing . In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing , pages 2062 - 2066 , Lisbon, Portugal, September. Association for Computational Linguistics. Richard Johansson and Pierre Nugues . 2008 . The effect of syntactic representation on semantic role labeling . In Proceedings of the 22nd International Conference on Computational Linguistics (Coling 2008 ), pages 393 - 400 , Manchester, UK , August . Coling 2008 Organizing Committee . Meghana Kshirsagar , Sam Thomson, Nathan Schneider, Jaime Carbonell, Noah A. Smith , and Chris Dyer . 2015 . Frame-semantic role labeling with heterogeneous annotations . In Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 2: Short Papers) , pages 218 - 224 , Beijing, China, July. Association for Computational Linguistics. Maciej Kula . 2015 . Metadata embeddings for user and item cold-start recommendations . In Toine Bogers and Marijn Koolen , editors, Proceedings of the 2nd Workshop on New Trends on Content-Based Recommender Systems co-located with 9th ACM Conference on Recommender Systems (RecSys 2015 ), volume 1448 of CEUR Workshop Proceedings , pages 14 - 21 , Vienna, Austria, September. CEUR-WS.org. Omer Levy and Yoav Goldberg . 2014 . Dependencybased word embeddings . In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics , ACL 2014 , June 22-27, 2014 , Baltimore, MD , USA, Volume 2 : Short Papers , pages 302 - 308 . The Association for Computer Linguistics. Oren Melamud , Jacob Goldberger , and Ido Dagan . 2016 . context2vec: Learning generic context embedding with bidirectional LSTM . In Proceedings of the 20th SIGNLL Conference on Computational Natural Language Learning , CoNLL 2016 , Berlin, Germany, August 11-12 , 2016 , pages 51 - 61 . Tomas Mikolov , Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean . 2013 . Distributed Representations of Words and Phrases and Their Compositionality . In Proceedings of the 26th International Conference on Neural Information Processing Systems (NIPS '13) , pages 3111 - 3119 , Lake Tahoe , Nevada, USA. Alexis Palmer and Caroline Sporleder . 2010 . Evaluating FrameNet-style semantic parsing: the role of coverage gaps in FrameNet . In Proceedings of the 23rd International Conference on Computational Linguistics: Posters , pages 928 - 936 , Beijing, China, August . Rebecca J. Passonneau , Collin F. Baker , Christiane Fellbaum, and Nancy Ide . 2012 . The MASC Word Sense Corpus . In Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12) , pages 3025 - 3030 , Istanbul, Turkey. Jeffrey Pennington , Richard Socher, and Christopher Manning . 2014 . Glove: Global vectors for word representation . In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 1532 - 1543 , Doha, Qatar, October. Association for Computational Linguistics. Michael Roth and Mirella Lapata . 2015 . Contextaware frame-semantic role labeling . Transactions of the Association for Computational Linguistics , 3 : 449 - 460 . Josef Ruppenhofer , Michael Ellsworth, Miriam R. L. Petruck , Christopher R. Johnson , and Jan Scheffczyk . 2010 . FrameNet II: Extended Theory and Practice . Technical report , ICSI, University of California, Berkeley. Anders Søgaard , Barbara Plank, and H e´ctor Mart´ınez Alonso. 2015 . Using Frame Semantics for Knowledge Extraction from Twitter . In Proceedings of the Twenty-Ninth AAAI Conference on Artificial Intelligence , pages 2447 - 2452 , Austin, Texas, USA. Anders Søgaard . 2013 . Semi-supervised learning and domain adaptation in natural language processing . Synthesis Lectures on Human Language Technologies , 6 ( 2 ): 1 - 103 . Mihai Surdeanu , Richard Johansson, Adam Meyers, Llu´ıs Ma`rquez, and Joakim Nivre . 2008 . The conll 2008 shared task on joint parsing of syntactic and semantic dependencies . In CoNLL 2008: Proceedings of the Twelfth Conference on Computational Natural Language Learning , pages 159 - 177 , Manchester, England, August. Coling 2008 Organizing Committee . Mihai Surdeanu , Massimiliano Ciaramita, and Hugo Zaragoza . 2011 . Learning to rank answers to nonfactoid questions from web collections . Computational Linguistics , 37 ( 2 ): 351 - 383 . Kaveh Taghipour and Hwee Tou Ng. 2015 . SemiSupervised Word Sense Disambiguation Using Word Embeddings in General and Specific Domains . In Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies , pages 314 - 323 , Denver, Colorado, May-June. Association for Computational Linguistics . Jason Weston , Samy Bengio, and Nicolas Usunier . 2011 . WSABIE: Scaling Up to Large Vocabulary Image Annotation . In Proceedings of the Twenty-Second International Joint Conference on Artificial Intelligence - Volume Volume Three , IJCAI'11 , pages 2764 - 2770 , Barcelona, Catalonia, Spain. AAAI Press. Haitong Yang , Tao Zhuang , and Chengqing Zong . 2015 . Domain adaptation for syntactic and semantic dependency parsing using deep belief networks . Transactions of the Association for Computational Linguistics , 3 : 271 - 282 . Seid Muhie Yimam , Richard Eckart de Castilho, Iryna Gurevych, and Chris Biemann . 2014 . Automatic Annotation Suggestions and Custom Annotation Layers in WebAnno . In Kalina Bontcheva and Zhu Jingbo, editors, Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics. System Demonstrations , pages 91 - 96 , Stroudsburg, PA 18360 , USA. Association for Computational Linguistics.] +[Eneko Agirre , Oier Lo´pez de Lacalle, Christiane Fellbaum, Shu-Kai Hsieh , Maurizio Tesconi, Monica Monachini, Piek Vossen, and Roxanne Segers . 2010 . SemEval-2010 Task 17 : All-Words Word Sense Disambiguation on a Specific Domain . InProceedings of the 5th International Workshop on Semantic Evaluation , pages 75 - 80 . Association for Computational Linguistics. Collin Baker , Michael Ellsworth , and Katrin Erk . 2007 . SemEval-2007 Task 19 : Frame Semantic Structure Extraction . In Proceedings of the Fourth International Workshop on Semantic Evaluations (SemEval2007) , pages 99 - 104 , Prague, Czech Republic, June. Association for Computational Linguistics. Jonathan Berant , Vivek Srikumar, Pei-Chun Chen , Abby Vander Linden, Brittany Harding, Brad Huang, Peter Clark , and Christopher D. Manning . 2014 . Modeling Biological Processes for Reading Comprehension . In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 1499 - 1510 , Doha, Qatar. Association for Computational Linguistics. John Blitzer , Ryan McDonald , and Fernando Pereira . 2006 . Domain adaptation with structural correspondence learning . In Proceedings of the 2006 Conference on Empirical Methods in Natural Language Processing , pages 120 - 128 , Sydney, Australia, July. Association for Computational Linguistics. Xavier Carreras and Llu´ıs Ma`rquez. 2005 . Introduction to the CoNLL-2005 shared task: Semantic role labeling . In Proceedings of the Ninth Conference on Computational Natural Language Learning (CoNLL-2005) , pages 152 - 164 , Ann Arbor, Michigan, June. Association for Computational Linguistics. Danilo Croce , Cristina Giannone, Paolo Annesi, and Roberto Basili . 2010 . Towards open-domain semantic role labeling . In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics , pages 237 - 246 , Uppsala, Sweden, July. Association for Computational Linguistics. Dipanjan Das and Noah A. Smith . 2011 . SemiSupervised Frame-Semantic Parsing for Unknown Predicates . In Proc. of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies , pages 1435 - 1444 , Portland, Oregon, USA. Dipanjan Das , Desai Chen , Andre´ F. T. Martins , Nathan Schneider , and Noah A. Smith . 2014 . Frame-semantic parsing . Computational Linguistics , 40 ( 1 ): 9 - 56 . Hal Daume ´III. 2007 . Frustratingly easy domain adaptation . In Proceedings of the 45th Annual Meeting of the Association of Computational Linguistics , pages 256 - 263 , Prague, Czech Republic, June. Association for Computational Linguistics. Katrin Erk and Sebastian Pado´. 2006 . SHALMANESER - A Toolchain For Shallow Semantic Parsing . In Proceedings of the 5th International Conference on Language Resources and Evaluation (LREC 2006 ), volume 6 , pages 527 - 532 , Genoa, Italy. ELRA. Charles J. Fillmore , Christopher R. Johnson , and Miriam R.L. Petruck . 2003 . Background to FrameNet. International journal of lexicography , 16 ( 3 ): 235 - 250 . Nicholas FitzGerald , Oscar Ta¨ckstro¨m, Kuzman Ganchev, and Dipanjan Das . 2015 . Semantic role labeling with neural network factors . In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing , pages 960 - 970 , Lisbon, Portugal, September. Association for Computational Linguistics. Jan Hajicˇ , Massimiliano Ciaramita, Richard Johansson, Daisuke Kawahara, Maria Anto`nia Mart´ı, Llu´ıs Ma`rquez, Adam Meyers, Joakim Nivre, Sebastian Pado´, Jan Sˇ teˇpa´nek, Pavel Stranˇa´k, Mihai Surdeanu, Nianwen Xue, and Yi Zhang . 2009 . The conll2009 shared task: Syntactic and semantic dependencies in multiple languages . In Proceedings of the Thirteenth Conference on Computational Natural Language Learning (CoNLL 2009 ): Shared Task, pages 1 - 18 , Boulder, Colorado, June. Association for Computational Linguistics. Karl Moritz Hermann , Dipanjan Das , Jason Weston , and Kuzman Ganchev . 2014 . Semantic frame identification with distributed word representations . In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 1448 - 1458 , Baltimore, Maryland, June. Association for Computational Linguistics. Fei Huang and Alexander Yates . 2010 . Open-domain semantic role labeling by modeling word spans . In Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics , pages 968 - 978 , Uppsala, Sweden, July. Association for Computational Linguistics. Ignacio Iacobacci , Mohammad Taher Pilehvar, and Roberto Navigli . 2016 . Embeddings for Word Sense Disambiguation: An Evaluation Study . In Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 897 - 907 , Berlin, Germany, August. Association for Computational Linguistics. Anders Johannsen , He´ctor Mart´ınez Alonso, and Anders Søgaard . 2015 . Any-language frame-semantic parsing . In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing , pages 2062 - 2066 , Lisbon, Portugal, September. Association for Computational Linguistics. Richard Johansson and Pierre Nugues . 2008 . The effect of syntactic representation on semantic role labeling . In Proceedings of the 22nd International Conference on Computational Linguistics (Coling 2008 ), pages 393 - 400 , Manchester, UK , August . Coling 2008 Organizing Committee . Meghana Kshirsagar , Sam Thomson, Nathan Schneider, Jaime Carbonell, Noah A. Smith , and Chris Dyer . 2015 . Frame-semantic role labeling with heterogeneous annotations . In Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 2: Short Papers) , pages 218 - 224 , Beijing, China, July. Association for Computational Linguistics. Maciej Kula . 2015 . Metadata embeddings for user and item cold-start recommendations . In Toine Bogers and Marijn Koolen , editors, Proceedings of the 2nd Workshop on New Trends on Content-Based Recommender Systems co-located with 9th ACM Conference on Recommender Systems (RecSys 2015 ), volume 1448 of CEUR Workshop Proceedings , pages 14 - 21 , Vienna, Austria, September. CEUR-WS.org. Omer Levy and Yoav Goldberg . 2014 . Dependencybased word embeddings . In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics , ACL 2014 , June 22-27, 2014 , Baltimore, MD , USA, Volume 2 : Short Papers , pages 302 - 308 . The Association for Computer Linguistics. Oren Melamud , Jacob Goldberger , and Ido Dagan . 2016 . context2vec: Learning generic context embedding with bidirectional LSTM . In Proceedings of the 20th SIGNLL Conference on Computational Natural Language Learning , CoNLL 2016 , Berlin, Germany, August 11-12 , 2016 , pages 51 - 61 . Tomas Mikolov , Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean . 2013 . Distributed Representations of Words and Phrases and Their Compositionality . In Proceedings of the 26th International Conference on Neural Information Processing Systems (NIPS '13) , pages 3111 - 3119 , Lake Tahoe , Nevada, USA. Alexis Palmer and Caroline Sporleder . 2010 . Evaluating FrameNet-style semantic parsing: the role of coverage gaps in FrameNet . In Proceedings of the 23rd International Conference on Computational Linguistics: Posters , pages 928 - 936 , Beijing, China, August . Rebecca J. Passonneau , Collin F. Baker , Christiane Fellbaum, and Nancy Ide . 2012 . The MASC Word Sense Corpus . In Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12) , pages 3025 - 3030 , Istanbul, Turkey. Jeffrey Pennington , Richard Socher, and Christopher Manning . 2014 . Glove: Global vectors for word representation . In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 1532 - 1543 , Doha, Qatar, October. Association for Computational Linguistics. Michael Roth and Mirella Lapata . 2015 . Contextaware frame-semantic role labeling . Transactions of the Association for Computational Linguistics , 3 : 449 - 460 . Josef Ruppenhofer , Michael Ellsworth, Miriam R. L. Petruck , Christopher R. Johnson , and Jan Scheffczyk . 2010 . FrameNet II: Extended Theory and Practice . Technical report , ICSI, University of California, Berkeley. Anders Søgaard , Barbara Plank, and H e´ctor Mart´ınez Alonso. 2015 . Using Frame Semantics for Knowledge Extraction from Twitter . In Proceedings of the Twenty-Ninth AAAI Conference on Artificial Intelligence , pages 2447 - 2452 , Austin, Texas, USA. Anders Søgaard . 2013 . Semi-supervised learning and domain adaptation in natural language processing . Synthesis Lectures on Human Language Technologies , 6 ( 2 ): 1 - 103 . Mihai Surdeanu , Richard Johansson, Adam Meyers, Llu´ıs Ma`rquez, and Joakim Nivre . 2008 . The conll 2008 shared task on joint parsing of syntactic and semantic dependencies . In CoNLL 2008: Proceedings of the Twelfth Conference on Computational Natural Language Learning , pages 159 - 177 , Manchester, England, August. Coling 2008 Organizing Committee . Mihai Surdeanu , Massimiliano Ciaramita, and Hugo Zaragoza . 2011 . Learning to rank answers to nonfactoid questions from web collections . Computational Linguistics , 37 ( 2 ): 351 - 383 . Kaveh Taghipour and Hwee Tou Ng. 2015 . SemiSupervised Word Sense Disambiguation Using Word Embeddings in General and Specific Domains . In Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies , pages 314 - 323 , Denver, Colorado, May-June. Association for Computational Linguistics . Jason Weston , Samy Bengio, and Nicolas Usunier . 2011 . WSABIE: Scaling Up to Large Vocabulary Image Annotation . In Proceedings of the Twenty-Second International Joint Conference on Artificial Intelligence - Volume Volume Three , IJCAI'11 , pages 2764 - 2770 , Barcelona, Catalonia, Spain. AAAI Press. Haitong Yang , Tao Zhuang , and Chengqing Zong . 2015 . Domain adaptation for syntactic and semantic dependency parsing using deep belief networks . Transactions of the Association for Computational Linguistics , 3 : 271 - 282 . Seid Muhie Yimam , Richard Eckart de Castilho, Iryna Gurevych, and Chris Biemann . 2014 . Automatic Annotation Suggestions and Custom Annotation Layers in WebAnno . In Kalina Bontcheva and Zhu Jingbo, editors, Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics. System Demonstrations , pages 91 - 96 , Stroudsburg, PA 18360 , USA. Association for Computational Linguistics.] Paragraph sofa: _InitialView - begin: 38859 + begin: 38861 end: 50111 -------- View _InitialView end ---------------------------------- diff --git a/dkpro-core-io-combination-asl/pom.xml b/dkpro-core-io-combination-asl/pom.xml index 47abf3bfc7..e6e238e719 100644 --- a/dkpro-core-io-combination-asl/pom.xml +++ b/dkpro-core-io-combination-asl/pom.xml @@ -1,53 +1,75 @@ - 4.0.0 - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT - ../dkpro-core-asl - - de.tudarmstadt.ukp.dkpro.core.io.combination-asl - DKPro Core ASL - IO - Combination - jar - - - org.apache.uima - uimaj-core - - - org.apache.uima - uimafit-core - - - commons-io - commons-io - - - junit - junit - test - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.text-asl - test - - + 4.0.0 + + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT + ../dkpro-core-asl + + dkpro-core-io-combination-asl + DKPro Core ASL - IO - Combination + https://dkpro.github.io/dkpro-core/ + jar + + + org.apache.uima + uimaj-core + + + org.apache.uima + uimafit-core + + + commons-io + commons-io + + + eu.openminted.share.annotations + omtd-share-annotations-api + + + junit + junit + test + + + org.dkpro.core + dkpro-core-io-text-asl + test + + + + + + eu.openminted.share.annotations + omtd-share-annotations-maven-plugin + + + + **/*.xml + + + + + \ No newline at end of file diff --git a/dkpro-core-io-combination-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/combination/CombinationReader.java b/dkpro-core-io-combination-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/combination/CombinationReader.java deleted file mode 100644 index a805e984f8..0000000000 --- a/dkpro-core-io-combination-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/combination/CombinationReader.java +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.combination; - -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import org.apache.commons.io.output.FileWriterWithEncoding; -import org.apache.uima.UIMAException; -import org.apache.uima.UimaContext; -import org.apache.uima.cas.CAS; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.collection.CollectionReader; -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.component.CasCollectionReader_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.factory.CollectionReaderFactory; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.Progress; -import org.xml.sax.SAXException; - -/** - * Combines multiple readers into a single reader. - */ -@ResourceMetaData(name="Combining Meta-Reader") -public class CombinationReader - extends CasCollectionReader_ImplBase -{ - - public static final String PARAM_READERS = "readers"; - @ConfigurationParameter(name = PARAM_READERS, mandatory = true) - private String[] readerFiles; - - private int readerIdx = 0; - private CollectionReader currentReader = null; - - private List readers; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - - readers = new ArrayList<>(); - - for (String readerFile : readerFiles) { - try { - readers.add(CollectionReaderFactory.createReaderFromPath(readerFile)); - } catch (UIMAException e) { - throw new ResourceInitializationException(e); - } catch (IOException e) { - throw new ResourceInitializationException(e); - } - } - } - - @Override - public void getNext(CAS aCAS) - throws IOException, CollectionException - { - currentReader.getNext(aCAS); - } - - @Override - public boolean hasNext() - throws IOException, CollectionException - { - try { - currentReader = getReader(); - - boolean hasNext = currentReader.hasNext(); - if (hasNext) { - return true; - } - currentReader = moreReadersToReadFrom(); - return continueIfCurrentReaderIsNotNull(); - - } - catch (Exception e) { - throw new IOException(e); - } - } - - private boolean continueIfCurrentReaderIsNotNull() throws Exception - { - if (currentReader == null){ - return false; - } - return currentReader.hasNext(); - } - - private CollectionReader moreReadersToReadFrom() - throws Exception - { - if (readerIdx + 1 < readers.size()) { - // close the empty-read reader - currentReader.close(); - - readerIdx++; - return readers.get(readerIdx); - } - return null; - } - - private CollectionReader getReader() - throws UIMAException, IOException - { - return readers.get(readerIdx); - } - - @Override - public Progress[] getProgress() - { - return currentReader.getProgress(); - } - - public static File descriptionToFile(CollectionReaderDescription desc) - throws IOException, SAXException - { - File tempFile = File.createTempFile("combReader", "desc"); - FileWriterWithEncoding writer = new FileWriterWithEncoding(tempFile, "UTF-8"); - desc.toXML(writer); - - return tempFile; - } -} \ No newline at end of file diff --git a/dkpro-core-io-combination-asl/src/main/java/org/dkpro/core/io/combination/CombinationReader.java b/dkpro-core-io-combination-asl/src/main/java/org/dkpro/core/io/combination/CombinationReader.java new file mode 100644 index 0000000000..8d00a9839a --- /dev/null +++ b/dkpro-core-io-combination-asl/src/main/java/org/dkpro/core/io/combination/CombinationReader.java @@ -0,0 +1,153 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.combination; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.io.output.FileWriterWithEncoding; +import org.apache.uima.UIMAException; +import org.apache.uima.UimaContext; +import org.apache.uima.cas.CAS; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.component.CasCollectionReader_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.factory.CollectionReaderFactory; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Progress; +import org.xml.sax.SAXException; + +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Combines multiple readers into a single reader. + */ +@Component(value = OperationType.READER) +@ResourceMetaData(name = "Combining Meta-Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +public class CombinationReader + extends CasCollectionReader_ImplBase +{ + /** + * Locations of UIMA reader description files. + */ + public static final String PARAM_READERS = "readers"; + @ConfigurationParameter(name = PARAM_READERS, mandatory = true) + private String[] readerFiles; + + private int readerIdx = 0; + private CollectionReader currentReader = null; + + private List readers; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + readers = new ArrayList<>(); + + for (String readerFile : readerFiles) { + try { + readers.add(CollectionReaderFactory.createReaderFromPath(readerFile)); + } catch (UIMAException e) { + throw new ResourceInitializationException(e); + } catch (IOException e) { + throw new ResourceInitializationException(e); + } + } + } + + @Override + public void getNext(CAS aCAS) + throws IOException, CollectionException + { + currentReader.getNext(aCAS); + } + + @Override + public boolean hasNext() + throws IOException, CollectionException + { + try { + currentReader = getReader(); + + boolean hasNext = currentReader.hasNext(); + if (hasNext) { + return true; + } + currentReader = moreReadersToReadFrom(); + return continueIfCurrentReaderIsNotNull(); + + } + catch (Exception e) { + throw new IOException(e); + } + } + + private boolean continueIfCurrentReaderIsNotNull() throws Exception + { + if (currentReader == null) { + return false; + } + return currentReader.hasNext(); + } + + private CollectionReader moreReadersToReadFrom() + throws Exception + { + if (readerIdx + 1 < readers.size()) { + // close the empty-read reader + currentReader.close(); + + readerIdx++; + return readers.get(readerIdx); + } + return null; + } + + private CollectionReader getReader() + throws UIMAException, IOException + { + return readers.get(readerIdx); + } + + @Override + public Progress[] getProgress() + { + return currentReader.getProgress(); + } + + public static File descriptionToFile(CollectionReaderDescription desc) + throws IOException, SAXException + { + File tempFile = File.createTempFile("combReader", "desc"); + FileWriterWithEncoding writer = new FileWriterWithEncoding(tempFile, "UTF-8"); + desc.toXML(writer); + + return tempFile; + } +} diff --git a/dkpro-core-io-combination-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/combination/CombinationReaderTest.java b/dkpro-core-io-combination-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/combination/CombinationReaderTest.java deleted file mode 100644 index c7c0e8280f..0000000000 --- a/dkpro-core-io-combination-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/combination/CombinationReaderTest.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.combination; - -import static org.junit.Assert.*; - -import java.io.File; -import java.util.ArrayList; -import java.util.List; - -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.factory.CollectionReaderFactory; -import org.apache.uima.fit.pipeline.JCasIterable; -import org.apache.uima.jcas.JCas; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; - -public class CombinationReaderTest { - - @Test - public void combinationReaderTest() - throws Exception - { - List readerFiles = new ArrayList<>(); - readerFiles.add( - CombinationReader.descriptionToFile(CollectionReaderFactory.createReaderDescription( - TextReader.class, - TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/texts/a/*.txt")) - ); - readerFiles.add( - CombinationReader.descriptionToFile(CollectionReaderFactory.createReaderDescription( - TextReader.class, - TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/texts/b/*.txt")) - ); - - CollectionReaderDescription combinationReader = CollectionReaderFactory.createReaderDescription( - CombinationReader.class, - CombinationReader.PARAM_READERS, readerFiles.toArray() - ); - - int i=0; - for (JCas jcas : new JCasIterable(combinationReader)) { - i++; - System.out.println(jcas.getDocumentText()); - } - assertEquals(4, i); - } -} diff --git a/dkpro-core-io-combination-asl/src/test/java/org/dkpro/core/io/combination/CombinationReaderTest.java b/dkpro-core-io-combination-asl/src/test/java/org/dkpro/core/io/combination/CombinationReaderTest.java new file mode 100644 index 0000000000..73274796d6 --- /dev/null +++ b/dkpro-core-io-combination-asl/src/test/java/org/dkpro/core/io/combination/CombinationReaderTest.java @@ -0,0 +1,65 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.combination; + +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.factory.CollectionReaderFactory; +import org.apache.uima.fit.pipeline.JCasIterable; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.combination.CombinationReader; +import org.dkpro.core.io.text.TextReader; +import org.junit.Test; + +public class CombinationReaderTest { + + @Test + public void combinationReaderTest() + throws Exception + { + List readerFiles = new ArrayList<>(); + readerFiles.add( + CombinationReader.descriptionToFile(CollectionReaderFactory.createReaderDescription( + TextReader.class, + TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/texts/a/*.txt")) + ); + readerFiles.add( + CombinationReader.descriptionToFile(CollectionReaderFactory.createReaderDescription( + TextReader.class, + TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/texts/b/*.txt")) + ); + + CollectionReaderDescription combinationReader = createReaderDescription( + CombinationReader.class, + CombinationReader.PARAM_READERS, readerFiles.toArray() + ); + + int i = 0; + for (JCas jcas : new JCasIterable(combinationReader)) { + i++; + System.out.println(jcas.getDocumentText()); + } + assertEquals(4, i); + } +} diff --git a/dkpro-core-io-conll-asl/pom.xml b/dkpro-core-io-conll-asl/pom.xml index 4650b41be9..94cd28d1bc 100644 --- a/dkpro-core-io-conll-asl/pom.xml +++ b/dkpro-core-io-conll-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.conll-asl + dkpro-core-io-conll-asl jar DKPro Core ASL - IO - CoNLL + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -44,52 +45,56 @@ commons-lang3 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl it.unimi.dsi fastutil - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.penntree-asl + org.dkpro.core + dkpro-core-io-penntree-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.syntax-asl + org.dkpro.core + dkpro-core-api-syntax-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.ner-asl + org.dkpro.core + dkpro-core-api-ner-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.semantics-asl + org.dkpro.core + dkpro-core-api-semantics-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.coref-asl + org.dkpro.core + dkpro-core-api-coref-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -97,8 +102,13 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.assertj + assertj-core + test + + + org.dkpro.core + dkpro-core-testing-asl test diff --git a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2002Reader.java b/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2002Reader.java deleted file mode 100644 index 7f84d4c20f..0000000000 --- a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2002Reader.java +++ /dev/null @@ -1,398 +0,0 @@ -/* - * Copyright 2013 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; - -import static org.apache.commons.io.IOUtils.closeQuietly; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.Type; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.factory.JCasBuilder; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.io.IobDecoder; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - *

Reads by default the CoNLL 2002 named entity format.

- * - *

The reader is also compatible with the CoNLL-based GermEval 2014 named entity format, - * in which the columns are separated by a tab, and there is an extra column for embedded named entities, - * besides the token number being put in the first column (see below). - * For that, additional parameters are provided, by which one can determine the column separator, - * whether there is an additional first column for token numbers, and whether embedded - * named entities should be read. - * (Note: Currently, the reader only reads the outer named entities, not the embedded ones.

- * - *

- * The following snippet shows an example of the TSV format 
- * # http://de.wikipedia.org/wiki/Manfred_Korfmann [2009-10-17]
- * 1  Aufgrund          O           O
- * 2  seiner            O           O
- * 3  Initiative        O           O
- * 4  fand              O           O
- * 5  2001/2002         O           O
- * 6  in                O           O
- * 7  Stuttgart         B-LOC       O
- * 8  ,                 O           O
- * 9  Braunschweig      B-LOC       O
- * 10 und               O           O
- * 11 Bonn              B-LOC       O
- * 12 eine              O           O
- * 13 große             O           O
- * 14 und               O           O
- * 15 publizistisch     O           O
- * 16 vielbeachtete     O           O
- * 17 Troia-Ausstellung B-LOCpart   O
- * 18 statt             O           O
- * 19 ,                 O           O
- * 20 „                 O           O
- * 21 Troia             B-OTH       B-LOC
- * 22 -                 I-OTH       O
- * 23 Traum             I-OTH       O
- * 24 und               I-OTH       O
- * 25 Wirklichkeit      I-OTH       O
- * 26 “                 O           O
- * 27 .                 O           O
- * 
- * - *
    - *
  1. WORD_NUMBER - token number
  2. - *
  3. FORM - token
  4. - *
  5. NER1 - outer named entity (BIO encoded)
  6. - *
  7. NER2 - embedded named entity (BIO encoded)
  8. - *
- - * The sentence is encoded as one token per line, with information provided in tab-separated columns. - * The first column contains either a #, which signals the source the sentence is cited from and the date it was retrieved, - * or the token number within the sentence. The second column contains the token. - * Name spans are encoded in the BIO-scheme. Outer spans are encoded in the third column, - * embedded spans in the fourth column. - * - * @see CoNLL 2002 shared task - * @see GermEval 2014 NER task - */ -@ResourceMetaData(name="CoNLL 2002 Reader") -@MimeTypeCapability({MimeTypes.TEXT_X_CONLL_2002, MimeTypes.TEXT_X_GERMEVAL_2014}) -@TypeCapability( - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity"}) -public class Conll2002Reader - extends JCasResourceCollectionReader_ImplBase -{ - - /** - * - * Column Separators - * - */ - public enum ColumnSeparators - { - SPACE("space", " "), - TAB("tab", "\t"), - INVALID("", ""); - - private String name; - private String value; - - private ColumnSeparators(String aName, String aValue) - { - name = aName; - value = aValue; - } - - public String getName() - { - return name; - } - - private String getValue() - { - return value; - } - - private static ColumnSeparators getInstance(String Name) { - for (ColumnSeparators cs : ColumnSeparators.values()) { - if (Name.equals(cs.getName())) { - return cs; - } - } - return INVALID; - } - } - - /** - * Column separator - */ - - ColumnSeparators columnSeparator; - - /** - * Column positions - */ - private int FORM = 0; - private int IOB = 1; - - /** - * Column separator parameter. Acceptable input values come from {@link ColumnSeparators}.
- * Example usage: if you want to define 'tab' as the column separator the following value should be input for - * this parameter {@code Conll2002Reader.ColumnSeparators.TAB.getName()} - */ - public static final String PARAM_COLUMN_SEPARATOR = "columnSeparator"; - @ConfigurationParameter(name = PARAM_COLUMN_SEPARATOR, mandatory = false, defaultValue = "space") - private String columnSeparatorName; - - /** - * Token number flag. When true, the first column contains the token number - * inside the sentence (as in GermEval 2014 format) - */ - public static final String PARAM_HAS_TOKEN_NUMBER = "hasTokenNumber"; - @ConfigurationParameter(name = PARAM_HAS_TOKEN_NUMBER, mandatory = false, defaultValue = "false") - private boolean hasTokenNumber; - - /** - * Indicates that there is a header line before the sentence - */ - public static final String PARAM_HAS_HEADER = "hasHeader"; - @ConfigurationParameter(name = PARAM_HAS_HEADER, mandatory = false, defaultValue = "false") - private boolean hasHeader; - - /** - * Character encoding of the input data. - */ - public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) - private String sourceEncoding; - - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spamming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code true} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - - /** - * Read named entity information. - * - * Default: {@code true} - */ - public static final String PARAM_READ_NAMED_ENTITY = ComponentParameters.PARAM_READ_NAMED_ENTITY; - @ConfigurationParameter(name = PARAM_READ_NAMED_ENTITY, mandatory = true, defaultValue = "true") - private boolean namedEntityEnabled; - - /** - * Has embedded named entity extra column. - * - * Default: {@code false} - */ - public static final String PARAM_HAS_EMBEDDED_NAMED_ENTITY = "hasEmbeddedNamedEntity"; - @ConfigurationParameter(name = PARAM_HAS_EMBEDDED_NAMED_ENTITY, mandatory = false, defaultValue = "false") - private boolean hasEmbeddedNamedEntity; - - /** - * Location of the mapping file for named entity tags to UIMA types. - */ - public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_NAMED_ENTITY_MAPPING_LOCATION, mandatory = false) - private String namedEntityMappingLocation; - - private MappingProvider namedEntityMappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - namedEntityMappingProvider = new MappingProvider(); - namedEntityMappingProvider.setDefault(MappingProvider.LOCATION, "classpath:/there/is/no/mapping/yet"); - namedEntityMappingProvider.setDefault(MappingProvider.BASE_TYPE, NamedEntity.class.getName()); - namedEntityMappingProvider.setOverride(MappingProvider.LOCATION, namedEntityMappingLocation); - namedEntityMappingProvider.setOverride(MappingProvider.LANGUAGE, getLanguage()); - - // Configure column positions. First column may be used for token number - FORM = hasTokenNumber?1:0; - IOB = hasTokenNumber?2:1; - - // Configure column separator - columnSeparator = ColumnSeparators.getInstance(columnSeparatorName); - - if (columnSeparator == ColumnSeparators.INVALID) { - Object[] params = {columnSeparatorName, PARAM_COLUMN_SEPARATOR}; - throw new ResourceInitializationException( - ResourceInitializationException.RESOURCE_DATA_NOT_VALID, params); - } - - } - - @Override - public void getNext(JCas aJCas) - throws IOException, CollectionException - { - try{ - if (namedEntityEnabled) { - namedEntityMappingProvider.configure(aJCas.getCas()); - } - } - catch(AnalysisEngineProcessException e){ - throw new IOException(e); - } - - Resource res = nextFile(); - initCas(aJCas, res); - BufferedReader reader = null; - try { - reader = new BufferedReader(new InputStreamReader( - CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()), - sourceEncoding)); - convert(aJCas, reader); - } - finally { - closeQuietly(reader); - } - } - - private void convert(JCas aJCas, BufferedReader aReader) - throws IOException - { - JCasBuilder doc = new JCasBuilder(aJCas); - - Type namedEntityType = JCasUtil.getType(aJCas, NamedEntity.class); - Feature namedEntityValue = namedEntityType.getFeatureByBaseName("value"); - IobDecoder decoder = new IobDecoder(aJCas.getCas(), namedEntityValue, namedEntityMappingProvider); - decoder.setInternTags(internTags); - - List words; - while ((words = readSentence(aReader)) != null) { - if (words.isEmpty()) { - continue; - } - - int sentenceBegin = doc.getPosition(); - int sentenceEnd = sentenceBegin; - - List tokens = new ArrayList(); - String[] namedEntityTags = new String[words.size()]; - - // Tokens, POS - int i = 0; - Iterator wordIterator = words.iterator(); - while (wordIterator.hasNext()) { - String[] word = wordIterator.next(); - - // Read token - Token token = doc.add(word[FORM], Token.class); - sentenceEnd = token.getEnd(); - if (wordIterator.hasNext()) { - doc.add(" "); - } - - tokens.add(token); - namedEntityTags[i] = word[IOB]; - i++; - } - - if (namedEntityEnabled) { - decoder.decode(tokens, namedEntityTags); - } - - // Sentence - Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); - sentence.addToIndexes(); - - // Once sentence per line. - doc.add("\n"); - } - - doc.close(); - } - - /** - * Read a single sentence. - */ - private List readSentence(BufferedReader aReader) - throws IOException - { - List words = new ArrayList(); - String line; - boolean beginSentence = true; - - while ((line = aReader.readLine()) != null) { - if (StringUtils.isBlank(line)) { - beginSentence = true; - break; // End of sentence - } - - if (hasHeader && beginSentence) { - // Ignore header line - beginSentence = false; - continue; - } - - String[] fields = line.split(columnSeparator.getValue()); - - if (!hasEmbeddedNamedEntity && fields.length != 2 + FORM) { - throw new IOException(String.format( - "Invalid file format. Line needs to have %d %s-separated fields: [%s]", 2 + FORM, - columnSeparator.getName(), line)); - } - else if (hasEmbeddedNamedEntity && fields.length != 3 + FORM) { - throw new IOException(String.format( - "Invalid file format. Line needs to have %d %s-separated fields: [%s]", 3 + FORM, - columnSeparator.getName(), line)); - } - words.add(fields); - } - - if (line == null && words.isEmpty()) { - return null; - } - else { - return words; - } - } -} diff --git a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/ConllUReader.java b/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/ConllUReader.java deleted file mode 100644 index 1db8fd3168..0000000000 --- a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/ConllUReader.java +++ /dev/null @@ -1,382 +0,0 @@ -/* - * Copyright 2016 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; - -import static org.apache.commons.io.IOUtils.closeQuietly; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.Type; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.factory.JCasBuilder; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; -import it.unimi.dsi.fastutil.ints.Int2ObjectMap; -import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap; - -/** - * Reads a file in the CoNLL-U format. - * - * @see CoNLL-U Format - */ -@ResourceMetaData(name="CoNLL-U Reader") -@MimeTypeCapability({MimeTypes.TEXT_X_CONLL_U}) -@TypeCapability( - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) -public class ConllUReader - extends JCasResourceCollectionReader_ImplBase -{ - public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) - private String sourceEncoding; - - public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; - @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") - private boolean readPos; - - public static final String PARAM_READ_CPOS = ComponentParameters.PARAM_READ_CPOS; - @ConfigurationParameter(name = PARAM_READ_CPOS, mandatory = true, defaultValue = "true") - private boolean readCPos; - - public static final String PARAM_USE_CPOS_AS_POS = "useCPosAsPos"; - @ConfigurationParameter(name = PARAM_USE_CPOS_AS_POS, mandatory = true, defaultValue = "false") - private boolean useCPosAsPos; - - /** - * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the - * tag set defined as part of the model meta data. This can be useful if a custom model is - * specified which does not have such meta data, or it can be used in readers. - */ - public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; - @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) - protected String posTagset; - - /** - * Load the part-of-speech tag to UIMA type mapping from this location instead of locating - * the mapping automatically. - */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - protected String posMappingLocation; - - public static final String PARAM_READ_MORPH = ComponentParameters.PARAM_READ_MORPH; - @ConfigurationParameter(name = PARAM_READ_MORPH, mandatory = true, defaultValue = "true") - private boolean readMorph; - - public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; - @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") - private boolean readLemma; - - public static final String PARAM_READ_DEPENDENCY = ComponentParameters.PARAM_READ_DEPENDENCY; - @ConfigurationParameter(name = PARAM_READ_DEPENDENCY, mandatory = true, defaultValue = "true") - private boolean readDependency; - - private static final String UNUSED = "_"; - - private static final int ID = 0; - private static final int FORM = 1; - private static final int LEMMA = 2; - private static final int CPOSTAG = 3; - private static final int POSTAG = 4; - private static final int FEATS = 5; - private static final int HEAD = 6; - private static final int DEPREL = 7; - private static final int DEPS = 8; - private static final int MISC = 9; - - private MappingProvider posMappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - posTagset, getLanguage()); - } - - @Override - public void getNext(JCas aJCas) - throws IOException, CollectionException - { - Resource res = nextFile(); - initCas(aJCas, res); - BufferedReader reader = null; - try { - reader = new BufferedReader(new InputStreamReader( - CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()), - sourceEncoding)); - convert(aJCas, reader); - } - finally { - closeQuietly(reader); - } - } - - public void convert(JCas aJCas, BufferedReader aReader) - throws IOException - { - if (readPos) { - try{ - posMappingProvider.configure(aJCas.getCas()); - } - catch(AnalysisEngineProcessException e){ - throw new IOException(e); - } - } - - JCasBuilder doc = new JCasBuilder(aJCas); - - List words; - while ((words = readSentence(aReader)) != null) { - if (words.isEmpty()) { - // Ignore empty sentences. This can happen when there are multiple end-of-sentence - // markers following each other. - continue; - } - - int sentenceBegin = doc.getPosition(); - int sentenceEnd = sentenceBegin; - - int surfaceBegin = -1; - int surfaceEnd = -1; - String surfaceString = null; - - // Tokens, Lemma, POS - Int2ObjectMap tokens = new Int2ObjectOpenHashMap<>(); - Iterator wordIterator = words.iterator(); - while (wordIterator.hasNext()) { - String[] word = wordIterator.next(); - if (word[ID].contains("-")) { - String[] fragments = word[ID].split("-"); - surfaceBegin = Integer.valueOf(fragments[0]); - surfaceEnd = Integer.valueOf(fragments[1]); - surfaceString = word[FORM]; - continue; - } - - // Read token - int tokenIdx = Integer.valueOf(word[ID]); - Token token = doc.add(word[FORM], Token.class); - tokens.put(tokenIdx, token); - if (!StringUtils.contains(word[MISC], "SpaceAfter=No") && wordIterator.hasNext()) { - doc.add(" "); - } - - // Read lemma - if (!UNUSED.equals(word[LEMMA]) && readLemma) { - Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); - lemma.setValue(word[LEMMA]); - lemma.addToIndexes(); - token.setLemma(lemma); - } - - // Read part-of-speech tag - POS pos = null; - String tag = useCPosAsPos ? word[CPOSTAG] : word[POSTAG]; - if (!UNUSED.equals(tag) && readPos) { - Type posTag = posMappingProvider.getTagType(tag); - pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), - token.getEnd()); - pos.setPosValue(tag.intern()); - } - - // Read coarse part-of-speech tag - if (!UNUSED.equals(word[CPOSTAG]) && readCPos && pos != null) { - pos.setCoarseValue(word[CPOSTAG].intern()); - } - - if (pos != null) { - pos.addToIndexes(); - token.setPos(pos); - } - - // Read morphological features - if (!UNUSED.equals(word[FEATS]) && readMorph) { - MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, - token.getBegin(), token.getEnd()); - morphtag.setValue(word[FEATS]); - morphtag.addToIndexes(); - token.setMorph(morphtag); - - // Try parsing out individual feature values. Since the DKPro Core - // MorphologicalFeatures type is based on the definition from the UD project, - // we can do this rather straightforwardly. - Type morphType = morphtag.getType(); - String[] items = word[FEATS].split("\\|"); - for (String item : items) { - String[] keyValue = item.split("="); - StringBuilder key = new StringBuilder(keyValue[0]); - key.setCharAt(0, Character.toLowerCase(key.charAt(0))); - String value = keyValue[1]; - - Feature feat = morphType.getFeatureByBaseName(key.toString()); - if (feat != null) { - morphtag.setStringValue(feat, value); - } - } - } - - // Read surface form - if (tokenIdx == surfaceEnd) { - int begin = tokens.get(surfaceBegin).getBegin(); - int end = tokens.get(surfaceEnd).getEnd(); - SurfaceForm surfaceForm = new SurfaceForm(aJCas, begin, end); - surfaceForm.setValue(surfaceString); - surfaceForm.addToIndexes(); - surfaceBegin = -1; - surfaceEnd = -1; - surfaceString = null; - } - - sentenceEnd = token.getEnd(); - } - - // Dependencies - if (readDependency) { - for (String[] word : words) { - if (!UNUSED.equals(word[DEPREL])) { - int depId = Integer.valueOf(word[ID]); - int govId = Integer.valueOf(word[HEAD]); - - // Model the root as a loop onto itself - makeDependency(aJCas, govId, depId, word[DEPREL], DependencyFlavor.BASIC, - tokens, word); - } - - if (!UNUSED.equals(word[DEPS])) { - // list items separated by vertical bar - String[] items = word[DEPS].split("\\|"); - for (String item : items) { - String[] sItem = item.split(":"); - - int depId = Integer.valueOf(word[ID]); - int govId = Integer.valueOf(sItem[0]); - - makeDependency(aJCas, govId, depId, sItem[1], DependencyFlavor.ENHANCED, - tokens, word); - } - } - } - } - - // Sentence - Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); - sentence.addToIndexes(); - - // Once sentence per line. - doc.add("\n"); - } - - doc.close(); - } - - private Dependency makeDependency(JCas aJCas, int govId, int depId, String label, String flavor, - Int2ObjectMap tokens, String[] word) - { - Dependency rel; - - if (govId == 0) { - rel = new ROOT(aJCas); - rel.setGovernor(tokens.get(depId)); - rel.setDependent(tokens.get(depId)); - } - else { - rel = new Dependency(aJCas); - rel.setGovernor(tokens.get(govId)); - rel.setDependent(tokens.get(depId)); - } - - rel.setDependencyType(label); - rel.setFlavor(flavor); - rel.setBegin(rel.getDependent().getBegin()); - rel.setEnd(rel.getDependent().getEnd()); - rel.addToIndexes(); - - return rel; - } - - /** - * Read a single sentence. - */ - private static List readSentence(BufferedReader aReader) - throws IOException - { - List words = new ArrayList(); - String line; - while ((line = aReader.readLine()) != null) { - if (StringUtils.isBlank(line)) { - break; // End of sentence - } - if (line.startsWith("#")) { - // Comment line - continue; - } - String[] fields = line.split("\t"); - if (fields.length != 10) { - throw new IOException( - "Invalid file format. Line needs to have 10 tab-separated fields, but it has " - + fields.length + ": [" + line + "]"); - } - words.add(fields); - } - - if (line == null && words.isEmpty()) { - return null; - } - else { - return words; - } - } -} diff --git a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/ConllUWriter.java b/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/ConllUWriter.java deleted file mode 100644 index 8898d5dfff..0000000000 --- a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/ConllUWriter.java +++ /dev/null @@ -1,254 +0,0 @@ -/* - * Copyright 2016 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; - -import static org.apache.commons.io.IOUtils.closeQuietly; -import static org.apache.uima.fit.util.JCasUtil.indexCovered; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; - -import java.io.OutputStreamWriter; -import java.io.PrintWriter; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; - -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; - -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; -import it.unimi.dsi.fastutil.ints.Int2ObjectMap; -import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap; - -/** - * Writes a file in the CoNLL-U format. - * - * @see CoNLL-U Format - */ -@ResourceMetaData(name="CoNLL-U Writer") -@MimeTypeCapability({MimeTypes.TEXT_X_CONLL_U}) -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) -public class ConllUWriter - extends JCasFileWriter_ImplBase -{ - private static final String UNUSED = "_"; - private static final int UNUSED_INT = -1; - - /** - * Character encoding of the output data. - */ - public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; - @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) - private String targetEncoding; - - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; - @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".conll") - private String filenameSuffix; - - public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; - @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "true") - private boolean writePos; - - public static final String PARAM_WRITE_CPOS = ComponentParameters.PARAM_WRITE_CPOS; - @ConfigurationParameter(name = PARAM_WRITE_CPOS, mandatory = true, defaultValue = "true") - private boolean writeCPos; - - public static final String PARAM_WRITE_MORPH = ComponentParameters.PARAM_WRITE_MORPH; - @ConfigurationParameter(name = PARAM_WRITE_MORPH, mandatory = true, defaultValue = "true") - private boolean writeMorph; - - public static final String PARAM_WRITE_LEMMA = ComponentParameters.PARAM_WRITE_LEMMA; - @ConfigurationParameter(name = PARAM_WRITE_LEMMA, mandatory = true, defaultValue = "true") - private boolean writeLemma; - - public static final String PARAM_WRITE_DEPENDENCY = ComponentParameters.PARAM_WRITE_DEPENDENCY; - @ConfigurationParameter(name = PARAM_WRITE_DEPENDENCY, mandatory = true, defaultValue = "true") - private boolean writeDependency; - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - PrintWriter out = null; - try { - out = new PrintWriter(new OutputStreamWriter(getOutputStream(aJCas, filenameSuffix), - targetEncoding)); - convert(aJCas, out); - } - catch (Exception e) { - throw new AnalysisEngineProcessException(e); - } - finally { - closeQuietly(out); - } - } - - private void convert(JCas aJCas, PrintWriter aOut) - { - Map> surfaceIdx = indexCovered(aJCas, SurfaceForm.class, - Token.class); - Int2ObjectMap surfaceBeginIdx = new Int2ObjectOpenHashMap<>(); - for (SurfaceForm sf : select(aJCas, SurfaceForm.class)) { - surfaceBeginIdx.put(sf.getBegin(), sf); - } - - - for (Sentence sentence : select(aJCas, Sentence.class)) { - HashMap ctokens = new LinkedHashMap(); - - // Tokens - List tokens = selectCovered(Token.class, sentence); - - for (int i = 0; i < tokens.size(); i++) { - Row row = new Row(); - row.id = i+1; - row.token = tokens.get(i); - row.noSpaceAfter = (i+1 < tokens.size()) && row.token.getEnd() == tokens.get(i+1).getBegin(); - ctokens.put(row.token, row); - } - - // Dependencies - for (Dependency rel : selectCovered(Dependency.class, sentence)) { - if (StringUtils.isBlank(rel.getFlavor()) || DependencyFlavor.BASIC.equals(rel.getFlavor())) { - ctokens.get(rel.getDependent()).deprel = rel; - } - else { - ctokens.get(rel.getDependent()).deps.add(rel); - } - } - - // Write sentence in CONLL-U format - for (Row row : ctokens.values()) { - String lemma = UNUSED; - if (writeLemma && (row.token.getLemma() != null)) { - lemma = row.token.getLemma().getValue(); - } - - String pos = UNUSED; - if (writePos && (row.token.getPos() != null)) { - POS posAnno = row.token.getPos(); - pos = posAnno.getPosValue(); - } - - String cpos = UNUSED; - if (writeCPos && (row.token.getPos() != null) - && row.token.getPos().getCoarseValue() != null) { - POS posAnno = row.token.getPos(); - cpos = posAnno.getCoarseValue(); - } - - int headId = UNUSED_INT; - String deprel = UNUSED; - String deps = UNUSED; - if (writeDependency) { - if ((row.deprel != null)) { - deprel = row.deprel.getDependencyType(); - headId = ctokens.get(row.deprel.getGovernor()).id; - if (headId == row.id) { - // ROOT dependencies may be modeled as a loop, ignore these. - headId = 0; - } - } - - StringBuilder depsBuf = new StringBuilder(); - for (Dependency d : row.deps) { - if (depsBuf.length() > 0) { - depsBuf.append('|'); - } - // Resolve self-looping root to 0-indexed root - int govId = ctokens.get(d.getGovernor()).id; - if (govId == row.id) { - govId = 0; - } - depsBuf.append(govId); - depsBuf.append(':'); - depsBuf.append(d.getDependencyType()); - } - if (depsBuf.length() > 0) { - deps = depsBuf.toString(); - } - } - - String head = UNUSED; - if (headId != UNUSED_INT) { - head = Integer.toString(headId); - } - - String feats = UNUSED; - if (writeMorph && (row.token.getMorph() != null)) { - feats = row.token.getMorph().getValue(); - } - - String misc = UNUSED; - if (row.noSpaceAfter) { - misc = "SpaceAfter=No"; - } - - SurfaceForm sf = surfaceBeginIdx.get(row.token.getBegin()); - if (sf != null) { - @SuppressWarnings({ "unchecked", "rawtypes" }) - List covered = (List) surfaceIdx.get(sf); - int id1 = ctokens.get(covered.get(0)).id; - int id2 = ctokens.get(covered.get(covered.size()-1)).id; - aOut.printf("%d-%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", id1, id2, - sf.getValue(), UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, - UNUSED); - } - - aOut.printf("%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", row.id, - row.token.getCoveredText(), lemma, cpos, pos, feats, head, deprel, deps, - misc); - } - - aOut.println(); - } - } - - private static final class Row - { - int id; - Token token; - boolean noSpaceAfter; - Dependency deprel; - List deps = new ArrayList<>(); - } -} diff --git a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/package-info.java b/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/package-info.java deleted file mode 100644 index d038c3d0cb..0000000000 --- a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Provides classes for the conversion of conll file formats. - * Especially to and from Brat response formats - */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; diff --git a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2000Reader.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2000Reader.java similarity index 80% rename from dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2000Reader.java rename to dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2000Reader.java index d552312fab..112765acf8 100644 --- a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2000Reader.java +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2000Reader.java @@ -15,9 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; +package org.dkpro.core.io.conll; import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.dkpro.core.api.resources.MappingProviderFactory.createChunkMappingProvider; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; import java.io.BufferedReader; import java.io.IOException; @@ -39,26 +41,27 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.IobDecoder; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.io.conll.internal.ConllReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.io.IobDecoder; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Reads the CoNLL 2000 chunking format. * * @see CoNLL 2000 shared task */ -@ResourceMetaData(name="CoNLL 2000 Reader") +@ResourceMetaData(name = "CoNLL 2000 Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.TEXT_X_CONLL_2000}) @TypeCapability( outputs = { @@ -67,7 +70,7 @@ "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk"}) public class Conll2000Reader - extends JCasResourceCollectionReader_ImplBase + extends ConllReader_ImplBase { private static final int FORM = 0; private static final int POSTAG = 1; @@ -77,23 +80,12 @@ public class Conll2000Reader * Character encoding of the input data. */ public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String sourceEncoding; /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spamming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code true} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - - /** - * Write part-of-speech information. - * - * Default: {@code true} + * Read part-of-speech information. */ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") @@ -108,18 +100,25 @@ public class Conll2000Reader @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) protected String posTagset; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; /** - * Write chunk information. - * - * Default: {@code true} + * Read chunk information. */ public static final String PARAM_READ_CHUNK = ComponentParameters.PARAM_READ_CHUNK; @ConfigurationParameter(name = PARAM_READ_CHUNK, mandatory = true, defaultValue = "true") @@ -138,10 +137,11 @@ public class Conll2000Reader * Load the chunk tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ - public static final String PARAM_CHUNK_MAPPING_LOCATION = ComponentParameters.PARAM_CHUNK_MAPPING_LOCATION; + public static final String PARAM_CHUNK_MAPPING_LOCATION = + ComponentParameters.PARAM_CHUNK_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_CHUNK_MAPPING_LOCATION, mandatory = false) protected String chunkMappingLocation; - + private MappingProvider posMappingProvider; private MappingProvider chunkMappingProvider; @@ -151,11 +151,11 @@ public void initialize(UimaContext aContext) { super.initialize(aContext); - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - posTagset, getLanguage()); + posMappingProvider = createPosMappingProvider(this, posMappingLocation, posTagset, + getLanguage()); - chunkMappingProvider = MappingProviderFactory.createChunkMappingProvider(chunkMappingLocation, - chunkTagset, getLanguage()); + chunkMappingProvider = createChunkMappingProvider(this, chunkMappingLocation, chunkTagset, + getLanguage()); } @Override @@ -196,7 +196,6 @@ private void convert(JCas aJCas, BufferedReader aReader) Type chunkType = JCasUtil.getType(aJCas, Chunk.class); Feature chunkValue = chunkType.getFeatureByBaseName("chunkValue"); IobDecoder decoder = new IobDecoder(aJCas.getCas(), chunkValue, chunkMappingProvider); - decoder.setInternTags(internTags); List words; while ((words = readSentence(aReader)) != null) { @@ -214,22 +213,24 @@ private void convert(JCas aJCas, BufferedReader aReader) int i = 0; for (String[] word : words) { // Read token - Token token = doc.add(word[FORM], Token.class); + Token token = doc.add(trim(word[FORM]), Token.class); sentenceEnd = token.getEnd(); doc.add(" "); if (posEnabled) { - Type posTag = posMappingProvider.getTagType(word[POSTAG]); + String posTagValue = cleanTag(word[POSTAG]); + + Type posTag = posMappingProvider.getTagType(posTagValue); POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); - pos.setPosValue(word[POSTAG].intern()); + pos.setPosValue(posTagValue); POSUtils.assignCoarseValue(pos); pos.addToIndexes(); token.setPos(pos); } tokens.add(token); - chunkTags[i] = word[IOB]; + chunkTags[i] = trim(word[IOB]); i++; } diff --git a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2000Writer.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2000Writer.java similarity index 78% rename from dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2000Writer.java rename to dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2000Writer.java index d39dd8444e..3cd827e381 100644 --- a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2000Writer.java +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2000Writer.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; +package org.dkpro.core.io.conll; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.fit.util.JCasUtil.select; @@ -36,22 +36,24 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.IobEncoder; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.io.IobEncoder; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Writes the CoNLL 2000 chunking format. * * @see CoNLL 2000 shared task */ -@ResourceMetaData(name="CoNLL 2000 Writer") +@ResourceMetaData(name = "CoNLL 2000 Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.TEXT_X_CONLL_2000}) @TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", @@ -66,21 +68,40 @@ public class Conll2000Writer * Character encoding of the output data. */ public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; - @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String targetEncoding; - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; + /** + * Use this filename extension. + */ + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".conll") private String filenameSuffix; + /** + * Write part-of-speech information. + */ public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "true") private boolean writePos; + /** + * Write chunking information. + */ public static final String PARAM_WRITE_CHUNK = ComponentParameters.PARAM_WRITE_CHUNK; @ConfigurationParameter(name = PARAM_WRITE_CHUNK, mandatory = true, defaultValue = "true") private boolean writeChunk; - + + /** + * Write text covered by the token instead of the token form. + */ + public static final String PARAM_WRITE_COVERED_TEXT = + ComponentParameters.PARAM_WRITE_COVERED_TEXT; + @ConfigurationParameter(name = PARAM_WRITE_COVERED_TEXT, mandatory = true, defaultValue = "true") + private boolean writeCovered; + @Override public void process(JCas aJCas) throws AnalysisEngineProcessException @@ -122,6 +143,11 @@ private void convert(JCas aJCas, PrintWriter aOut) // Write sentence in CONLL 2006 format for (Row row : ctokens.values()) { + String form = row.token.getCoveredText(); + if (!writeCovered) { + form = row.token.getText(); + } + String pos = UNUSED; if (writePos && (row.token.getPos() != null)) { POS posAnno = row.token.getPos(); @@ -133,7 +159,7 @@ private void convert(JCas aJCas, PrintWriter aOut) chunk = row.chunk; } - aOut.printf("%s %s %s\n", row.token.getCoveredText(), pos, chunk); + aOut.printf("%s %s %s\n", form, pos, chunk); } aOut.println(); diff --git a/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2002Reader.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2002Reader.java new file mode 100644 index 0000000000..19740786b0 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2002Reader.java @@ -0,0 +1,390 @@ +/* + * Copyright 2013 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.conll; + +import static org.apache.commons.io.IOUtils.closeQuietly; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.Type; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.factory.JCasBuilder; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.IobDecoder; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.io.conll.internal.ConllReader_ImplBase; + +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + *

Reads by default the CoNLL 2002 named entity format.

+ * + *

The reader is also compatible with the CoNLL-based GermEval 2014 named entity format, + * in which the columns are separated by a tab, and there is an extra column for embedded named + * entities, besides the token number being put in the first column (see below). + * For that, additional parameters are provided, by which one can determine the column separator, + * whether there is an additional first column for token numbers, and whether embedded + * named entities should be read. + * (Note: Currently, the reader only reads the outer named entities, not the embedded ones.

+ * + *

+ * The following snippet shows an example of the TSV format 
+ * # http://de.wikipedia.org/wiki/Manfred_Korfmann [2009-10-17]
+ * 1  Aufgrund          O           O
+ * 2  seiner            O           O
+ * 3  Initiative        O           O
+ * 4  fand              O           O
+ * 5  2001/2002         O           O
+ * 6  in                O           O
+ * 7  Stuttgart         B-LOC       O
+ * 8  ,                 O           O
+ * 9  Braunschweig      B-LOC       O
+ * 10 und               O           O
+ * 11 Bonn              B-LOC       O
+ * 12 eine              O           O
+ * 13 große             O           O
+ * 14 und               O           O
+ * 15 publizistisch     O           O
+ * 16 vielbeachtete     O           O
+ * 17 Troia-Ausstellung B-LOCpart   O
+ * 18 statt             O           O
+ * 19 ,                 O           O
+ * 20 „                 O           O
+ * 21 Troia             B-OTH       B-LOC
+ * 22 -                 I-OTH       O
+ * 23 Traum             I-OTH       O
+ * 24 und               I-OTH       O
+ * 25 Wirklichkeit      I-OTH       O
+ * 26 “                 O           O
+ * 27 .                 O           O
+ * 
+ * + *
    + *
  1. WORD_NUMBER - token number
  2. + *
  3. FORM - token
  4. + *
  5. NER1 - outer named entity (BIO encoded)
  6. + *
  7. NER2 - embedded named entity (BIO encoded)
  8. + *
+ + * The sentence is encoded as one token per line, with information provided in tab-separated + * columns. The first column contains either a #, which signals the source the sentence is cited + * from and the date it was retrieved, or the token number within the sentence. The second column + * contains the token. Name spans are encoded in the BIO-scheme. Outer spans are encoded in the + * third column, embedded spans in the fourth column. + * + * @see CoNLL 2002 shared task + * @see GermEval 2014 NER task + */ +@ResourceMetaData(name = "CoNLL 2002 Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.TEXT_X_CONLL_2002, MimeTypes.TEXT_X_GERMEVAL_2014}) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity"}) +public class Conll2002Reader + extends ConllReader_ImplBase +{ + /** + * Column Separators + */ + public enum ColumnSeparators + { + SPACE("space", " "), + TAB("tab", "\t"), + INVALID("", ""); + + private String name; + private String value; + + private ColumnSeparators(String aName, String aValue) + { + name = aName; + value = aValue; + } + + public String getName() + { + return name; + } + + private String getValue() + { + return value; + } + + private static ColumnSeparators getInstance(String Name) { + for (ColumnSeparators cs : ColumnSeparators.values()) { + if (Name.equals(cs.getName())) { + return cs; + } + } + return INVALID; + } + } + + /** + * Column separator + */ + + ColumnSeparators columnSeparator; + + /** + * Column positions + */ + private int FORM = 0; + private int IOB = 1; + + /** + * Column separator parameter. Acceptable input values come from {@link ColumnSeparators}. + *
+ * Example usage: if you want to define 'tab' as the column separator the following value + * should be input for this parameter {@code Conll2002Reader.ColumnSeparators.TAB.getName()} + */ + public static final String PARAM_COLUMN_SEPARATOR = "columnSeparator"; + @ConfigurationParameter(name = PARAM_COLUMN_SEPARATOR, mandatory = false, defaultValue = "space") + private String columnSeparatorName; + + /** + * Token number flag. When true, the first column contains the token number + * inside the sentence (as in GermEval 2014 format) + */ + public static final String PARAM_HAS_TOKEN_NUMBER = "hasTokenNumber"; + @ConfigurationParameter(name = PARAM_HAS_TOKEN_NUMBER, mandatory = false, defaultValue = "false") + private boolean hasTokenNumber; + + /** + * Indicates that there is a header line before the sentence + */ + public static final String PARAM_HAS_HEADER = "hasHeader"; + @ConfigurationParameter(name = PARAM_HAS_HEADER, mandatory = false, defaultValue = "false") + private boolean hasHeader; + + /** + * Character encoding of the input data. + */ + public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) + private String sourceEncoding; + + /** + * Read named entity information. + */ + public static final String PARAM_READ_NAMED_ENTITY = + ComponentParameters.PARAM_READ_NAMED_ENTITY; + @ConfigurationParameter(name = PARAM_READ_NAMED_ENTITY, mandatory = true, defaultValue = "true") + private boolean namedEntityEnabled; + + /** + * Has embedded named entity extra column. + */ + public static final String PARAM_HAS_EMBEDDED_NAMED_ENTITY = "hasEmbeddedNamedEntity"; + @ConfigurationParameter(name = PARAM_HAS_EMBEDDED_NAMED_ENTITY, mandatory = false, defaultValue = "false") + private boolean hasEmbeddedNamedEntity; + + /** + * Location of the mapping file for named entity tags to UIMA types. + */ + public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = + ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_NAMED_ENTITY_MAPPING_LOCATION, mandatory = false) + private String namedEntityMappingLocation; + + private MappingProvider namedEntityMappingProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + namedEntityMappingProvider = new MappingProvider(); + namedEntityMappingProvider.setDefault(MappingProvider.LOCATION, + "classpath:/there/is/no/mapping/yet"); + namedEntityMappingProvider.setDefault(MappingProvider.BASE_TYPE, + NamedEntity.class.getName()); + namedEntityMappingProvider.setOverride(MappingProvider.LOCATION, + namedEntityMappingLocation); + namedEntityMappingProvider.setOverride(MappingProvider.LANGUAGE, getLanguage()); + + // Configure column positions. First column may be used for token number + FORM = hasTokenNumber ? 1 : 0; + IOB = hasTokenNumber ? 2 : 1; + + // Configure column separator + columnSeparator = ColumnSeparators.getInstance(columnSeparatorName); + + if (columnSeparator == ColumnSeparators.INVALID) { + Object[] params = {columnSeparatorName, PARAM_COLUMN_SEPARATOR}; + throw new ResourceInitializationException( + ResourceInitializationException.RESOURCE_DATA_NOT_VALID, params); + } + + } + + @Override + public void getNext(JCas aJCas) + throws IOException, CollectionException + { + try { + if (namedEntityEnabled) { + namedEntityMappingProvider.configure(aJCas.getCas()); + } + } + catch (AnalysisEngineProcessException e) { + throw new IOException(e); + } + + Resource res = nextFile(); + initCas(aJCas, res); + BufferedReader reader = null; + try { + reader = new BufferedReader(new InputStreamReader( + CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()), + sourceEncoding)); + convert(aJCas, reader); + } + finally { + closeQuietly(reader); + } + } + + private void convert(JCas aJCas, BufferedReader aReader) + throws IOException + { + JCasBuilder doc = new JCasBuilder(aJCas); + + Type namedEntityType = JCasUtil.getType(aJCas, NamedEntity.class); + Feature namedEntityValue = namedEntityType.getFeatureByBaseName("value"); + IobDecoder decoder = new IobDecoder(aJCas.getCas(), namedEntityValue, + namedEntityMappingProvider); + + List words; + while ((words = readSentence(aReader)) != null) { + if (words.isEmpty()) { + continue; + } + + int sentenceBegin = doc.getPosition(); + int sentenceEnd = sentenceBegin; + + List tokens = new ArrayList(); + String[] namedEntityTags = new String[words.size()]; + + // Tokens, POS + int i = 0; + Iterator wordIterator = words.iterator(); + while (wordIterator.hasNext()) { + String[] word = wordIterator.next(); + + // Read token + Token token = doc.add(trim(word[FORM]), Token.class); + sentenceEnd = token.getEnd(); + if (wordIterator.hasNext()) { + doc.add(" "); + } + + tokens.add(token); + namedEntityTags[i] = cleanTag(word[IOB]); + i++; + } + + if (namedEntityEnabled) { + decoder.decode(tokens, namedEntityTags); + } + + // Sentence + Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); + sentence.addToIndexes(); + + // Once sentence per line. + doc.add("\n"); + } + + doc.close(); + } + + /** + * Read a single sentence. + */ + private List readSentence(BufferedReader aReader) + throws IOException + { + List words = new ArrayList(); + String line; + boolean beginSentence = true; + + while ((line = aReader.readLine()) != null) { + if (StringUtils.isBlank(line)) { + beginSentence = true; + break; // End of sentence + } + + if (hasHeader && beginSentence) { + // Ignore header line + beginSentence = false; + continue; + } + + String[] fields = line.split(columnSeparator.getValue()); + + if (!hasEmbeddedNamedEntity && fields.length != 2 + FORM) { + throw new IOException(String.format( + "Invalid file format. Line needs to have %d %s-separated fields: [%s]", + 2 + FORM, columnSeparator.getName(), line)); + } + else if (hasEmbeddedNamedEntity && fields.length != 3 + FORM) { + throw new IOException(String.format( + "Invalid file format. Line needs to have %d %s-separated fields: [%s]", + 3 + FORM, columnSeparator.getName(), line)); + } + words.add(fields); + } + + if (line == null && words.isEmpty()) { + return null; + } + else { + return words; + } + } +} diff --git a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2002Writer.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2002Writer.java similarity index 76% rename from dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2002Writer.java rename to dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2002Writer.java index 16e3ccd290..f993028137 100644 --- a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2002Writer.java +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2002Writer.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; +package org.dkpro.core.io.conll; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.fit.util.JCasUtil.select; @@ -36,21 +36,23 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.IobEncoder; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.io.IobEncoder; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Writes the CoNLL 2002 named entity format. * * @see CoNLL 2002 shared task */ -@ResourceMetaData(name="CoNLL 2002 Writer") +@ResourceMetaData(name = "CoNLL 2002 Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.TEXT_X_CONLL_2002}) @TypeCapability( inputs = { @@ -67,16 +69,33 @@ public class Conll2002Writer * Character encoding of the output data. */ public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; - @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String targetEncoding; - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; + /** + * Use this filename extension. + */ + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".conll") private String filenameSuffix; - public static final String PARAM_WRITE_NAMED_ENTITY = ComponentParameters.PARAM_WRITE_NAMED_ENTITY; + /** + * Write named entity information. + */ + public static final String PARAM_WRITE_NAMED_ENTITY = + ComponentParameters.PARAM_WRITE_NAMED_ENTITY; @ConfigurationParameter(name = PARAM_WRITE_NAMED_ENTITY, mandatory = true, defaultValue = "true") private boolean writeNamedEntity; + + /** + * Write text covered by the token instead of the token form. + */ + public static final String PARAM_WRITE_COVERED_TEXT = + ComponentParameters.PARAM_WRITE_COVERED_TEXT; + @ConfigurationParameter(name = PARAM_WRITE_COVERED_TEXT, mandatory = true, defaultValue = "true") + private boolean writeCovered; @Override public void process(JCas aJCas) @@ -119,12 +138,17 @@ private void convert(JCas aJCas, PrintWriter aOut) // Write sentence in CONLL 2006 format for (Row row : ctokens.values()) { + String form = row.token.getCoveredText(); + if (!writeCovered) { + form = row.token.getText(); + } + String namedEntity = UNUSED; if (writeNamedEntity && (row.ne != null)) { namedEntity = row.ne; } - aOut.printf("%s %s\n", row.token.getCoveredText(), namedEntity); + aOut.printf("%s %s\n", form, namedEntity); } aOut.println(); diff --git a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2003Reader.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2003Reader.java similarity index 80% rename from dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2003Reader.java rename to dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2003Reader.java index 978706b2e3..9e167ca566 100644 --- a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2003Reader.java +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2003Reader.java @@ -15,9 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; +package org.dkpro.core.io.conll; import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.dkpro.core.api.resources.MappingProviderFactory.createChunkMappingProvider; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; import java.io.BufferedReader; import java.io.IOException; @@ -39,27 +41,28 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.IobDecoder; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.io.conll.internal.ConllReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.io.IobDecoder; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Reads the CoNLL 2003 format. * * @see CoNLL 2003 shared task */ -@ResourceMetaData(name="CoNLL 2003 Reader") +@ResourceMetaData(name = "CoNLL 2003 Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.TEXT_X_CONLL_2003}) @TypeCapability( outputs = { @@ -69,7 +72,7 @@ "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk", "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity" }) public class Conll2003Reader - extends JCasResourceCollectionReader_ImplBase + extends ConllReader_ImplBase { private static final int FORM = 0; private static final int POSTAG = 1; @@ -80,23 +83,12 @@ public class Conll2003Reader * Character encoding of the input data. */ public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String sourceEncoding; /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spamming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code true} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - - /** - * Write part-of-speech information. - * - * Default: {@code true} + * Read part-of-speech information. */ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") @@ -111,18 +103,25 @@ public class Conll2003Reader @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) protected String posTagset; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; /** - * Write chunk information. - * - * Default: {@code true} + * Read chunk information. */ public static final String PARAM_READ_CHUNK = ComponentParameters.PARAM_READ_CHUNK; @ConfigurationParameter(name = PARAM_READ_CHUNK, mandatory = true, defaultValue = "true") @@ -141,23 +140,24 @@ public class Conll2003Reader * Load the chunk tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ - public static final String PARAM_CHUNK_MAPPING_LOCATION = ComponentParameters.PARAM_CHUNK_MAPPING_LOCATION; + public static final String PARAM_CHUNK_MAPPING_LOCATION = + ComponentParameters.PARAM_CHUNK_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_CHUNK_MAPPING_LOCATION, mandatory = false) protected String chunkMappingLocation; /** * Read named entity information. - * - * Default: {@code true} */ - public static final String PARAM_READ_NAMED_ENTITY = ComponentParameters.PARAM_READ_NAMED_ENTITY; + public static final String PARAM_READ_NAMED_ENTITY = + ComponentParameters.PARAM_READ_NAMED_ENTITY; @ConfigurationParameter(name = PARAM_READ_NAMED_ENTITY, mandatory = true, defaultValue = "true") private boolean namedEntityEnabled; /** * Location of the mapping file for named entity tags to UIMA types. */ - public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; + public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = + ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_NAMED_ENTITY_MAPPING_LOCATION, mandatory = false) private String namedEntityMappingLocation; @@ -171,16 +171,19 @@ public void initialize(UimaContext aContext) { super.initialize(aContext); - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - posTagset, getLanguage()); + posMappingProvider = createPosMappingProvider(this, posMappingLocation, posTagset, + getLanguage()); - chunkMappingProvider = MappingProviderFactory.createChunkMappingProvider(chunkMappingLocation, - chunkTagset, getLanguage()); + chunkMappingProvider = createChunkMappingProvider(this, chunkMappingLocation, chunkTagset, + getLanguage()); namedEntityMappingProvider = new MappingProvider(); - namedEntityMappingProvider.setDefault(MappingProvider.LOCATION, "classpath:/there/is/no/mapping/yet"); - namedEntityMappingProvider.setDefault(MappingProvider.BASE_TYPE, NamedEntity.class.getName()); - namedEntityMappingProvider.setOverride(MappingProvider.LOCATION, namedEntityMappingLocation); + namedEntityMappingProvider.setDefault(MappingProvider.LOCATION, + "classpath:/there/is/no/mapping/yet"); + namedEntityMappingProvider.setDefault(MappingProvider.BASE_TYPE, + NamedEntity.class.getName()); + namedEntityMappingProvider.setOverride(MappingProvider.LOCATION, + namedEntityMappingLocation); namedEntityMappingProvider.setOverride(MappingProvider.LANGUAGE, getLanguage()); } @@ -225,12 +228,11 @@ private void convert(JCas aJCas, BufferedReader aReader) Type chunkType = JCasUtil.getType(aJCas, Chunk.class); Feature chunkValue = chunkType.getFeatureByBaseName("chunkValue"); IobDecoder chunkDecoder = new IobDecoder(aJCas.getCas(), chunkValue, chunkMappingProvider); - chunkDecoder.setInternTags(internTags); Type namedEntityType = JCasUtil.getType(aJCas, NamedEntity.class); Feature namedEntityValue = namedEntityType.getFeatureByBaseName("value"); - IobDecoder neDecoder = new IobDecoder(aJCas.getCas(), namedEntityValue, namedEntityMappingProvider); - neDecoder.setInternTags(internTags); + IobDecoder neDecoder = new IobDecoder(aJCas.getCas(), namedEntityValue, + namedEntityMappingProvider); List words; while ((words = readSentence(aReader)) != null) { @@ -249,23 +251,25 @@ private void convert(JCas aJCas, BufferedReader aReader) int i = 0; for (String[] word : words) { // Read token - Token token = doc.add(word[FORM], Token.class); + Token token = doc.add(trim(word[FORM]), Token.class); sentenceEnd = token.getEnd(); doc.add(" "); if (posEnabled) { - Type posTag = posMappingProvider.getTagType(word[POSTAG]); + String posTagValue = cleanTag(word[POSTAG]); + + Type posTag = posMappingProvider.getTagType(posTagValue); POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); - pos.setPosValue(word[POSTAG].intern()); + pos.setPosValue(posTagValue); POSUtils.assignCoarseValue(pos); pos.addToIndexes(); token.setPos(pos); } tokens.add(token); - chunkTags[i] = word[CHUNK]; - namedEntityTags[i] = word[NAMED_ENTITY]; + chunkTags[i] = cleanTag(word[CHUNK]); + namedEntityTags[i] = cleanTag(word[NAMED_ENTITY]); i++; } diff --git a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2003Writer.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2003Writer.java similarity index 79% rename from dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2003Writer.java rename to dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2003Writer.java index 8c8831ba21..b4734b9f78 100644 --- a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2003Writer.java +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2003Writer.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; +package org.dkpro.core.io.conll; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.fit.util.JCasUtil.select; @@ -36,23 +36,25 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.IobEncoder; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.io.IobEncoder; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Writes the CoNLL 2003 format. * * @see CoNLL 2003 shared task */ -@ResourceMetaData(name="CoNLL 2003 Writer") +@ResourceMetaData(name = "CoNLL 2003 Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.TEXT_X_CONLL_2003}) @TypeCapability( inputs = { @@ -70,25 +72,48 @@ public class Conll2003Writer * Character encoding of the output data. */ public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; - @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String targetEncoding; - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; + /** + * Use this filename extension. + */ + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".conll") private String filenameSuffix; + /** + * Write part-of-speech information. + */ public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "true") private boolean writePos; + /** + * Write chunking information. + */ public static final String PARAM_WRITE_CHUNK = ComponentParameters.PARAM_WRITE_CHUNK; @ConfigurationParameter(name = PARAM_WRITE_CHUNK, mandatory = true, defaultValue = "true") private boolean writeChunk; - public static final String PARAM_WRITE_NAMED_ENTITY = ComponentParameters.PARAM_WRITE_NAMED_ENTITY; + /** + * Write named entity information. + */ + public static final String PARAM_WRITE_NAMED_ENTITY = + ComponentParameters.PARAM_WRITE_NAMED_ENTITY; @ConfigurationParameter(name = PARAM_WRITE_NAMED_ENTITY, mandatory = true, defaultValue = "true") private boolean writeNamedEntity; + /** + * Write text covered by the token instead of the token form. + */ + public static final String PARAM_WRITE_COVERED_TEXT = + ComponentParameters.PARAM_WRITE_COVERED_TEXT; + @ConfigurationParameter(name = PARAM_WRITE_COVERED_TEXT, mandatory = true, defaultValue = "true") + private boolean writeCovered; + @Override public void process(JCas aJCas) throws AnalysisEngineProcessException @@ -137,6 +162,11 @@ private void convert(JCas aJCas, PrintWriter aOut) // Write sentence in CONLL 2006 format for (Row row : ctokens.values()) { + String form = row.token.getCoveredText(); + if (!writeCovered) { + form = row.token.getText(); + } + String pos = UNUSED; if (writePos && (row.token.getPos() != null)) { POS posAnno = row.token.getPos(); @@ -153,7 +183,7 @@ private void convert(JCas aJCas, PrintWriter aOut) namedEntity = row.ne; } - aOut.printf("%s %s %s %s\n", row.token.getCoveredText(), pos, chunk, namedEntity); + aOut.printf("%s %s %s %s\n", form, pos, chunk, namedEntity); } aOut.println(); diff --git a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2006Reader.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2006Reader.java similarity index 79% rename from dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2006Reader.java rename to dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2006Reader.java index 53eb664ed5..2f072c3e65 100644 --- a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2006Reader.java +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2006Reader.java @@ -15,9 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; +package org.dkpro.core.io.conll; +import static de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor.BASIC; import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; import java.io.BufferedReader; import java.io.IOException; @@ -40,28 +42,28 @@ import org.apache.uima.fit.factory.JCasBuilder; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.io.conll.internal.ConllReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; +import eu.openminted.share.annotations.api.DocumentationResource; /** - *

Reads a file in the CoNLL-2006 format (aka CoNLL-X).

+ * Reads files in the CoNLL-2006 format (aka CoNLL-X). * * @see CoNLL-X Shared Task: Multi-lingual Dependency Parsing */ -@ResourceMetaData(name="CoNLL 2006 Reader") +@ResourceMetaData(name = "CoNLL 2006 Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.TEXT_X_CONLL_2006}) @TypeCapability( outputs = { @@ -73,16 +75,26 @@ "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) public class Conll2006Reader - extends JCasResourceCollectionReader_ImplBase + extends ConllReader_ImplBase { + /** + * Character encoding of the input data. + */ public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String sourceEncoding; + /** + * Read fine-grained part-of-speech information. + */ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") private boolean readPos; + /** + * Read coarse-grained part-of-speech information. + */ public static final String PARAM_READ_CPOS = ComponentParameters.PARAM_READ_CPOS; @ConfigurationParameter(name = PARAM_READ_CPOS, mandatory = true, defaultValue = "true") private boolean readCPos; @@ -104,22 +116,40 @@ public class Conll2006Reader @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) protected String posTagset; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; + /** + * Read morphological features. + */ public static final String PARAM_READ_MORPH = ComponentParameters.PARAM_READ_MORPH; @ConfigurationParameter(name = PARAM_READ_MORPH, mandatory = true, defaultValue = "true") private boolean readMorph; + /** + * Read lemma information. + */ public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") private boolean readLemma; + /** + * Read syntactic dependency information. + */ public static final String PARAM_READ_DEPENDENCY = ComponentParameters.PARAM_READ_DEPENDENCY; @ConfigurationParameter(name = PARAM_READ_DEPENDENCY, mandatory = true, defaultValue = "true") private boolean readDependency; @@ -145,8 +175,8 @@ public void initialize(UimaContext aContext) { super.initialize(aContext); - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - posTagset, getLanguage()); + posMappingProvider = createPosMappingProvider(this, posMappingLocation, posTagset, + getLanguage()); } @Override @@ -171,10 +201,10 @@ public void convert(JCas aJCas, BufferedReader aReader) throws IOException { if (readPos) { - try{ + try { posMappingProvider.configure(aJCas.getCas()); } - catch(AnalysisEngineProcessException e){ + catch (AnalysisEngineProcessException e) { throw new IOException(e); } } @@ -198,8 +228,8 @@ public void convert(JCas aJCas, BufferedReader aReader) while (wordIterator.hasNext()) { String[] word = wordIterator.next(); // Read token - Token token = doc.add(word[FORM], Token.class); - tokens.put(Integer.valueOf(word[ID]), token); + Token token = doc.add(trim(word[FORM]), Token.class); + tokens.put(Integer.valueOf(trim(word[ID])), token); if (wordIterator.hasNext()) { doc.add(" "); } @@ -207,24 +237,25 @@ public void convert(JCas aJCas, BufferedReader aReader) // Read lemma if (!UNUSED.equals(word[LEMMA]) && readLemma) { Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); - lemma.setValue(word[LEMMA]); + lemma.setValue(trim(word[LEMMA])); lemma.addToIndexes(); token.setLemma(lemma); } // Read part-of-speech tag POS pos = null; - String tag = useCPosAsPos ? word[CPOSTAG] : word[POSTAG]; + String cPosTag = cleanTag(word[CPOSTAG]); + String tag = useCPosAsPos ? cPosTag : cleanTag(word[POSTAG]); if (!UNUSED.equals(tag) && readPos) { Type posTag = posMappingProvider.getTagType(tag); pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); - pos.setPosValue(tag.intern()); + pos.setPosValue(tag); } // Read coarse part-of-speech tag - if (!UNUSED.equals(word[CPOSTAG]) && readCPos && pos != null) { - pos.setCoarseValue(word[CPOSTAG].intern()); + if (!UNUSED.equals(cPosTag) && readCPos && pos != null) { + pos.setCoarseValue(cPosTag); } if (pos != null) { @@ -233,10 +264,11 @@ public void convert(JCas aJCas, BufferedReader aReader) } // Read morphological features - if (!UNUSED.equals(word[FEATS]) && readMorph) { + String featsValue = cleanTag(word[FEATS]); + if (!UNUSED.equals(featsValue) && readMorph) { MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd()); - morphtag.setValue(word[FEATS]); + morphtag.setValue(featsValue); morphtag.addToIndexes(); token.setMorph(morphtag); } @@ -244,32 +276,33 @@ public void convert(JCas aJCas, BufferedReader aReader) sentenceEnd = token.getEnd(); } - // Dependencies + // Read dependencies if (readDependency) { for (String[] word : words) { - if (!UNUSED.equals(word[DEPREL])) { - int depId = Integer.valueOf(word[ID]); - int govId = Integer.valueOf(word[HEAD]); + String depRel = cleanTag(word[DEPREL]); + if (!UNUSED.equals(depRel)) { + int depId = Integer.valueOf(trim(word[ID])); + int govId = Integer.valueOf(trim(word[HEAD])); // Model the root as a loop onto itself if (govId == 0) { Dependency rel = new ROOT(aJCas); rel.setGovernor(tokens.get(depId)); rel.setDependent(tokens.get(depId)); - rel.setDependencyType(word[DEPREL]); + rel.setDependencyType(depRel); rel.setBegin(rel.getDependent().getBegin()); rel.setEnd(rel.getDependent().getEnd()); - rel.setFlavor(DependencyFlavor.BASIC); + rel.setFlavor(BASIC); rel.addToIndexes(); } else { Dependency rel = new Dependency(aJCas); rel.setGovernor(tokens.get(govId)); rel.setDependent(tokens.get(depId)); - rel.setDependencyType(word[DEPREL]); + rel.setDependencyType(depRel); rel.setBegin(rel.getDependent().getBegin()); rel.setEnd(rel.getDependent().getEnd()); - rel.setFlavor(DependencyFlavor.BASIC); + rel.setFlavor(BASIC); rel.addToIndexes(); } } diff --git a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2006Writer.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2006Writer.java similarity index 77% rename from dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2006Writer.java rename to dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2006Writer.java index 0ebb340572..ddb7c0d910 100644 --- a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2006Writer.java +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2006Writer.java @@ -15,9 +15,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; +package org.dkpro.core.io.conll; -import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; @@ -34,25 +33,28 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Writes a file in the CoNLL-2006 format (aka CoNLL-X). * * @see CoNLL-X Shared Task: Multi-lingual Dependency Parsing */ -@ResourceMetaData(name="CoNLL 2006 Writer") +@ResourceMetaData(name = "CoNLL 2006 Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.TEXT_X_CONLL_2006}) -@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", +@TypeCapability(inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures", @@ -69,49 +71,73 @@ public class Conll2006Writer * Character encoding of the output data. */ public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; - @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String targetEncoding; - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; + /** + * Use this filename extension. + */ + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".conll") private String filenameSuffix; + /** + * Write fine-grained part-of-speech information. + */ public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "true") private boolean writePos; + /** + * Write coarse-grained part-of-speech information. + */ public static final String PARAM_WRITE_CPOS = ComponentParameters.PARAM_WRITE_CPOS; @ConfigurationParameter(name = PARAM_WRITE_CPOS, mandatory = true, defaultValue = "true") private boolean writeCPos; + /** + * Write morphological features. + */ public static final String PARAM_WRITE_MORPH = "writeMorph"; @ConfigurationParameter(name = PARAM_WRITE_MORPH, mandatory = true, defaultValue = "true") private boolean writeMorph; + /** + * Write lemma information. + */ public static final String PARAM_WRITE_LEMMA = ComponentParameters.PARAM_WRITE_LEMMA; @ConfigurationParameter(name = PARAM_WRITE_LEMMA, mandatory = true, defaultValue = "true") private boolean writeLemma; + /** + * Write syntactic dependency information. + */ public static final String PARAM_WRITE_DEPENDENCY = ComponentParameters.PARAM_WRITE_DEPENDENCY; @ConfigurationParameter(name = PARAM_WRITE_DEPENDENCY, mandatory = true, defaultValue = "true") private boolean writeDependency; + + /** + * Write text covered by the token instead of the token form. + */ + public static final String PARAM_WRITE_COVERED_TEXT = + ComponentParameters.PARAM_WRITE_COVERED_TEXT; + @ConfigurationParameter(name = PARAM_WRITE_COVERED_TEXT, mandatory = true, defaultValue = "true") + private boolean writeCovered; @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { - PrintWriter out = null; - try { - out = new PrintWriter(new OutputStreamWriter(getOutputStream(aJCas, filenameSuffix), - targetEncoding)); + try (PrintWriter out = new PrintWriter( + new OutputStreamWriter(getOutputStream(aJCas, filenameSuffix), targetEncoding));) { + convert(aJCas, out); } catch (Exception e) { throw new AnalysisEngineProcessException(e); } - finally { - closeQuietly(out); - } } private void convert(JCas aJCas, PrintWriter aOut) @@ -123,12 +149,13 @@ private void convert(JCas aJCas, PrintWriter aOut) List tokens = selectCovered(Token.class, sentence); // Check if we should try to include the FEATS in output - List morphology = selectCovered(MorphologicalFeatures.class, sentence); + List morphology = selectCovered(MorphologicalFeatures.class, + sentence); boolean useFeats = tokens.size() == morphology.size(); for (int i = 0; i < tokens.size(); i++) { Row row = new Row(); - row.id = i+1; + row.id = i + 1; row.token = tokens.get(i); if (useFeats) { row.feats = morphology.get(i); @@ -138,20 +165,31 @@ private void convert(JCas aJCas, PrintWriter aOut) // Dependencies List basicDeps = selectCovered(Dependency.class, sentence).stream() - .filter(dep -> dep.getFlavor() == null || DependencyFlavor.BASIC.equals(dep.getFlavor())) + .filter(dep -> dep.getFlavor() == null || + DependencyFlavor.BASIC.equals(dep.getFlavor())) .collect(Collectors.toList()); for (Dependency rel : basicDeps) { Row row = ctokens.get(rel.getDependent()); if (row.deprel != null) { + String form = row.token.getCoveredText(); + if (!writeCovered) { + form = row.token.getText(); + } + throw new IllegalStateException("Illegal basic dependency structure - token [" - + row.token.getCoveredText() + + form + "] is dependent of more than one dependency."); } row.deprel = rel; } - // Write sentence in CONLL 2006 format + // Write sentence for (Row row : ctokens.values()) { + String form = row.token.getCoveredText(); + if (!writeCovered) { + form = row.token.getText(); + } + String lemma = UNUSED; if (writeLemma && (row.token.getLemma() != null)) { lemma = row.token.getLemma().getValue(); @@ -195,7 +233,7 @@ private void convert(JCas aJCas, PrintWriter aOut) String pdeprel = UNUSED; aOut.printf("%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", row.id, - row.token.getCoveredText(), lemma, cpos, pos, feats, head, deprel, phead, + form, lemma, cpos, pos, feats, head, deprel, phead, pdeprel); } diff --git a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2008Reader.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2008Reader.java similarity index 79% rename from dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2008Reader.java rename to dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2008Reader.java index e760d290c3..b15e9dc7ce 100644 --- a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2008Reader.java +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2008Reader.java @@ -15,9 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; +package org.dkpro.core.io.conll; +import static de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor.BASIC; import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; import java.io.BufferedReader; import java.io.IOException; @@ -41,15 +43,14 @@ import org.apache.uima.fit.util.FSCollectionFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.io.conll.internal.ConllReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; @@ -57,8 +58,8 @@ import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArgLink; import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Reads a file in the CoNLL-2008 format. @@ -68,7 +69,8 @@ * @see The CoNLL-2008 Shared Task on * Joint Parsing of Syntactic and Semantic Dependencies */ -@ResourceMetaData(name="CoNLL 2008 Reader") +@ResourceMetaData(name = "CoNLL 2008 Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.TEXT_X_CONLL_2008}) @TypeCapability( outputs = { @@ -82,12 +84,19 @@ "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred", "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg" }) public class Conll2008Reader - extends JCasResourceCollectionReader_ImplBase + extends ConllReader_ImplBase { + /** + * Character encoding of the input data. + */ public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String sourceEncoding; + /** + * Read part-of-speech information. + */ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") private boolean readPos; @@ -101,23 +110,42 @@ public class Conll2008Reader @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) protected String posTagset; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; + /** + * Read lemma information. + */ public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") private boolean readLemma; + /** + * Read syntactic dependency information. + */ public static final String PARAM_READ_DEPENDENCY = ComponentParameters.PARAM_READ_DEPENDENCY; @ConfigurationParameter(name = PARAM_READ_DEPENDENCY, mandatory = true, defaultValue = "true") private boolean readDependency; - public static final String PARAM_READ_SEMANTIC_PREDICATE = "readSemanticPredicate"; + /** + * Read semantic predicate information. + */ + public static final String PARAM_READ_SEMANTIC_PREDICATE = + ComponentParameters.PARAM_READ_SEMANTIC_PREDICATE; @ConfigurationParameter(name = PARAM_READ_SEMANTIC_PREDICATE, mandatory = true, defaultValue = "true") private boolean readSemanticPredicate; @@ -144,8 +172,8 @@ public void initialize(UimaContext aContext) { super.initialize(aContext); - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - posTagset, getLanguage()); + posMappingProvider = createPosMappingProvider(this, posMappingLocation, posTagset, + getLanguage()); } @Override @@ -170,10 +198,10 @@ public void convert(JCas aJCas, BufferedReader aReader) throws IOException { if (readPos) { - try{ + try { posMappingProvider.configure(aJCas.getCas()); } - catch(AnalysisEngineProcessException e){ + catch (AnalysisEngineProcessException e) { throw new IOException(e); } } @@ -198,34 +226,37 @@ public void convert(JCas aJCas, BufferedReader aReader) while (wordIterator.hasNext()) { String[] word = wordIterator.next(); // Read token - Token token = doc.add(word[FORM], Token.class); - tokens.put(Integer.valueOf(word[ID]), token); + Token token = doc.add(trim(word[FORM]), Token.class); + tokens.put(Integer.valueOf(trim(word[ID])), token); if (wordIterator.hasNext()) { doc.add(" "); } // Read lemma - if (!UNUSED.equals(word[LEMMA]) && readLemma) { + String lemmaValue = trim(word[LEMMA]); + if (!UNUSED.equals(lemmaValue) && readLemma) { Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); - lemma.setValue(word[LEMMA]); + lemma.setValue(lemmaValue); lemma.addToIndexes(); token.setLemma(lemma); } // Read part-of-speech tag - if (!UNUSED.equals(word[GPOS]) && readPos) { - Type posTag = posMappingProvider.getTagType(word[GPOS]); + String gPosValue = cleanTag(word[GPOS]); + if (!UNUSED.equals(gPosValue) && readPos) { + Type posTag = posMappingProvider.getTagType(gPosValue); POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); - pos.setPosValue(word[GPOS].intern()); + pos.setPosValue(gPosValue); POSUtils.assignCoarseValue(pos); pos.addToIndexes(); token.setPos(pos); } - if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) { + String predValue = trim(word[PRED]); + if (!UNUSED.equals(predValue) && readSemanticPredicate) { SemPred pred = new SemPred(aJCas, token.getBegin(), token.getEnd()); - pred.setCategory(word[PRED]); + pred.setCategory(predValue); pred.addToIndexes(); preds.add(pred); } @@ -236,29 +267,30 @@ public void convert(JCas aJCas, BufferedReader aReader) // Dependencies if (readDependency) { for (String[] word : words) { + String depRel = cleanTag(word[DEPREL]); if (!UNUSED.equals(word[DEPREL])) { - int depId = Integer.valueOf(word[ID]); - int govId = Integer.valueOf(word[HEAD]); + int depId = Integer.valueOf(trim(word[ID])); + int govId = Integer.valueOf(trim(word[HEAD])); // Model the root as a loop onto itself if (govId == 0) { Dependency rel = new ROOT(aJCas); rel.setGovernor(tokens.get(depId)); rel.setDependent(tokens.get(depId)); - rel.setDependencyType(word[DEPREL]); + rel.setDependencyType(depRel); rel.setBegin(rel.getDependent().getBegin()); rel.setEnd(rel.getDependent().getEnd()); - rel.setFlavor(DependencyFlavor.BASIC); + rel.setFlavor(BASIC); rel.addToIndexes(); } else { Dependency rel = new Dependency(aJCas); rel.setGovernor(tokens.get(govId)); rel.setDependent(tokens.get(depId)); - rel.setDependencyType(word[DEPREL]); + rel.setDependencyType(depRel); rel.setBegin(rel.getDependent().getBegin()); rel.setEnd(rel.getDependent().getEnd()); - rel.setFlavor(DependencyFlavor.BASIC); + rel.setFlavor(BASIC); rel.addToIndexes(); } } @@ -271,13 +303,14 @@ public void convert(JCas aJCas, BufferedReader aReader) for (int p = 0; p < preds.size(); p++) { List args = new ArrayList<>(); for (String[] word : words) { - if (!UNUSED.equals(word[APRED+p])) { - Token token = tokens.get(Integer.valueOf(word[ID])); + String aPredValue = trim(word[APRED + p]); + if (!UNUSED.equals(aPredValue)) { + Token token = tokens.get(Integer.valueOf(trim(word[ID]))); SemArg arg = new SemArg(aJCas, token.getBegin(), token.getEnd()); arg.addToIndexes(); SemArgLink link = new SemArgLink(aJCas); - link.setRole(word[APRED+p]); + link.setRole(aPredValue); link.setTarget(arg); args.add(link); } diff --git a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2008Writer.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2008Writer.java similarity index 82% rename from dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2008Writer.java rename to dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2008Writer.java index 7269c1d099..a04034c066 100644 --- a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2008Writer.java +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2008Writer.java @@ -15,10 +15,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; +package org.dkpro.core.io.conll; import static org.apache.commons.io.IOUtils.closeQuietly; -import static org.apache.uima.fit.util.JCasUtil.*; +import static org.apache.uima.fit.util.JCasUtil.indexCovered; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; import java.io.OutputStreamWriter; import java.io.PrintWriter; @@ -36,12 +38,12 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg; @@ -49,6 +51,7 @@ import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Writes a file in the CoNLL-2008 format. @@ -58,7 +61,8 @@ * @see The CoNLL-2008 Shared Task on * Joint Parsing of Syntactic and Semantic Dependencies */ -@ResourceMetaData(name="CoNLL 2008 Writer") +@ResourceMetaData(name = "CoNLL 2008 Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.TEXT_X_CONLL_2008}) @TypeCapability( inputs = { @@ -81,32 +85,60 @@ public class Conll2008Writer * Character encoding of the output data. */ public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; - @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String targetEncoding; - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; + /** + * Use this filename extension. + */ + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".conll") private String filenameSuffix; + /** + * Write part-of-speech information. + */ public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "true") private boolean writePos; + /** + * Write morphological features. + */ public static final String PARAM_WRITE_MORPH = "writeMorph"; @ConfigurationParameter(name = PARAM_WRITE_MORPH, mandatory = true, defaultValue = "true") private boolean writeMorph; + /** + * Write lemma information. + */ public static final String PARAM_WRITE_LEMMA = ComponentParameters.PARAM_WRITE_LEMMA; @ConfigurationParameter(name = PARAM_WRITE_LEMMA, mandatory = true, defaultValue = "true") private boolean writeLemma; + /** + * Write syntactic dependency infomation. + */ public static final String PARAM_WRITE_DEPENDENCY = ComponentParameters.PARAM_WRITE_DEPENDENCY; @ConfigurationParameter(name = PARAM_WRITE_DEPENDENCY, mandatory = true, defaultValue = "true") private boolean writeDependency; + /** + * Write semantic predicate infomation. + */ public static final String PARAM_WRITE_SEMANTIC_PREDICATE = "writeSemanticPredicate"; @ConfigurationParameter(name = PARAM_WRITE_SEMANTIC_PREDICATE, mandatory = true, defaultValue = "true") private boolean writeSemanticPredicate; + + /** + * Write text covered by the token instead of the token form. + */ + public static final String PARAM_WRITE_COVERED_TEXT = + ComponentParameters.PARAM_WRITE_COVERED_TEXT; + @ConfigurationParameter(name = PARAM_WRITE_COVERED_TEXT, mandatory = true, defaultValue = "true") + private boolean writeCovered; @Override public void process(JCas aJCas) @@ -128,8 +160,8 @@ public void process(JCas aJCas) private void convert(JCas aJCas, PrintWriter aOut) { - Map> predIdx = indexCovered(aJCas, Token.class, SemPred.class); - Map> argIdx = indexCovered(aJCas, SemArg.class, Token.class); + Map> predIdx = indexCovered(aJCas, Token.class, SemPred.class); + Map> argIdx = indexCovered(aJCas, SemArg.class, Token.class); for (Sentence sentence : select(aJCas, Sentence.class)) { HashMap ctokens = new LinkedHashMap(); @@ -145,7 +177,7 @@ private void convert(JCas aJCas, PrintWriter aOut) for (int i = 0; i < tokens.size(); i++) { Row row = new Row(); - row.id = i+1; + row.id = i + 1; row.token = tokens.get(i); row.args = new SemArgLink[preds.size()]; if (useFeats) { @@ -163,13 +195,19 @@ private void convert(JCas aJCas, PrintWriter aOut) // Dependencies List basicDeps = selectCovered(Dependency.class, sentence).stream() - .filter(dep -> dep.getFlavor() == null || DependencyFlavor.BASIC.equals(dep.getFlavor())) + .filter(dep -> dep.getFlavor() == null || + DependencyFlavor.BASIC.equals(dep.getFlavor())) .collect(Collectors.toList()); for (Dependency rel : basicDeps) { Row row = ctokens.get(rel.getDependent()); if (row.deprel != null) { + String form = row.token.getCoveredText(); + if (!writeCovered) { + form = row.token.getText(); + } + throw new IllegalStateException("Illegal basic dependency structure - token [" - + row.token.getCoveredText() + + form + "] is dependent of more than one dependency."); } row.deprel = rel; @@ -191,6 +229,9 @@ private void convert(JCas aJCas, PrintWriter aOut) int id = row.id; String form = row.token.getCoveredText(); + if (!writeCovered) { + form = row.token.getText(); + } String lemma = UNUSED; if (writeLemma && (row.token.getLemma() != null)) { diff --git a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2009Reader.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2009Reader.java similarity index 78% rename from dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2009Reader.java rename to dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2009Reader.java index d93b43e348..33159d62fc 100644 --- a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2009Reader.java +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2009Reader.java @@ -15,9 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; +package org.dkpro.core.io.conll; +import static de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor.BASIC; import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; import java.io.BufferedReader; import java.io.IOException; @@ -41,16 +43,15 @@ import org.apache.uima.fit.util.FSCollectionFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.io.conll.internal.ConllReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; @@ -58,8 +59,8 @@ import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArgLink; import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Reads a file in the CoNLL-2009 format. @@ -71,7 +72,8 @@ * @see The CoNLL-2008 Shared Task on Joint * Parsing of Syntactic and Semantic Dependencies */ -@ResourceMetaData(name="CoNLL 2009 Reader") +@ResourceMetaData(name = "CoNLL 2009 Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.TEXT_X_CONLL_2009}) @TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", @@ -83,12 +85,19 @@ "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred", "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg"}) public class Conll2009Reader - extends JCasResourceCollectionReader_ImplBase + extends ConllReader_ImplBase { + /** + * Character encoding of the input data. + */ public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String sourceEncoding; + /** + * Read part-of-speech information. + */ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") private boolean readPos; @@ -102,27 +111,49 @@ public class Conll2009Reader @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) protected String posTagset; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; + /** + * Read morphological features. + */ public static final String PARAM_READ_MORPH = ComponentParameters.PARAM_READ_MORPH; @ConfigurationParameter(name = PARAM_READ_MORPH, mandatory = true, defaultValue = "true") private boolean readMorph; + /** + * Read lemma information. + */ public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") private boolean readLemma; + /** + * Read syntactic dependency information. + */ public static final String PARAM_READ_DEPENDENCY = ComponentParameters.PARAM_READ_DEPENDENCY; @ConfigurationParameter(name = PARAM_READ_DEPENDENCY, mandatory = true, defaultValue = "true") private boolean readDependency; - public static final String PARAM_READ_SEMANTIC_PREDICATE = "readSemanticPredicate"; + /** + * Read semantic predicate information. + */ + public static final String PARAM_READ_SEMANTIC_PREDICATE = + ComponentParameters.PARAM_READ_SEMANTIC_PREDICATE; @ConfigurationParameter(name = PARAM_READ_SEMANTIC_PREDICATE, mandatory = true, defaultValue = "true") private boolean readSemanticPredicate; @@ -152,8 +183,8 @@ public void initialize(UimaContext aContext) { super.initialize(aContext); - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - posTagset, getLanguage()); + posMappingProvider = createPosMappingProvider(this, posMappingLocation, posTagset, + getLanguage()); } @Override @@ -178,10 +209,10 @@ public void convert(JCas aJCas, BufferedReader aReader) throws IOException { if (readPos) { - try{ + try { posMappingProvider.configure(aJCas.getCas()); } - catch(AnalysisEngineProcessException e){ + catch (AnalysisEngineProcessException e) { throw new IOException(e); } } @@ -206,42 +237,46 @@ public void convert(JCas aJCas, BufferedReader aReader) while (wordIterator.hasNext()) { String[] word = wordIterator.next(); // Read token - Token token = doc.add(word[FORM], Token.class); - tokens.put(Integer.valueOf(word[ID]), token); + Token token = doc.add(trim(word[FORM]), Token.class); + tokens.put(Integer.valueOf(trim(word[ID])), token); if (wordIterator.hasNext()) { doc.add(" "); } // Read lemma - if (!UNUSED.equals(word[LEMMA]) && readLemma) { + String lemmaValue = trim(word[LEMMA]); + if (!UNUSED.equals(lemmaValue) && readLemma) { Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); - lemma.setValue(word[LEMMA]); + lemma.setValue(lemmaValue); lemma.addToIndexes(); token.setLemma(lemma); } // Read part-of-speech tag - if (!UNUSED.equals(word[POS]) && readPos) { - Type posTag = posMappingProvider.getTagType(word[POS]); + String posValue = cleanTag(word[POS]); + if (!UNUSED.equals(posValue) && readPos) { + Type posTag = posMappingProvider.getTagType(posValue); POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); - pos.setPosValue(word[POS].intern()); + pos.setPosValue(posValue); POSUtils.assignCoarseValue(pos); pos.addToIndexes(); token.setPos(pos); } // Read morphological features - if (!UNUSED.equals(word[FEAT]) && readMorph) { + String featValue = cleanTag(word[FEAT]); + if (!UNUSED.equals(featValue) && readMorph) { MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd()); - morphtag.setValue(word[FEAT]); + morphtag.setValue(featValue); morphtag.addToIndexes(); } - if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) { + String predValue = trim(word[PRED]); + if (!UNUSED.equals(predValue) && readSemanticPredicate) { SemPred pred = new SemPred(aJCas, token.getBegin(), token.getEnd()); - pred.setCategory(word[PRED]); + pred.setCategory(predValue); pred.addToIndexes(); preds.add(pred); } @@ -252,34 +287,34 @@ public void convert(JCas aJCas, BufferedReader aReader) // Dependencies if (readDependency) { for (String[] word : words) { - if (!UNUSED.equals(word[DEPREL])) { - int depId = Integer.valueOf(word[ID]); - int govId = Integer.valueOf(word[HEAD]); - + String depRel = cleanTag(word[DEPREL]); + if (!UNUSED.equals(depRel)) { + int depId = Integer.valueOf(trim(word[ID])); + int govId = Integer.valueOf(trim(word[HEAD])); + // Model the root as a loop onto itself if (govId == 0) { Dependency rel = new ROOT(aJCas); rel.setGovernor(tokens.get(depId)); rel.setDependent(tokens.get(depId)); - rel.setDependencyType(word[DEPREL]); + rel.setDependencyType(depRel); rel.setBegin(rel.getDependent().getBegin()); rel.setEnd(rel.getDependent().getEnd()); - rel.setFlavor(DependencyFlavor.BASIC); + rel.setFlavor(BASIC); rel.addToIndexes(); } else { Dependency rel = new Dependency(aJCas); rel.setGovernor(tokens.get(govId)); rel.setDependent(tokens.get(depId)); - rel.setDependencyType(word[DEPREL]); + rel.setDependencyType(depRel); rel.setBegin(rel.getDependent().getBegin()); rel.setEnd(rel.getDependent().getEnd()); - rel.setFlavor(DependencyFlavor.BASIC); + rel.setFlavor(BASIC); rel.addToIndexes(); } } - } - } + } } // Semantic arguments if (readSemanticPredicate) { @@ -287,13 +322,14 @@ public void convert(JCas aJCas, BufferedReader aReader) for (int p = 0; p < preds.size(); p++) { List args = new ArrayList<>(); for (String[] word : words) { - if (!UNUSED.equals(word[APRED+p])) { - Token token = tokens.get(Integer.valueOf(word[ID])); + String aPredValue = trim(word[APRED + p]); + if (!UNUSED.equals(aPredValue)) { + Token token = tokens.get(Integer.valueOf(trim(word[ID]))); SemArg arg = new SemArg(aJCas, token.getBegin(), token.getEnd()); arg.addToIndexes(); SemArgLink link = new SemArgLink(aJCas); - link.setRole(word[APRED+p]); + link.setRole(aPredValue); link.setTarget(arg); args.add(link); } diff --git a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2009Writer.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2009Writer.java similarity index 81% rename from dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2009Writer.java rename to dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2009Writer.java index be5641fbac..5baa84995d 100644 --- a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2009Writer.java +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2009Writer.java @@ -15,10 +15,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; +package org.dkpro.core.io.conll; import static org.apache.commons.io.IOUtils.closeQuietly; -import static org.apache.uima.fit.util.JCasUtil.*; +import static org.apache.uima.fit.util.JCasUtil.indexCovered; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; import java.io.OutputStreamWriter; import java.io.PrintWriter; @@ -36,12 +38,12 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg; @@ -49,6 +51,7 @@ import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Writes a file in the CoNLL-2009 format. @@ -60,7 +63,8 @@ * @see The CoNLL-2008 Shared Task on Joint * Parsing of Syntactic and Semantic Dependencies */ -@ResourceMetaData(name="CoNLL 2009 Writer") +@ResourceMetaData(name = "CoNLL 2009 Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.TEXT_X_CONLL_2009}) @TypeCapability( inputs = { @@ -83,32 +87,61 @@ public class Conll2009Writer * Character encoding of the output data. */ public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; - @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String targetEncoding; - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; + /** + * Use this filename extension. + */ + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".conll") private String filenameSuffix; + /** + * Write part-of-speech information. + */ public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "true") private boolean writePos; - public static final String PARAM_WRITE_MORPH = "writeMorph"; + /** + * Read morphological features. + */ + public static final String PARAM_WRITE_MORPH = ComponentParameters.PARAM_WRITE_MORPH; @ConfigurationParameter(name = PARAM_WRITE_MORPH, mandatory = true, defaultValue = "true") private boolean writeMorph; + /** + * Write lemma information. + */ public static final String PARAM_WRITE_LEMMA = ComponentParameters.PARAM_WRITE_LEMMA; @ConfigurationParameter(name = PARAM_WRITE_LEMMA, mandatory = true, defaultValue = "true") private boolean writeLemma; + /** + * Write syntactic dependency information. + */ public static final String PARAM_WRITE_DEPENDENCY = ComponentParameters.PARAM_WRITE_DEPENDENCY; @ConfigurationParameter(name = PARAM_WRITE_DEPENDENCY, mandatory = true, defaultValue = "true") private boolean writeDependency; - public static final String PARAM_WRITE_SEMANTIC_PREDICATE = "writeSemanticPredicate"; + /** + * Write semantic predicate information. + */ + public static final String PARAM_WRITE_SEMANTIC_PREDICATE = + ComponentParameters.PARAM_WRITE_SEMANTIC_PREDICATE; @ConfigurationParameter(name = PARAM_WRITE_SEMANTIC_PREDICATE, mandatory = true, defaultValue = "true") private boolean writeSemanticPredicate; + + /** + * Write text covered by the token instead of the token form. + */ + public static final String PARAM_WRITE_COVERED_TEXT = + ComponentParameters.PARAM_WRITE_COVERED_TEXT; + @ConfigurationParameter(name = PARAM_WRITE_COVERED_TEXT, mandatory = true, defaultValue = "true") + private boolean writeCovered; @Override public void process(JCas aJCas) @@ -130,8 +163,8 @@ public void process(JCas aJCas) private void convert(JCas aJCas, PrintWriter aOut) { - Map> predIdx = indexCovered(aJCas, Token.class, SemPred.class); - Map> argIdx = indexCovered(aJCas, SemArg.class, Token.class); + Map> predIdx = indexCovered(aJCas, Token.class, SemPred.class); + Map> argIdx = indexCovered(aJCas, SemArg.class, Token.class); for (Sentence sentence : select(aJCas, Sentence.class)) { HashMap ctokens = new LinkedHashMap(); @@ -147,7 +180,7 @@ private void convert(JCas aJCas, PrintWriter aOut) for (int i = 0; i < tokens.size(); i++) { Row row = new Row(); - row.id = i+1; + row.id = i + 1; row.token = tokens.get(i); row.args = new SemArgLink[preds.size()]; if (useFeats) { @@ -165,13 +198,19 @@ private void convert(JCas aJCas, PrintWriter aOut) // Dependencies List basicDeps = selectCovered(Dependency.class, sentence).stream() - .filter(dep -> dep.getFlavor() == null || DependencyFlavor.BASIC.equals(dep.getFlavor())) + .filter(dep -> dep.getFlavor() == null || + DependencyFlavor.BASIC.equals(dep.getFlavor())) .collect(Collectors.toList()); for (Dependency rel : basicDeps) { Row row = ctokens.get(rel.getDependent()); if (row.deprel != null) { + String form = row.token.getCoveredText(); + if (!writeCovered) { + form = row.token.getText(); + } + throw new IllegalStateException("Illegal basic dependency structure - token [" - + row.token.getCoveredText() + + form + "] is dependent of more than one dependency."); } row.deprel = rel; @@ -193,6 +232,9 @@ private void convert(JCas aJCas, PrintWriter aOut) int id = row.id; String form = row.token.getCoveredText(); + if (!writeCovered) { + form = row.token.getText(); + } String lemma = UNUSED; if (writeLemma && (row.token.getLemma() != null)) { @@ -249,9 +291,9 @@ private void convert(JCas aJCas, PrintWriter aOut) } } - aOut.printf("%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", id, form, - lemma, plemma, pos, ppos, feat, pfeat, head, phead, deprel, pdeprel, fillpred, - pred, apreds); + aOut.printf("%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", id, + form, lemma, plemma, pos, ppos, feat, pfeat, head, phead, deprel, pdeprel, + fillpred, pred, apreds); } aOut.println(); diff --git a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2012Reader.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2012Reader.java similarity index 79% rename from dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2012Reader.java rename to dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2012Reader.java index 7864421f3d..d453df13a4 100644 --- a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2012Reader.java +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2012Reader.java @@ -15,14 +15,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; +package org.dkpro.core.io.conll; import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.dkpro.core.api.resources.MappingProviderFactory.createConstituentMappingProvider; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -43,19 +46,20 @@ import org.apache.uima.fit.util.FSCollectionFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.io.conll.internal.ConllReader_ImplBase; +import org.dkpro.core.io.penntree.PennTreeToJCasConverter; +import org.dkpro.core.io.penntree.PennTreeUtils; import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain; import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; @@ -63,8 +67,7 @@ import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArgLink; import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred; import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.WordSense; -import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeToJCasConverter; -import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Reads a file in the CoNLL-2012 format. @@ -72,24 +75,35 @@ * @see CoNLL 2012 Shared Task: * Modeling Multilingual Unrestricted Coreference in OntoNotes */ -@ResourceMetaData(name="CoNLL 2012 Reader") +@ResourceMetaData(name = "CoNLL 2012 Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.TEXT_X_CONLL_2012}) @TypeCapability( outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred", - "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg"}) + "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg", + "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.WordSense" }) public class Conll2012Reader - extends JCasResourceCollectionReader_ImplBase + extends ConllReader_ImplBase { + /** + * Character encoding of the input data. + */ public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String encoding; + /** + * Read part-of-speech information. + */ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") private boolean readPos; @@ -103,15 +117,26 @@ public class Conll2012Reader @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) protected String posTagset; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; /** + * Read lemma information. + *

* Disabled by default because CoNLL 2012 format does not include lemmata for all words, only * for predicates. */ @@ -119,23 +144,40 @@ public class Conll2012Reader @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "false") private boolean readLemma; - public static final String PARAM_READ_SEMANTIC_PREDICATE = "readSemanticPredicate"; + /** + * Read semantic predicate information. + */ + public static final String PARAM_READ_SEMANTIC_PREDICATE = + ComponentParameters.PARAM_READ_SEMANTIC_PREDICATE; @ConfigurationParameter(name = PARAM_READ_SEMANTIC_PREDICATE, mandatory = true, defaultValue = "true") private boolean readSemanticPredicate; + /** + * Read word sense information. + */ public static final String PARAM_READ_WORD_SENSE = "readWordSense"; @ConfigurationParameter(name = PARAM_READ_WORD_SENSE, mandatory = true, defaultValue = "true") private boolean readWordSense; + /** + * Read syntactic constituent information. + */ public static final String PARAM_READ_CONSTITUENT = ComponentParameters.PARAM_READ_CONSTITUENT; @ConfigurationParameter(name = PARAM_READ_CONSTITUENT, mandatory = true, defaultValue = "true") private boolean readConstituent; + /** + * Read co-reference information. + */ public static final String PARAM_READ_COREFERENCE = ComponentParameters.PARAM_READ_COREFERENCE; @ConfigurationParameter(name = PARAM_READ_COREFERENCE, mandatory = true, defaultValue = "true") private boolean readCoreference; - public static final String PARAM_READ_NAMED_ENTITY = ComponentParameters.PARAM_READ_NAMED_ENTITY; + /** + * Read named entity information. + */ + public static final String PARAM_READ_NAMED_ENTITY = + ComponentParameters.PARAM_READ_NAMED_ENTITY; @ConfigurationParameter(name = PARAM_READ_NAMED_ENTITY, mandatory = true, defaultValue = "true") private boolean readNamedEntity; @@ -144,7 +186,8 @@ public class Conll2012Reader * tag set defined as part of the model meta data. This can be useful if a custom model is * specified which does not have such meta data, or it can be used in readers. */ - public static final String PARAM_CONSTITUENT_TAG_SET = ComponentParameters.PARAM_CONSTITUENT_TAG_SET; + public static final String PARAM_CONSTITUENT_TAG_SET = + ComponentParameters.PARAM_CONSTITUENT_TAG_SET; @ConfigurationParameter(name = PARAM_CONSTITUENT_TAG_SET, mandatory = false) protected String constituentTagset; @@ -152,20 +195,14 @@ public class Conll2012Reader * Load the constituent tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ - public static final String PARAM_CONSTITUENT_MAPPING_LOCATION = ComponentParameters.PARAM_CONSTITUENT_MAPPING_LOCATION; + public static final String PARAM_CONSTITUENT_MAPPING_LOCATION = + ComponentParameters.PARAM_CONSTITUENT_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_CONSTITUENT_MAPPING_LOCATION, mandatory = false) protected String constituentMappingLocation; /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code true} + * Whether to render traces into the document text. */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - public static final String PARAM_WRITE_TRACES_TO_TEXT = "writeTracesToText"; @ConfigurationParameter(name = PARAM_WRITE_TRACES_TO_TEXT, mandatory = false, defaultValue = "false") private boolean writeTracesToText; @@ -203,14 +240,13 @@ public void initialize(UimaContext aContext) { super.initialize(aContext); - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - posTagset, getLanguage()); - - constituentMappingProvider = MappingProviderFactory.createConstituentMappingProvider( + posMappingProvider = createPosMappingProvider(this, posMappingLocation, posTagset, + getLanguage()); + + constituentMappingProvider = createConstituentMappingProvider(this, constituentMappingLocation, constituentTagset, getLanguage()); converter = new PennTreeToJCasConverter(posMappingProvider, constituentMappingProvider); - converter.setInternTags(internTags); converter.setWriteTracesToText(writeTracesToText); converter.setCreatePosTags(false); // We handle POS tags via the column already converter.setRootLabel("TOP"); @@ -274,57 +310,65 @@ public void convert(JCas aJCas, BufferedReader aReader) while (wordIterator.hasNext()) { String[] word = wordIterator.next(); // Read token - Token token = doc.add(word[FORM], Token.class); - tokenById.put(Integer.valueOf(word[ID]), token); + Token token = doc.add(trim(word[FORM]), Token.class); + tokenById.put(Integer.valueOf(trim(word[ID])), token); if (wordIterator.hasNext()) { doc.add(" "); } // Read lemma - if (!UNUSED.equals(word[LEMMA]) && readLemma) { + String lemmaValue = trim(word[LEMMA]); + if (!UNUSED.equals(lemmaValue) && readLemma) { Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); - lemma.setValue(word[LEMMA]); + lemma.setValue(lemmaValue); lemma.addToIndexes(); token.setLemma(lemma); } // Read part-of-speech tag - if (!UNUSED.equals(word[POS]) && readPos) { - Type posTag = posMappingProvider.getTagType(word[POS]); + String posValue = cleanTag(word[POS]); + if (!UNUSED.equals(posValue) && readPos) { + Type posTag = posMappingProvider.getTagType(posValue); POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); - pos.setPosValue(word[POS].intern()); + pos.setPosValue(posValue); POSUtils.assignCoarseValue(pos); pos.addToIndexes(); token.setPos(pos); } - if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) { + String predValue = trim(word[PRED]); + if (!UNUSED.equals(predValue) && readSemanticPredicate) { SemPred pred = new SemPred(aJCas, token.getBegin(), token.getEnd()); - pred.setCategory(word[PRED]); + pred.setCategory(predValue); pred.addToIndexes(); preds.add(pred); } - if (!UNUSED.equals(word[PARSE]) && readConstituent) { - String fixed = word[PARSE].replace("*", "(" + word[POS] + " " + word[FORM] + ")"); + String constituentFragmentValue = trim(word[PARSE]); + if (!UNUSED.equals(constituentFragmentValue) && readConstituent) { + String fixed = constituentFragmentValue.replace("*", + "(" + posValue + " " + trim(word[FORM]) + ")"); parse.append(fixed); } - if (!UNUSED.equals(word[WORD_SENSE]) && readWordSense) { + String wordSenseValue = trim(word[WORD_SENSE]); + if (!UNUSED.equals(wordSenseValue) && readWordSense) { WordSense wordSense = new WordSense(aJCas, token.getBegin(), token.getEnd()); - wordSense.setValue(word[WORD_SENSE]); + wordSense.setValue(wordSenseValue); wordSense.addToIndexes(); } - if (!UNUSED.equals(word[word.length-1]) && readCoreference) { - String[] chainFragments = word[word.length-1].split("\\|"); + String coreferenceValue = trim(word[word.length - 1]); + if (!UNUSED.equals(coreferenceValue) && readCoreference) { + String[] chainFragments = Arrays.stream(coreferenceValue.split("\\|")) + .map(this::trim).toArray(String[]::new); for (String chainFragment : chainFragments) { boolean beginning = chainFragment.startsWith("("); boolean ending = chainFragment.endsWith(")"); String chainId = chainFragment.substring(beginning ? 1 : 0, - ending ? chainFragment.length() -1 : chainFragment.length()); + ending ? chainFragment.length() - 1 : chainFragment.length()); CoreferenceLink link = chains.get(chainId); if (beginning) { @@ -360,19 +404,20 @@ public void convert(JCas aJCas, BufferedReader aReader) int currentNeBegin = -1; String currentNeType = null; for (int i = 0; i < words.size(); i++) { - String ne = words.get(i)[NAMED_ENTITIES]; + String ne = trim(words.get(i)[NAMED_ENTITIES]); boolean beginning = ne.startsWith("("); boolean ending = ne.endsWith(")"); // When a NE is beginning, we remember what the NE is and where it began if (beginning) { - // The NE is beginning with "(" and either ending with "(" or "*", so we trim - // the first and last character - currentNeType = ne.substring(1, ne.length()-1); + // The NE is beginning with "(" and either ending with "(" or "*", so we + // trim the first and last character + currentNeType = cleanTag(ne.substring(1, ne.length() - 1)); currentNeBegin = i; } - // We need to create an annotation if the current token is the end of an annotation + // We need to create an annotation if the current token is the end of an + // annotation if (ending) { // Determine begin and end of named entity int begin = tokenById.get(currentNeBegin).getBegin(); @@ -400,7 +445,7 @@ public void convert(JCas aJCas, BufferedReader aReader) int currentArgBegin = -1; String currentArgType = null; for (int i = 0; i < words.size(); i++) { - String ne = words.get(i)[APRED + p]; + String ne = trim(words.get(i)[APRED + p]); boolean beginning = ne.startsWith("("); boolean ending = ne.endsWith(")"); @@ -408,7 +453,7 @@ public void convert(JCas aJCas, BufferedReader aReader) if (beginning) { // The arg is beginning with "(" and either ending with "(" or "*", so // we trim the first and last character - currentArgType = ne.substring(1, ne.length()-1); + currentArgType = cleanTag(ne.substring(1, ne.length() - 1)); currentArgBegin = i; } @@ -445,7 +490,9 @@ public void convert(JCas aJCas, BufferedReader aReader) Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); sentence.addToIndexes(); - converter.convertPennTree(sentence, PennTreeUtils.parsePennTree(parse.toString())); + if (readConstituent) { + converter.convertPennTree(sentence, PennTreeUtils.parsePennTree(parse.toString())); + } // Once sentence per line. doc.add("\n"); @@ -472,7 +519,7 @@ private List readSentence(JCas aJCas, BufferedReader aReader) Matcher matcher = pattern.matcher(line); if (matcher.matches()) { DocumentMetaData meta = DocumentMetaData.get(aJCas); - meta.setDocumentId(matcher.group(1)+'#'+matcher.group(2)); + meta.setDocumentId(matcher.group(1) + '#' + matcher.group(2)); } } diff --git a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2012Writer.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2012Writer.java similarity index 79% rename from dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2012Writer.java rename to dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2012Writer.java index 3f008f8075..4e29d859a6 100644 --- a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2012Writer.java +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/Conll2012Writer.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; +package org.dkpro.core.io.conll; import static java.util.Arrays.asList; import static org.apache.commons.io.IOUtils.closeQuietly; @@ -26,12 +26,15 @@ import java.io.OutputStreamWriter; import java.io.PrintWriter; +import java.util.ArrayList; import java.util.Collection; +import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import org.apache.commons.lang3.StringUtils; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; @@ -42,15 +45,17 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.io.penntree.PennTreeNode; +import org.dkpro.core.io.penntree.PennTreeUtils; import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain; import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg; @@ -58,23 +63,26 @@ import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred; import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.WordSense; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; -import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeNode; -import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Writer for the CoNLL-2012 format. */ -@ResourceMetaData(name="CoNLL 2012 Writer") +@ResourceMetaData(name = "CoNLL 2012 Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.TEXT_X_CONLL_2012}) @TypeCapability( inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred", - "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg" }) + "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg", + "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.WordSense" }) public class Conll2012Writer extends JCasFileWriter_ImplBase { @@ -86,24 +94,46 @@ public class Conll2012Writer * Character encoding of the output data. */ public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; - @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String targetEncoding; - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; + /** + * Use this filename extension. + */ + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".conll") private String filenameSuffix; + /** + * Write part-of-speech information. + */ public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "true") private boolean writePos; + /** + * Write lemma information. + */ public static final String PARAM_WRITE_LEMMA = ComponentParameters.PARAM_WRITE_LEMMA; @ConfigurationParameter(name = PARAM_WRITE_LEMMA, mandatory = true, defaultValue = "true") private boolean writeLemma; + /** + * Write semantic predicate infomation. + */ public static final String PARAM_WRITE_SEMANTIC_PREDICATE = "writeSemanticPredicate"; @ConfigurationParameter(name = PARAM_WRITE_SEMANTIC_PREDICATE, mandatory = true, defaultValue = "true") private boolean writeSemanticPredicate; + + /** + * Write text covered by the token instead of the token form. + */ + public static final String PARAM_WRITE_COVERED_TEXT = + ComponentParameters.PARAM_WRITE_COVERED_TEXT; + @ConfigurationParameter(name = PARAM_WRITE_COVERED_TEXT, mandatory = true, defaultValue = "true") + private boolean writeCovered; @Override public void process(JCas aJCas) @@ -134,13 +164,13 @@ public void process(JCas aJCas) private void convert(JCas aJCas, PrintWriter aOut) { - Map> predIdx = indexCovered(aJCas, Token.class, SemPred.class); - Map> argIdx = indexCovered(aJCas, SemArg.class, Token.class); - Map> neIdx = indexCovering(aJCas, Token.class, + Map> predIdx = indexCovered(aJCas, Token.class, SemPred.class); + Map> argIdx = indexCovered(aJCas, SemArg.class, Token.class); + Map> neIdx = indexCovering(aJCas, Token.class, NamedEntity.class); - Map> wordSenseIdx = indexCovered(aJCas, Token.class, + Map> wordSenseIdx = indexCovered(aJCas, Token.class, WordSense.class); - Map> corefIdx = indexCovering(aJCas, Token.class, + Map> corefIdx = indexCovering(aJCas, Token.class, CoreferenceLink.class); Map corefChainIdx = new HashMap<>(); @@ -158,8 +188,19 @@ private void convert(JCas aJCas, PrintWriter aOut) // Tokens List tokens = selectCovered(Token.class, sentence); - List preds = selectCovered(SemPred.class, sentence); - + // Collect the predicates for this sentence in text order + List preds = new ArrayList<>(); + for (Entry> e : predIdx.entrySet()) { + if (!e.getValue().isEmpty() && tokens.contains(e.getKey())) { + // If there are multiple semantic predicates for the current token, then + // we keep only the first + preds.add(e.getValue().get(0)); + } + } + preds.sort(Comparator + .comparing(SemPred::getBegin) + .thenComparing(SemPred::getEnd, Comparator.reverseOrder())); + String[] parseFragments = null; List root = selectCovered(ROOT.class, sentence); if (root.size() == 1) { @@ -236,6 +277,9 @@ private void convert(JCas aJCas, PrintWriter aOut) int id = row.id; String form = row.token.getCoveredText(); + if (!writeCovered) { + form = row.token.getText(); + } String lemma = UNUSED + " "; if (writeLemma && (row.token.getLemma() != null)) { @@ -273,27 +317,36 @@ private void convert(JCas aJCas, PrintWriter aOut) pred = row.pred.getCategory(); } + // Get which args column the current predicate is destined for. Every args + // column only gets a single predicate. + int predForCol = preds.indexOf(row.pred); + + int argsCol = 0; for (SemArgLink link : row.args) { - if (apreds.length() > 0) { apreds.append(" "); } String value; - if (link == null) { - if (row.pred != null && row.pred.getBegin() == row.token.getBegin() - && row.pred.getEnd() == row.token.getEnd()) { - value = "(V*)"; - } - else { - value = ALT_UNUSED + ' '; - } + // If this is the column for the predicate marker,then we write that - + // ignoring any arguments that might be at the same position + if (row.pred != null && argsCol == predForCol) { + value = "(V*)"; } - else { + // ... otherwise - if there is an argument at this position, then write that + else if (link != null) { value = encodeMultiTokenAnnotation(row.token, link.getTarget(), link.getRole()); } + // ... and if there is neither the predicate marker nor the argument, then + // mark as unused + else { + value = ALT_UNUSED + ' '; + } + apreds.append(String.format("%10s", value)); + + argsCol++; } } @@ -358,7 +411,7 @@ private String encodeMultiTokenLink(Token aToken, AnnotationFS aAnnotation, Inte if (begin) { buf.append('('); } - if (begin|end) { + if (begin | end) { buf.append(aChainId); } if (end) { diff --git a/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/ConllCoreNlpReader.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/ConllCoreNlpReader.java new file mode 100644 index 0000000000..4d9543fe20 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/ConllCoreNlpReader.java @@ -0,0 +1,384 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.conll; + +import static de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor.BASIC; +import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.Type; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.factory.JCasBuilder; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.sequencecodec.AdjacentLabelCodec; +import org.dkpro.core.api.io.sequencecodec.SequenceItem; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.io.conll.internal.ConllReader_ImplBase; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Reads files in the default CoreNLP CoNLL format. + * + * @see CoreNLP CoNLLOutputter + */ +@ResourceMetaData(name = "CoNLL CoreNLP Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.TEXT_X_CONLL_CORENLP}) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) +public class ConllCoreNlpReader + extends ConllReader_ImplBase +{ + /** + * Character encoding of the input data. + */ + public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) + private String sourceEncoding; + + /** + * Read fine-grained part-of-speech information. + */ + public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; + @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") + private boolean readPos; + + /** + * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the + * tag set defined as part of the model meta data. This can be useful if a custom model is + * specified which does not have such meta data, or it can be used in readers. + */ + public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; + @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) + protected String posTagset; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Load the part-of-speech tag to UIMA type mapping from this location instead of locating + * the mapping automatically. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + protected String posMappingLocation; + + /** + * Location of the mapping file for named entity tags to UIMA types. + */ + public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = + ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_NAMED_ENTITY_MAPPING_LOCATION, mandatory = false) + private String namedEntityMappingLocation; + + /** + * Read morphological features. + */ + public static final String PARAM_READ_NAMED_ENTITY = + ComponentParameters.PARAM_READ_NAMED_ENTITY; + @ConfigurationParameter(name = PARAM_READ_NAMED_ENTITY, mandatory = true, defaultValue = "true") + private boolean readNer; + + /** + * Read lemma information. + */ + public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; + @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") + private boolean readLemma; + + /** + * Read syntactic dependency information. + */ + public static final String PARAM_READ_DEPENDENCY = ComponentParameters.PARAM_READ_DEPENDENCY; + @ConfigurationParameter(name = PARAM_READ_DEPENDENCY, mandatory = true, defaultValue = "true") + private boolean readDependency; + + private static final String UNUSED = "_"; + + private static final int ID = 0; + private static final int FORM = 1; + private static final int LEMMA = 2; + private static final int POSTAG = 3; + private static final int NER = 4; + private static final int HEAD = 5; + private static final int DEPREL = 6; + + private MappingProvider posMappingProvider; + private MappingProvider namedEntityMappingProvider; + + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + posMappingProvider = createPosMappingProvider(this, posMappingLocation, posTagset, + getLanguage()); + + namedEntityMappingProvider = new MappingProvider(); + namedEntityMappingProvider.setDefault(MappingProvider.LOCATION, + "classpath:/there/is/no/mapping/yet"); + namedEntityMappingProvider.setDefault(MappingProvider.BASE_TYPE, + NamedEntity.class.getName()); + namedEntityMappingProvider.setOverride(MappingProvider.LOCATION, + namedEntityMappingLocation); + namedEntityMappingProvider.setOverride(MappingProvider.LANGUAGE, getLanguage()); + } + + @Override + public void getNext(JCas aJCas) + throws IOException, CollectionException + { + Resource res = nextFile(); + initCas(aJCas, res); + BufferedReader reader = null; + try { + reader = new BufferedReader(new InputStreamReader( + CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()), + sourceEncoding)); + convert(aJCas, reader); + } + finally { + closeQuietly(reader); + } + } + + public void convert(JCas aJCas, BufferedReader aReader) + throws IOException + { + if (readPos) { + try { + posMappingProvider.configure(aJCas.getCas()); + } + catch (AnalysisEngineProcessException e) { + throw new IOException(e); + } + } + + if (readNer) { + try { + namedEntityMappingProvider.configure(aJCas.getCas()); + } + catch (AnalysisEngineProcessException e) { + throw new IOException(e); + } + } + + JCasBuilder doc = new JCasBuilder(aJCas); + + List words; + while ((words = readSentence(aReader)) != null) { + if (words.isEmpty()) { + // Ignore empty sentences. This can happen when there are multiple end-of-sentence + // markers following each other. + continue; + } + + int sentenceBegin = doc.getPosition(); + int sentenceEnd = sentenceBegin; + + // Tokens, Lemma, POS + Map tokens = new HashMap(); + Iterator wordIterator = words.iterator(); + while (wordIterator.hasNext()) { + String[] word = wordIterator.next(); + // Read token + Token token = doc.add(trim(word[FORM]), Token.class); + tokens.put(Integer.valueOf(trim(word[ID])), token); + if (wordIterator.hasNext()) { + doc.add(" "); + } + + // Read lemma + String lemmaValue = trim(word[LEMMA]); + if (!UNUSED.equals(lemmaValue) && readLemma) { + Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); + lemma.setValue(lemmaValue); + lemma.addToIndexes(); + token.setLemma(lemma); + } + + // Read part-of-speech tag + String tag = cleanTag(word[POSTAG]); + if (!UNUSED.equals(tag) && readPos) { + Type posTag = posMappingProvider.getTagType(tag); + POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), + token.getEnd()); + pos.setPosValue(tag); + pos.addToIndexes(); + token.setPos(pos); + } + + sentenceEnd = token.getEnd(); + } + + // Read named entities + if (readNer) { + List encodedNerSpans = words.stream().map(w -> { + int id = Integer.valueOf(trim(w[ID])); + return new SequenceItem(id, id, trim(w[NER])); + }).collect(Collectors.toList()); + + AdjacentLabelCodec codec = new AdjacentLabelCodec(1); + List decodedNerSpans = codec.decode(encodedNerSpans); + + for (SequenceItem nerSpan : decodedNerSpans) { + Type nerType = namedEntityMappingProvider.getTagType(nerSpan.getLabel()); + Token beginToken = tokens.get(nerSpan.getBegin()); + Token endToken = tokens.get(nerSpan.getEnd()); + NamedEntity ne = (NamedEntity) aJCas.getCas().createAnnotation(nerType, + beginToken.getBegin(), endToken.getEnd()); + ne.setValue(cleanTag(nerSpan.getLabel())); + ne.addToIndexes(); + } + } + + // Read dependencies + if (readDependency) { + for (String[] word : words) { + String depRel = cleanTag(word[DEPREL]); + if (!UNUSED.equals(depRel)) { + int depId = Integer.valueOf(trim(word[ID])); + int govId = Integer.valueOf(trim(word[HEAD])); + + // Model the root as a loop onto itself + if (govId == 0) { + Dependency rel = new ROOT(aJCas); + rel.setGovernor(tokens.get(depId)); + rel.setDependent(tokens.get(depId)); + rel.setDependencyType(depRel); + rel.setBegin(rel.getDependent().getBegin()); + rel.setEnd(rel.getDependent().getEnd()); + rel.setFlavor(BASIC); + rel.addToIndexes(); + } + else { + Dependency rel = new Dependency(aJCas); + rel.setGovernor(tokens.get(govId)); + rel.setDependent(tokens.get(depId)); + rel.setDependencyType(depRel); + rel.setBegin(rel.getDependent().getBegin()); + rel.setEnd(rel.getDependent().getEnd()); + rel.setFlavor(BASIC); + rel.addToIndexes(); + } + } + } + } + + // Sentence + Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); + sentence.addToIndexes(); + + // Once sentence per line. + doc.add("\n"); + } + + doc.close(); + } + + /** + * Read a single sentence. + */ + private static List readSentence(BufferedReader aReader) + throws IOException + { + List words = new ArrayList(); + String line; + boolean firstLineOfSentence = true; + while ((line = aReader.readLine()) != null) { + if (StringUtils.isBlank(line)) { + firstLineOfSentence = true; + break; // End of sentence + } + + if (line.startsWith("<") && line.endsWith(">")) { + // FinnTreeBank uses pseudo-XML to attach extra metadata to sentences. + // Currently, we just ignore this. + break; // Consider end of sentence + } + + if (firstLineOfSentence && line.startsWith("#")) { + // GUM uses a comment to attach extra metadata to sentences. + // Currently, we just ignore this. + break; // Consider end of sentence + } + + firstLineOfSentence = false; + + String[] fields = line.split("\t"); + if (fields.length != 7) { + throw new IOException( + "Invalid file format. Line needs to have 7 tab-separated fields, but it has " + + fields.length + ": [" + line + "]"); + } + words.add(fields); + } + + if (line == null && words.isEmpty()) { + return null; + } + else { + return words; + } + } +} diff --git a/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/ConllCoreNlpWriter.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/ConllCoreNlpWriter.java new file mode 100644 index 0000000000..7ea9863841 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/ConllCoreNlpWriter.java @@ -0,0 +1,245 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.conll; + +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; + +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.TreeMap; +import java.util.stream.Collectors; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.io.sequencecodec.AdjacentLabelCodec; +import org.dkpro.core.api.io.sequencecodec.SequenceItem; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Write files in the default CoreNLP CoNLL format. + * + * @see CoreNLP CoNLLOutputter + */ +@ResourceMetaData(name = "CoNLL CoreNLP Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.TEXT_X_CONLL_CORENLP}) +@TypeCapability(inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) +public class ConllCoreNlpWriter + extends JCasFileWriter_ImplBase +{ + private static final String UNUSED = "_"; + private static final int UNUSED_INT = -1; + + /** + * Character encoding of the output data. + */ + public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; + @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) + private String targetEncoding; + + /** + * Use this filename extension. + */ + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; + @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".conll") + private String filenameSuffix; + + /** + * Write fine-grained part-of-speech information. + */ + public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; + @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "true") + private boolean writePos; + + /** + * Write named entity information. + */ + public static final String PARAM_WRITE_NAMED_ENTITY = + ComponentParameters.PARAM_WRITE_NAMED_ENTITY; + @ConfigurationParameter(name = PARAM_WRITE_NAMED_ENTITY, mandatory = true, defaultValue = "true") + private boolean writeNamedEntity; + + /** + * Write lemma information. + */ + public static final String PARAM_WRITE_LEMMA = ComponentParameters.PARAM_WRITE_LEMMA; + @ConfigurationParameter(name = PARAM_WRITE_LEMMA, mandatory = true, defaultValue = "true") + private boolean writeLemma; + + /** + * Write syntactic dependency information. + */ + public static final String PARAM_WRITE_DEPENDENCY = ComponentParameters.PARAM_WRITE_DEPENDENCY; + @ConfigurationParameter(name = PARAM_WRITE_DEPENDENCY, mandatory = true, defaultValue = "true") + private boolean writeDependency; + + /** + * Write text covered by the token instead of the token form. + */ + public static final String PARAM_WRITE_COVERED_TEXT = + ComponentParameters.PARAM_WRITE_COVERED_TEXT; + @ConfigurationParameter(name = PARAM_WRITE_COVERED_TEXT, mandatory = true, defaultValue = "true") + private boolean writeCovered; + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + try (PrintWriter out = new PrintWriter( + new OutputStreamWriter(getOutputStream(aJCas, filenameSuffix), targetEncoding));) { + + convert(aJCas, out); + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + } + + private void convert(JCas aJCas, PrintWriter aOut) + { + for (Sentence sentence : select(aJCas, Sentence.class)) { + Map ctokens = new LinkedHashMap<>(); + NavigableMap tokenBeginIndex = new TreeMap<>(); + NavigableMap tokenEndIndex = new TreeMap<>(); + + // Tokens + List tokens = selectCovered(Token.class, sentence); + + for (int i = 0; i < tokens.size(); i++) { + Row row = new Row(); + row.id = i + 1; + row.token = tokens.get(i); + ctokens.put(row.token, row); + tokenBeginIndex.put(row.token.getBegin(), row.token); + tokenEndIndex.put(row.token.getEnd(), row.token); + } + + // Dependencies + List basicDeps = selectCovered(Dependency.class, sentence).stream() + .filter(dep -> dep.getFlavor() == null || + DependencyFlavor.BASIC.equals(dep.getFlavor())) + .collect(Collectors.toList()); + for (Dependency rel : basicDeps) { + Row row = ctokens.get(rel.getDependent()); + if (row.deprel != null) { + String form = row.token.getCoveredText(); + throw new IllegalStateException("Illegal basic dependency structure - token [" + + form + + "] is dependent of more than one dependency."); + } + row.deprel = rel; + } + + // Named entities + List nerSpans = new ArrayList<>(); + for (NamedEntity ne : selectCovered(NamedEntity.class, sentence)) { + Token beginToken = tokenBeginIndex.floorEntry(ne.getBegin()).getValue(); + Token endToken = tokenEndIndex.ceilingEntry(ne.getEnd()).getValue(); + nerSpans.add(new SequenceItem(ctokens.get(beginToken).id, ctokens.get(endToken).id, + ne.getValue())); + } + AdjacentLabelCodec codec = new AdjacentLabelCodec(1); + List encodedNe = codec.encode(nerSpans, tokens.size()); + for (int i = 0; i < encodedNe.size(); i++) { + ctokens.get(tokens.get(i)).ne = encodedNe.get(i).getLabel(); + } + + // Write sentence + for (Row row : ctokens.values()) { + String form = row.token.getCoveredText(); + if (!writeCovered) { + form = row.token.getText(); + } + + String lemma = UNUSED; + if (writeLemma && (row.token.getLemma() != null)) { + lemma = row.token.getLemma().getValue(); + } + + String pos = UNUSED; + if (writePos && (row.token.getPos() != null)) { + POS posAnno = row.token.getPos(); + pos = posAnno.getPosValue(); + } + + int headId = UNUSED_INT; + String deprel = UNUSED; + if (writeDependency && (row.deprel != null)) { + deprel = row.deprel.getDependencyType(); + headId = ctokens.get(row.deprel.getGovernor()).id; + if (headId == row.id) { + // ROOT dependencies may be modeled as a loop, ignore these. + headId = 0; + } + } + + String head = UNUSED; + if (headId != UNUSED_INT) { + head = Integer.toString(headId); + } + + String ner = UNUSED; + if (writeNamedEntity && (row.ne != null)) { + ner = row.ne; + } + + aOut.printf("%d\t%s\t%s\t%s\t%s\t%s\t%s\n", row.id, form, lemma, pos, ner, head, + deprel); + } + + aOut.println(); + } + } + + private static final class Row + { + int id; + Token token; + String ne; + Dependency deprel; + } +} diff --git a/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/ConllUReader.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/ConllUReader.java new file mode 100644 index 0000000000..66f95f4682 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/ConllUReader.java @@ -0,0 +1,518 @@ +/* + * Copyright 2016 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.conll; + +import static de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor.BASIC; +import static de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor.ENHANCED; +import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.Type; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.factory.JCasBuilder; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.io.conll.internal.ConllReader_ImplBase; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.DocumentationResource; +import it.unimi.dsi.fastutil.ints.Int2ObjectMap; +import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap; + +/** + * Reads a file in the CoNLL-U format. + * + * @see CoNLL-U Format + */ +@ResourceMetaData(name = "CoNLL-U Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.TEXT_X_CONLL_U}) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) +public class ConllUReader + extends ConllReader_ImplBase +{ + /** + * Character encoding of the input data. + */ + public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) + private String sourceEncoding; + + /** + * Read fine-grained part-of-speech information. + */ + public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; + @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") + private boolean readPos; + + /** + * Read coarse-grained part-of-speech information. + */ + public static final String PARAM_READ_CPOS = ComponentParameters.PARAM_READ_CPOS; + @ConfigurationParameter(name = PARAM_READ_CPOS, mandatory = true, defaultValue = "true") + private boolean readCPos; + + /** + * Treat coarse-grained part-of-speech as fine-grained part-of-speech information. + */ + public static final String PARAM_USE_CPOS_AS_POS = "useCPosAsPos"; + @ConfigurationParameter(name = PARAM_USE_CPOS_AS_POS, mandatory = true, defaultValue = "false") + private boolean useCPosAsPos; + + /** + * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the + * tag set defined as part of the model meta data. This can be useful if a custom model is + * specified which does not have such meta data, or it can be used in readers. + */ + public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; + @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) + protected String posTagset; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Load the part-of-speech tag to UIMA type mapping from this location instead of locating + * the mapping automatically. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + protected String posMappingLocation; + + /** + * Read morphological features. + */ + public static final String PARAM_READ_MORPH = ComponentParameters.PARAM_READ_MORPH; + @ConfigurationParameter(name = PARAM_READ_MORPH, mandatory = true, defaultValue = "true") + private boolean readMorph; + + /** + * Read lemma information. + */ + public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; + @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") + private boolean readLemma; + + /** + * Read syntactic dependency information. + */ + public static final String PARAM_READ_DEPENDENCY = ComponentParameters.PARAM_READ_DEPENDENCY; + @ConfigurationParameter(name = PARAM_READ_DEPENDENCY, mandatory = true, defaultValue = "true") + private boolean readDependency; + + /** + * Read paragraph information. If no paragraph information is provided in the file, or if set + * to false, then output one sentence per line, separated by an empty line. + */ + public static final String PARAM_READ_PARAGRAPH = ComponentParameters.PARAM_READ_PARAGRAPH; + @ConfigurationParameter(name = PARAM_READ_PARAGRAPH, mandatory = true, defaultValue = "true") + private boolean readParagraph; + + private static final String UNUSED = "_"; + + private static final int ID = 0; + private static final int FORM = 1; + private static final int LEMMA = 2; + private static final int CPOSTAG = 3; + private static final int POSTAG = 4; + private static final int FEATS = 5; + private static final int HEAD = 6; + private static final int DEPREL = 7; + private static final int DEPS = 8; + private static final int MISC = 9; + + public static final String META_SEND_ID = "sent_id"; + public static final String META_DOCUMENT_ID = "newdoc id"; + public static final String META_PARAGRAPH_ID = "newpar id"; + public static final String META_TEXT = "text"; + + private MappingProvider posMappingProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + posMappingProvider = createPosMappingProvider(this, posMappingLocation, posTagset, + getLanguage()); + } + + @Override + public void getNext(JCas aJCas) + throws IOException, CollectionException + { + Resource res = nextFile(); + initCas(aJCas, res); + BufferedReader reader = null; + try { + reader = new BufferedReader(new InputStreamReader( + CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()), + sourceEncoding)); + convert(aJCas, reader); + } + finally { + closeQuietly(reader); + } + } + + public void convert(JCas aJCas, BufferedReader aReader) + throws IOException + { + if (readPos) { + try { + posMappingProvider.configure(aJCas.getCas()); + } + catch (AnalysisEngineProcessException e) { + throw new IOException(e); + } + } + + JCasBuilder doc = new JCasBuilder(aJCas); + + Paragraph p = null; + int lastSentenceEndPosition = 0; + boolean shouldAddSpace = false; + Optional documentContainsParagraphInformation = Optional.empty(); + List documentIDValues = new ArrayList<>(); + while (true) { + // Read sentence comments (if any) + Map comments = readSentenceComments(aReader); + + if (!documentContainsParagraphInformation.isPresent()) { + documentContainsParagraphInformation = Optional.of( + comments.keySet().contains(META_PARAGRAPH_ID)); + } + + + // Read sentence + List words = readSentence(aReader); + if (words == null) { + // End of file + break; + } + + if (words.isEmpty()) { + // Ignore empty sentences. This can happen when there are multiple end-of-sentence + // markers following each other. + continue; + } + if (comments.keySet().contains(META_DOCUMENT_ID)) { + documentIDValues.add(comments.get(META_DOCUMENT_ID)); + } + + + if (!readParagraph || !documentContainsParagraphInformation.get()) { + if (doc.getPosition() > 0) { + doc.add("\n"); + shouldAddSpace = false; + } + } else if (readParagraph) { + if (p != null && comments.keySet().contains(META_PARAGRAPH_ID)) { + doc.add("\n\n"); + shouldAddSpace = false; + } + } + + int sentenceBegin = doc.getPosition(); + int sentenceEnd = sentenceBegin; + + int surfaceBegin = -1; + int surfaceEnd = -1; + String surfaceString = null; + + // Tokens, Lemma, POS + Int2ObjectMap tokens = new Int2ObjectOpenHashMap<>(); + Iterator wordIterator = words.iterator(); + while (wordIterator.hasNext()) { + String[] word = wordIterator.next(); + + String idValue = trim(word[ID]); + String formValue = trim(word[FORM]); + + if (idValue.contains("-")) { + String[] fragments = idValue.split("-"); + surfaceBegin = Integer.valueOf(trim(fragments[0])); + surfaceEnd = Integer.valueOf(trim(fragments[1])); + surfaceString = formValue; + continue; + } +// the following must be placed after check for dashes in ID in order not to insert +// unnecessary spaces + if (shouldAddSpace) { + if (doc.getPosition() == sentenceBegin) { + sentenceBegin++; + } + doc.add(" "); + } + + // Read token + int tokenIdx = Integer.valueOf(idValue); + Token token = doc.add(formValue, Token.class); + tokens.put(tokenIdx, token); + shouldAddSpace = !StringUtils.contains(word[MISC], "SpaceAfter=No"); + + // Read lemma + String lemmaValue = trim(word[LEMMA]); + if (!UNUSED.equals(lemmaValue) && readLemma) { + Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); + lemma.setValue(lemmaValue); + lemma.addToIndexes(); + token.setLemma(lemma); + } + + // Read part-of-speech tag + POS pos = null; + String cPosTag = cleanTag(word[CPOSTAG]); + String tag = useCPosAsPos ? cPosTag : cleanTag(word[POSTAG]); + if (!UNUSED.equals(tag) && readPos) { + Type posTag = posMappingProvider.getTagType(tag); + pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), + token.getEnd()); + pos.setPosValue(tag); + } + + // Read coarse part-of-speech tag + if (!UNUSED.equals(cPosTag) && readCPos) { + if (pos == null) { + pos = new POS(aJCas, token.getBegin(), token.getEnd()); + } + pos.setCoarseValue(cPosTag); + } + + if (pos != null) { + pos.addToIndexes(); + token.setPos(pos); + } + + // Read morphological features + String featsValue = cleanTag(word[FEATS]); + if (!UNUSED.equals(featsValue) && readMorph) { + MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, + token.getBegin(), token.getEnd()); + morphtag.setValue(featsValue); + morphtag.addToIndexes(); + token.setMorph(morphtag); + + // Try parsing out individual feature values. Since the DKPro Core + // MorphologicalFeatures type is based on the definition from the UD project, + // we can do this rather straightforwardly. + Type morphType = morphtag.getType(); + String[] items = featsValue.split("\\|"); + for (String item : items) { + String[] keyValue = item.split("="); + StringBuilder key = new StringBuilder(trim(keyValue[0])); + key.setCharAt(0, Character.toLowerCase(key.charAt(0))); + String value = trim(keyValue[1]); + + Feature feat = morphType.getFeatureByBaseName(key.toString()); + if (feat != null) { + morphtag.setStringValue(feat, value); + } + } + } + + // Read surface form + if (tokenIdx == surfaceEnd) { + int begin = tokens.get(surfaceBegin).getBegin(); + int end = tokens.get(surfaceEnd).getEnd(); + SurfaceForm surfaceForm = new SurfaceForm(aJCas, begin, end); + surfaceForm.setValue(surfaceString); + surfaceForm.addToIndexes(); + surfaceBegin = -1; + surfaceEnd = -1; + surfaceString = null; + } + + sentenceEnd = token.getEnd(); + } + + // Dependencies + if (readDependency) { + for (String[] word : words) { + String depRelValue = cleanTag(word[DEPREL]); + + if (!UNUSED.equals(depRelValue)) { + int depId = Integer.valueOf(trim(word[ID])); + int govId = Integer.valueOf(trim(word[HEAD])); + + // Model the root as a loop onto itself + makeDependency(aJCas, govId, depId, depRelValue, BASIC, tokens, + word); + } + + String depsValue = trim(word[DEPS]); + if (!UNUSED.equals(depsValue)) { + // list items separated by vertical bar + String[] items = depsValue.split("\\|"); + for (String item : items) { + String[] sItem = item.split(":"); + + int depId = Integer.valueOf(trim(word[ID])); + int govId = Integer.valueOf(trim(sItem[0])); + + makeDependency(aJCas, govId, depId, cleanTag(sItem[1]), ENHANCED, + tokens, word); + } + } + } + } + + // Sentence + Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); + sentence.setId(comments.get(META_SEND_ID)); + sentence.addToIndexes(); + + + if (comments.keySet().contains(META_PARAGRAPH_ID)) { + final String paragraphID = comments.get(META_PARAGRAPH_ID); + if (p != null) { + // do nothing + p.setEnd(lastSentenceEndPosition); + p.addToIndexes(); + } + p = new Paragraph(aJCas, sentenceBegin, sentenceEnd); + p.setId(paragraphID); + p.setDivType("p"); // `p` for paragraph - using the HTML tag here + } + lastSentenceEndPosition = sentenceEnd; + } + if (p != null) { + p.setEnd(lastSentenceEndPosition); + p.addToIndexes(); + } + if (documentIDValues.size() > 0) { + DocumentMetaData m = DocumentMetaData.get(aJCas); + String documentID = String.join(";", documentIDValues); + if (documentIDValues.size() > 1) { + final String fileUri = m.getDocumentUri(); + getLogger().warn(String.format("File %s contains multiple document IDs: %s", + fileUri, documentIDValues)); + + } + m.setDocumentId(documentID); + } + doc.close(); + } + + private Map readSentenceComments(BufferedReader aReader) + throws IOException + { + Map comments = new LinkedHashMap<>(); + + while (true) { + // Check if the next line could be a header line + aReader.mark(2); + char character = (char) aReader.read(); + if ('#' == character) { + // Read the rest of the line + String line = aReader.readLine(); + if (line.contains("=")) { + String[] parts = line.split("=", 2); + comments.put(parts[0].trim(), parts[1].trim()); + } + else { + // Comment or unknown header line + } + } + else { + aReader.reset(); + break; + } + } + + return comments; + } + + /** + * Read a single sentence. + */ + private static List readSentence(BufferedReader aReader) + throws IOException + { + List words = new ArrayList<>(); + String line; + while ((line = aReader.readLine()) != null) { + if (StringUtils.isBlank(line)) { + break; // End of sentence + } + if (line.startsWith("#")) { + // Comment line + continue; + } + String[] fields = line.split("\t"); + if (fields.length != 10) { + throw new IOException( + "Invalid file format. Line needs to have 10 tab-separated fields, but it has " + + fields.length + ": [" + line + "]"); + } + words.add(fields); + } + + if (line == null && words.isEmpty()) { + return null; + } + else { + return words; + } + } +} diff --git a/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/ConllUWriter.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/ConllUWriter.java new file mode 100644 index 0000000000..ab96d4603c --- /dev/null +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/ConllUWriter.java @@ -0,0 +1,303 @@ +/* + * Copyright 2016 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.conll; + +import static org.apache.uima.fit.util.JCasUtil.indexCovered; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; + +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; +import eu.openminted.share.annotations.api.DocumentationResource; +import it.unimi.dsi.fastutil.ints.Int2ObjectMap; +import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap; + +/** + * Writes a file in the CoNLL-U format. + * + * @see CoNLL-U Format + */ +@ResourceMetaData(name = "CoNLL-U Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.TEXT_X_CONLL_U}) +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) +public class ConllUWriter + extends JCasFileWriter_ImplBase +{ + private static final String UNUSED = "_"; + private static final int UNUSED_INT = -1; + + /** + * Character encoding of the output data. + */ + public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; + @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) + private String targetEncoding; + + /** + * Use this filename extension. + */ + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; + @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".conllu") + private String filenameSuffix; + + /** + * Write fine-grained part-of-speech information. + */ + public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; + @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "true") + private boolean writePos; + + /** + * Write coarse-grained part-of-speech information. + */ + public static final String PARAM_WRITE_CPOS = ComponentParameters.PARAM_WRITE_CPOS; + @ConfigurationParameter(name = PARAM_WRITE_CPOS, mandatory = true, defaultValue = "true") + private boolean writeCPos; + + /** + * Write morphological features. + */ + public static final String PARAM_WRITE_MORPH = ComponentParameters.PARAM_WRITE_MORPH; + @ConfigurationParameter(name = PARAM_WRITE_MORPH, mandatory = true, defaultValue = "true") + private boolean writeMorph; + + /** + * Write lemma information. + */ + public static final String PARAM_WRITE_LEMMA = ComponentParameters.PARAM_WRITE_LEMMA; + @ConfigurationParameter(name = PARAM_WRITE_LEMMA, mandatory = true, defaultValue = "true") + private boolean writeLemma; + + /** + * Write syntactic dependency information. + */ + public static final String PARAM_WRITE_DEPENDENCY = ComponentParameters.PARAM_WRITE_DEPENDENCY; + @ConfigurationParameter(name = PARAM_WRITE_DEPENDENCY, mandatory = true, defaultValue = "true") + private boolean writeDependency; + + /** + * Write text covered by the token instead of the token form. + */ + public static final String PARAM_WRITE_COVERED_TEXT = + ComponentParameters.PARAM_WRITE_COVERED_TEXT; + @ConfigurationParameter(name = PARAM_WRITE_COVERED_TEXT, mandatory = true, defaultValue = "true") + private boolean writeCovered; + + /** + * Include the full sentence text as a comment in front of each sentence. + */ + public static final String PARAM_WRITE_TEXT_COMMENT = "writeTextComment"; + @ConfigurationParameter(name = PARAM_WRITE_TEXT_COMMENT, mandatory = true, defaultValue = "true") + private boolean writeTextHeader; + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + try (PrintWriter out = new PrintWriter( + new OutputStreamWriter(getOutputStream(aJCas, filenameSuffix), targetEncoding));) { + convert(aJCas, out); + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + } + + private void convert(JCas aJCas, PrintWriter aOut) + { + Map> surfaceIdx = indexCovered(aJCas, SurfaceForm.class, + Token.class); + Int2ObjectMap surfaceBeginIdx = new Int2ObjectOpenHashMap<>(); + for (SurfaceForm sf : select(aJCas, SurfaceForm.class)) { + surfaceBeginIdx.put(sf.getBegin(), sf); + } + + for (Sentence sentence : select(aJCas, Sentence.class)) { + Map ctokens = new LinkedHashMap<>(); + + // Comments + if (sentence.getId() != null) { + aOut.printf("# %s = %s\n", ConllUReader.META_SEND_ID, sentence.getId()); + } + if (writeTextHeader) { + String sentenceText = sentence.getCoveredText(); + // CoNLL-U does not support line breaks in the sentence text, so we need to replace + // such characters. + sentenceText = StringUtils.replaceChars(sentenceText, "\n\r", " "); + aOut.printf("# %s = %s\n", ConllUReader.META_TEXT, sentenceText); + } + + // Tokens + List tokens = selectCovered(Token.class, sentence); + + for (int i = 0; i < tokens.size(); i++) { + Row row = new Row(); + row.id = i + 1; + row.token = tokens.get(i); + row.noSpaceAfter = (i + 1 < tokens.size()) + && row.token.getEnd() == tokens.get(i + 1).getBegin(); + ctokens.put(row.token, row); + } + + // Dependencies + for (Dependency rel : selectCovered(Dependency.class, sentence)) { + if (StringUtils.isBlank(rel.getFlavor()) + || DependencyFlavor.BASIC.equals(rel.getFlavor())) { + ctokens.get(rel.getDependent()).deprel = rel; + } + else { + ctokens.get(rel.getDependent()).deps.add(rel); + } + } + + // Write sentence in CONLL-U format + for (Row row : ctokens.values()) { + + String form = row.token.getCoveredText(); + if (!writeCovered) { + form = row.token.getText(); + } + + String lemma = UNUSED; + if (writeLemma && (row.token.getLemma() != null)) { + lemma = row.token.getLemma().getValue(); + } + + String pos = UNUSED; + if (writePos && (row.token.getPos() != null) + && row.token.getPos().getPosValue() != null) { + POS posAnno = row.token.getPos(); + pos = posAnno.getPosValue(); + } + + String cpos = UNUSED; + if (writeCPos && (row.token.getPos() != null) + && row.token.getPos().getCoarseValue() != null) { + POS posAnno = row.token.getPos(); + cpos = posAnno.getCoarseValue(); + } + + int headId = UNUSED_INT; + String deprel = UNUSED; + String deps = UNUSED; + if (writeDependency) { + if ((row.deprel != null)) { + deprel = row.deprel.getDependencyType(); + headId = ctokens.get(row.deprel.getGovernor()).id; + if (headId == row.id) { + // ROOT dependencies may be modeled as a loop, ignore these. + headId = 0; + } + } + + StringBuilder depsBuf = new StringBuilder(); + for (Dependency d : row.deps) { + if (depsBuf.length() > 0) { + depsBuf.append('|'); + } + // Resolve self-looping root to 0-indexed root + int govId = ctokens.get(d.getGovernor()).id; + if (govId == row.id) { + govId = 0; + } + depsBuf.append(govId); + depsBuf.append(':'); + depsBuf.append(d.getDependencyType()); + } + if (depsBuf.length() > 0) { + deps = depsBuf.toString(); + } + } + + String head = UNUSED; + if (headId != UNUSED_INT) { + head = Integer.toString(headId); + } + + String feats = UNUSED; + if (writeMorph && (row.token.getMorph() != null)) { + feats = row.token.getMorph().getValue(); + } + + String misc = UNUSED; + if (row.noSpaceAfter) { + misc = "SpaceAfter=No"; + } + + SurfaceForm sf = surfaceBeginIdx.get(row.token.getBegin()); + if (sf != null) { + @SuppressWarnings({ "unchecked", "rawtypes" }) + List covered = (List) surfaceIdx.get(sf); + int id1 = ctokens.get(covered.get(0)).id; + int id2 = ctokens.get(covered.get(covered.size() - 1)).id; + aOut.printf("%d-%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", id1, id2, + sf.getValue(), UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, + UNUSED); + } + + aOut.printf("%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", row.id, + form, lemma, cpos, pos, feats, head, deprel, deps, + misc); + } + + aOut.println(); + } + } + + private static final class Row + { + int id; + Token token; + boolean noSpaceAfter; + Dependency deprel; + List deps = new ArrayList<>(); + } +} diff --git a/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/internal/ConllReader_ImplBase.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/internal/ConllReader_ImplBase.java new file mode 100644 index 0000000000..7544acf35d --- /dev/null +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/internal/ConllReader_ImplBase.java @@ -0,0 +1,84 @@ +/* + * Copyright 2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.conll.internal; + +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; +import it.unimi.dsi.fastutil.ints.Int2ObjectMap; + +/** + * Abstract base class for CoNLL format readers. + */ +public abstract class ConllReader_ImplBase + extends JCasResourceCollectionReader_ImplBase +{ + /** + * Trim field values. + */ + public static final String PARAM_TRIM_FIELDS = "trimFields"; + @ConfigurationParameter(name = PARAM_TRIM_FIELDS, mandatory = true, defaultValue = "true") + protected boolean trimFields; + + protected String cleanTag(String aField) + { + if (aField == null) { + return null; + } + + return trim(aField).intern(); + } + + protected String trim(String aField) + { + if (aField == null || !trimFields) { + return aField; + } + + return aField.trim(); + } + + protected Dependency makeDependency(JCas aJCas, int govId, int depId, String label, + String flavor, Int2ObjectMap tokens, String[] word) + { + Dependency rel; + + if (govId == 0) { + rel = new ROOT(aJCas); + rel.setGovernor(tokens.get(depId)); + rel.setDependent(tokens.get(depId)); + } + else { + rel = new Dependency(aJCas); + rel.setGovernor(tokens.get(govId)); + rel.setDependent(tokens.get(depId)); + } + + rel.setDependencyType(label); + rel.setFlavor(flavor); + rel.setBegin(rel.getDependent().getBegin()); + rel.setEnd(rel.getDependent().getEnd()); + rel.addToIndexes(); + + return rel; + } +} diff --git a/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/package-info.java b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/package-info.java new file mode 100644 index 0000000000..26d1b880b4 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/package-info.java @@ -0,0 +1,22 @@ +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Provides classes for the conversion of conll file formats. + * Especially to and from Brat response formats + */ +package org.dkpro.core.io.conll; diff --git a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2006ReaderWriterTest.java b/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2006ReaderWriterTest.java deleted file mode 100644 index 3b1a1e9ec0..0000000000 --- a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2006ReaderWriterTest.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; - -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testOneWay; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testRoundTrip; - -import org.junit.Ignore; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - -//NOTE: This file contains Asciidoc markers for partial inclusion of this file in the documentation -//Do not remove these tags! -public class Conll2006ReaderWriterTest -{ - // Deleted the test file here because it was malformed *and* we had no provenance info. - // However, leaving the test in right now and ignoring it because it is used in the - // documentation. - @Ignore() - @Test - public void roundTrip() - throws Exception - { -// tag::testRoundTrip[] - testRoundTrip( - Conll2006Reader.class, // the reader - Conll2006Writer.class, // the writer - "conll/2006/fk003_2006_08_ZH1.conll"); // the input also used as output reference -// end::testRoundTrip[] - } - - @Test - public void testFinnTreeBank() - throws Exception - { -// tag::testOneWay[] - testOneWay( - Conll2006Reader.class, // the reader - Conll2006Writer.class, // the writer - "conll/2006/fi-ref.conll", // the reference file for the output - "conll/2006/fi-orig.conll"); // the input file for the test -// end::testOneWay[] - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/ConllUReaderTest.java b/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/ConllUReaderTest.java deleted file mode 100644 index 3bb85907ba..0000000000 --- a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/ConllUReaderTest.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright 2016 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; - -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertMorph; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertPOS; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertSentence; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.JCasIterable; -import org.apache.uima.jcas.JCas; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - -public class ConllUReaderTest -{ - @Test - public void test() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - ConllUReader.class, - ConllUReader.PARAM_LANGUAGE, "en", - ConllUReader.PARAM_SOURCE_LOCATION, "src/test/resources/conll/u/", - ConllUReader.PARAM_PATTERNS, "conllu-en-orig.conll"); - - JCas jcas = new JCasIterable(reader).iterator().next(); - - String[] sentences = { - "They buy and sell books.", - "I have not a clue." }; - - String[] posMapped = { "POS", "POS_VERB", "POS_CONJ", "POS_VERB", "POS_NOUN", "POS_PUNCT", "POS", "POS_VERB", "POS_ADV", - "POS_DET", "POS_NOUN", "POS_PUNCT" }; - - String[] posOriginal = { "PRN", "VB", "CC", "VB", "NNS", ".", "PRN", "VB", "RB", "DT", "NN", - "." }; - - String[] morphologicalFeeatures = { - "[ 0, 4] - - Nom - - - - - Plur - - - - - - - - They (Case=Nom|Number=Plur)", - "[ 5, 8] - - - - - - - - Plur - 3 - - - Pres - - buy (Number=Plur|Person=3|Tense=Pres)", - "[ 13, 17] - - - - - - - - Plur - 3 - - - Pres - - sell (Number=Plur|Person=3|Tense=Pres)", - "[ 18, 23] - - - - - - - - Plur - - - - - - - - books (Number=Plur)", - "[ 25, 26] - - Nom - - - - - Sing - 1 - - - - - - I (Case=Nom|Number=Sing|Person=1)", - "[ 27, 31] - - - - - - - - Sing - 1 - - - Pres - - have (Number=Sing|Person=1|Tense=Pres)", - "[ 32, 35] - - - - - - - Neg - - - - - - - - - not (Negative=Neg)", - "[ 36, 37] - - - - - - - - - - - - Art - - - - a (Definite=Ind|PronType=Art)", - "[ 38, 42] - - - - - - - - Sing - - - - - - - - clue (Number=Sing)" }; - - assertSentence(sentences, select(jcas, Sentence.class)); - assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - assertMorph(morphologicalFeeatures, select(jcas, MorphologicalFeatures.class)); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/ConllUReaderWriterTest.java b/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/ConllUReaderWriterTest.java deleted file mode 100644 index 67d7cb8ad2..0000000000 --- a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/ConllUReaderWriterTest.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright 2016 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; - -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.*; - -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - -public class ConllUReaderWriterTest -{ - @Test - public void roundTrip() - throws Exception - { - testRoundTrip(ConllUReader.class, ConllUWriter.class, "conll/u/conllu-en-orig.conll"); - } - - @Test - public void withComments() - throws Exception - { - testOneWay(ConllUReader.class, ConllUWriter.class, - "conll/u/conllu-en-ref.conll", - "conll/u/conllu-en-orig2.conll"); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2000ReaderTest.java b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2000ReaderTest.java similarity index 80% rename from dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2000ReaderTest.java rename to dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2000ReaderTest.java index a04a2c28e1..8098f9fc77 100644 --- a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2000ReaderTest.java +++ b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2000ReaderTest.java @@ -1,132 +1,137 @@ -/* - * Copyright 2013 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; - -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertChunks; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertPOS; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertSentence; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.JCasIterable; -import org.apache.uima.jcas.JCas; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; - -public class Conll2000ReaderTest -{ - - @Test - public void conll2000test() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - Conll2000Reader.class, - Conll2000Reader.PARAM_LANGUAGE, "en", - Conll2000Reader.PARAM_SOURCE_LOCATION, "src/test/resources/conll/2000/", - Conll2000Reader.PARAM_PATTERNS, "chunk2000_test.conll", - Conll2000Reader.PARAM_CHUNK_TAG_SET, "conll2000" - ); - - JCas jcas = new JCasIterable(reader).iterator().next(); - - String[] sentences = new String[] { - "Confidence in the pound is widely expected to take another sharp dive if trade " - + "figures for September , due for release tomorrow , fail to show a substantial " - + "improvement from July and August 's near-record deficits .", - "Chancellor of the Exchequer Nigel Lawson 's restated commitment to a firm " - + "monetary policy has helped to prevent a freefall in sterling over the past " - + "week .", - "But analysts reckon underlying support for sterling has been eroded by the " - + "chancellor 's failure to announce any new policy measures in his Mansion " - + "House speech last Thursday ." }; - - String[] chunks = new String[] { - "[ 0, 10]NC(NP) (Confidence)", - "[ 11, 13]PC(PP) (in)", - "[ 14, 23]NC(NP) (the pound)", - "[ 24, 50]VC(VP) (is widely expected to take)", - "[ 51, 69]NC(NP) (another sharp dive)", - "[ 70, 72]O(SBAR) (if)", - "[ 73, 86]NC(NP) (trade figures)", - "[ 87, 90]PC(PP) (for)", - "[ 91,100]NC(NP) (September)", - "[103,106]ADJC(ADJP) (due)", - "[107,110]PC(PP) (for)", - "[111,118]NC(NP) (release)", - "[119,127]NC(NP) (tomorrow)", - "[130,142]VC(VP) (fail to show)", - "[143,168]NC(NP) (a substantial improvement)", - "[169,173]PC(PP) (from)", - "[174,189]NC(NP) (July and August)", - "[190,213]NC(NP) ('s near-record deficits)", - "[228,230]PC(PP) (of)", - "[231,244]NC(NP) (the Exchequer)", - "[245,257]NC(NP) (Nigel Lawson)", - "[258,280]NC(NP) ('s restated commitment)", - "[281,283]PC(PP) (to)", - "[284,306]NC(NP) (a firm monetary policy)", - "[307,328]VC(VP) (has helped to prevent)", - "[329,339]NC(NP) (a freefall)", - "[340,342]PC(PP) (in)", - "[343,351]NC(NP) (sterling)", - "[352,356]PC(PP) (over)", - "[357,370]NC(NP) (the past week)", - "[378,386]NC(NP) (analysts)", - "[387,393]VC(VP) (reckon)", - "[394,412]NC(NP) (underlying support)", - "[413,416]PC(PP) (for)", - "[417,425]NC(NP) (sterling)", - "[426,441]VC(VP) (has been eroded)", - "[442,444]PC(PP) (by)", - "[445,459]NC(NP) (the chancellor)", - "[460,470]NC(NP) ('s failure)", - "[471,482]VC(VP) (to announce)", - "[483,506]NC(NP) (any new policy measures)", - "[507,509]PC(PP) (in)", - "[510,534]NC(NP) (his Mansion House speech)", - "[535,548]NC(NP) (last Thursday)" }; - - String[] posMapped = { "POS_NOUN", "POS_ADP", "POS_DET", "POS_NOUN", "POS_VERB", "POS_ADV", "POS_VERB", "POS_ADP", "POS_VERB", - "POS_DET", "POS_ADJ", "POS_NOUN", "POS_ADP", "POS_NOUN", "POS_NOUN", "POS_ADP", "POS_PROPN", "POS_PUNCT", "POS_ADJ", "POS_ADP", - "POS_NOUN", "POS_NOUN", "POS_PUNCT", "POS_VERB", "POS_ADP", "POS_VERB", "POS_DET", "POS_ADJ", "POS_NOUN", "POS_ADP", - "POS_PROPN", "POS_CONJ", "POS_PROPN", "POS_X", "POS_ADJ", "POS_NOUN", "POS_PUNCT", "POS_PROPN", "POS_ADP", "POS_DET", - "POS_PROPN", "POS_PROPN", "POS_PROPN", "POS_X", "POS_VERB", "POS_NOUN", "POS_ADP", "POS_DET", "POS_NOUN", "POS_ADJ", "POS_NOUN", - "POS_VERB", "POS_VERB", "POS_ADP", "POS_VERB", "POS_DET", "POS_NOUN", "POS_ADP", "POS_NOUN", "POS_ADP", "POS_DET", "POS_ADJ", - "POS_NOUN", "POS_PUNCT", "POS_CONJ", "POS_NOUN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_ADP", "POS_NOUN", "POS_VERB", - "POS_VERB", "POS_VERB", "POS_ADP", "POS_DET", "POS_NOUN", "POS_X", "POS_NOUN", "POS_ADP", "POS_VERB", "POS_DET", "POS_ADJ", - "POS_NOUN", "POS_NOUN", "POS_ADP", "POS_PRON", "POS_PROPN", "POS_PROPN", "POS_NOUN", "POS_ADJ", "POS_PROPN", "POS_PUNCT" }; - - String[] posOriginal = { "NN", "IN", "DT", "NN", "VBZ", "RB", "VBN", "TO", "VB", "DT", "JJ", - "NN", "IN", "NN", "NNS", "IN", "NNP", ",", "JJ", "IN", "NN", "NN", ",", "VB", "TO", - "VB", "DT", "JJ", "NN", "IN", "NNP", "CC", "NNP", "POS", "JJ", "NNS", ".", "NNP", - "IN", "DT", "NNP", "NNP", "NNP", "POS", "VBN", "NN", "TO", "DT", "NN", "JJ", "NN", - "VBZ", "VBN", "TO", "VB", "DT", "NN", "IN", "NN", "IN", "DT", "JJ", "NN", ".", "CC", - "NNS", "VBP", "VBG", "NN", "IN", "NN", "VBZ", "VBN", "VBN", "IN", "DT", "NN", "POS", - "NN", "TO", "VB", "DT", "JJ", "NN", "NNS", "IN", "PRP$", "NNP", "NNP", "NN", "JJ", - "NNP", "." }; - - assertSentence(sentences, select(jcas, Sentence.class)); - assertChunks(chunks, select(jcas, Chunk.class)); - assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - } -} +/* + * Copyright 2013 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.conll; + +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.AssertAnnotations.assertChunks; +import static org.dkpro.core.testing.AssertAnnotations.assertPOS; +import static org.dkpro.core.testing.AssertAnnotations.assertSentence; + +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.JCasIterable; +import org.apache.uima.jcas.JCas; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; + +public class Conll2000ReaderTest +{ + + @Test + public void conll2000test() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + Conll2000Reader.class, + Conll2000Reader.PARAM_LANGUAGE, "en", + Conll2000Reader.PARAM_SOURCE_LOCATION, "src/test/resources/conll/2000/", + Conll2000Reader.PARAM_PATTERNS, "chunk2000_test.conll", + Conll2000Reader.PARAM_CHUNK_TAG_SET, "conll2000" + ); + + JCas jcas = new JCasIterable(reader).iterator().next(); + + String[] sentences = new String[] { + "Confidence in the pound is widely expected to take another sharp dive if trade " + + "figures for September , due for release tomorrow , fail to show a substantial " + + "improvement from July and August 's near-record deficits .", + "Chancellor of the Exchequer Nigel Lawson 's restated commitment to a firm " + + "monetary policy has helped to prevent a freefall in sterling over the past " + + "week .", + "But analysts reckon underlying support for sterling has been eroded by the " + + "chancellor 's failure to announce any new policy measures in his Mansion " + + "House speech last Thursday ." }; + + String[] chunks = new String[] { + "[ 0, 10]NC(NP) (Confidence)", + "[ 11, 13]PC(PP) (in)", + "[ 14, 23]NC(NP) (the pound)", + "[ 24, 50]VC(VP) (is widely expected to take)", + "[ 51, 69]NC(NP) (another sharp dive)", + "[ 70, 72]O(SBAR) (if)", + "[ 73, 86]NC(NP) (trade figures)", + "[ 87, 90]PC(PP) (for)", + "[ 91,100]NC(NP) (September)", + "[103,106]ADJC(ADJP) (due)", + "[107,110]PC(PP) (for)", + "[111,118]NC(NP) (release)", + "[119,127]NC(NP) (tomorrow)", + "[130,142]VC(VP) (fail to show)", + "[143,168]NC(NP) (a substantial improvement)", + "[169,173]PC(PP) (from)", + "[174,189]NC(NP) (July and August)", + "[190,213]NC(NP) ('s near-record deficits)", + "[228,230]PC(PP) (of)", + "[231,244]NC(NP) (the Exchequer)", + "[245,257]NC(NP) (Nigel Lawson)", + "[258,280]NC(NP) ('s restated commitment)", + "[281,283]PC(PP) (to)", + "[284,306]NC(NP) (a firm monetary policy)", + "[307,328]VC(VP) (has helped to prevent)", + "[329,339]NC(NP) (a freefall)", + "[340,342]PC(PP) (in)", + "[343,351]NC(NP) (sterling)", + "[352,356]PC(PP) (over)", + "[357,370]NC(NP) (the past week)", + "[378,386]NC(NP) (analysts)", + "[387,393]VC(VP) (reckon)", + "[394,412]NC(NP) (underlying support)", + "[413,416]PC(PP) (for)", + "[417,425]NC(NP) (sterling)", + "[426,441]VC(VP) (has been eroded)", + "[442,444]PC(PP) (by)", + "[445,459]NC(NP) (the chancellor)", + "[460,470]NC(NP) ('s failure)", + "[471,482]VC(VP) (to announce)", + "[483,506]NC(NP) (any new policy measures)", + "[507,509]PC(PP) (in)", + "[510,534]NC(NP) (his Mansion House speech)", + "[535,548]NC(NP) (last Thursday)" }; + + String[] posMapped = { "POS_NOUN", "POS_ADP", "POS_DET", "POS_NOUN", "POS_VERB", "POS_ADV", + "POS_VERB", "POS_ADP", "POS_VERB", "POS_DET", "POS_ADJ", "POS_NOUN", "POS_ADP", + "POS_NOUN", "POS_NOUN", "POS_ADP", "POS_PROPN", "POS_PUNCT", "POS_ADJ", "POS_ADP", + "POS_NOUN", "POS_NOUN", "POS_PUNCT", "POS_VERB", "POS_ADP", "POS_VERB", "POS_DET", + "POS_ADJ", "POS_NOUN", "POS_ADP", "POS_PROPN", "POS_CONJ", "POS_PROPN", "POS_X", + "POS_ADJ", "POS_NOUN", "POS_PUNCT", "POS_PROPN", "POS_ADP", "POS_DET", "POS_PROPN", + "POS_PROPN", "POS_PROPN", "POS_X", "POS_VERB", "POS_NOUN", "POS_ADP", "POS_DET", + "POS_NOUN", "POS_ADJ", "POS_NOUN", "POS_VERB", "POS_VERB", "POS_ADP", "POS_VERB", + "POS_DET", "POS_NOUN", "POS_ADP", "POS_NOUN", "POS_ADP", "POS_DET", "POS_ADJ", + "POS_NOUN", "POS_PUNCT", "POS_CONJ", "POS_NOUN", "POS_VERB", "POS_VERB", "POS_NOUN", + "POS_ADP", "POS_NOUN", "POS_VERB", "POS_VERB", "POS_VERB", "POS_ADP", "POS_DET", + "POS_NOUN", "POS_X", "POS_NOUN", "POS_ADP", "POS_VERB", "POS_DET", "POS_ADJ", + "POS_NOUN", "POS_NOUN", "POS_ADP", "POS_PRON", "POS_PROPN", "POS_PROPN", "POS_NOUN", + "POS_ADJ", "POS_PROPN", "POS_PUNCT" }; + + String[] posOriginal = { "NN", "IN", "DT", "NN", "VBZ", "RB", "VBN", "TO", "VB", "DT", "JJ", + "NN", "IN", "NN", "NNS", "IN", "NNP", ",", "JJ", "IN", "NN", "NN", ",", "VB", "TO", + "VB", "DT", "JJ", "NN", "IN", "NNP", "CC", "NNP", "POS", "JJ", "NNS", ".", "NNP", + "IN", "DT", "NNP", "NNP", "NNP", "POS", "VBN", "NN", "TO", "DT", "NN", "JJ", "NN", + "VBZ", "VBN", "TO", "VB", "DT", "NN", "IN", "NN", "IN", "DT", "JJ", "NN", ".", "CC", + "NNS", "VBP", "VBG", "NN", "IN", "NN", "VBZ", "VBN", "VBN", "IN", "DT", "NN", "POS", + "NN", "TO", "VB", "DT", "JJ", "NN", "NNS", "IN", "PRP$", "NNP", "NNP", "NN", "JJ", + "NNP", "." }; + + assertSentence(sentences, select(jcas, Sentence.class)); + assertChunks(chunks, select(jcas, Chunk.class)); + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + } +} diff --git a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2000ReaderWriterTest.java b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2000ReaderWriterTest.java similarity index 84% rename from dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2000ReaderWriterTest.java rename to dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2000ReaderWriterTest.java index 152a2239c5..059f16081d 100644 --- a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2000ReaderWriterTest.java +++ b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2000ReaderWriterTest.java @@ -15,14 +15,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; +package org.dkpro.core.io.conll; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testRoundTrip; +import static org.dkpro.core.testing.IOTestRunner.testRoundTrip; + +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - public class Conll2000ReaderWriterTest { @Test diff --git a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2002ReaderWriterTest.java b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2002ReaderWriterTest.java similarity index 81% rename from dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2002ReaderWriterTest.java rename to dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2002ReaderWriterTest.java index 496d3fc209..1fe53445d8 100644 --- a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2002ReaderWriterTest.java +++ b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2002ReaderWriterTest.java @@ -15,17 +15,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; +package org.dkpro.core.io.conll; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testRoundTrip; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testOneWay; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; +import static org.dkpro.core.testing.IOTestRunner.testRoundTrip; +import org.dkpro.core.io.conll.Conll2002Reader.ColumnSeparators; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - public class Conll2002ReaderWriterTest { @Test @@ -40,15 +40,15 @@ public void roundTrip() public void testGermeval2014() throws Exception { - testOneWay( + testOneWay( createReaderDescription(Conll2002Reader.class, Conll2002Reader.PARAM_LANGUAGE, "de", Conll2002Reader.PARAM_HAS_HEADER, true, Conll2002Reader.PARAM_HAS_TOKEN_NUMBER, true, - Conll2002Reader.PARAM_COLUMN_SEPARATOR, Conll2002Reader.ColumnSeparators.TAB.getName(), + Conll2002Reader.PARAM_COLUMN_SEPARATOR, ColumnSeparators.TAB.getName(), Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true), "conll/2002/germeval2014_test.conll.out", - "conll/2002/germeval2014_test.conll"); + "conll/2002/germeval2014_test.conll"); } @Rule diff --git a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2003ReaderWriterTest.java b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2003ReaderWriterTest.java similarity index 84% rename from dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2003ReaderWriterTest.java rename to dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2003ReaderWriterTest.java index ac8fcaa550..23630cbe47 100644 --- a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2003ReaderWriterTest.java +++ b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2003ReaderWriterTest.java @@ -15,14 +15,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; +package org.dkpro.core.io.conll; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testRoundTrip; +import static org.dkpro.core.testing.IOTestRunner.testRoundTrip; + +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - public class Conll2003ReaderWriterTest { @Test diff --git a/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2006ReaderWriterTest.java b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2006ReaderWriterTest.java new file mode 100644 index 0000000000..b282a325a8 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2006ReaderWriterTest.java @@ -0,0 +1,66 @@ +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.conll; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.contentOf; + +import java.io.File; + +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.ReaderAssert; +import org.junit.Rule; +import org.junit.Test; + +//NOTE: This file contains Asciidoc markers for partial inclusion of this file in the documentation +//Do not remove these tags! +public class Conll2006ReaderWriterTest +{ + @Test + public void roundTrip() + throws Exception + { +// tag::testRoundTrip[] + ReaderAssert.assertThat(Conll2006Reader.class) // the reader + .readingFrom("src/test/resources/conll/2006/fi-ref.conll") // the test input file + .usingWriter(Conll2006Writer.class) // the writer + .outputAsString() // access writer output + .isEqualToNormalizingNewlines( // compare to input file + contentOf(new File("src/test/resources/conll/2006/fi-ref.conll"), UTF_8)); +// end::testRoundTrip[] + } + + @Test + public void testFinnTreeBank() + throws Exception + { +// tag::testOneWay[] + ReaderAssert.assertThat(Conll2006Reader.class, // the reader + Conll2006Reader.PARAM_SOURCE_ENCODING, "UTF-8") // reader parameter + .readingFrom("src/test/resources/conll/2006/fi-orig.conll") // the test input file + .usingWriter(Conll2006Writer.class, // the writer + Conll2006Writer.PARAM_TARGET_ENCODING, "UTF-8") // writer parameter + .outputAsString("fi-orig.conll") // access writer output + .isEqualToNormalizingNewlines( // compare to input file + contentOf(new File("src/test/resources/conll/2006/fi-ref.conll"), UTF_8)); +// end::testOneWay[] + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2008ReaderWriterTest.java b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2008ReaderWriterTest.java similarity index 84% rename from dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2008ReaderWriterTest.java rename to dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2008ReaderWriterTest.java index 580fc4accf..b937d66561 100644 --- a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2008ReaderWriterTest.java +++ b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2008ReaderWriterTest.java @@ -15,13 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; +package org.dkpro.core.io.conll; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testOneWay; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class Conll2008ReaderWriterTest { diff --git a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2009ReaderWriterTest.java b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2009ReaderWriterTest.java similarity index 84% rename from dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2009ReaderWriterTest.java rename to dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2009ReaderWriterTest.java index 59ef2cbe33..5c97456ccc 100644 --- a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2009ReaderWriterTest.java +++ b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2009ReaderWriterTest.java @@ -15,13 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; +package org.dkpro.core.io.conll; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testOneWay; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class Conll2009ReaderWriterTest { diff --git a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2012ReaderWriterTest.java b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2012ReaderWriterTest.java similarity index 81% rename from dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2012ReaderWriterTest.java rename to dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2012ReaderWriterTest.java index ee192f8afa..0dda3f8e5f 100644 --- a/dkpro-core-io-conll-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2012ReaderWriterTest.java +++ b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/Conll2012ReaderWriterTest.java @@ -15,17 +15,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.conll; +package org.dkpro.core.io.conll; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testRoundTrip; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.dkpro.core.testing.IOTestRunner.testRoundTrip; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - public class Conll2012ReaderWriterTest { @Test @@ -40,6 +39,18 @@ public void test() "conll/2012/en-orig.conll"); } + @Test + public void test2() + throws Exception + { + testRoundTrip( + createReaderDescription(Conll2012Reader.class, + Conll2012Reader.PARAM_USE_HEADER_METADATA, false, + Conll2012Reader.PARAM_READ_LEMMA, true), + createEngineDescription(Conll2012Writer.class), + "conll/2012/semeval1010-en-sample.conll"); + } + // @Test // public void generate() // throws Exception diff --git a/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/ConllCoreNlpReaderWriterTest.java b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/ConllCoreNlpReaderWriterTest.java new file mode 100644 index 0000000000..0d417a43e6 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/ConllCoreNlpReaderWriterTest.java @@ -0,0 +1,66 @@ +/* + * Copyright 2016 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.conll; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; + +import org.apache.commons.io.FileUtils; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.collection.CollectionReaderDescription; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Rule; +import org.junit.Test; + +public class ConllCoreNlpReaderWriterTest +{ + @Test + public void roundTrip() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + ConllCoreNlpReader.class, + ConllCoreNlpReader.PARAM_SOURCE_LOCATION, "src/test/resources/conll/corenlp", + ConllCoreNlpReader.PARAM_PATTERNS, "en-orig.conll"); + + AnalysisEngineDescription writer = createEngineDescription( + ConllCoreNlpWriter.class, + ConllCoreNlpWriter.PARAM_TARGET_LOCATION, "target/test-output/ConllCoreNlpReaderWriterTest-roundTrip", + ConllCoreNlpWriter.PARAM_FILENAME_EXTENSION, ".conll", + ConllCoreNlpWriter.PARAM_STRIP_EXTENSION, true, + ConllCoreNlpWriter.PARAM_OVERWRITE, true); + + runPipeline(reader, writer); + + String reference = FileUtils.readFileToString( + new File("src/test/resources/conll/corenlp/en-ref.conll"), "UTF-8") + .trim(); + String actual = FileUtils.readFileToString( + new File("target/test-output/ConllCoreNlpReaderWriterTest-roundTrip/en-orig.conll"), + "UTF-8").trim(); + + assertThat(actual).isEqualToNormalizingNewlines(reference); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/ConllUReaderTest.java b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/ConllUReaderTest.java new file mode 100644 index 0000000000..d5e6aca46d --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/ConllUReaderTest.java @@ -0,0 +1,214 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.conll; + +import static java.util.Arrays.asList; +import static java.util.stream.Collectors.toList; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.AssertAnnotations.assertMorph; +import static org.dkpro.core.testing.AssertAnnotations.assertPOS; +import static org.dkpro.core.testing.AssertAnnotations.assertSentence; + +import java.util.List; + +import org.apache.uima.cas.text.AnnotationIndex; +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.JCasIterable; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; + +public class ConllUReaderTest +{ + @Test + public void test() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + ConllUReader.class, + ConllUReader.PARAM_LANGUAGE, "en", + ConllUReader.PARAM_SOURCE_LOCATION, "src/test/resources/conll/u/", + ConllUReader.PARAM_PATTERNS, "conllu-en-orig.conllu"); + + JCas jcas = new JCasIterable(reader).iterator().next(); + + String[] sentences = { + "They buy and sell books.", + "I have not a clue." }; + + String[] posMapped = { "POS", "POS_VERB", "POS_CONJ", "POS_VERB", "POS_NOUN", "POS_PUNCT", + "POS", "POS_VERB", "POS_ADV", "POS_DET", "POS_NOUN", "POS_PUNCT" }; + + String[] posOriginal = { "PRN", "VB", "CC", "VB", "NNS", ".", "PRN", "VB", "RB", "DT", "NN", + "." }; + + String[] morphologicalFeeatures = { + "[ 0, 4] - - Nom - - - - - Plur - - - - - - - - They (Case=Nom|Number=Plur)", + "[ 5, 8] - - - - - - - - Plur - 3 - - - Pres - - buy (Number=Plur|Person=3|Tense=Pres)", + "[ 13, 17] - - - - - - - - Plur - 3 - - - Pres - - sell (Number=Plur|Person=3|Tense=Pres)", + "[ 18, 23] - - - - - - - - Plur - - - - - - - - books (Number=Plur)", + "[ 25, 26] - - Nom - - - - - Sing - 1 - - - - - - I (Case=Nom|Number=Sing|Person=1)", + "[ 27, 31] - - - - - - - - Sing - 1 - - - Pres - - have (Number=Sing|Person=1|Tense=Pres)", + "[ 32, 35] - - - - - - - Neg - - - - - - - - - not (Negative=Neg)", + "[ 36, 37] - - - - - - - - - - - - Art - - - - a (Definite=Ind|PronType=Art)", + "[ 38, 42] - - - - - - - - Sing - - - - - - - - clue (Number=Sing)" + }; + + assertSentence(sentences, select(jcas, Sentence.class)); + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertMorph(morphologicalFeeatures, select(jcas, MorphologicalFeatures.class)); + } + + @Test + public void testDocumentID() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + ConllUReader.class, + ConllUReader.PARAM_LANGUAGE, "en", + ConllUReader.PARAM_SOURCE_LOCATION, "src/test/resources/conll/u_v2/", + ConllUReader.PARAM_PATTERNS, "conllu-paragraph_and_document_boundaries.conllu"); + + JCas jcas = new JCasIterable(reader).iterator().next(); + + AnnotationIndex index = jcas.getAnnotationIndex(DocumentMetaData.class); + DocumentMetaData m = index.iterator().get(); + final String actualDocumentID = m.getDocumentId(); + final String expectedDocumentID = "mf920901-001"; + + Assert.assertEquals("Document ID mismatch", expectedDocumentID, actualDocumentID); + } + + @Test + public void testMultipleDocumentIDs() + throws Exception + { +// final TestAppender appender = new TestAppender(); +// final Logger logger = Logger.getRootLogger(); +// logger.addAppender(appender); +// try { + CollectionReaderDescription reader = createReaderDescription( + ConllUReader.class, + ConllUReader.PARAM_LANGUAGE, "en", + ConllUReader.PARAM_SOURCE_LOCATION, "src/test/resources/conll/u_v2/", + ConllUReader.PARAM_PATTERNS, "conllu-multiple_document_IDs.conllu"); + + JCas jcas = new JCasIterable(reader).iterator().next(); + + AnnotationIndex index = jcas + .getAnnotationIndex(DocumentMetaData.class); + DocumentMetaData m = index.iterator().get(); + final String actualDocumentID = m.getDocumentId(); + final String expectedDocumentID = "mf920901-001;mf920901-002"; + + Assert.assertEquals("Document ID mismatch", expectedDocumentID, actualDocumentID); + // The following code is commented out because when running on our Windows Jenkins slave, + // the log message does not get recorded. It does get logged and I also tested on a local + // Windows as well as on Mac and it works. The Jenkins seems to log in German, i.e. + // "WARNUNG" instead of "WARN", but looking at the code which captures the warnings, that + // should actually not matter since the WARN enum constant is compared. Since I cannot + // reproduce the issue which Jenkins has under any circumstances so far, I am commenting + // this part of the test out. +// } +// finally { +// logger.removeAppender(appender); +// } +// final List log = appender.getLog(); +// final LoggingEvent firstLogEntry = log.get(0); +// Assert.assertEquals(Level.WARN, firstLogEntry.getLevel()); +// Assert.assertEquals("org.dkpro.core.io.conll.ConllUReader", +// firstLogEntry.getLoggerName()); +// +// final String patternString = "File\\s[\\w:/%-\\.]+\\scontains\\smultiple\\sdocument\\sIDs:\\s" +// + "\\[mf920901-001,\\smf920901-002]"; +// Pattern pattern = Pattern.compile(patternString); +// Matcher matcher = pattern.matcher(firstLogEntry.getMessage().toString()); +// Assert.assertTrue(matcher.matches()); + } + + @Test + public void testParagraphs() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + ConllUReader.class, + ConllUReader.PARAM_LANGUAGE, "en", + ConllUReader.PARAM_SOURCE_LOCATION, "src/test/resources/conll/u_v2/", + ConllUReader.PARAM_PATTERNS, "conllu-multiple_paragraphs.conllu"); + + JCas jcas = new JCasIterable(reader).iterator().next(); + + AnnotationIndex index = jcas.getAnnotationIndex(Paragraph.class); + + List paragraphIDs = index.stream() + .map(Paragraph::getId) + .collect(toList()); + List expectedParagraphIDs = asList("mf920901-001-p1", "mf920901-001-p2"); + + Assert.assertEquals(expectedParagraphIDs, paragraphIDs); + + final String expectedTextContent = "Slovenská ústava: pro i proti Slovenská ústava: pro i" + + " proti\n" + + "\n" + + "Slovenská ústava: pro i proti"; + final String actualTextContent = jcas.getDocumentText(); + Assert.assertEquals(expectedTextContent, actualTextContent); + + String[] sentences = { + "Slovenská ústava: pro i proti", + "Slovenská ústava: pro i proti", + "Slovenská ústava: pro i proti" }; + assertSentence(sentences, select(jcas, Sentence.class)); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); + +// class TestAppender extends AppenderSkeleton { +// private final List log = new ArrayList<>(); +// +// @Override +// public boolean requiresLayout() { +// return false; +// } +// +// @Override +// protected void append(final LoggingEvent loggingEvent) { +// if (loggingEvent.getLevel().equals(Level.WARN)) { +// log.add(loggingEvent); +// } +// } +// +// @Override +// public void close() { +// } +// +// public List getLog() { +// return new ArrayList(log); +// } +// } +} diff --git a/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/ConllUReaderWriterTest.java b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/ConllUReaderWriterTest.java new file mode 100644 index 0000000000..bda3761b67 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/ConllUReaderWriterTest.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.conll; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.assertj.core.api.Assertions.contentOf; +import static org.assertj.core.api.Assertions.tuple; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; +import static org.dkpro.core.testing.IOTestRunner.testRoundTrip; + +import java.io.File; + +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.ReaderAssert; +import org.junit.Ignore; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class ConllUReaderWriterTest +{ + @Test + public void roundTrip() + throws Exception + { + testRoundTrip( + createReaderDescription(ConllUReader.class), + createEngineDescription(ConllUWriter.class, + ConllUWriter.PARAM_WRITE_TEXT_COMMENT, false), + "conll/u/conllu-en-orig.conllu"); + } + + @Ignore("This unfortunately doesn't work yet.") + @Test + public void roundTripV2EmptyNodes() + throws Exception + { + testRoundTrip( + createReaderDescription(ConllUReader.class), + createEngineDescription(ConllUWriter.class), + "conll/u_v2/conllu-empty_nodes.conllu"); + } + + @Test + public void roundTripV2MorphologicalAnnotation() + throws Exception + { + testRoundTrip( + createReaderDescription(ConllUReader.class), + createEngineDescription(ConllUWriter.class), + "conll/u_v2/conllu-morphological_annotation.conllu"); + } + + @Ignore("This unfortunately doesn't work yet.") + @Test + public void roundTripV2ParagraphAndDocumentBoundaries() + throws Exception + { + testRoundTrip( + createReaderDescription(ConllUReader.class), + createEngineDescription(ConllUWriter.class, + ConllUWriter.PARAM_WRITE_TEXT_COMMENT, true), + "conll/u_v2/conllu-paragraph_and_document_boundaries.conllu"); + } + + @Test + public void roundTripV2SentenceBoundariesAndComments() + throws Exception + { + testRoundTrip( + createReaderDescription(ConllUReader.class), + createEngineDescription(ConllUWriter.class, + ConllUWriter.PARAM_WRITE_TEXT_COMMENT, true), + "conll/u_v2/conllu-sentence_bounaries_and_comments.conllu"); + } + + @Test + public void roundTripV2SyntacticAnnotation() + throws Exception + { + ReaderAssert.assertThat(ConllUReader.class) + .readingFrom("src/test/resources/conll/u_v2/conllu-syntactic_annotation.conllu") + .asJCasList() + .extracting( + jcas -> select(jcas, Sentence.class).size(), + jcas -> select(jcas, Token.class).size()) + .containsExactly( + tuple(1, 6)); + + ReaderAssert.assertThat(ConllUReader.class) + .readingFrom("src/test/resources/conll/u_v2/conllu-syntactic_annotation.conllu") + .usingWriter(ConllUWriter.class) + .outputAsString() + .isEqualToNormalizingNewlines(contentOf( + new File("src/test/resources/conll/u_v2/conllu-syntactic_annotation.conllu"), + UTF_8)); + } + + @Ignore("This unfortunately doesn't work yet.") + @Test + public void roundTripV2UntokenizedText() + throws Exception + { + testRoundTrip( + createReaderDescription(ConllUReader.class), + createEngineDescription(ConllUWriter.class, + ConllUWriter.PARAM_WRITE_TEXT_COMMENT, true), + "conll/u_v2/conllu-untokenized_text.conllu"); + } + + @Test + public void roundTripV2WordsAndTokens() + throws Exception + { + testRoundTrip( + createReaderDescription(ConllUReader.class), + createEngineDescription(ConllUWriter.class), + "conll/u_v2/conllu-words_and_tokens.conllu"); + } + + @Test + public void withComments() + throws Exception + { + testOneWay( + createReaderDescription(ConllUReader.class), + createEngineDescription(ConllUWriter.class, + ConllUWriter.PARAM_WRITE_TEXT_COMMENT, false), + "conll/u/conllu-en-ref.conllu", + "conll/u/conllu-en-orig2.conllu"); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/ConllUWriterTest.java b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/ConllUWriterTest.java new file mode 100644 index 0000000000..1e16c8cbda --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/java/org/dkpro/core/io/conll/ConllUWriterTest.java @@ -0,0 +1,67 @@ +/* + * Copyright 2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.conll; + +import static org.apache.commons.io.FileUtils.readFileToString; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class ConllUWriterTest +{ + @Test + public void thatLineBreaksDoNotBreakTheFormat() throws Exception + { + File target = testContext.getTestOutputFolder(); + + JCas jcas = JCasFactory.createText("Test\ntest."); + new Sentence(jcas, 0, 10).addToIndexes(); + new Token(jcas, 0, 4).addToIndexes(); + new Token(jcas, 5, 9).addToIndexes(); + new Token(jcas, 9, 10).addToIndexes(); + + DocumentMetaData dmd = DocumentMetaData.create(jcas); + dmd.setDocumentId("output"); + + AnalysisEngine writer = createEngine(ConllUWriter.class, + ConllUWriter.PARAM_TARGET_LOCATION, target); + + writer.process(jcas); + + String reference = readFileToString( + new File("src/test/resources/conll/u_v2/conllu-linebreaks.conllu"), "UTF-8").trim(); + String actual = readFileToString(new File(target, "output.conllu"), "UTF-8").trim(); + + assertThat(actual).isEqualToNormalizingNewlines(reference); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/2002/germeval2014_test.conll.out b/dkpro-core-io-conll-asl/src/test/resources/conll/2002/germeval2014_test.conll.out index 6d836d8631..3e758fc750 100644 --- a/dkpro-core-io-conll-asl/src/test/resources/conll/2002/germeval2014_test.conll.out +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/2002/germeval2014_test.conll.out @@ -24,31 +24,37 @@ Token sofa: _InitialView begin: 0 end: 8 + order: 0 [seiner] Token sofa: _InitialView begin: 9 end: 15 + order: 0 [Initiative] Token sofa: _InitialView begin: 16 end: 26 + order: 0 [fand] Token sofa: _InitialView begin: 27 end: 31 + order: 0 [2001/2002] Token sofa: _InitialView begin: 32 end: 41 + order: 0 [in] Token sofa: _InitialView begin: 42 end: 44 + order: 0 [Stuttgart] NamedEntity sofa: _InitialView @@ -60,11 +66,13 @@ Token sofa: _InitialView begin: 45 end: 54 + order: 0 [,] Token sofa: _InitialView begin: 55 end: 56 + order: 0 [Braunschweig] NamedEntity sofa: _InitialView @@ -76,11 +84,13 @@ Token sofa: _InitialView begin: 57 end: 69 + order: 0 [und] Token sofa: _InitialView begin: 70 end: 73 + order: 0 [Bonn] NamedEntity sofa: _InitialView @@ -92,31 +102,37 @@ Token sofa: _InitialView begin: 74 end: 78 + order: 0 [eine] Token sofa: _InitialView begin: 79 end: 83 + order: 0 [große] Token sofa: _InitialView begin: 84 end: 89 + order: 0 [und] Token sofa: _InitialView begin: 90 end: 93 + order: 0 [publizistisch] Token sofa: _InitialView begin: 94 end: 107 + order: 0 [vielbeachtete] Token sofa: _InitialView begin: 108 end: 121 + order: 0 [Troia-Ausstellung] NamedEntity sofa: _InitialView @@ -128,21 +144,25 @@ Token sofa: _InitialView begin: 122 end: 139 + order: 0 [statt] Token sofa: _InitialView begin: 140 end: 145 + order: 0 [,] Token sofa: _InitialView begin: 146 end: 147 + order: 0 [„] Token sofa: _InitialView begin: 148 end: 149 + order: 0 [Troia - Traum und Wirklichkeit] NamedEntity sofa: _InitialView @@ -154,36 +174,43 @@ Token sofa: _InitialView begin: 150 end: 155 + order: 0 [-] Token sofa: _InitialView begin: 156 end: 157 + order: 0 [Traum] Token sofa: _InitialView begin: 158 end: 163 + order: 0 [und] Token sofa: _InitialView begin: 164 end: 167 + order: 0 [Wirklichkeit] Token sofa: _InitialView begin: 168 end: 180 + order: 0 [“] Token sofa: _InitialView begin: 181 end: 182 + order: 0 [.] Token sofa: _InitialView begin: 183 end: 184 + order: 0 -------- View _InitialView end ---------------------------------- ======== CAS 0 end ================================== \ No newline at end of file diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/2003/en-orig.conll b/dkpro-core-io-conll-asl/src/test/resources/conll/2003/en-orig.conll index a560e6f650..23cb26ad5e 100644 --- a/dkpro-core-io-conll-asl/src/test/resources/conll/2003/en-orig.conll +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/2003/en-orig.conll @@ -5,3 +5,4 @@ heads VBZ I-VP O for IN I-PP O Baghdad NNP I-NP I-LOC . . O O + diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/2006/fi-ref.conll b/dkpro-core-io-conll-asl/src/test/resources/conll/2006/fi-ref.conll index 473c8e0bea..813c864e3c 100644 --- a/dkpro-core-io-conll-asl/src/test/resources/conll/2006/fi-ref.conll +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/2006/fi-ref.conll @@ -33,3 +33,4 @@ 3 Nimi nimi N N N Nom Sg 0 main _ _ 4 ja ja CC CC CC 5 phrm _ _ 5 tarkoitus tarkoitus N N N Nom Sg 3 conjunct _ _ + diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/2012/en-orig.conll b/dkpro-core-io-conll-asl/src/test/resources/conll/2012/en-orig.conll index b0503bbd85..ebaff80b64 100644 --- a/dkpro-core-io-conll-asl/src/test/resources/conll/2012/en-orig.conll +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/2012/en-orig.conll @@ -1,12 +1,12 @@ #begin document (en-orig.conll); part 000 -en-orig.conll 0 0 John NNP (TOP(S(NP*) john - - - (PERSON) (A0) (1) +en-orig.conll 0 0 John NNP (TOP(S(NP*) john - - - (PERSON) (A0) (11) en-orig.conll 0 1 went VBD (VP* go go.02 - - * (V*) - en-orig.conll 0 2 to TO (PP* to - - - * * - -en-orig.conll 0 3 the DT (NP* the - - - * * (2 -en-orig.conll 0 4 market NN *))) market - - - * (A1) 2) +en-orig.conll 0 3 the DT (NP* the - - - * * (9 +en-orig.conll 0 4 market NN *))) market - - - * (A1) 9) en-orig.conll 0 5 . . *)) . - - - * * - -en-orig.conll 0 0 He PRP (TOP(S(NP*) he - - - * (A0) (1) +en-orig.conll 0 0 He PRP (TOP(S(NP*) he - - - * (A0) (11) en-orig.conll 0 1 was VBD (VP* be - - - * * - en-orig.conll 0 2 looking VBG (VP* look look.01 - - * (V*) - en-orig.conll 0 3 for IN (PP* for - - - * * - @@ -15,25 +15,25 @@ en-orig.conll 0 5 present JJ *) present - - en-orig.conll 0 6 to TO (SBAR(S(VP* to - - - * * - en-orig.conll 0 7 give VB (VP*)))) give - - - * * - en-orig.conll 0 8 to TO (PP* to - - - * * - -en-orig.conll 0 9 Mary NNP (NP*)))))) mary - - - (PERSON) * 3)|(4) +en-orig.conll 0 9 Mary NNP (NP*)))))) mary - - - (PERSON) * 3)|(7) en-orig.conll 0 10 . . *)) . - - - * * - -en-orig.conll 0 0 He PRP (TOP(S(NP*) he - - - * (A0) (1) +en-orig.conll 0 0 He PRP (TOP(S(NP*) he - - - * (A0) (11) en-orig.conll 0 1 had VBD (VP* have - - - * * - en-orig.conll 0 2 met VBN (VP* meet meet.02 - - * (V*) - -en-orig.conll 0 3 her PRP$ (NP* her - - - * * (5|(6) +en-orig.conll 0 3 her PRP$ (NP* her - - - * * (1|(4) en-orig.conll 0 4 last JJ * last - - - * * - -en-orig.conll 0 5 year NN *) year - - - * (AM-TMP) 5) +en-orig.conll 0 5 year NN *) year - - - * (AM-TMP) 1) en-orig.conll 0 6 at IN (PP* at - - - * * - -en-orig.conll 0 7 the DT (NP* the - - - * * (7 +en-orig.conll 0 7 the DT (NP* the - - - * * (8 en-orig.conll 0 8 fun NN * fun - - - * * - -en-orig.conll 0 9 fair NN *)))) fair - - - * (A1) 7) +en-orig.conll 0 9 fair NN *)))) fair - - - * (A1) 8) en-orig.conll 0 10 . . *)) . - - - * * - -en-orig.conll 0 0 They PRP (TOP(S(S(NP*) they - - - * (A0) * * (8) -en-orig.conll 0 1 had VBD (VP* have have.03 - - * (V*) (V*) (V*) - -en-orig.conll 0 2 a DT (NP* a - - - * * * * (9 -en-orig.conll 0 3 ride NN *) ride - - - * (A1) * * 9) +en-orig.conll 0 0 They PRP (TOP(S(S(NP*) they - - - * (A0) * * (2) +en-orig.conll 0 1 had VBD (VP* have have.03 - - * (V*) * * - +en-orig.conll 0 2 a DT (NP* a - - - * * * * (5 +en-orig.conll 0 3 ride NN *) ride - - - * (A1) * * 5) en-orig.conll 0 4 on IN (PP* on - - - * * * * - en-orig.conll 0 5 the DT (NP* the - - - * * * * (10 en-orig.conll 0 6 ferris JJ * ferris - - - * * * * - @@ -41,17 +41,17 @@ en-orig.conll 0 7 wheel NN *)) wheel - - en-orig.conll 0 8 together RB (ADVP*))) together - - - * * * * - en-orig.conll 0 9 and CC * and - - - * * * * - en-orig.conll 0 10 when WRB (S(SBAR(WHADVP*) when - - - * * (R-AM-TMP) * - -en-orig.conll 0 11 he PRP (S(NP*) he - - - * * (A0) * (1) -en-orig.conll 0 12 looked VBD (VP* look look.01 - - * (V*) (V*) (AM-TMP) - +en-orig.conll 0 11 he PRP (S(NP*) he - - - * * (A0) * (11) +en-orig.conll 0 12 looked VBD (VP* look look.01 - - * * (V*) (AM-TMP) - en-orig.conll 0 13 into IN (PP* into - - - * * * * - -en-orig.conll 0 14 Mary NNP (NP(NP* mary - - - (PERSON) * * * (11|(4 -en-orig.conll 0 15 's POS *) ' - - - * * * * 4) +en-orig.conll 0 14 Mary NNP (NP(NP* mary - - - (PERSON) * * * (6|(7 +en-orig.conll 0 15 's POS *) ' - - - * * * * 7) en-orig.conll 0 16 blue JJ * blue - - - * * * * - -en-orig.conll 0 17 eyes NNS *))))) eye - - - * * (A1) * 11) +en-orig.conll 0 17 eyes NNS *))))) eye - - - * * (A1) * 6) en-orig.conll 0 18 , , * , - - - * * * * - -en-orig.conll 0 19 John NNP (NP*) john - - - (PERSON) * * (A1) (1) +en-orig.conll 0 19 John NNP (NP*) john - - - (PERSON) * * (A1) (11) en-orig.conll 0 20 immediately RB (ADVP*) immediately - - - * * * (AM-TMP) - -en-orig.conll 0 21 fell VBD (VP* fall fall.01 - - * (V*) (V*) (V*) - +en-orig.conll 0 21 fell VBD (VP* fall fall.01 - - * * * (V*) - en-orig.conll 0 22 in IN (PP* in - - - * * * * - en-orig.conll 0 23 love NN (NP*)))) love - - - * * * * - en-orig.conll 0 24 . . *)) . - - - * * * * - diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/2012/semeval1010-en-sample.conll b/dkpro-core-io-conll-asl/src/test/resources/conll/2012/semeval1010-en-sample.conll new file mode 100644 index 0000000000..a7e2ecdfca --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/2012/semeval1010-en-sample.conll @@ -0,0 +1,35 @@ +#begin document (semeval1010-en-sample.conll); part 000 +semeval1010-en-sample.conll 0 0 Save VB (TOP(S* save - - - * * * * * * * * * * - +semeval1010-en-sample.conll 0 1 for IN (VP* for - - - * * * * * * * * * * - +semeval1010-en-sample.conll 0 2 this DT (PP(PP* this - - - * * * * * * * * * * - +semeval1010-en-sample.conll 0 3 one CD (NPB(ADJP* one - - - * * * * * * * * * * - +semeval1010-en-sample.conll 0 4 excursion NN * excursion Travel - - * (V*) * * * * * * * * - +semeval1010-en-sample.conll 0 5 , PUNC, (ADJP* , - - - * * * * * * * * * * - +semeval1010-en-sample.conll 0 6 he PRP *)) he Coreference - - * (Traveler) (V*) (Coreferent) * * (Self_mover) (Interlocutor_1) * (Coreferent) - +semeval1010-en-sample.conll 0 7 spent VBD * spend - - - * * * * * * * * * * - +semeval1010-en-sample.conll 0 8 his PRP$ *)) his Coreference - - * * * (V*) * * * * * * - +semeval1010-en-sample.conll 0 9 days NNS * day Calendric_unit - - * * * * (V*) * * * * * - +semeval1010-en-sample.conll 0 10 in IN (PP* in - - - * * * * * * * * * * - +semeval1010-en-sample.conll 0 11 long JJ (VP* long Duration - - * * * * * (V*) * * * * - +semeval1010-en-sample.conll 0 12 and CC (PP* and - - - * * * * * * * * * * - +semeval1010-en-sample.conll 0 13 often RB (NP(NPB* often - - - * * * * * * (Depictive* * * * - +semeval1010-en-sample.conll 0 14 solitary JJ *) solitary - - - * * * * * * *) * * * - +semeval1010-en-sample.conll 0 15 walks VBZ (PP* walk Self_motion - - * * * * * (Eventuality) (V*) * * * - +semeval1010-en-sample.conll 0 16 , PUNC, (NP(NPB* , - - - * * * * * * * * * * - +semeval1010-en-sample.conll 0 17 or CC *) or - - - * * * * * * * * * * - +semeval1010-en-sample.conll 0 18 in IN (SBAR(WHNP* in - - - * * * * * * * * * * - +semeval1010-en-sample.conll 0 19 chatting VBG *) chat Chatting - - * * * * * * * (V*) * * - +semeval1010-en-sample.conll 0 20 with IN (S* with - - - * * * * * * * (Interlocutor_2* * * - +semeval1010-en-sample.conll 0 21 a DT (VP* a - - - * * * * * * * * * * - +semeval1010-en-sample.conll 0 22 number NN (VP* number Quantity - - * * * * * * * * (V*) * - +semeval1010-en-sample.conll 0 23 of IN *))))))))))) of - - - * * * * * * * * (Individuals* * - +semeval1010-en-sample.conll 0 24 village NN (NPB* village - - - * * * * * * * * * * - +semeval1010-en-sample.conll 0 25 gossips NNS *)) gossip - - - * * * * * * * * * * - +semeval1010-en-sample.conll 0 26 whose WP$ (VP* whose - - - * * * * * * * * * * - +semeval1010-en-sample.conll 0 27 acquaintance NN (PP* acquaintance - - - * * * * * * * * * * - +semeval1010-en-sample.conll 0 28 he PRP (NPB* he Coreference - - * * * * * * * * * (V*) - +semeval1010-en-sample.conll 0 29 had VBD * have - - - * * * * * * * * * * - +semeval1010-en-sample.conll 0 30 cultivated VBN * cultivate - - - * * * * * * * * * * - +semeval1010-en-sample.conll 0 31 . PUNC. *))))) . - - - * * * * * * * *) *) * - + +#end document \ No newline at end of file diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/corenlp/en-orig.conll b/dkpro-core-io-conll-asl/src/test/resources/conll/corenlp/en-orig.conll new file mode 100644 index 0000000000..f82b0805b2 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/corenlp/en-orig.conll @@ -0,0 +1,32 @@ +1 Selectum Selectum NNP O _ _ +2 , , , O _ _ +3 Société Société NNP O _ _ +4 d'Investissement d'Investissement NNP O _ _ +5 à à NNP O _ _ +6 Capital Capital NNP O _ _ +7 Variable Variable NNP O _ _ +8 . . . O _ _ + +1 Siège Siège NNP O _ _ +2 social social JJ O _ _ +3 : : : O _ _ +4 L-2453 l-2453 NN O _ _ +5 Luxembourg Luxembourg NNP COUNTRY _ _ +6 , , , O _ _ +7 12 12 CD NUMBER _ _ +8 . . . O _ _ + +1 R.C.S. R.C.S. NNP LOCATION _ _ +2 Luxembourg Luxembourg NNP COUNTRY _ _ +3 B B NNP O _ _ +4 161.997 161.997 CD NUMBER _ _ +5 . . . O _ _ + +1 STATUTES statute NNS O _ _ +2 In in IN O _ _ +3 the the DT DATE _ _ +4 year year NN DATE _ _ +5 two two CD DATE _ _ +6 thousand thousand CD DATE _ _ +7 eleven eleven NNS DATE _ _ +8 . . . O _ _ diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/corenlp/en-ref.conll b/dkpro-core-io-conll-asl/src/test/resources/conll/corenlp/en-ref.conll new file mode 100644 index 0000000000..f82b0805b2 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/corenlp/en-ref.conll @@ -0,0 +1,32 @@ +1 Selectum Selectum NNP O _ _ +2 , , , O _ _ +3 Société Société NNP O _ _ +4 d'Investissement d'Investissement NNP O _ _ +5 à à NNP O _ _ +6 Capital Capital NNP O _ _ +7 Variable Variable NNP O _ _ +8 . . . O _ _ + +1 Siège Siège NNP O _ _ +2 social social JJ O _ _ +3 : : : O _ _ +4 L-2453 l-2453 NN O _ _ +5 Luxembourg Luxembourg NNP COUNTRY _ _ +6 , , , O _ _ +7 12 12 CD NUMBER _ _ +8 . . . O _ _ + +1 R.C.S. R.C.S. NNP LOCATION _ _ +2 Luxembourg Luxembourg NNP COUNTRY _ _ +3 B B NNP O _ _ +4 161.997 161.997 CD NUMBER _ _ +5 . . . O _ _ + +1 STATUTES statute NNS O _ _ +2 In in IN O _ _ +3 the the DT DATE _ _ +4 year year NN DATE _ _ +5 two two CD DATE _ _ +6 thousand thousand CD DATE _ _ +7 eleven eleven NNS DATE _ _ +8 . . . O _ _ diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/u/conllu-en-orig.conll b/dkpro-core-io-conll-asl/src/test/resources/conll/u/conllu-en-orig.conllu similarity index 100% rename from dkpro-core-io-conll-asl/src/test/resources/conll/u/conllu-en-orig.conll rename to dkpro-core-io-conll-asl/src/test/resources/conll/u/conllu-en-orig.conllu diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/u/conllu-en-orig2.conll b/dkpro-core-io-conll-asl/src/test/resources/conll/u/conllu-en-orig2.conll deleted file mode 100644 index 6dcb86f542..0000000000 --- a/dkpro-core-io-conll-asl/src/test/resources/conll/u/conllu-en-orig2.conll +++ /dev/null @@ -1,18 +0,0 @@ -# sent_id 1 -# ... -1 They they PRON PRN Case=Nom|Number=Plur 2 nsubj 4:nsubj _ -2 buy buy VERB VB Number=Plur|Person=3|Tense=Pres 0 root _ _ -3 and and CONJ CC _ 2 cc _ _ -4 sell sell VERB VB Number=Plur|Person=3|Tense=Pres 2 conj 0:root _ -5 books book NOUN NNS Number=Plur 2 dobj 4:dobj SpaceAfter=No -6 . . PUNCT . _ 2 punct _ _ - -# sent_id 2 -# ... -1 I I PRON PRN Case=Nom|Number=Sing|Person=1 2 nsubj _ _ -2-3 haven't _ _ _ _ _ _ _ _ -2 have have VERB VB Number=Sing|Person=1|Tense=Pres 0 root _ _ -3 not not PART RB Negative=Neg 2 neg _ _ -4 a a DET DT Definite=Ind|PronType=Art 5 det _ _ -5 clue clue NOUN NN Number=Sing 2 dobj _ SpaceAfter=No -6 . . PUNCT . _ 2 punct _ _ diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/u/conllu-en-orig2.conllu b/dkpro-core-io-conll-asl/src/test/resources/conll/u/conllu-en-orig2.conllu new file mode 100644 index 0000000000..01bffad1d1 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/u/conllu-en-orig2.conllu @@ -0,0 +1,18 @@ +# sent_id = 1 +# ... +1 They they PRON PRN Case=Nom|Number=Plur 2 nsubj 4:nsubj _ +2 buy buy VERB VB Number=Plur|Person=3|Tense=Pres 0 root _ _ +3 and and CONJ CC _ 2 cc _ _ +4 sell sell VERB VB Number=Plur|Person=3|Tense=Pres 2 conj 0:root _ +5 books book NOUN NNS Number=Plur 2 dobj 4:dobj SpaceAfter=No +6 . . PUNCT . _ 2 punct _ _ + +# sent_id = 2 +# ... +1 I I PRON PRN Case=Nom|Number=Sing|Person=1 2 nsubj _ _ +2-3 haven't _ _ _ _ _ _ _ _ +2 have have VERB VB Number=Sing|Person=1|Tense=Pres 0 root _ _ +3 not not PART RB Negative=Neg 2 neg _ _ +4 a a DET DT Definite=Ind|PronType=Art 5 det _ _ +5 clue clue NOUN NN Number=Sing 2 dobj _ SpaceAfter=No +6 . . PUNCT . _ 2 punct _ _ diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/u/conllu-en-ref.conll b/dkpro-core-io-conll-asl/src/test/resources/conll/u/conllu-en-ref.conll deleted file mode 100644 index 45616bf32a..0000000000 --- a/dkpro-core-io-conll-asl/src/test/resources/conll/u/conllu-en-ref.conll +++ /dev/null @@ -1,14 +0,0 @@ -1 They they PRON PRN Case=Nom|Number=Plur 2 nsubj 4:nsubj _ -2 buy buy VERB VB Number=Plur|Person=3|Tense=Pres 0 root _ _ -3 and and CONJ CC _ 2 cc _ _ -4 sell sell VERB VB Number=Plur|Person=3|Tense=Pres 2 conj 0:root _ -5 books book NOUN NNS Number=Plur 2 dobj 4:dobj SpaceAfter=No -6 . . PUNCT . _ 2 punct _ _ - -1 I I PRON PRN Case=Nom|Number=Sing|Person=1 2 nsubj _ _ -2-3 haven't _ _ _ _ _ _ _ _ -2 have have VERB VB Number=Sing|Person=1|Tense=Pres 0 root _ _ -3 not not PART RB Negative=Neg 2 neg _ _ -4 a a DET DT Definite=Ind|PronType=Art 5 det _ _ -5 clue clue NOUN NN Number=Sing 2 dobj _ SpaceAfter=No -6 . . PUNCT . _ 2 punct _ _ \ No newline at end of file diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/u/conllu-en-ref.conllu b/dkpro-core-io-conll-asl/src/test/resources/conll/u/conllu-en-ref.conllu new file mode 100644 index 0000000000..f4a4cd4e66 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/u/conllu-en-ref.conllu @@ -0,0 +1,16 @@ +# sent_id = 1 +1 They they PRON PRN Case=Nom|Number=Plur 2 nsubj 4:nsubj _ +2 buy buy VERB VB Number=Plur|Person=3|Tense=Pres 0 root _ _ +3 and and CONJ CC _ 2 cc _ _ +4 sell sell VERB VB Number=Plur|Person=3|Tense=Pres 2 conj 0:root _ +5 books book NOUN NNS Number=Plur 2 dobj 4:dobj SpaceAfter=No +6 . . PUNCT . _ 2 punct _ _ + +# sent_id = 2 +1 I I PRON PRN Case=Nom|Number=Sing|Person=1 2 nsubj _ _ +2-3 haven't _ _ _ _ _ _ _ _ +2 have have VERB VB Number=Sing|Person=1|Tense=Pres 0 root _ _ +3 not not PART RB Negative=Neg 2 neg _ _ +4 a a DET DT Definite=Ind|PronType=Art 5 det _ _ +5 clue clue NOUN NN Number=Sing 2 dobj _ SpaceAfter=No +6 . . PUNCT . _ 2 punct _ _ \ No newline at end of file diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/README.txt b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/README.txt new file mode 100644 index 0000000000..b6c6eb8396 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/README.txt @@ -0,0 +1,11 @@ +Examples here come from the CoNLL-U v2 format documentation: + +http://universaldependencies.org/format.html + +The original abbreviated examples have been extended to full +CoNLL-U files by adding missing columns and replacing spaces by tabs. + +Apache License 2.0 + +https://github.com/UniversalDependencies/docs/blob/pages-source/LICENSE.txt + diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-empty_nodes.conllu b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-empty_nodes.conllu new file mode 100644 index 0000000000..8b1d82afb7 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-empty_nodes.conllu @@ -0,0 +1,7 @@ +1 Sue Sue _ _ _ _ _ _ _ +2 likes like _ _ _ _ _ _ _ +3 coffee coffee _ _ _ _ _ _ _ +4 and and _ _ _ _ _ _ _ +5 Bill Bill _ _ _ _ _ _ _ +5.1 likes like _ _ _ _ _ _ _ +6 tea tea _ _ _ _ _ _ _ diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-linebreaks.conllu b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-linebreaks.conllu new file mode 100644 index 0000000000..a7003c91f3 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-linebreaks.conllu @@ -0,0 +1,5 @@ +# text = Test test. +1 Test _ _ _ _ _ _ _ _ +2 test _ _ _ _ _ _ _ SpaceAfter=No +3 . _ _ _ _ _ _ _ _ + diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-morphological_annotation.conllu b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-morphological_annotation.conllu new file mode 100644 index 0000000000..14f16a3887 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-morphological_annotation.conllu @@ -0,0 +1,8 @@ +# text = Då var han elva år . +1 Då då ADV AB _ _ _ _ _ +2 var vara VERB VB.PRET.ACT Tense=Past|Voice=Act _ _ _ _ +3 han han PRON PN.UTR.SIN.DEF.NOM Case=Nom|Definite=Def|Gender=Com|Number=Sing _ _ _ _ +4 elva elva NUM RG.NOM Case=Nom|NumType=Card _ _ _ _ +5 år år NOUN NN.NEU.PLU.IND.NOM Case=Nom|Definite=Ind|Gender=Neut|Number=Plur _ _ _ _ +6 . . PUNCT DL.MAD _ _ _ _ _ + diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-multiple_document_IDs.conllu b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-multiple_document_IDs.conllu new file mode 100644 index 0000000000..963b806c8e --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-multiple_document_IDs.conllu @@ -0,0 +1,34 @@ +# newdoc id = mf920901-001 +# newpar id = mf920901-001-p1 +# sent_id = mf920901-001-p1s1A +# text = Slovenská ústava: pro i proti +# text_en = Slovak constitution: pros and cons +1 Slovenská slovenský ADJ AAFS1----1A---- Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|Polarity=Pos 2 amod _ _ +2 ústava ústava NOUN NNFS1-----A---- Case=Nom|Gender=Fem|Number=Sing|Polarity=Pos 0 root _ SpaceAfter=No +3 : : PUNCT Z:------------- _ 2 punct _ _ +4 pro pro ADP RR--4---------- Case=Acc 2 appos _ LId=pro-1 +5 i i CCONJ J^------------- _ 6 cc _ LId=i-1 +6 proti proti ADP RR--3---------- Case=Dat 4 conj _ LId=proti-1 + +# sent_id = mf920901-001-p1s2 +# text = Slovenská ústava: pro i proti +# text_en = Slovak constitution: pros and cons +1 Slovenská slovenský ADJ AAFS1----1A---- Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|Polarity=Pos 2 amod _ _ +2 ústava ústava NOUN NNFS1-----A---- Case=Nom|Gender=Fem|Number=Sing|Polarity=Pos 0 root _ SpaceAfter=No +3 : : PUNCT Z:------------- _ 2 punct _ _ +4 pro pro ADP RR--4---------- Case=Acc 2 appos _ LId=pro-1 +5 i i CCONJ J^------------- _ 6 cc _ LId=i-1 +6 proti proti ADP RR--3---------- Case=Dat 4 conj _ LId=proti-1 + +# newdoc id = mf920901-002 +# newpar id = mf920901-002-p1 +# sent_id = mf920901-002-p1s1A +# text = Slovenská ústava: pro i proti +# text_en = Slovak constitution: pros and cons +1 Slovenská slovenský ADJ AAFS1----1A---- Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|Polarity=Pos 2 amod _ _ +2 ústava ústava NOUN NNFS1-----A---- Case=Nom|Gender=Fem|Number=Sing|Polarity=Pos 0 root _ SpaceAfter=No +3 : : PUNCT Z:------------- _ 2 punct _ _ +4 pro pro ADP RR--4---------- Case=Acc 2 appos _ LId=pro-1 +5 i i CCONJ J^------------- _ 6 cc _ LId=i-1 +6 proti proti ADP RR--3---------- Case=Dat 4 conj _ LId=proti-1 + diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-multiple_paragraphs.conllu b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-multiple_paragraphs.conllu new file mode 100644 index 0000000000..6ba76724f9 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-multiple_paragraphs.conllu @@ -0,0 +1,33 @@ +# newdoc id = mf920901-001 +# newpar id = mf920901-001-p1 +# sent_id = mf920901-001-p1s1A +# text = Slovenská ústava: pro i proti +# text_en = Slovak constitution: pros and cons +1 Slovenská slovenský ADJ AAFS1----1A---- Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|Polarity=Pos 2 amod _ _ +2 ústava ústava NOUN NNFS1-----A---- Case=Nom|Gender=Fem|Number=Sing|Polarity=Pos 0 root _ SpaceAfter=No +3 : : PUNCT Z:------------- _ 2 punct _ _ +4 pro pro ADP RR--4---------- Case=Acc 2 appos _ LId=pro-1 +5 i i CCONJ J^------------- _ 6 cc _ LId=i-1 +6 proti proti ADP RR--3---------- Case=Dat 4 conj _ LId=proti-1 + +# sent_id = mf920901-001-p1s2 +# text = Slovenská ústava: pro i proti +# text_en = Slovak constitution: pros and cons +1 Slovenská slovenský ADJ AAFS1----1A---- Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|Polarity=Pos 2 amod _ _ +2 ústava ústava NOUN NNFS1-----A---- Case=Nom|Gender=Fem|Number=Sing|Polarity=Pos 0 root _ SpaceAfter=No +3 : : PUNCT Z:------------- _ 2 punct _ _ +4 pro pro ADP RR--4---------- Case=Acc 2 appos _ LId=pro-1 +5 i i CCONJ J^------------- _ 6 cc _ LId=i-1 +6 proti proti ADP RR--3---------- Case=Dat 4 conj _ LId=proti-1 + +# newpar id = mf920901-001-p2 +# sent_id = mf920901-001-p2s1A +# text = Slovenská ústava: pro i proti +# text_en = Slovak constitution: pros and cons +1 Slovenská slovenský ADJ AAFS1----1A---- Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|Polarity=Pos 2 amod _ _ +2 ústava ústava NOUN NNFS1-----A---- Case=Nom|Gender=Fem|Number=Sing|Polarity=Pos 0 root _ SpaceAfter=No +3 : : PUNCT Z:------------- _ 2 punct _ _ +4 pro pro ADP RR--4---------- Case=Acc 2 appos _ LId=pro-1 +5 i i CCONJ J^------------- _ 6 cc _ LId=i-1 +6 proti proti ADP RR--3---------- Case=Dat 4 conj _ LId=proti-1 + diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-paragraph_and_document_boundaries.conllu b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-paragraph_and_document_boundaries.conllu new file mode 100644 index 0000000000..6d6dfdf034 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-paragraph_and_document_boundaries.conllu @@ -0,0 +1,11 @@ +# newdoc id = mf920901-001 +# newpar id = mf920901-001-p1 +# sent_id = mf920901-001-p1s1A +# text = Slovenská ústava: pro i proti +# text_en = Slovak constitution: pros and cons +1 Slovenská slovenský ADJ AAFS1----1A---- Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|Polarity=Pos 2 amod _ _ +2 ústava ústava NOUN NNFS1-----A---- Case=Nom|Gender=Fem|Number=Sing|Polarity=Pos 0 root _ SpaceAfter=No +3 : : PUNCT Z:------------- _ 2 punct _ _ +4 pro pro ADP RR--4---------- Case=Acc 2 appos _ LId=pro-1 +5 i i CCONJ J^------------- _ 6 cc _ LId=i-1 +6 proti proti ADP RR--3---------- Case=Dat 4 conj _ LId=proti-1 diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-sentence_bounaries_and_comments.conllu b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-sentence_bounaries_and_comments.conllu new file mode 100644 index 0000000000..90b60e42e9 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-sentence_bounaries_and_comments.conllu @@ -0,0 +1,16 @@ +# sent_id = 1 +# text = They buy and sell books. +1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj _ +2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root _ +3 and and CONJ CC _ 4 cc 4:cc _ +4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj _ +5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj SpaceAfter=No +6 . . PUNCT . _ 2 punct 2:punct _ + +# sent_id = 2 +# text = I have no clue. +1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj _ _ +2 have have VERB VBP Number=Sing|Person=1|Tense=Pres 0 root _ _ +3 no no DET DT PronType=Neg 4 det _ _ +4 clue clue NOUN NN Number=Sing 2 obj _ SpaceAfter=No +5 . . PUNCT . _ 2 punct _ _ diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-syntactic_annotation.conllu b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-syntactic_annotation.conllu new file mode 100644 index 0000000000..e3cf89b165 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-syntactic_annotation.conllu @@ -0,0 +1,8 @@ +# text = They buy and sell books . +1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj _ +2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root _ +3 and and CONJ CC _ 4 cc 4:cc _ +4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj _ +5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj _ +6 . . PUNCT . _ 2 punct 2:punct _ + diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-untokenized_text.conllu b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-untokenized_text.conllu new file mode 100644 index 0000000000..6f9cd45cea --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-untokenized_text.conllu @@ -0,0 +1,19 @@ +# text = Er arbeitet fürs FBI (deutsch etwa: „Bundesamt für Ermittlung“). +# text_en = He works for the FBI (German approx: “Bundesamt für Ermittlung”). +1 Er er PRON _ _ _ _ _ _ +2 arbeitet arbeiten VERB _ _ _ _ _ _ +3-4 fürs _ _ _ _ _ _ _ _ +3 für für ADP _ _ _ _ _ _ +4 das der DET _ _ _ _ _ _ +5 FBI FBI PROPN _ _ _ _ _ _ +6 ( ( PUNCT _ _ _ _ _ SpaceAfter=No +7 deutsch deutsch ADV _ _ _ _ _ _ +8 etwa etwa ADV _ _ _ _ _ SpaceAfter=No +9 : : PUNCT _ _ _ _ _ _ +10 „ „ PUNCT _ _ _ _ _ SpaceAfter=No +11 Bundesamt Bundesamt NOUN _ _ _ _ _ _ +12 für für ADP _ _ _ _ _ _ +13 Ermittlung Ermittlung NOUN _ _ _ _ _ SpaceAfter=No +14 “ “ PUNCT _ _ _ _ _ SpaceAfter=No +15 ) ) PUNCT _ _ _ _ _ SpaceAfter=No +16 . . PUNCT _ _ _ _ _ _ diff --git a/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-words_and_tokens.conllu b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-words_and_tokens.conllu new file mode 100644 index 0000000000..8fbc4bfa05 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/resources/conll/u_v2/conllu-words_and_tokens.conllu @@ -0,0 +1,9 @@ +# text = vamos nos a el mar +1-2 vámonos _ _ _ _ _ _ _ _ +1 vamos ir _ _ _ _ _ _ _ +2 nos nosotros _ _ _ _ _ _ _ +3-4 al _ _ _ _ _ _ _ _ +3 a a _ _ _ _ _ _ _ +4 el el _ _ _ _ _ _ _ +5 mar mar _ _ _ _ _ _ _ + diff --git a/dkpro-core-io-conll-asl/src/test/resources/log4j.properties b/dkpro-core-io-conll-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-io-conll-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-conll-asl/src/test/resources/log4j2.xml b/dkpro-core-io-conll-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-conll-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-ditop-asl/pom.xml b/dkpro-core-io-ditop-asl/pom.xml index 22d753612d..ae54f8932f 100644 --- a/dkpro-core-io-ditop-asl/pom.xml +++ b/dkpro-core-io-ditop-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.ditop-asl + dkpro-core-io-ditop-asl jar DKPro Core ASL - IO - Ditop (EXPERIMENTAL) + https://dkpro.github.io/dkpro-core/ DKPro components for DiTop: http://ditop.hs8.de/ @@ -50,20 +51,24 @@ commons-collections4 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.mallet-asl + org.dkpro.core + dkpro-core-mallet-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -71,14 +76,31 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.text-asl + org.dkpro.core + dkpro-core-io-text-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.tokit-asl + org.dkpro.core + dkpro-core-tokit-asl test + + + + eu.openminted.share.annotations + omtd-share-annotations-maven-plugin + + + + **/DiTopWriter.xml + + + + + \ No newline at end of file diff --git a/dkpro-core-io-ditop-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/ditop/DiTopWriter.java b/dkpro-core-io-ditop-asl/src/main/java/org/dkpro/core/io/ditop/DiTopWriter.java similarity index 92% rename from dkpro-core-io-ditop-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/ditop/DiTopWriter.java rename to dkpro-core-io-ditop-asl/src/main/java/org/dkpro/core/io/ditop/DiTopWriter.java index 66fd392ddc..fbf688d9d1 100644 --- a/dkpro-core-io-ditop-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/ditop/DiTopWriter.java +++ b/dkpro-core-io-ditop-asl/src/main/java/org/dkpro/core/io/ditop/DiTopWriter.java @@ -1,434 +1,443 @@ -/* - * Copyright 2014 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.ditop; - -import static org.apache.uima.fit.util.JCasUtil.select; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; -import java.util.TreeSet; - -import org.apache.commons.collections4.Bag; -import org.apache.commons.collections4.bag.HashBag; -import org.apache.commons.io.FileUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.cas.DoubleArray; -import org.apache.uima.resource.ResourceInitializationException; - -import cc.mallet.topics.ParallelTopicModel; -import cc.mallet.types.Alphabet; -import cc.mallet.types.IDSorter; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.mallet.type.TopicDistribution; - -/** - * This annotator (consumer) writes output files as required by DiTop. It requires JCas input annotated by - * {@link de.tudarmstadt.ukp.dkpro.core.mallet.lda.MalletLdaTopicModelInferencer} using the same model. - */ -@ResourceMetaData(name="DiTop Writer") -@MimeTypeCapability({MimeTypes.APPLICATION_X_DITOP}) -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.mallet.type.TopicDistribution" }) -public class DiTopWriter - extends JCasFileWriter_ImplBase -{ - private static final String FIELDSEPARATOR_CONFIGFILE = ";"; - private final static String DOC_TOPICS_FILE = "topics.csv"; - private final static String TOPIC_TERM_FILE = "topicTerm.txt"; - private final static String TOPIC_TERM_MATRIX_FILE = "topicTermMatrix.txt"; - private final static String TOPIC_SUMMARY_FILE = "topicTerm-T15.txt"; - private final static String CONFIG_FILE = "config.all"; - - /** - * The maximum number of topic words to extract. Default: 15 - */ - public static final String PARAM_MAX_TOPIC_WORDS = "maxTopicWords"; - @ConfigurationParameter(name = PARAM_MAX_TOPIC_WORDS, mandatory = true, defaultValue = "15") - private int maxTopicWords; - - /** - * A Mallet file storing a serialized {@link ParallelTopicModel}. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true) - protected File modelLocation; - - /** - * The corpus name is used to name the corresponding sub-directory and will be set in the - * configuration file. - */ - public static final String PARAM_CORPUS_NAME = "corpusName"; - @ConfigurationParameter(name = PARAM_CORPUS_NAME, mandatory = true) - protected String corpusName; - - /** - * Directory in which to store output files. - */ - public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; - @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) - protected File targetLocation; - - /** - * If set to true, the new corpus will be appended to an existing config file. If false, the - * existing file is overwritten. Default: true. - */ - public static final String PARAM_APPEND_CONFIG = "appendConfig"; - @ConfigurationParameter(name = PARAM_APPEND_CONFIG, mandatory = true, defaultValue = "true") - protected boolean appendConfig; - - /** - * If set, only documents with one of the listed collection IDs are written, all others are - * ignored. If this is empty (null), all documents are written. - */ - public final static String PARAM_COLLECTION_VALUES = "collectionValues"; - @ConfigurationParameter(name = PARAM_COLLECTION_VALUES, mandatory = false) - protected String[] collectionValues; - - /** - * If true (default), only write documents with collection ids matching one of the collection - * values exactly. If false, write documents with collection ids containing any of the - * collection value string in collection while ignoring cases. - */ - public final static String PARAM_COLLECTION_VALUES_EXACT_MATCH = "collectionValuesExactMatch"; - @ConfigurationParameter(name = PARAM_COLLECTION_VALUES_EXACT_MATCH, mandatory = true, defaultValue = "true") - protected boolean collectionValuesExactMatch; - - private ParallelTopicModel model; - private File collectionDir; - protected Set collectionValuesSet; - private Bag collectionCounter; - - protected BufferedWriter writerDocTopic; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - - try { - model = ParallelTopicModel.read(modelLocation); - collectionDir = new File(targetLocation, corpusName + "_" + model.getNumTopics()); - if (collectionDir.exists()) { - getLogger().warn( - String.format("%s' already exists, overwriting content.", collectionDir)); - } - collectionDir.mkdirs(); - initializeTopicFile(); - } - catch (Exception e) { - throw new ResourceInitializationException(e); - } - - collectionValuesSet = collectionValues == null ? - Collections. emptySet() : new HashSet<>(Arrays.asList(collectionValues)); - collectionCounter = new HashBag<>(); - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - for (TopicDistribution distribution : select(aJCas, TopicDistribution.class)) { - String docName = getDocumentId(aJCas); - String collectionId = getCollectionId(aJCas); - - /* Print and gather collection statistics */ - if (collectionCounter.getCount(collectionId) == 0) { - getLogger().info("New collection ID observed: " + collectionId); - } - collectionCounter.add(collectionId); - - try { - writeDocTopic(distribution, docName, collectionId); - } - catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - } - } - - protected void writeDocTopic(TopicDistribution distribution, String docName, String collectionId) - throws IOException - { - /* filter by collection id if PARAM_COLLECTION_VALUES is set */ - if (collectionValuesSet.isEmpty() || collectionValuesSet.contains(collectionId)) { - /* write documents to file */ - writerDocTopic.write(collectionId + ","); - writerDocTopic.write(docName); - DoubleArray proportions = distribution.getTopicProportions(); - for (double topicProb : proportions.toArray()) { - writerDocTopic.write("," + topicProb); - } - writerDocTopic.newLine(); - } - } - - @Override - public void collectionProcessComplete() - throws AnalysisEngineProcessException - { - super.collectionProcessComplete(); - - getLogger().info("Collection statistics: " + collectionCounter.toString()); - getLogger().info( - collectionValuesSet.isEmpty() ? - "Writing all documents." : - "Writing documents from these collections only: " - + collectionValuesSet.toString()); - - try { - writerDocTopic.close(); - writetermMatrixFiles(); - writeConfigFile(); - } - catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - - } - - private void initializeTopicFile() - throws IOException - { - File topicFile = new File(collectionDir, DOC_TOPICS_FILE); - getLogger().info(String.format("Writing file '%s'.", topicFile.getPath())); - writerDocTopic = new BufferedWriter(new FileWriter(topicFile)); - - /* Write header */ - writerDocTopic.write("Class,Document"); - for (int j = 0; j < model.numTopics; j++) { - writerDocTopic.write(",T" + j); - } - writerDocTopic.newLine(); - } - - /** - * This method has been copied and slightly adapted from MalletLDA#printTopics in the original - * DiTop code. - * - * @throws IOException - * if a low-level I/O error occurs - */ - private void writetermMatrixFiles() - throws IOException - { - File topicTermFile = new File(collectionDir, TOPIC_TERM_FILE); - File topicTermMatrixFile = new File(collectionDir, TOPIC_TERM_MATRIX_FILE); - File topicSummaryFile = new File(collectionDir, TOPIC_SUMMARY_FILE); - - BufferedWriter writerTopicTerm = new BufferedWriter(new FileWriter(topicTermFile)); - BufferedWriter writerTopicTermMatrix = new BufferedWriter(new FileWriter( - topicTermMatrixFile)); - BufferedWriter writerTopicTermShort = new BufferedWriter(new FileWriter(topicSummaryFile)); - - getLogger().info(String.format("Writing file '%s'.", topicTermFile)); - getLogger().info(String.format("Writing file '%s'.", topicTermMatrixFile)); - getLogger().info(String.format("Writing file '%s'.", topicSummaryFile)); - - /* Write topic term associations */ - Alphabet alphabet = model.getAlphabet(); - for (int i = 0; i < model.getSortedWords().size(); i++) { - writerTopicTerm.write("TOPIC " + i + ": "); - writerTopicTermShort.write("TOPIC " + i + ": "); - writerTopicTermMatrix.write("TOPIC " + i + ": "); - /** topic for the label */ - int count = 0; - TreeSet set = model.getSortedWords().get(i); - for (IDSorter s : set) { - if (count <= maxTopicWords) { - writerTopicTermShort.write(alphabet.lookupObject(s.getID()) + ", "); - } - count++; - writerTopicTerm.write(alphabet.lookupObject(s.getID()) + ", "); - writerTopicTermMatrix.write(alphabet.lookupObject(s.getID()) + " (" + s.getWeight() - + "), "); - /** add to topic label */ - } - writerTopicTerm.newLine(); - writerTopicTermShort.newLine(); - writerTopicTermMatrix.newLine(); - } - - writerTopicTermMatrix.close(); - writerTopicTerm.close(); - writerTopicTermShort.close(); - } - - private void writeConfigFile() - throws IOException - { - File configFile = new File(targetLocation, CONFIG_FILE); - Map> corpora; // holds all corpus names mapped to (multiple) topic - // numbers - Set currentCorpusTopicNumbers; // entry for the current, new topic - - if (appendConfig && configFile.exists()) { - // read existing entries from config file - corpora = readConfigFile(configFile); - currentCorpusTopicNumbers = corpora.containsKey(corpusName) ? - corpora.get(corpusName) : new HashSet<>(); - } - else { - corpora = new HashMap<>(); - currentCorpusTopicNumbers = new HashSet<>(1, 1); - } - - currentCorpusTopicNumbers.add(model.getNumTopics()); - corpora.put(corpusName, currentCorpusTopicNumbers); - - getLogger().info(String.format("Writing configuration file '%s'.", configFile.getPath())); - BufferedWriter configWriter = new BufferedWriter(new FileWriter(configFile)); - - for (Entry> entry : corpora.entrySet()) { - configWriter.write(entry.getKey()); - for (Integer topicNumber : entry.getValue()) { - configWriter.write(FIELDSEPARATOR_CONFIGFILE + topicNumber); - } - configWriter.newLine(); - } - configWriter.close(); - } - - /** - * Read config file in the form ;[;...] - *

- * Results in a Map :Set(ntopics1, ...) - * - * @param configFile - * the config file to read - * @return a map containing corpus names as keys and a set of topic numbers as values - * @throws IOException - * if an I/O error occurs. - */ - private static Map> readConfigFile(File configFile) - throws IOException - { - Map> entries = new HashMap<>(); - - for (String line : FileUtils.readLines(configFile)) { - String[] fields = line.split(FIELDSEPARATOR_CONFIGFILE); - if (fields.length < 2) { - throw new IllegalStateException(String.format( - "Could not parse config file '%s': Invalid line:%n%s", configFile, line)); - } - if (entries.containsKey(fields[0])) { - throw new IllegalStateException(String.format( - "Could not parse config file '%s': duplicate corpus entry '%s'.", - configFile, fields[0])); - } - Set topicCounts = new HashSet<>(fields.length - 1); - for (int i = 1; i < fields.length; i++) { - try { - topicCounts.add(Integer.parseInt(fields[i])); - } - catch (NumberFormatException e) { - throw new IllegalStateException(String.format( - "Could not parse config file '%s': Invalid topic number '%s'.", - configFile, fields[i])); - } - } - entries.put(fields[0], topicCounts); - } - return entries; - } - - /** - * Extract the collection id from the JCas. Uses {@link DocumentMetaData#getCollectionId()}, but - * this method can be overwritten to select a different source for the collection id. - * - * @param aJCas - * the JCas. - * @return the collection id String or null if it is not available. - */ - protected String getCollectionId(JCas aJCas) - { - String collectionId = DocumentMetaData.get(aJCas).getCollectionId(); - if (collectionId == null) { - throw new IllegalStateException("Could not extract collection ID for document"); - } - - if (!collectionValuesExactMatch && !collectionValuesSet.contains(collectionId)) { - collectionId = expandCollectionId(collectionId); - } - - return collectionId; - } - - /** - * This method checks whether any of the specified collection values contains the given String. - * If it does, returns the matching value; if not, it returns the original value. - * - * @param collectionId - * the collection ID. - * @return the first entry from {@code collectionValuesSet} that contains the (lowercased) - * {@code collectionId} or the input {@code collectionId}. - */ - protected String expandCollectionId(String collectionId) - { - assert !collectionValuesExactMatch; - for (String value : collectionValuesSet) { - if (collectionId.toLowerCase().contains(value.toLowerCase())) { - getLogger().debug( - String.format("Changing collection ID from '%s' to '%s'.", - collectionId, value)); - return value; - } - } - return collectionId; - } - - /** - * Extract the document id from the JCas. Uses {@link DocumentMetaData#getDocumentId()}, but - * this method can be overwritten to select a different source for the document id. - * - * @param aJCas - * the JCas. - * @return the document id string or null if it is not available. - */ - protected String getDocumentId(JCas aJCas) - throws IllegalStateException - { - String docName = DocumentMetaData.get(aJCas).getDocumentId(); - if (docName == null) { - throw new IllegalStateException("Could not extract document ID from metadata."); - } - return docName; - } -} +/* + * Copyright 2014 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.ditop; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.uima.fit.util.JCasUtil.select; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.commons.collections4.Bag; +import org.apache.commons.collections4.bag.HashBag; +import org.apache.commons.io.FileUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.DoubleArray; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; + +import cc.mallet.topics.ParallelTopicModel; +import cc.mallet.types.Alphabet; +import cc.mallet.types.IDSorter; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.mallet.type.TopicDistribution; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.Parameters; + +/** + * This annotator (consumer) writes output files as required by + * DiTop. It requires JCas input annotated by + * {@link org.dkpro.core.mallet.lda.MalletLdaTopicModelInferencer} using the same + * model. + */ +@ResourceMetaData(name = "DiTop Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@Parameters( + exclude = { + DiTopWriter.PARAM_TARGET_LOCATION }) +@MimeTypeCapability({MimeTypes.APPLICATION_X_DITOP}) +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.mallet.type.TopicDistribution" }) +public class DiTopWriter + extends JCasFileWriter_ImplBase +{ + private static final String FIELDSEPARATOR_CONFIGFILE = ";"; + private final static String DOC_TOPICS_FILE = "topics.csv"; + private final static String TOPIC_TERM_FILE = "topicTerm.txt"; + private final static String TOPIC_TERM_MATRIX_FILE = "topicTermMatrix.txt"; + private final static String TOPIC_SUMMARY_FILE = "topicTerm-T15.txt"; + private final static String CONFIG_FILE = "config.all"; + + /** + * The maximum number of topic words to extract. + */ + public static final String PARAM_MAX_TOPIC_WORDS = "maxTopicWords"; + @ConfigurationParameter(name = PARAM_MAX_TOPIC_WORDS, mandatory = true, defaultValue = "15") + private int maxTopicWords; + + /** + * A Mallet file storing a serialized {@link ParallelTopicModel}. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true) + protected File modelLocation; + + /** + * The corpus name is used to name the corresponding sub-directory and will be set in the + * configuration file. + */ + public static final String PARAM_CORPUS_NAME = "corpusName"; + @ConfigurationParameter(name = PARAM_CORPUS_NAME, mandatory = true) + protected String corpusName; + + /** + * Directory in which to store output files. + */ + public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; + @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) + protected File targetLocation; + + /** + * If set to true, the new corpus will be appended to an existing config file. If false, the + * existing file is overwritten. + */ + public static final String PARAM_APPEND_CONFIG = "appendConfig"; + @ConfigurationParameter(name = PARAM_APPEND_CONFIG, mandatory = true, defaultValue = "true") + protected boolean appendConfig; + + /** + * If set, only documents with one of the listed collection IDs are written, all others are + * ignored. If this is empty (null), all documents are written. + */ + public final static String PARAM_COLLECTION_VALUES = "collectionValues"; + @ConfigurationParameter(name = PARAM_COLLECTION_VALUES, mandatory = false) + protected String[] collectionValues; + + /** + * If true (default), only write documents with collection ids matching one of the collection + * values exactly. If false, write documents with collection ids containing any of the + * collection value string in collection while ignoring cases. + */ + public final static String PARAM_COLLECTION_VALUES_EXACT_MATCH = "collectionValuesExactMatch"; + @ConfigurationParameter(name = PARAM_COLLECTION_VALUES_EXACT_MATCH, mandatory = true, defaultValue = "true") + protected boolean collectionValuesExactMatch; + + private ParallelTopicModel model; + private File collectionDir; + protected Set collectionValuesSet; + private Bag collectionCounter; + + protected BufferedWriter writerDocTopic; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + try { + model = ParallelTopicModel.read(modelLocation); + collectionDir = new File(targetLocation, corpusName + "_" + model.getNumTopics()); + if (collectionDir.exists()) { + getLogger().warn( + String.format("%s' already exists, overwriting content.", collectionDir)); + } + collectionDir.mkdirs(); + initializeTopicFile(); + } + catch (Exception e) { + throw new ResourceInitializationException(e); + } + + collectionValuesSet = collectionValues == null ? Collections.emptySet() + : new HashSet<>(Arrays.asList(collectionValues)); + collectionCounter = new HashBag<>(); + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + for (TopicDistribution distribution : select(aJCas, TopicDistribution.class)) { + String docName = getDocumentId(aJCas); + String collectionId = getCollectionId(aJCas); + + /* Print and gather collection statistics */ + if (collectionCounter.getCount(collectionId) == 0) { + getLogger().info("New collection ID observed: " + collectionId); + } + collectionCounter.add(collectionId); + + try { + writeDocTopic(distribution, docName, collectionId); + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + } + + protected void writeDocTopic(TopicDistribution distribution, String docName, + String collectionId) + throws IOException + { + /* filter by collection id if PARAM_COLLECTION_VALUES is set */ + if (collectionValuesSet.isEmpty() || collectionValuesSet.contains(collectionId)) { + /* write documents to file */ + writerDocTopic.write(collectionId + ","); + writerDocTopic.write(docName); + DoubleArray proportions = distribution.getTopicProportions(); + for (double topicProb : proportions.toArray()) { + writerDocTopic.write("," + topicProb); + } + writerDocTopic.newLine(); + } + } + + @Override + public void collectionProcessComplete() + throws AnalysisEngineProcessException + { + super.collectionProcessComplete(); + + getLogger().info("Collection statistics: " + collectionCounter.toString()); + getLogger().info( + collectionValuesSet.isEmpty() ? + "Writing all documents." : + "Writing documents from these collections only: " + + collectionValuesSet.toString()); + + try { + writerDocTopic.close(); + writetermMatrixFiles(); + writeConfigFile(); + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + + } + + private void initializeTopicFile() + throws IOException + { + File topicFile = new File(collectionDir, DOC_TOPICS_FILE); + getLogger().info(String.format("Writing file '%s'.", topicFile.getPath())); + writerDocTopic = new BufferedWriter(new FileWriter(topicFile)); + + /* Write header */ + writerDocTopic.write("Class,Document"); + for (int j = 0; j < model.numTopics; j++) { + writerDocTopic.write(",T" + j); + } + writerDocTopic.newLine(); + } + + /** + * This method has been copied and slightly adapted from MalletLDA#printTopics in the original + * DiTop code. + * + * @throws IOException + * if a low-level I/O error occurs + */ + private void writetermMatrixFiles() + throws IOException + { + File topicTermFile = new File(collectionDir, TOPIC_TERM_FILE); + File topicTermMatrixFile = new File(collectionDir, TOPIC_TERM_MATRIX_FILE); + File topicSummaryFile = new File(collectionDir, TOPIC_SUMMARY_FILE); + + BufferedWriter writerTopicTerm = new BufferedWriter(new FileWriter(topicTermFile)); + BufferedWriter writerTopicTermMatrix = new BufferedWriter(new FileWriter( + topicTermMatrixFile)); + BufferedWriter writerTopicTermShort = new BufferedWriter(new FileWriter(topicSummaryFile)); + + getLogger().info(String.format("Writing file '%s'.", topicTermFile)); + getLogger().info(String.format("Writing file '%s'.", topicTermMatrixFile)); + getLogger().info(String.format("Writing file '%s'.", topicSummaryFile)); + + /* Write topic term associations */ + Alphabet alphabet = model.getAlphabet(); + for (int i = 0; i < model.getSortedWords().size(); i++) { + writerTopicTerm.write("TOPIC " + i + ": "); + writerTopicTermShort.write("TOPIC " + i + ": "); + writerTopicTermMatrix.write("TOPIC " + i + ": "); + /** topic for the label */ + int count = 0; + TreeSet set = model.getSortedWords().get(i); + for (IDSorter s : set) { + if (count <= maxTopicWords) { + writerTopicTermShort.write(alphabet.lookupObject(s.getID()) + ", "); + } + count++; + writerTopicTerm.write(alphabet.lookupObject(s.getID()) + ", "); + writerTopicTermMatrix.write(alphabet.lookupObject(s.getID()) + " (" + s.getWeight() + + "), "); + /** add to topic label */ + } + writerTopicTerm.newLine(); + writerTopicTermShort.newLine(); + writerTopicTermMatrix.newLine(); + } + + writerTopicTermMatrix.close(); + writerTopicTerm.close(); + writerTopicTermShort.close(); + } + + private void writeConfigFile() + throws IOException + { + File configFile = new File(targetLocation, CONFIG_FILE); + Map> corpora; // holds all corpus names mapped to (multiple) topic + // numbers + Set currentCorpusTopicNumbers; // entry for the current, new topic + + if (appendConfig && configFile.exists()) { + // read existing entries from config file + corpora = readConfigFile(configFile); + currentCorpusTopicNumbers = corpora.containsKey(corpusName) ? + corpora.get(corpusName) : new HashSet<>(); + } + else { + corpora = new HashMap<>(); + currentCorpusTopicNumbers = new HashSet<>(1, 1); + } + + currentCorpusTopicNumbers.add(model.getNumTopics()); + corpora.put(corpusName, currentCorpusTopicNumbers); + + getLogger().info(String.format("Writing configuration file '%s'.", configFile.getPath())); + BufferedWriter configWriter = new BufferedWriter(new FileWriter(configFile)); + + for (Entry> entry : corpora.entrySet()) { + configWriter.write(entry.getKey()); + for (Integer topicNumber : entry.getValue()) { + configWriter.write(FIELDSEPARATOR_CONFIGFILE + topicNumber); + } + configWriter.newLine(); + } + configWriter.close(); + } + + /** + * Read config file in the form ;[;...] + *

+ * Results in a Map :Set(ntopics1, ...) + * + * @param configFile + * the config file to read + * @return a map containing corpus names as keys and a set of topic numbers as values + * @throws IOException + * if an I/O error occurs. + */ + private static Map> readConfigFile(File configFile) + throws IOException + { + Map> entries = new HashMap<>(); + + for (String line : FileUtils.readLines(configFile, UTF_8)) { + String[] fields = line.split(FIELDSEPARATOR_CONFIGFILE); + if (fields.length < 2) { + throw new IllegalStateException(String.format( + "Could not parse config file '%s': Invalid line:%n%s", configFile, line)); + } + if (entries.containsKey(fields[0])) { + throw new IllegalStateException(String.format( + "Could not parse config file '%s': duplicate corpus entry '%s'.", + configFile, fields[0])); + } + Set topicCounts = new HashSet<>(fields.length - 1); + for (int i = 1; i < fields.length; i++) { + try { + topicCounts.add(Integer.parseInt(fields[i])); + } + catch (NumberFormatException e) { + throw new IllegalStateException(String.format( + "Could not parse config file '%s': Invalid topic number '%s'.", + configFile, fields[i])); + } + } + entries.put(fields[0], topicCounts); + } + return entries; + } + + /** + * Extract the collection id from the JCas. Uses {@link DocumentMetaData#getCollectionId()}, but + * this method can be overwritten to select a different source for the collection id. + * + * @param aJCas + * the JCas. + * @return the collection id String or null if it is not available. + */ + protected String getCollectionId(JCas aJCas) + { + String collectionId = DocumentMetaData.get(aJCas).getCollectionId(); + if (collectionId == null) { + throw new IllegalStateException("Could not extract collection ID for document"); + } + + if (!collectionValuesExactMatch && !collectionValuesSet.contains(collectionId)) { + collectionId = expandCollectionId(collectionId); + } + + return collectionId; + } + + /** + * This method checks whether any of the specified collection values contains the given String. + * If it does, returns the matching value; if not, it returns the original value. + * + * @param collectionId + * the collection ID. + * @return the first entry from {@code collectionValuesSet} that contains the (lowercased) + * {@code collectionId} or the input {@code collectionId}. + */ + protected String expandCollectionId(String collectionId) + { + assert !collectionValuesExactMatch; + for (String value : collectionValuesSet) { + if (collectionId.toLowerCase().contains(value.toLowerCase())) { + getLogger().debug( + String.format("Changing collection ID from '%s' to '%s'.", + collectionId, value)); + return value; + } + } + return collectionId; + } + + /** + * Extract the document id from the JCas. Uses {@link DocumentMetaData#getDocumentId()}, but + * this method can be overwritten to select a different source for the document id. + * + * @param aJCas + * the JCas. + * @return the document id string or null if it is not available. + */ + protected String getDocumentId(JCas aJCas) + throws IllegalStateException + { + String docName = DocumentMetaData.get(aJCas).getDocumentId(); + if (docName == null) { + throw new IllegalStateException("Could not extract document ID from metadata."); + } + return docName; + } +} diff --git a/dkpro-core-io-ditop-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/ditop/DiTopWriterTest.java b/dkpro-core-io-ditop-asl/src/test/java/org/dkpro/core/io/ditop/DiTopWriterTest.java similarity index 92% rename from dkpro-core-io-ditop-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/ditop/DiTopWriterTest.java rename to dkpro-core-io-ditop-asl/src/test/java/org/dkpro/core/io/ditop/DiTopWriterTest.java index 651d6c9ebf..68ffc16e54 100644 --- a/dkpro-core-io-ditop-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/ditop/DiTopWriterTest.java +++ b/dkpro-core-io-ditop-asl/src/test/java/org/dkpro/core/io/ditop/DiTopWriterTest.java @@ -1,305 +1,310 @@ -/* - * Copyright 2014 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.ditop; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.io.File; -import java.io.IOException; - -import org.apache.commons.io.FileUtils; -import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.junit.Before; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.mallet.lda.MalletLdaTopicModelInferencer; -import de.tudarmstadt.ukp.dkpro.core.mallet.lda.MalletLdaTopicModelTrainer; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; - -public class DiTopWriterTest -{ - private static final String DITOP_CORPUSNAME = "test"; - private static final String TARGET_DITOP = "target/ditop"; - private static final File MODEL_FILE = new File("target/mallet/model"); - private static final String CAS_DIR = "src/test/resources/txt"; - private static final String CAS_FILE_PATTERN = "[+]*.txt"; - - private static final int N_TOPICS = 10; - private static final int N_ITERATIONS = 50; - private static final String LANGUAGE = "en"; - - @Before - public void setUp() - throws Exception - { - /* Generate model */ - CollectionReaderDescription reader = createReaderDescription(TextReader.class, - TextReader.PARAM_SOURCE_LOCATION, CAS_DIR, - TextReader.PARAM_PATTERNS, CAS_FILE_PATTERN, - TextReader.PARAM_LANGUAGE, LANGUAGE); - AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - - AnalysisEngineDescription estimator = createEngineDescription( - MalletLdaTopicModelTrainer.class, - MalletLdaTopicModelTrainer.PARAM_TARGET_LOCATION, MODEL_FILE, - MalletLdaTopicModelTrainer.PARAM_N_ITERATIONS, N_ITERATIONS, - MalletLdaTopicModelTrainer.PARAM_N_TOPICS, N_TOPICS); - SimplePipeline.runPipeline(reader, segmenter, estimator); - - MODEL_FILE.deleteOnExit(); - } - - @Test - public void testSimple() - throws UIMAException, IOException - { - int expectedNDocuments = 2; - - CollectionReaderDescription reader = createReaderDescription(TextReader.class, - TextReader.PARAM_SOURCE_LOCATION, CAS_DIR, - TextReader.PARAM_PATTERNS, CAS_FILE_PATTERN, - TextReader.PARAM_LANGUAGE, LANGUAGE); - AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - AnalysisEngineDescription inferencer = createEngineDescription( - MalletLdaTopicModelInferencer.class, - MalletLdaTopicModelInferencer.PARAM_MODEL_LOCATION, MODEL_FILE); - AnalysisEngineDescription ditopwriter = createEngineDescription(DiTopWriter.class, - DiTopWriter.PARAM_TARGET_LOCATION, TARGET_DITOP, - DiTopWriter.PARAM_MODEL_LOCATION, MODEL_FILE, - DiTopWriter.PARAM_CORPUS_NAME, DITOP_CORPUSNAME); - - SimplePipeline.runPipeline(reader, segmenter, inferencer, ditopwriter); - - /* test whether target files and dirs exist */ - File contentDir = new File(TARGET_DITOP, DITOP_CORPUSNAME + "_" + N_TOPICS); - File topicsFile = new File(contentDir, "topics.csv"); - File topicTermT15File = new File(contentDir, "topicTerm-T15.txt"); - File topicTermFile = new File(contentDir, "topicTerm.txt"); - File topicTermMatrixFile = new File(contentDir, "topicTermMatrix.txt"); - - assertTrue(new File(TARGET_DITOP, "config.all").exists()); - assertTrue(contentDir.isDirectory()); - assertTrue(topicTermT15File.exists()); - assertTrue(topicTermFile.exists()); - assertTrue(topicTermMatrixFile.exists()); - assertTrue(topicsFile.exists()); - - /* check that file lengths are correct */ - assertEquals(expectedNDocuments + 1, FileUtils.readLines(topicsFile).size()); - assertEquals(N_TOPICS, FileUtils.readLines(topicTermT15File).size()); - assertEquals(N_TOPICS, FileUtils.readLines(topicTermFile).size()); - assertEquals(N_TOPICS, FileUtils.readLines(topicTermMatrixFile).size()); - MODEL_FILE.delete(); - } - - @Test - public void testCollectionValuesExact() - throws UIMAException, IOException - { - int expectedNDocuments = 2; - String exactName = new File(CAS_DIR).toURI().toString(); - String[] collectionValues = new String[] { exactName }; - boolean exactMatch = true; - - CollectionReaderDescription reader = createReaderDescription(TextReader.class, - TextReader.PARAM_SOURCE_LOCATION, CAS_DIR, - TextReader.PARAM_PATTERNS, CAS_FILE_PATTERN, - TextReader.PARAM_LANGUAGE, LANGUAGE); - AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - AnalysisEngineDescription inferencer = createEngineDescription( - MalletLdaTopicModelInferencer.class, - MalletLdaTopicModelInferencer.PARAM_MODEL_LOCATION, MODEL_FILE); - AnalysisEngineDescription ditopwriter = createEngineDescription(DiTopWriter.class, - DiTopWriter.PARAM_TARGET_LOCATION, TARGET_DITOP, - DiTopWriter.PARAM_MODEL_LOCATION, MODEL_FILE, - DiTopWriter.PARAM_CORPUS_NAME, DITOP_CORPUSNAME, - DiTopWriter.PARAM_COLLECTION_VALUES, collectionValues, - DiTopWriter.PARAM_COLLECTION_VALUES_EXACT_MATCH, exactMatch); - - SimplePipeline.runPipeline(reader, segmenter, inferencer, ditopwriter); - - /* test whether target files and dirs exist */ - File contentDir = new File(TARGET_DITOP, DITOP_CORPUSNAME + "_" + N_TOPICS); - File topicsFile = new File(contentDir, "topics.csv"); - File topicTermT15File = new File(contentDir, "topicTerm-T15.txt"); - File topicTermFile = new File(contentDir, "topicTerm.txt"); - File topicTermMatrixFile = new File(contentDir, "topicTermMatrix.txt"); - - assertTrue(new File(TARGET_DITOP, "config.all").exists()); - assertTrue(contentDir.isDirectory()); - assertTrue(topicTermT15File.exists()); - assertTrue(topicTermFile.exists()); - assertTrue(topicTermMatrixFile.exists()); - assertTrue(topicsFile.exists()); - - /* check that file lengths are correct */ - assertEquals(expectedNDocuments + 1, FileUtils.readLines(topicsFile).size()); - assertEquals(N_TOPICS, FileUtils.readLines(topicTermT15File).size()); - assertEquals(N_TOPICS, FileUtils.readLines(topicTermFile).size()); - assertEquals(N_TOPICS, FileUtils.readLines(topicTermMatrixFile).size()); - MODEL_FILE.delete(); - } - - @Test - public void testCollectionValuesExactNoMatch() - throws UIMAException, IOException - { - int expectedNDocuments = 0; - String[] collectionValues = new String[] { "file:/home/schnober/workspace/de.tudarmstadt.ukp.dkpro.core-asl/de.tudarmstadt.ukp.dkpro.core.io.ditop/src/test/resources/" }; - boolean exactMatch = true; - - CollectionReaderDescription reader = createReaderDescription(TextReader.class, - TextReader.PARAM_SOURCE_LOCATION, CAS_DIR, - TextReader.PARAM_PATTERNS, CAS_FILE_PATTERN, - TextReader.PARAM_LANGUAGE, LANGUAGE); - AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - AnalysisEngineDescription inferencer = createEngineDescription( - MalletLdaTopicModelInferencer.class, - MalletLdaTopicModelInferencer.PARAM_MODEL_LOCATION, MODEL_FILE); - AnalysisEngineDescription ditopwriter = createEngineDescription(DiTopWriter.class, - DiTopWriter.PARAM_TARGET_LOCATION, TARGET_DITOP, - DiTopWriter.PARAM_MODEL_LOCATION, MODEL_FILE, - DiTopWriter.PARAM_CORPUS_NAME, DITOP_CORPUSNAME, - DiTopWriter.PARAM_COLLECTION_VALUES, collectionValues, - DiTopWriter.PARAM_COLLECTION_VALUES_EXACT_MATCH, exactMatch); - - SimplePipeline.runPipeline(reader, segmenter, inferencer, ditopwriter); - - /* test whether target files and dirs exist */ - File contentDir = new File(TARGET_DITOP, DITOP_CORPUSNAME + "_" + N_TOPICS); - File topicsFile = new File(contentDir, "topics.csv"); - File topicTermT15File = new File(contentDir, "topicTerm-T15.txt"); - File topicTermFile = new File(contentDir, "topicTerm.txt"); - File topicTermMatrixFile = new File(contentDir, "topicTermMatrix.txt"); - - assertTrue(new File(TARGET_DITOP, "config.all").exists()); - assertTrue(contentDir.isDirectory()); - assertTrue(topicTermT15File.exists()); - assertTrue(topicTermFile.exists()); - assertTrue(topicTermMatrixFile.exists()); - assertTrue(topicsFile.exists()); - - /* check that file lengths are correct */ - assertEquals(expectedNDocuments + 1, FileUtils.readLines(topicsFile).size()); - assertEquals(N_TOPICS, FileUtils.readLines(topicTermT15File).size()); - assertEquals(N_TOPICS, FileUtils.readLines(topicTermFile).size()); - assertEquals(N_TOPICS, FileUtils.readLines(topicTermMatrixFile).size()); - MODEL_FILE.delete(); - } - - @Test - public void testCollectionValuesNotExact() - throws UIMAException, IOException - { - int expectedNDocuments = 2; - String[] collectionValues = new String[] { "txt" }; - boolean exactMatch = false; - - CollectionReaderDescription reader = createReaderDescription(TextReader.class, - TextReader.PARAM_SOURCE_LOCATION, CAS_DIR, - TextReader.PARAM_PATTERNS, CAS_FILE_PATTERN, - TextReader.PARAM_LANGUAGE, LANGUAGE); - AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - AnalysisEngineDescription inferencer = createEngineDescription( - MalletLdaTopicModelInferencer.class, - MalletLdaTopicModelInferencer.PARAM_MODEL_LOCATION, MODEL_FILE); - AnalysisEngineDescription ditopwriter = createEngineDescription(DiTopWriter.class, - DiTopWriter.PARAM_TARGET_LOCATION, TARGET_DITOP, - DiTopWriter.PARAM_MODEL_LOCATION, MODEL_FILE, - DiTopWriter.PARAM_CORPUS_NAME, DITOP_CORPUSNAME, - DiTopWriter.PARAM_COLLECTION_VALUES, collectionValues, - DiTopWriter.PARAM_COLLECTION_VALUES_EXACT_MATCH, exactMatch); - - SimplePipeline.runPipeline(reader, segmenter, inferencer, ditopwriter); - - /* test whether target files and dirs exist */ - File contentDir = new File(TARGET_DITOP, DITOP_CORPUSNAME + "_" + N_TOPICS); - File topicsFile = new File(contentDir, "topics.csv"); - File topicTermT15File = new File(contentDir, "topicTerm-T15.txt"); - File topicTermFile = new File(contentDir, "topicTerm.txt"); - File topicTermMatrixFile = new File(contentDir, "topicTermMatrix.txt"); - - assertTrue(new File(TARGET_DITOP, "config.all").exists()); - assertTrue(contentDir.isDirectory()); - assertTrue(topicTermT15File.exists()); - assertTrue(topicTermFile.exists()); - assertTrue(topicTermMatrixFile.exists()); - assertTrue(topicsFile.exists()); - - /* check that file lengths are correct */ - assertEquals(expectedNDocuments + 1, FileUtils.readLines(topicsFile).size()); - assertEquals(N_TOPICS, FileUtils.readLines(topicTermT15File).size()); - assertEquals(N_TOPICS, FileUtils.readLines(topicTermFile).size()); - assertEquals(N_TOPICS, FileUtils.readLines(topicTermMatrixFile).size()); - MODEL_FILE.delete(); - } - - @Test - public void testCollectionValuesNotExactNoMatch() - throws UIMAException, IOException - { - int expectedNDocuments = 0; - String[] collectionValues = new String[] { "abcd" }; - boolean exactMatch = false; - - CollectionReaderDescription reader = createReaderDescription(TextReader.class, - TextReader.PARAM_SOURCE_LOCATION, CAS_DIR, - TextReader.PARAM_PATTERNS, CAS_FILE_PATTERN, - TextReader.PARAM_LANGUAGE, LANGUAGE); - AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - AnalysisEngineDescription inferencer = createEngineDescription( - MalletLdaTopicModelInferencer.class, - MalletLdaTopicModelInferencer.PARAM_MODEL_LOCATION, MODEL_FILE); - AnalysisEngineDescription ditopwriter = createEngineDescription(DiTopWriter.class, - DiTopWriter.PARAM_TARGET_LOCATION, TARGET_DITOP, - DiTopWriter.PARAM_MODEL_LOCATION, MODEL_FILE, - DiTopWriter.PARAM_CORPUS_NAME, DITOP_CORPUSNAME, - DiTopWriter.PARAM_COLLECTION_VALUES, collectionValues, - DiTopWriter.PARAM_COLLECTION_VALUES_EXACT_MATCH, exactMatch); - - SimplePipeline.runPipeline(reader, segmenter, inferencer, ditopwriter); - - /* test whether target files and dirs exist */ - File contentDir = new File(TARGET_DITOP, DITOP_CORPUSNAME + "_" + N_TOPICS); - File topicsFile = new File(contentDir, "topics.csv"); - File topicTermT15File = new File(contentDir, "topicTerm-T15.txt"); - File topicTermFile = new File(contentDir, "topicTerm.txt"); - File topicTermMatrixFile = new File(contentDir, "topicTermMatrix.txt"); - - assertTrue(new File(TARGET_DITOP, "config.all").exists()); - assertTrue(contentDir.isDirectory()); - assertTrue(topicTermT15File.exists()); - assertTrue(topicTermFile.exists()); - assertTrue(topicTermMatrixFile.exists()); - assertTrue(topicsFile.exists()); - - /* check that file lengths are correct */ - assertEquals(expectedNDocuments + 1, FileUtils.readLines(topicsFile).size()); - assertEquals(N_TOPICS, FileUtils.readLines(topicTermT15File).size()); - assertEquals(N_TOPICS, FileUtils.readLines(topicTermFile).size()); - assertEquals(N_TOPICS, FileUtils.readLines(topicTermMatrixFile).size()); - MODEL_FILE.delete(); - } -} +/* + * Copyright 2014 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.ditop; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.IOException; + +import org.apache.commons.io.FileUtils; +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.io.ditop.DiTopWriter; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.mallet.lda.MalletLdaTopicModelInferencer; +import org.dkpro.core.mallet.lda.MalletLdaTopicModelTrainer; +import org.dkpro.core.tokit.BreakIteratorSegmenter; +import org.junit.Before; +import org.junit.Test; + +public class DiTopWriterTest +{ + private static final String DITOP_CORPUSNAME = "test"; + private static final String TARGET_DITOP = "target/ditop"; + private static final File MODEL_FILE = new File("target/mallet/model"); + private static final String CAS_DIR = "src/test/resources/txt"; + private static final String CAS_FILE_PATTERN = "[+]*.txt"; + + private static final int N_TOPICS = 10; + private static final int N_ITERATIONS = 50; + private static final String LANGUAGE = "en"; + + @Before + public void setUp() + throws Exception + { + /* Generate model */ + CollectionReaderDescription reader = createReaderDescription(TextReader.class, + TextReader.PARAM_SOURCE_LOCATION, CAS_DIR, + TextReader.PARAM_PATTERNS, CAS_FILE_PATTERN, + TextReader.PARAM_LANGUAGE, LANGUAGE); + AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); + + AnalysisEngineDescription estimator = createEngineDescription( + MalletLdaTopicModelTrainer.class, + MalletLdaTopicModelTrainer.PARAM_TARGET_LOCATION, MODEL_FILE, + MalletLdaTopicModelTrainer.PARAM_N_ITERATIONS, N_ITERATIONS, + MalletLdaTopicModelTrainer.PARAM_N_TOPICS, N_TOPICS); + SimplePipeline.runPipeline(reader, segmenter, estimator); + + MODEL_FILE.deleteOnExit(); + } + + @Test + public void testSimple() + throws UIMAException, IOException + { + int expectedNDocuments = 2; + + CollectionReaderDescription reader = createReaderDescription(TextReader.class, + TextReader.PARAM_SOURCE_LOCATION, CAS_DIR, + TextReader.PARAM_PATTERNS, CAS_FILE_PATTERN, + TextReader.PARAM_LANGUAGE, LANGUAGE); + AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); + AnalysisEngineDescription inferencer = createEngineDescription( + MalletLdaTopicModelInferencer.class, + MalletLdaTopicModelInferencer.PARAM_MODEL_LOCATION, MODEL_FILE); + AnalysisEngineDescription ditopwriter = createEngineDescription(DiTopWriter.class, + DiTopWriter.PARAM_TARGET_LOCATION, TARGET_DITOP, + DiTopWriter.PARAM_MODEL_LOCATION, MODEL_FILE, + DiTopWriter.PARAM_CORPUS_NAME, DITOP_CORPUSNAME); + + SimplePipeline.runPipeline(reader, segmenter, inferencer, ditopwriter); + + /* test whether target files and dirs exist */ + File contentDir = new File(TARGET_DITOP, DITOP_CORPUSNAME + "_" + N_TOPICS); + File topicsFile = new File(contentDir, "topics.csv"); + File topicTermT15File = new File(contentDir, "topicTerm-T15.txt"); + File topicTermFile = new File(contentDir, "topicTerm.txt"); + File topicTermMatrixFile = new File(contentDir, "topicTermMatrix.txt"); + + assertTrue(new File(TARGET_DITOP, "config.all").exists()); + assertTrue(contentDir.isDirectory()); + assertTrue(topicTermT15File.exists()); + assertTrue(topicTermFile.exists()); + assertTrue(topicTermMatrixFile.exists()); + assertTrue(topicsFile.exists()); + + /* check that file lengths are correct */ + assertEquals(expectedNDocuments + 1, FileUtils.readLines(topicsFile, UTF_8).size()); + assertEquals(N_TOPICS, FileUtils.readLines(topicTermT15File, UTF_8).size()); + assertEquals(N_TOPICS, FileUtils.readLines(topicTermFile, UTF_8).size()); + assertEquals(N_TOPICS, FileUtils.readLines(topicTermMatrixFile, UTF_8).size()); + MODEL_FILE.delete(); + } + + @Test + public void testCollectionValuesExact() + throws UIMAException, IOException + { + int expectedNDocuments = 2; + String exactName = new File(CAS_DIR).toURI().toString(); + String[] collectionValues = new String[] { exactName }; + boolean exactMatch = true; + + CollectionReaderDescription reader = createReaderDescription(TextReader.class, + TextReader.PARAM_SOURCE_LOCATION, CAS_DIR, + TextReader.PARAM_PATTERNS, CAS_FILE_PATTERN, + TextReader.PARAM_LANGUAGE, LANGUAGE); + AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); + AnalysisEngineDescription inferencer = createEngineDescription( + MalletLdaTopicModelInferencer.class, + MalletLdaTopicModelInferencer.PARAM_MODEL_LOCATION, MODEL_FILE); + AnalysisEngineDescription ditopwriter = createEngineDescription(DiTopWriter.class, + DiTopWriter.PARAM_TARGET_LOCATION, TARGET_DITOP, + DiTopWriter.PARAM_MODEL_LOCATION, MODEL_FILE, + DiTopWriter.PARAM_CORPUS_NAME, DITOP_CORPUSNAME, + DiTopWriter.PARAM_COLLECTION_VALUES, collectionValues, + DiTopWriter.PARAM_COLLECTION_VALUES_EXACT_MATCH, exactMatch); + + SimplePipeline.runPipeline(reader, segmenter, inferencer, ditopwriter); + + /* test whether target files and dirs exist */ + File contentDir = new File(TARGET_DITOP, DITOP_CORPUSNAME + "_" + N_TOPICS); + File topicsFile = new File(contentDir, "topics.csv"); + File topicTermT15File = new File(contentDir, "topicTerm-T15.txt"); + File topicTermFile = new File(contentDir, "topicTerm.txt"); + File topicTermMatrixFile = new File(contentDir, "topicTermMatrix.txt"); + + assertTrue(new File(TARGET_DITOP, "config.all").exists()); + assertTrue(contentDir.isDirectory()); + assertTrue(topicTermT15File.exists()); + assertTrue(topicTermFile.exists()); + assertTrue(topicTermMatrixFile.exists()); + assertTrue(topicsFile.exists()); + + /* check that file lengths are correct */ + assertEquals(expectedNDocuments + 1, FileUtils.readLines(topicsFile, UTF_8).size()); + assertEquals(N_TOPICS, FileUtils.readLines(topicTermT15File, UTF_8).size()); + assertEquals(N_TOPICS, FileUtils.readLines(topicTermFile, UTF_8).size()); + assertEquals(N_TOPICS, FileUtils.readLines(topicTermMatrixFile, UTF_8).size()); + MODEL_FILE.delete(); + } + + @Test + public void testCollectionValuesExactNoMatch() + throws UIMAException, IOException + { + int expectedNDocuments = 0; + // FIXME I'm pretty sure this absolute path should not be here - check and relativize + // if possible. + String[] collectionValues = { + "file:/home/schnober/workspace/de.tudarmstadt.ukp.dkpro.core-asl/de.tudarmstadt.ukp.dkpro.core.io.ditop/src/test/resources/" + }; + boolean exactMatch = true; + + CollectionReaderDescription reader = createReaderDescription(TextReader.class, + TextReader.PARAM_SOURCE_LOCATION, CAS_DIR, + TextReader.PARAM_PATTERNS, CAS_FILE_PATTERN, + TextReader.PARAM_LANGUAGE, LANGUAGE); + AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); + AnalysisEngineDescription inferencer = createEngineDescription( + MalletLdaTopicModelInferencer.class, + MalletLdaTopicModelInferencer.PARAM_MODEL_LOCATION, MODEL_FILE); + AnalysisEngineDescription ditopwriter = createEngineDescription(DiTopWriter.class, + DiTopWriter.PARAM_TARGET_LOCATION, TARGET_DITOP, + DiTopWriter.PARAM_MODEL_LOCATION, MODEL_FILE, + DiTopWriter.PARAM_CORPUS_NAME, DITOP_CORPUSNAME, + DiTopWriter.PARAM_COLLECTION_VALUES, collectionValues, + DiTopWriter.PARAM_COLLECTION_VALUES_EXACT_MATCH, exactMatch); + + SimplePipeline.runPipeline(reader, segmenter, inferencer, ditopwriter); + + /* test whether target files and dirs exist */ + File contentDir = new File(TARGET_DITOP, DITOP_CORPUSNAME + "_" + N_TOPICS); + File topicsFile = new File(contentDir, "topics.csv"); + File topicTermT15File = new File(contentDir, "topicTerm-T15.txt"); + File topicTermFile = new File(contentDir, "topicTerm.txt"); + File topicTermMatrixFile = new File(contentDir, "topicTermMatrix.txt"); + + assertTrue(new File(TARGET_DITOP, "config.all").exists()); + assertTrue(contentDir.isDirectory()); + assertTrue(topicTermT15File.exists()); + assertTrue(topicTermFile.exists()); + assertTrue(topicTermMatrixFile.exists()); + assertTrue(topicsFile.exists()); + + /* check that file lengths are correct */ + assertEquals(expectedNDocuments + 1, FileUtils.readLines(topicsFile, UTF_8).size()); + assertEquals(N_TOPICS, FileUtils.readLines(topicTermT15File, UTF_8).size()); + assertEquals(N_TOPICS, FileUtils.readLines(topicTermFile, UTF_8).size()); + assertEquals(N_TOPICS, FileUtils.readLines(topicTermMatrixFile, UTF_8).size()); + MODEL_FILE.delete(); + } + + @Test + public void testCollectionValuesNotExact() + throws UIMAException, IOException + { + int expectedNDocuments = 2; + String[] collectionValues = new String[] { "txt" }; + boolean exactMatch = false; + + CollectionReaderDescription reader = createReaderDescription(TextReader.class, + TextReader.PARAM_SOURCE_LOCATION, CAS_DIR, + TextReader.PARAM_PATTERNS, CAS_FILE_PATTERN, + TextReader.PARAM_LANGUAGE, LANGUAGE); + AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); + AnalysisEngineDescription inferencer = createEngineDescription( + MalletLdaTopicModelInferencer.class, + MalletLdaTopicModelInferencer.PARAM_MODEL_LOCATION, MODEL_FILE); + AnalysisEngineDescription ditopwriter = createEngineDescription(DiTopWriter.class, + DiTopWriter.PARAM_TARGET_LOCATION, TARGET_DITOP, + DiTopWriter.PARAM_MODEL_LOCATION, MODEL_FILE, + DiTopWriter.PARAM_CORPUS_NAME, DITOP_CORPUSNAME, + DiTopWriter.PARAM_COLLECTION_VALUES, collectionValues, + DiTopWriter.PARAM_COLLECTION_VALUES_EXACT_MATCH, exactMatch); + + SimplePipeline.runPipeline(reader, segmenter, inferencer, ditopwriter); + + /* test whether target files and dirs exist */ + File contentDir = new File(TARGET_DITOP, DITOP_CORPUSNAME + "_" + N_TOPICS); + File topicsFile = new File(contentDir, "topics.csv"); + File topicTermT15File = new File(contentDir, "topicTerm-T15.txt"); + File topicTermFile = new File(contentDir, "topicTerm.txt"); + File topicTermMatrixFile = new File(contentDir, "topicTermMatrix.txt"); + + assertTrue(new File(TARGET_DITOP, "config.all").exists()); + assertTrue(contentDir.isDirectory()); + assertTrue(topicTermT15File.exists()); + assertTrue(topicTermFile.exists()); + assertTrue(topicTermMatrixFile.exists()); + assertTrue(topicsFile.exists()); + + /* check that file lengths are correct */ + assertEquals(expectedNDocuments + 1, FileUtils.readLines(topicsFile, UTF_8).size()); + assertEquals(N_TOPICS, FileUtils.readLines(topicTermT15File, UTF_8).size()); + assertEquals(N_TOPICS, FileUtils.readLines(topicTermFile, UTF_8).size()); + assertEquals(N_TOPICS, FileUtils.readLines(topicTermMatrixFile, UTF_8).size()); + MODEL_FILE.delete(); + } + + @Test + public void testCollectionValuesNotExactNoMatch() + throws UIMAException, IOException + { + int expectedNDocuments = 0; + String[] collectionValues = new String[] { "abcd" }; + boolean exactMatch = false; + + CollectionReaderDescription reader = createReaderDescription(TextReader.class, + TextReader.PARAM_SOURCE_LOCATION, CAS_DIR, + TextReader.PARAM_PATTERNS, CAS_FILE_PATTERN, + TextReader.PARAM_LANGUAGE, LANGUAGE); + AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); + AnalysisEngineDescription inferencer = createEngineDescription( + MalletLdaTopicModelInferencer.class, + MalletLdaTopicModelInferencer.PARAM_MODEL_LOCATION, MODEL_FILE); + AnalysisEngineDescription ditopwriter = createEngineDescription(DiTopWriter.class, + DiTopWriter.PARAM_TARGET_LOCATION, TARGET_DITOP, + DiTopWriter.PARAM_MODEL_LOCATION, MODEL_FILE, + DiTopWriter.PARAM_CORPUS_NAME, DITOP_CORPUSNAME, + DiTopWriter.PARAM_COLLECTION_VALUES, collectionValues, + DiTopWriter.PARAM_COLLECTION_VALUES_EXACT_MATCH, exactMatch); + + SimplePipeline.runPipeline(reader, segmenter, inferencer, ditopwriter); + + /* test whether target files and dirs exist */ + File contentDir = new File(TARGET_DITOP, DITOP_CORPUSNAME + "_" + N_TOPICS); + File topicsFile = new File(contentDir, "topics.csv"); + File topicTermT15File = new File(contentDir, "topicTerm-T15.txt"); + File topicTermFile = new File(contentDir, "topicTerm.txt"); + File topicTermMatrixFile = new File(contentDir, "topicTermMatrix.txt"); + + assertTrue(new File(TARGET_DITOP, "config.all").exists()); + assertTrue(contentDir.isDirectory()); + assertTrue(topicTermT15File.exists()); + assertTrue(topicTermFile.exists()); + assertTrue(topicTermMatrixFile.exists()); + assertTrue(topicsFile.exists()); + + /* check that file lengths are correct */ + assertEquals(expectedNDocuments + 1, FileUtils.readLines(topicsFile, UTF_8).size()); + assertEquals(N_TOPICS, FileUtils.readLines(topicTermT15File, UTF_8).size()); + assertEquals(N_TOPICS, FileUtils.readLines(topicTermFile, UTF_8).size()); + assertEquals(N_TOPICS, FileUtils.readLines(topicTermMatrixFile, UTF_8).size()); + MODEL_FILE.delete(); + } +} diff --git a/dkpro-core-io-fangorn-asl/pom.xml b/dkpro-core-io-fangorn-asl/pom.xml deleted file mode 100644 index 6e34fc99ac..0000000000 --- a/dkpro-core-io-fangorn-asl/pom.xml +++ /dev/null @@ -1,121 +0,0 @@ - - - 4.0.0 - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT - ../dkpro-core-asl - - de.tudarmstadt.ukp.dkpro.core.io.fangorn-asl - jar - DKPro Core ASL - IO - Fangorn - - - org.apache.uima - uimaj-core - - - org.apache.uima - uimafit-core - - - org.apache.commons - commons-lang3 - - - au.edu.unimelb.csse - fangorn - r68 - - - org.apache.lucene - lucene-core - - 2.4.1 - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.syntax-asl - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl - - - junit - junit - test - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl - test - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-model-parser-en-chunking - test - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-model-token-en-maxent - test - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-model-sentence-en-maxent - test - - - - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl - 1.10.0-SNAPSHOT - pom - import - - - - - - - - org.apache.maven.plugins - maven-dependency-plugin - - - - de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.opennlp-model-token-en-maxent - de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.opennlp-model-sentence-en-maxent - de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.opennlp-model-parser-en-chunking - - - - - - - \ No newline at end of file diff --git a/dkpro-core-io-fangorn-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/fangorn/FangornWriter.java b/dkpro-core-io-fangorn-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/fangorn/FangornWriter.java deleted file mode 100644 index 25c3398339..0000000000 --- a/dkpro-core-io-fangorn-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/fangorn/FangornWriter.java +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.fangorn; - -import static org.apache.uima.fit.util.JCasUtil.select; - -import java.io.File; -import java.io.IOException; - -import org.apache.commons.lang3.exception.ExceptionUtils; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.index.IndexWriter; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasConsumer_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.Level; - -import au.edu.unimelb.csse.ParseException; -import au.edu.unimelb.csse.analyser.Node; -import au.edu.unimelb.csse.analyser.NodeTreebankAnalyser; -import au.edu.unimelb.csse.analyser.OverflowException; -import au.edu.unimelb.csse.analyser.String2NodesParser; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; - -/** - * Fangorn index writer. - */ -@ResourceMetaData(name="Fangorn Index Writer") -@MimeTypeCapability({MimeTypes.APPLICATION_X_FANGORN}) -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree"}) -public class FangornWriter - extends JCasConsumer_ImplBase -{ - public static final String FIELD_FANGORN = "sent"; - public static final String FIELD_COLLECTION_ID = "collectionId"; - public static final String FIELD_DOCUMENT_ID = "documentId"; - public static final String FIELD_BEGIN = "begin"; - public static final String FIELD_END = "end"; - - /** - * Location to which the output is written. - */ - public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; - @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) - private File outputFolder; - - private IndexWriter writer; - private NodeTreebankAnalyser analyser; - private final String2NodesParser parser = new String2NodesParser(); - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - analyser = new NodeTreebankAnalyser(false); - - try { - writer = new IndexWriter(outputFolder, analyser, true, - IndexWriter.MaxFieldLength.UNLIMITED); - } - catch (IOException e) { - throw new ResourceInitializationException(e); - } - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - DocumentMetaData meta = DocumentMetaData.get(aJCas); - - for (PennTree s : select(aJCas, PennTree.class)) { - Node root; - try { - root = parser.parse(s.getPennTree()); - } - catch (ParseException e) { - getContext().getLogger().log(Level.SEVERE, ExceptionUtils.getRootCauseMessage(e)); - continue; - } - - String asJson = root.asJSONString(); - Document d = new Document(); - d.add(new Field("documentId", meta.getDocumentId(), Field.Store.YES, - Field.Index.NOT_ANALYZED, Field.TermVector.NO)); - d.add(new Field("collectionId", meta.getCollectionId(), Field.Store.YES, - Field.Index.NOT_ANALYZED, Field.TermVector.NO)); - d.add(new Field("begin", Integer.toString(s.getBegin()), Field.Store.YES, - Field.Index.NOT_ANALYZED, Field.TermVector.NO)); - d.add(new Field("end", Integer.toString(s.getEnd()), Field.Store.YES, - Field.Index.NOT_ANALYZED, Field.TermVector.NO)); - d.add(new Field("sent", asJson, Field.Store.COMPRESS, Field.Index.ANALYZED_NO_NORMS, - Field.TermVector.WITH_POSITIONS)); - try { - writer.addDocument(d); - } - catch (OverflowException e) { - getContext().getLogger().log(Level.SEVERE, ExceptionUtils.getRootCauseMessage(e)); - continue; - } - catch (Exception e) { - throw new AnalysisEngineProcessException(e); - } - } - } - - @Override - public void collectionProcessComplete() - throws AnalysisEngineProcessException - { - if (writer != null) { - try { - writer.close(); - } - catch (IOException e) { - // Ignore exception on close - } - } - } -} diff --git a/dkpro-core-io-fangorn-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/fangorn/FangornWriterTest.java b/dkpro-core-io-fangorn-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/fangorn/FangornWriterTest.java deleted file mode 100644 index 33f72e9e80..0000000000 --- a/dkpro-core-io-fangorn-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/fangorn/FangornWriterTest.java +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.fangorn; - -import static java.util.Arrays.asList; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.junit.Assert.assertEquals; - -import java.io.File; -import java.util.ArrayList; -import java.util.List; - -import org.apache.commons.lang3.StringUtils; -import org.apache.lucene.document.Document; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.store.FSDirectory; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.fit.factory.JCasFactory; -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.apache.uima.jcas.JCas; -import org.junit.Test; - -import au.edu.unimelb.csse.queryParser.QueryBuilder; -import au.edu.unimelb.csse.search.SimpleHitCollector; -import au.edu.unimelb.csse.search.TreebankQuery; -import au.edu.unimelb.csse.search.complete.AllResults; -import au.edu.unimelb.csse.search.complete.Result; -import au.edu.unimelb.csse.search.join.TermJoinType; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpParser; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpSegmenter; - -public class FangornWriterTest -{ - @Test - public void test() - throws Exception - { - File outputFile = new File("target/test-output"); - - JCas jcas = JCasFactory.createJCas(); - - jcas.setDocumentLanguage("en"); - jcas.setDocumentText("This is a test. I may work. Or it may not work."); - - DocumentMetaData meta = DocumentMetaData.create(jcas); - meta.setCollectionId("dummyCollection"); - meta.setDocumentId("dummyId"); - - AnalysisEngineDescription segmenter = createEngineDescription(OpenNlpSegmenter.class); - - AnalysisEngineDescription parser = createEngineDescription(OpenNlpParser.class, - OpenNlpParser.PARAM_WRITE_PENN_TREE, true); - - AnalysisEngineDescription writer = createEngineDescription(FangornWriter.class, - FangornWriter.PARAM_TARGET_LOCATION, outputFile); - - SimplePipeline.runPipeline(jcas, segmenter, parser, writer); - - IndexSearcher searcher = new IndexSearcher(FSDirectory.getDirectory(outputFile)); - QueryBuilder builder = new QueryBuilder("//NP"); - TreebankQuery tq = builder.parse(TermJoinType.SIMPLE_WITH_FC, false); - SimpleHitCollector hitCollector = new SimpleHitCollector(100); - searcher.search(tq, hitCollector); - AllResults allResults = new AllResults(hitCollector.hits, hitCollector.totalHits, tq); - - Result[] resultMeta = allResults.collect(searcher); - - String[] results = new String[hitCollector.totalHits]; - for (int i = 0; i < hitCollector.totalHits; i++) { - results[i] = searcher.doc(hitCollector.hits[i]).get("sent").trim(); - } - - List actual = new ArrayList(); - - for (int i = 0; i < hitCollector.totalHits; i++) { - Document doc = searcher.doc(hitCollector.hits[i]); - actual.add(String.format("%s %s %s %s %s", - doc.get(FangornWriter.FIELD_COLLECTION_ID), - doc.get(FangornWriter.FIELD_DOCUMENT_ID), - doc.get(FangornWriter.FIELD_BEGIN), - doc.get(FangornWriter.FIELD_END), - resultMeta[i].asJSONString().replace('"', '\''))); - } - - List expected = asList( - "dummyCollection dummyId 0 15 {'num':'2','ms':[{'m':[{'s':'','e':'1_0_2_8','o':'0','t':'0'}]},{'m':[{'s':'','e':'4_2_3_6','o':'0','t':'0'}]}]}", - "dummyCollection dummyId 16 27 {'num':'1','ms':[{'m':[{'s':'','e':'1_0_2_7','o':'0','t':'0'}]}]}", - "dummyCollection dummyId 28 47 {'num':'1','ms':[{'m':[{'s':'','e':'2_1_2_9','o':'0','t':'0'}]}]}"); - - assertEquals(StringUtils.join(expected, "\n"), StringUtils.join(actual, "\n")); - } -} diff --git a/dkpro-core-io-gate-asl/pom.xml b/dkpro-core-io-gate-asl/pom.xml index 1dd5092972..fdf50f60a8 100644 --- a/dkpro-core-io-gate-asl/pom.xml +++ b/dkpro-core-io-gate-asl/pom.xml @@ -15,18 +15,20 @@ See the License for the specific language governing permissions and limitations under the License. --> - 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.gate-asl + dkpro-core-io-gate-asl jar DKPro Core ASL - IO - GATE (v ${gate.version}) + https://dkpro.github.io/dkpro-core/ 8.2 @@ -59,24 +61,24 @@ fastutil - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.ner-asl + org.dkpro.core + dkpro-core-api-ner-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl junit @@ -84,13 +86,13 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.conll-asl + org.dkpro.core + dkpro-core-io-conll-asl test diff --git a/dkpro-core-io-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/gate/GateXmlReader.java b/dkpro-core-io-gate-asl/src/main/java/org/dkpro/core/io/gate/GateXmlReader.java similarity index 83% rename from dkpro-core-io-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/gate/GateXmlReader.java rename to dkpro-core-io-gate-asl/src/main/java/org/dkpro/core/io/gate/GateXmlReader.java index a3eb219925..85cf716d60 100644 --- a/dkpro-core-io-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/gate/GateXmlReader.java +++ b/dkpro-core-io-gate-asl/src/main/java/org/dkpro/core/io/gate/GateXmlReader.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.gate; +package org.dkpro.core.io.gate; import java.io.IOException; @@ -26,13 +26,18 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; +import eu.openminted.share.annotations.api.DocumentationResource; import gate.Gate; import gate.util.GateException; -@ResourceMetaData(name="GATE XML Reader") +/** + * Reader for the GATE XML format. + */ +@ResourceMetaData(name = "GATE XML Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.APPLICATION_X_GATE_XML}) @TypeCapability( outputs = { diff --git a/dkpro-core-io-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/gate/GateXmlWriter.java b/dkpro-core-io-gate-asl/src/main/java/org/dkpro/core/io/gate/GateXmlWriter.java similarity index 82% rename from dkpro-core-io-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/gate/GateXmlWriter.java rename to dkpro-core-io-gate-asl/src/main/java/org/dkpro/core/io/gate/GateXmlWriter.java index dd9806551a..7f49ec80a6 100644 --- a/dkpro-core-io-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/gate/GateXmlWriter.java +++ b/dkpro-core-io-gate-asl/src/main/java/org/dkpro/core/io/gate/GateXmlWriter.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.gate; +package org.dkpro.core.io.gate; import java.io.OutputStream; @@ -27,17 +27,23 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.io.gate.internal.DKPro2Gate; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.io.gate.internal.DKPro2Gate; +import eu.openminted.share.annotations.api.DocumentationResource; import gate.DocumentExporter; import gate.corpora.DocumentImpl; import gate.corpora.export.GateXMLExporter; import gate.util.GateException; -@ResourceMetaData(name="GATE XML Writer") +/** + * Writer for the GATE XML format. This writer uses an explicit mapping from DKPro Core types + * to typical GATE naming convensions. + */ +@ResourceMetaData(name = "GATE XML Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.APPLICATION_X_GATE_XML}) @TypeCapability( inputs = { @@ -49,7 +55,8 @@ public class GateXmlWriter * Specify the suffix of output files. Default value .xml. If the suffix is not * needed, provide an empty string as value. */ - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".xml") private String filenameSuffix; @@ -64,7 +71,8 @@ public class GateXmlWriter * Character encoding used by the output files. */ public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; - @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String targetEncoding; private DocumentExporter exporter; diff --git a/dkpro-core-io-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/gate/GateXmlWriter2.java b/dkpro-core-io-gate-asl/src/main/java/org/dkpro/core/io/gate/GateXmlWriter2.java similarity index 94% rename from dkpro-core-io-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/gate/GateXmlWriter2.java rename to dkpro-core-io-gate-asl/src/main/java/org/dkpro/core/io/gate/GateXmlWriter2.java index b1216c93bd..653cf46a7e 100644 --- a/dkpro-core-io-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/gate/GateXmlWriter2.java +++ b/dkpro-core-io-gate-asl/src/main/java/org/dkpro/core/io/gate/GateXmlWriter2.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.gate; +package org.dkpro.core.io.gate; import static org.apache.uima.fit.util.JCasUtil.selectAll; @@ -46,10 +46,11 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.TOP; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; +import eu.openminted.share.annotations.api.DocumentationResource; import gate.Annotation; import gate.AnnotationSet; import gate.DocumentExporter; @@ -62,7 +63,12 @@ import it.unimi.dsi.fastutil.ints.Int2IntMap; import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; -@ResourceMetaData(name="GATE XML Writer (generic)") +/** + * Writer for the GATE XML format. This writer attempts a generic transformation of the UIMA CAS + * into the GATE document structure. + */ +@ResourceMetaData(name = "GATE XML Writer (generic)") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.APPLICATION_X_GATE_XML}) @TypeCapability( inputs = { @@ -74,7 +80,8 @@ public class GateXmlWriter2 * Specify the suffix of output files. Default value .xml. If the suffix is not * needed, provide an empty string as value. */ - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".xml") private String filenameSuffix; @@ -82,7 +89,8 @@ public class GateXmlWriter2 * Character encoding of the output data. */ public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; - @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String targetEncoding; private DocumentExporter exporter; @@ -301,7 +309,8 @@ else if (FSUtil.isMultiValuedFeature(aFS, aFeature)) { } else if (aFS.getCAS().getTypeSystem().subsumes(CasUtil.getType(aFS.getCAS(), TOP.class), aFeature.getRange())) { - fm.put(aFeature.getShortName(), process(aProcessed, aAs, aFS.getFeatureValue(aFeature))); + fm.put(aFeature.getShortName(), + process(aProcessed, aAs, aFS.getFeatureValue(aFeature))); } else { throw new IllegalArgumentException("Unable to convert value of feature [" diff --git a/dkpro-core-io-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/gate/internal/DKPro2Gate.java b/dkpro-core-io-gate-asl/src/main/java/org/dkpro/core/io/gate/internal/DKPro2Gate.java similarity index 88% rename from dkpro-core-io-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/gate/internal/DKPro2Gate.java rename to dkpro-core-io-gate-asl/src/main/java/org/dkpro/core/io/gate/internal/DKPro2Gate.java index fbe1a94c10..b0d964c3a3 100644 --- a/dkpro-core-io-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/gate/internal/DKPro2Gate.java +++ b/dkpro-core-io-gate-asl/src/main/java/org/dkpro/core/io/gate/internal/DKPro2Gate.java @@ -15,12 +15,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.gate.internal; +package org.dkpro.core.io.gate.internal; -import static gate.creole.ANNIEConstants.*; -import static de.tudarmstadt.ukp.dkpro.core.io.gate.internal.GateAnnieConstants.FEAT_LEMMA; -import static de.tudarmstadt.ukp.dkpro.core.io.gate.internal.GateAnnieConstants.FEAT_STEM; +import static gate.creole.ANNIEConstants.LOCATION_ANNOTATION_TYPE; +import static gate.creole.ANNIEConstants.ORGANIZATION_ANNOTATION_TYPE; +import static gate.creole.ANNIEConstants.PERSON_ANNOTATION_TYPE; +import static gate.creole.ANNIEConstants.SENTENCE_ANNOTATION_TYPE; +import static gate.creole.ANNIEConstants.TOKEN_ANNOTATION_TYPE; +import static gate.creole.ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME; +import static gate.creole.ANNIEConstants.TOKEN_LENGTH_FEATURE_NAME; +import static gate.creole.ANNIEConstants.TOKEN_STRING_FEATURE_NAME; import static org.apache.uima.fit.util.JCasUtil.selectAll; +import static org.dkpro.core.io.gate.internal.GateAnnieConstants.FEAT_LEMMA; +import static org.dkpro.core.io.gate.internal.GateAnnieConstants.FEAT_STEM; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.TOP; @@ -40,7 +47,6 @@ import gate.util.GateException; import gate.util.SimpleFeatureMapImpl; import it.unimi.dsi.fastutil.ints.IntOpenHashSet; -import org.apache.uima.cas.Type; public class DKPro2Gate { diff --git a/dkpro-core-io-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/gate/internal/Gate2DKPro.java b/dkpro-core-io-gate-asl/src/main/java/org/dkpro/core/io/gate/internal/Gate2DKPro.java similarity index 93% rename from dkpro-core-io-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/gate/internal/Gate2DKPro.java rename to dkpro-core-io-gate-asl/src/main/java/org/dkpro/core/io/gate/internal/Gate2DKPro.java index 79c537a3b4..5d9503bb70 100644 --- a/dkpro-core-io-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/gate/internal/Gate2DKPro.java +++ b/dkpro-core-io-gate-asl/src/main/java/org/dkpro/core/io/gate/internal/Gate2DKPro.java @@ -15,13 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.gate.internal; +package org.dkpro.core.io.gate.internal; -import static de.tudarmstadt.ukp.dkpro.core.io.gate.internal.GateAnnieConstants.FEAT_LEMMA; -import static de.tudarmstadt.ukp.dkpro.core.io.gate.internal.GateAnnieConstants.FEAT_STEM; import static gate.creole.ANNIEConstants.SENTENCE_ANNOTATION_TYPE; import static gate.creole.ANNIEConstants.TOKEN_ANNOTATION_TYPE; import static gate.creole.ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME; +import static org.dkpro.core.io.gate.internal.GateAnnieConstants.FEAT_LEMMA; +import static org.dkpro.core.io.gate.internal.GateAnnieConstants.FEAT_STEM; import org.apache.uima.jcas.JCas; diff --git a/dkpro-core-io-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/gate/internal/GateAnnieConstants.java b/dkpro-core-io-gate-asl/src/main/java/org/dkpro/core/io/gate/internal/GateAnnieConstants.java similarity index 94% rename from dkpro-core-io-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/gate/internal/GateAnnieConstants.java rename to dkpro-core-io-gate-asl/src/main/java/org/dkpro/core/io/gate/internal/GateAnnieConstants.java index 41c045fe6b..a05a62ac9f 100644 --- a/dkpro-core-io-gate-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/gate/internal/GateAnnieConstants.java +++ b/dkpro-core-io-gate-asl/src/main/java/org/dkpro/core/io/gate/internal/GateAnnieConstants.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.gate.internal; +package org.dkpro.core.io.gate.internal; public final class GateAnnieConstants { diff --git a/dkpro-core-io-gate-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/gate/GateXmlWriterTest.java b/dkpro-core-io-gate-asl/src/test/java/org/dkpro/core/io/gate/GateXmlWriterTest.java similarity index 88% rename from dkpro-core-io-gate-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/gate/GateXmlWriterTest.java rename to dkpro-core-io-gate-asl/src/test/java/org/dkpro/core/io/gate/GateXmlWriterTest.java index a0df1cf9db..e4afe4022c 100644 --- a/dkpro-core-io-gate-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/gate/GateXmlWriterTest.java +++ b/dkpro-core-io-gate-asl/src/test/java/org/dkpro/core/io/gate/GateXmlWriterTest.java @@ -15,19 +15,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.gate; +package org.dkpro.core.io.gate; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testOneWay; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; +import org.dkpro.core.io.conll.Conll2000Reader; +import org.dkpro.core.io.conll.Conll2002Reader; +import org.dkpro.core.io.gate.GateXmlWriter; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2000Reader; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2002Reader; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - public class GateXmlWriterTest { @Test diff --git a/dkpro-core-io-gate-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/gate/GateXmlWriterTest2.java b/dkpro-core-io-gate-asl/src/test/java/org/dkpro/core/io/gate/GateXmlWriterTest2.java similarity index 80% rename from dkpro-core-io-gate-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/gate/GateXmlWriterTest2.java rename to dkpro-core-io-gate-asl/src/test/java/org/dkpro/core/io/gate/GateXmlWriterTest2.java index 489174f59b..bb93d9558e 100644 --- a/dkpro-core-io-gate-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/gate/GateXmlWriterTest2.java +++ b/dkpro-core-io-gate-asl/src/test/java/org/dkpro/core/io/gate/GateXmlWriterTest2.java @@ -15,16 +15,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.gate; +package org.dkpro.core.io.gate; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testOneWay; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; +import org.dkpro.core.io.conll.Conll2000Reader; +import org.dkpro.core.io.gate.GateXmlWriter2; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2000Reader; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - public class GateXmlWriterTest2 { @Test diff --git a/dkpro-core-io-gate-asl/src/test/resources/log4j.properties b/dkpro-core-io-gate-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-io-gate-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-gate-asl/src/test/resources/log4j2.xml b/dkpro-core-io-gate-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-gate-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-nyt-asl/LICENSE.txt b/dkpro-core-io-gigaword-asl/LICENSE.txt similarity index 100% rename from dkpro-core-io-nyt-asl/LICENSE.txt rename to dkpro-core-io-gigaword-asl/LICENSE.txt diff --git a/dkpro-core-io-gigaword-asl/pom.xml b/dkpro-core-io-gigaword-asl/pom.xml new file mode 100644 index 0000000000..baf29c1a92 --- /dev/null +++ b/dkpro-core-io-gigaword-asl/pom.xml @@ -0,0 +1,70 @@ + + + 4.0.0 + + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT + ../dkpro-core-asl + + dkpro-core-io-gigaword-asl + jar + DKPro Core ASL - IO - Gigaword + https://dkpro.github.io/dkpro-core/ + + + org.apache.uima + uimaj-core + + + org.apache.uima + uimafit-core + + + com.google.guava + guava + + + org.dkpro.core + dkpro-core-api-metadata-asl + + + org.dkpro.core + dkpro-core-api-resources-asl + + + org.dkpro.core + dkpro-core-api-io-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api + + + junit + junit + test + + + org.assertj + assertj-core + test + + + diff --git a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReader.java b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReader.java new file mode 100644 index 0000000000..b4278a3b38 --- /dev/null +++ b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReader.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.gigaword; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Iterator; + +import org.apache.uima.UimaContext; +import org.apache.uima.cas.CAS; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.ResourceCollectionReaderBase; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordArticle; +import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordDocuments; + +import com.google.common.collect.AbstractIterator; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * UIMA collection reader for plain text files. + */ +@ResourceMetaData(name = "Annotated Gigaword Text Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability(MimeTypes.TEXT_PLAIN) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) +public class AnnotatedGigawordReader + extends ResourceCollectionReaderBase +{ + private MultiFileArticleIterator iter; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException + { + super.initialize(aContext); + + iter = new MultiFileArticleIterator(); + } + + @Override + public void getNext(CAS aJCas) + throws IOException, CollectionException + { + AnnotatedGigawordArticle article = iter.next(); + Resource res = article.getResource(); + initCas(aJCas, res, article.getId()); + + DocumentMetaData dmd = DocumentMetaData.get(aJCas); + dmd.setDocumentId(article.getId()); + + aJCas.setDocumentText(article.getText()); + } + + @Override + public boolean hasNext() throws IOException, CollectionException + { + return iter.hasNext(); + } + + private class MultiFileArticleIterator extends AbstractIterator + { + private Iterator currentFileIterator; + + @Override + protected AnnotatedGigawordArticle computeNext() + { + try { + while ( + (currentFileIterator == null || !currentFileIterator.hasNext()) + && AnnotatedGigawordReader.super.hasNext() + ) { + Resource res = nextFile(); + try (InputStream is = new BufferedInputStream(CompressionUtils + .getInputStream(res.getLocation(), res.getInputStream()))) { + currentFileIterator = AnnotatedGigawordDocuments + .fromAnnotatedGigawordFile(res).iterator(); + + } + } + } + catch (Exception e) { + throw new RuntimeException(e); + } + + if (currentFileIterator == null || !currentFileIterator.hasNext()) { + return endOfData(); + } + + return currentFileIterator.next(); + } + } +} diff --git a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordArticle.java b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordArticle.java new file mode 100644 index 0000000000..80c67bba60 --- /dev/null +++ b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordArticle.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.gigaword.internal; + +import org.dkpro.core.api.io.ResourceCollectionReaderBase.Resource; + +public class AnnotatedGigawordArticle +{ + private final Resource res; + + private final String id; + + private final String text; + + public AnnotatedGigawordArticle(Resource aRes, String aId, String aText) + { + res = aRes; + id = aId; + text = aText; + } + + public Resource getResource() + { + return res; + } + + public String getId() + { + return id; + } + + public String getText() + { + return text; + } + + @Override + public String toString() + { + return "Article [id=" + id + ", text=" + text.substring(0, Math.min(100, text.length() - 1)) + + "...]"; + } +} diff --git a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordDocuments.java b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordDocuments.java new file mode 100644 index 0000000000..2498c15cd0 --- /dev/null +++ b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordDocuments.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.gigaword.internal; + +import java.io.BufferedInputStream; +import java.io.InputStream; +import java.util.Iterator; +import java.util.List; + +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.dkpro.core.api.io.ResourceCollectionReaderBase.Resource; +import org.dkpro.core.api.resources.CompressionUtils; + +import com.google.common.collect.AbstractIterator; + +/** + * The LDC distributes annotated Gigaword as a moderate number of gzipped files, each of which has + * many documents concatenated together. This class lets you iterate over the documents stored in + * such a file. + */ +public class AnnotatedGigawordDocuments + implements Iterable +{ + private List articleList; + + private AnnotatedGigawordDocuments(List aArticleList) + { + articleList = aArticleList; + } + + public static AnnotatedGigawordDocuments fromAnnotatedGigawordFile(Resource aResource) + throws Exception + { + try (InputStream is = new BufferedInputStream(CompressionUtils + .getInputStream(aResource.getLocation(), aResource.getInputStream()))) { + SAXParserFactory factory = SAXParserFactory.newInstance(); + SAXParser saxParser = factory.newSAXParser(); + AnnotatedGigawordParser parser = new AnnotatedGigawordParser(aResource); + saxParser.parse(is, parser); + return new AnnotatedGigawordDocuments(parser.getArticleList()); + } + } + + @Override + public Iterator iterator() + { + return new AnnotatedArticlesIterator(); + } + + private class AnnotatedArticlesIterator + extends AbstractIterator + { + private int startNextIndex = 0; + + @Override + protected AnnotatedGigawordArticle computeNext() + { + if (startNextIndex >= articleList.size()) { + return endOfData(); + } + else { + AnnotatedGigawordArticle nextArticle = articleList.get(startNextIndex); + startNextIndex++; + return nextArticle; + } + } + } +} diff --git a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordParser.java b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordParser.java new file mode 100644 index 0000000000..e8c68b7953 --- /dev/null +++ b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordParser.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.gigaword.internal; + +import java.util.ArrayList; +import java.util.List; + +import org.dkpro.core.api.io.ResourceCollectionReaderBase.Resource; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Read text from the Annotated Gigaword Corpus. This reader does not read any of the + * annotations yet. + */ +public class AnnotatedGigawordParser extends DefaultHandler +{ + private final Resource resource; + + private List articleList = new ArrayList<>(); + + // flags for parsing articles + private boolean inDocument = false; + private boolean inSentences = false; + private boolean inToken = false; + private boolean inWord = false; + private boolean inOffsetBegin = false; + + // variables for reconstructing articles + private StringBuilder docText = new StringBuilder(); + private String currentDocId = ""; + private String currentWord = ""; + private int currentOffsetBegin = 0; + + public AnnotatedGigawordParser(Resource aResource) + { + super(); + resource = aResource; + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes attributes) + { + if (qName.equals("DOC")) { + inDocument = true; + currentDocId = attributes.getValue("id"); + } + else if (inDocument && qName.equals("sentences")) { + inSentences = true; + } + else if (inSentences && qName.equals("token")) { + inToken = true; + } + else if (inToken && qName.equals("word")) { + inWord = true; + } + else if (inToken && qName.equals("CharacterOffsetBegin")) { + inOffsetBegin = true; + } + } + + @Override + public void endElement(String uri, String localName, String qName) + { + if (qName.equals("DOC")) { + inDocument = false; + } + else if (inDocument && qName.equals("sentences")) { + inSentences = false; + articleList + .add(new AnnotatedGigawordArticle(resource, currentDocId, docText.toString())); + docText = new StringBuilder(); + } + else if (inSentences && qName.equals("token")) { + inToken = false; + while (docText.length() < currentOffsetBegin) { + docText.append(" "); + } + docText.append(currentWord); + } + else if (inToken && qName.equals("word")) { + inWord = false; + } + else if (inToken && qName.equals("CharacterOffsetBegin")) { + inOffsetBegin = false; + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (inWord) { + currentWord = new String(ch, start, length); + } + if (inOffsetBegin) { + currentOffsetBegin = Integer.parseInt(new String(ch, start, length).trim()); + } + + } + + public List getArticleList() { + return articleList; + } +} diff --git a/dkpro-core-io-gigaword-asl/src/test/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReaderTest.java b/dkpro-core-io-gigaword-asl/src/test/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReaderTest.java new file mode 100644 index 0000000000..bee2c3236f --- /dev/null +++ b/dkpro-core-io-gigaword-asl/src/test/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReaderTest.java @@ -0,0 +1,65 @@ +/* + * Copyright 2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.gigaword; + +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; + +public class AnnotatedGigawordReaderTest +{ + @Test + public void collectArticlesFromAnnotatedGigaword() + throws Exception + { + CollectionReader reader = createReader(AnnotatedGigawordReader.class, + AnnotatedGigawordReader.PARAM_SOURCE_LOCATION, "src/test/resources/texts/*.txt"); + + JCas jcas = JCasFactory.createJCas(); + + List ids = new ArrayList<>(); + List texts = new ArrayList<>(); + + while (reader.hasNext()) { + jcas.reset(); + reader.getNext(jcas.getCas()); + + DocumentMetaData dmd = DocumentMetaData.get(jcas); + ids.add(dmd.getDocumentUri().substring(dmd.getDocumentBaseUri().length())); + texts.add(jcas.getDocumentText()); + } + + assertThat(ids) + .containsExactly( + "gigaword_test_1.txt#Test1", "gigaword_test_1.txt#Test2", + "gigaword_test_1.txt#Test3", "gigaword_test_2.txt#Test1", + "gigaword_test_2.txt#Test2", "gigaword_test_2.txt#Test3"); + + assertThat(texts) + .allMatch(t -> t.contains("days left in the year")); + } +} diff --git a/dkpro-core-io-gigaword-asl/src/test/resources/texts/gigaword_test_1.txt b/dkpro-core-io-gigaword-asl/src/test/resources/texts/gigaword_test_1.txt new file mode 100644 index 0000000000..47879ab3ad --- /dev/null +++ b/dkpro-core-io-gigaword-asl/src/test/resources/texts/gigaword_test_1.txt @@ -0,0 +1,1667 @@ + + + +( (NP (NP (NN Today)) (PP (IN In) (NP (NN History))) (: -) (NP (NNP April) (CD 1)))) + + +( (S (NP (NN Today)) (VP (VBZ is) (NP (NP (NP (NNP Sunday)) (, ,) (NP (NNP April) (CD 1))) (, ,) (NP (NP (DT the) (JJ 91st) (NN day)) (PP (IN of) (NP (CD 2001)))))) (. .))) +( (S (NP (EX There)) (VP (VBP are) (NP (NP (CD 274) (NNS days)) (VP (VBN left) (PP (IN in) (NP (DT the) (NN year)))))) (. .))) +( (S (NP (DT This)) (VP (VBZ is) (NP (NP (NNP April) (NNP Fool) (POS 's)) (NN Day))) (. .))) + + + + + + Today + today + 0 + 5 + NN + O + + + is + be + 6 + 8 + VBZ + O + + + Sunday + Sunday + 9 + 15 + NNP + DATE + ****0401 + + + , + , + 16 + 17 + , + DATE + ****0401 + + + April + April + 18 + 23 + NNP + DATE + ****0401 + + + 1 + 1 + 24 + 25 + CD + DATE + ****0401 + + + , + , + 26 + 27 + , + O + + + the + the + 28 + 31 + DT + O + + + 91st + 91st + 32 + 36 + JJ + DATE + 2001 + + + day + day + 37 + 40 + NN + DATE + 2001 + + + of + of + 41 + 43 + IN + DATE + 2001 + + + 2001 + 2001 + 44 + 48 + CD + DATE + 2001 + + + . + . + 49 + 50 + . + O + + + (ROOT (S (NP (NN Today)) (VP (VBZ is) (NP (NP (NP (NNP Sunday)) (, ,) (NP (NNP April) (CD 1))) (, ,) (NP (NP (DT the) (JJ 91st) (NN day)) (PP (IN of) (NP (CD 2001)))))) (. .))) + + + 3 + 1 + + + 3 + 2 + + + 0 + 3 + + + 3 + 5 + + + 5 + 6 + + + 10 + 8 + + + 10 + 9 + + + 3 + 10 + + + 10 + 11 + + + 11 + 12 + + + + + 3 + 1 + + + 3 + 2 + + + 0 + 3 + + + 3 + 5 + + + 5 + 6 + + + 10 + 8 + + + 10 + 9 + + + 3 + 10 + + + 10 + 12 + + + + + 3 + 1 + + + 3 + 2 + + + 0 + 3 + + + 3 + 5 + + + 5 + 6 + + + 10 + 8 + + + 10 + 9 + + + 3 + 10 + + + 10 + 12 + + + + + + + There + there + 51 + 56 + EX + O + + + are + be + 57 + 60 + VBP + O + + + 274 + 274 + 61 + 64 + CD + NUMBER + 274.0 + + + days + day + 65 + 69 + NNS + NUMBER + 274.0 + + + left + leave + 70 + 74 + VBN + O + + + in + in + 75 + 77 + IN + O + + + the + the + 78 + 81 + DT + O + + + year + year + 82 + 86 + NN + O + + + . + . + 87 + 88 + . + O + + + (ROOT (S (NP (EX There)) (VP (VBP are) (NP (NP (CD 274) (NNS days)) (VP (VBN left) (PP (IN in) (NP (DT the) (NN year)))))) (. .))) + + + 2 + 1 + + + 0 + 2 + + + 4 + 3 + + + 2 + 4 + + + 4 + 5 + + + 5 + 6 + + + 8 + 7 + + + 6 + 8 + + + + + 2 + 1 + + + 0 + 2 + + + 4 + 3 + + + 2 + 4 + + + 4 + 5 + + + 8 + 7 + + + 5 + 8 + + + + + 2 + 1 + + + 0 + 2 + + + 4 + 3 + + + 2 + 4 + + + 4 + 5 + + + 8 + 7 + + + 5 + 8 + + + + + + + This + this + 89 + 93 + DT + O + + + is + be + 94 + 96 + VBZ + O + + + April + April + 97 + 102 + NNP + DATE + ****04 + + + Fool + Fool + 103 + 107 + NNP + DATE + ****04 + + + 's + 's + 108 + 110 + POS + DATE + ****04 + + + Day + day + 111 + 114 + NN + DATE + ****04 + + + . + . + 115 + 116 + . + O + + + (ROOT (S (NP (DT This)) (VP (VBZ is) (NP (NP (NNP April) (NNP Fool) (POS 's)) (NN Day))) (. .))) + + + 6 + 1 + + + 6 + 2 + + + 4 + 3 + + + 6 + 4 + + + 4 + 5 + + + 0 + 6 + + + + + 6 + 1 + + + 6 + 2 + + + 4 + 3 + + + 6 + 4 + + + 0 + 6 + + + + + 6 + 1 + + + 6 + 2 + + + 4 + 3 + + + 6 + 4 + + + 0 + 6 + + + + + + + +( (NP (NP (NN Today)) (PP (IN In) (NP (NN History))) (: -) (NP (NNP April) (CD 2)))) + + +( (S (NP (NN Today)) (VP (VBZ is) (NP (NP (NP (NNP Monday)) (, ,) (NP (NNP April) (CD 2))) (, ,) (NP (NP (DT the) (JJ 92nd) (NN day)) (PP (IN of) (NP (CD 2001)))))) (. .))) +( (S (NP (EX There)) (VP (VBP are) (NP (NP (CD 273) (NNS days)) (VP (VBN left) (PP (IN in) (NP (DT the) (NN year)))))) (. .))) +( (S (NP (DT This)) (VP (VBZ is) (NP (NP (NNP April) (NNP Tool) (POS 's)) (NN Day))) (. .))) + + + + + + Today + today + 0 + 5 + NN + O + + + is + be + 6 + 8 + VBZ + O + + + Monday + Monday + 9 + 15 + NNP + DATE + ****0401 + + + , + , + 16 + 17 + , + DATE + ****0401 + + + April + April + 18 + 23 + NNP + DATE + ****0401 + + + 2 + 2 + 24 + 25 + CD + DATE + ****0401 + + + , + , + 26 + 27 + , + O + + + the + the + 28 + 31 + DT + O + + + 92nd + 92nd + 32 + 36 + JJ + DATE + 2001 + + + day + day + 37 + 40 + NN + DATE + 2001 + + + of + of + 41 + 43 + IN + DATE + 2001 + + + 2001 + 2001 + 44 + 48 + CD + DATE + 2001 + + + . + . + 49 + 50 + . + O + + + (ROOT (S (NP (NN Today)) (VP (VBZ is) (NP (NP (NP (NNP Monday)) (, ,) (NP (NNP April) (CD 2))) (, ,) (NP (NP (DT the) (JJ 92nd) (NN day)) (PP (IN of) (NP (CD 2001)))))) (. .))) + + + 3 + 1 + + + 3 + 2 + + + 0 + 3 + + + 3 + 5 + + + 5 + 6 + + + 10 + 8 + + + 10 + 9 + + + 3 + 10 + + + 10 + 11 + + + 11 + 12 + + + + + 3 + 1 + + + 3 + 2 + + + 0 + 3 + + + 3 + 5 + + + 5 + 6 + + + 10 + 8 + + + 10 + 9 + + + 3 + 10 + + + 10 + 12 + + + + + 3 + 1 + + + 3 + 2 + + + 0 + 3 + + + 3 + 5 + + + 5 + 6 + + + 10 + 8 + + + 10 + 9 + + + 3 + 10 + + + 10 + 12 + + + + + + + There + there + 51 + 56 + EX + O + + + are + be + 57 + 60 + VBP + O + + + 273 + 273 + 61 + 64 + CD + NUMBER + 273.0 + + + days + day + 65 + 69 + NNS + NUMBER + 274.0 + + + left + leave + 70 + 74 + VBN + O + + + in + in + 75 + 77 + IN + O + + + the + the + 78 + 81 + DT + O + + + year + year + 82 + 86 + NN + O + + + . + . + 87 + 88 + . + O + + + (ROOT (S (NP (EX There)) (VP (VBP are) (NP (NP (CD 273) (NNS days)) (VP (VBN left) (PP (IN in) (NP (DT the) (NN year)))))) (. .))) + + + 2 + 1 + + + 0 + 2 + + + 4 + 3 + + + 2 + 4 + + + 4 + 5 + + + 5 + 6 + + + 8 + 7 + + + 6 + 8 + + + + + 2 + 1 + + + 0 + 2 + + + 4 + 3 + + + 2 + 4 + + + 4 + 5 + + + 8 + 7 + + + 5 + 8 + + + + + 2 + 1 + + + 0 + 2 + + + 4 + 3 + + + 2 + 4 + + + 4 + 5 + + + 8 + 7 + + + 5 + 8 + + + + + + + This + this + 89 + 93 + DT + O + + + is + be + 94 + 96 + VBZ + O + + + April + April + 97 + 102 + NNP + DATE + ****04 + + + Tool + Tool + 103 + 107 + NNP + DATE + ****04 + + + 's + 's + 108 + 110 + POS + DATE + ****04 + + + Day + day + 111 + 114 + NN + DATE + ****04 + + + . + . + 115 + 116 + . + O + + + (ROOT (S (NP (DT This)) (VP (VBZ is) (NP (NP (NNP April) (NNP Tool) (POS 's)) (NN Day))) (. .))) + + + 6 + 1 + + + 6 + 2 + + + 4 + 3 + + + 6 + 4 + + + 4 + 5 + + + 0 + 6 + + + + + 6 + 1 + + + 6 + 2 + + + 4 + 3 + + + 6 + 4 + + + 0 + 6 + + + + + 6 + 1 + + + 6 + 2 + + + 4 + 3 + + + 6 + 4 + + + 0 + 6 + + + + + + + +( (NP (NP (NN Today)) (PP (IN In) (NP (NN History))) (: -) (NP (NNP April) (CD 3)))) + + +( (S (NP (NN Today)) (VP (VBZ is) (NP (NP (NP (NNP Tuesday)) (, ,) (NP (NNP April) (CD 3))) (, ,) (NP (NP (DT the) (JJ 93rd) (NN day)) (PP (IN of) (NP (CD 2001)))))) (. .))) +( (S (NP (EX There)) (VP (VBP are) (NP (NP (CD 272) (NNS days)) (VP (VBN left) (PP (IN in) (NP (DT the) (NN year)))))) (. .))) +( (S (NP (DT This)) (VP (VBZ is) (NP (NP (NNP April) (NNP Pool) (POS 's)) (NN Day))) (. .))) + + + + + + Today + today + 0 + 5 + NN + O + + + is + be + 6 + 8 + VBZ + O + + + Tuesday + Tuesday + 9 + 15 + NNP + DATE + ****0401 + + + , + , + 16 + 17 + , + DATE + ****0401 + + + April + April + 18 + 23 + NNP + DATE + ****0401 + + + 3 + 3 + 24 + 25 + CD + DATE + ****0401 + + + , + , + 26 + 27 + , + O + + + the + the + 28 + 31 + DT + O + + + 93rd + 93rd + 32 + 36 + JJ + DATE + 2001 + + + day + day + 37 + 40 + NN + DATE + 2001 + + + of + of + 41 + 43 + IN + DATE + 2001 + + + 2001 + 2001 + 44 + 48 + CD + DATE + 2001 + + + . + . + 49 + 50 + . + O + + + (ROOT (S (NP (NN Today)) (VP (VBZ is) (NP (NP (NP (NNP Tuesday)) (, ,) (NP (NNP April) (CD 3))) (, ,) (NP (NP (DT the) (JJ 93rd) (NN day)) (PP (IN of) (NP (CD 2001)))))) (. .))) + + + 3 + 1 + + + 3 + 2 + + + 0 + 3 + + + 3 + 5 + + + 5 + 6 + + + 10 + 8 + + + 10 + 9 + + + 3 + 10 + + + 10 + 11 + + + 11 + 12 + + + + + 3 + 1 + + + 3 + 2 + + + 0 + 3 + + + 3 + 5 + + + 5 + 6 + + + 10 + 8 + + + 10 + 9 + + + 3 + 10 + + + 10 + 12 + + + + + 3 + 1 + + + 3 + 2 + + + 0 + 3 + + + 3 + 5 + + + 5 + 6 + + + 10 + 8 + + + 10 + 9 + + + 3 + 10 + + + 10 + 12 + + + + + + + There + there + 51 + 56 + EX + O + + + are + be + 57 + 60 + VBP + O + + + 272 + 272 + 61 + 64 + CD + NUMBER + 272.0 + + + days + day + 65 + 69 + NNS + NUMBER + 274.0 + + + left + leave + 70 + 74 + VBN + O + + + in + in + 75 + 77 + IN + O + + + the + the + 78 + 81 + DT + O + + + year + year + 82 + 86 + NN + O + + + . + . + 87 + 88 + . + O + + + (ROOT (S (NP (EX There)) (VP (VBP are) (NP (NP (CD 272) (NNS days)) (VP (VBN left) (PP (IN in) (NP (DT the) (NN year)))))) (. .))) + + + 2 + 1 + + + 0 + 2 + + + 4 + 3 + + + 2 + 4 + + + 4 + 5 + + + 5 + 6 + + + 8 + 7 + + + 6 + 8 + + + + + 2 + 1 + + + 0 + 2 + + + 4 + 3 + + + 2 + 4 + + + 4 + 5 + + + 8 + 7 + + + 5 + 8 + + + + + 2 + 1 + + + 0 + 2 + + + 4 + 3 + + + 2 + 4 + + + 4 + 5 + + + 8 + 7 + + + 5 + 8 + + + + + + + This + this + 89 + 93 + DT + O + + + is + be + 94 + 96 + VBZ + O + + + April + April + 97 + 102 + NNP + DATE + ****04 + + + Pool + Pool + 103 + 107 + NNP + DATE + ****04 + + + 's + 's + 108 + 110 + POS + DATE + ****04 + + + Day + day + 111 + 114 + NN + DATE + ****04 + + + . + . + 115 + 116 + . + O + + + (ROOT (S (NP (DT This)) (VP (VBZ is) (NP (NP (NNP April) (NNP Pool) (POS 's)) (NN Day))) (. .))) + + + 6 + 1 + + + 6 + 2 + + + 4 + 3 + + + 6 + 4 + + + 4 + 5 + + + 0 + 6 + + + + + 6 + 1 + + + 6 + 2 + + + 4 + 3 + + + 6 + 4 + + + 0 + 6 + + + + + 6 + 1 + + + 6 + 2 + + + 4 + 3 + + + 6 + 4 + + + 0 + 6 + + + + + + diff --git a/dkpro-core-io-gigaword-asl/src/test/resources/texts/gigaword_test_2.txt b/dkpro-core-io-gigaword-asl/src/test/resources/texts/gigaword_test_2.txt new file mode 100644 index 0000000000..b2c3589030 --- /dev/null +++ b/dkpro-core-io-gigaword-asl/src/test/resources/texts/gigaword_test_2.txt @@ -0,0 +1,1667 @@ + + + +( (NP (NP (NN Today)) (PP (IN In) (NP (NN History))) (: -) (NP (NNP April) (CD 1)))) + + +( (S (NP (NN Today)) (VP (VBZ is) (NP (NP (NP (NNP Sunday)) (, ,) (NP (NNP April) (CD 1))) (, ,) (NP (NP (DT the) (JJ 91st) (NN day)) (PP (IN of) (NP (CD 2001)))))) (. .))) +( (S (NP (EX There)) (VP (VBP are) (NP (NP (CD 274) (NNS days)) (VP (VBN left) (PP (IN in) (NP (DT the) (NN year)))))) (. .))) +( (S (NP (DT This)) (VP (VBZ is) (NP (NP (NNP April) (NNP Fool) (POS 's)) (NN Day))) (. .))) + + + + + + Today + today + 0 + 5 + NN + O + + + is + be + 6 + 8 + VBZ + O + + + Sunday + Sunday + 9 + 15 + NNP + DATE + ****0401 + + + , + , + 16 + 17 + , + DATE + ****0401 + + + April + April + 18 + 23 + NNP + DATE + ****0401 + + + 1 + 1 + 24 + 25 + CD + DATE + ****0401 + + + , + , + 26 + 27 + , + O + + + the + the + 28 + 31 + DT + O + + + 91st + 91st + 32 + 36 + JJ + DATE + 2001 + + + day + day + 37 + 40 + NN + DATE + 2001 + + + of + of + 41 + 43 + IN + DATE + 2001 + + + 2001 + 2001 + 44 + 48 + CD + DATE + 2001 + + + . + . + 49 + 50 + . + O + + + (ROOT (S (NP (NN Today)) (VP (VBZ is) (NP (NP (NP (NNP Sunday)) (, ,) (NP (NNP April) (CD 1))) (, ,) (NP (NP (DT the) (JJ 91st) (NN day)) (PP (IN of) (NP (CD 2001)))))) (. .))) + + + 3 + 1 + + + 3 + 2 + + + 0 + 3 + + + 3 + 5 + + + 5 + 6 + + + 10 + 8 + + + 10 + 9 + + + 3 + 10 + + + 10 + 11 + + + 11 + 12 + + + + + 3 + 1 + + + 3 + 2 + + + 0 + 3 + + + 3 + 5 + + + 5 + 6 + + + 10 + 8 + + + 10 + 9 + + + 3 + 10 + + + 10 + 12 + + + + + 3 + 1 + + + 3 + 2 + + + 0 + 3 + + + 3 + 5 + + + 5 + 6 + + + 10 + 8 + + + 10 + 9 + + + 3 + 10 + + + 10 + 12 + + + + + + + There + there + 51 + 56 + EX + O + + + are + be + 57 + 60 + VBP + O + + + 274 + 274 + 61 + 64 + CD + NUMBER + 274.0 + + + days + day + 65 + 69 + NNS + NUMBER + 274.0 + + + left + leave + 70 + 74 + VBN + O + + + in + in + 75 + 77 + IN + O + + + the + the + 78 + 81 + DT + O + + + year + year + 82 + 86 + NN + O + + + . + . + 87 + 88 + . + O + + + (ROOT (S (NP (EX There)) (VP (VBP are) (NP (NP (CD 274) (NNS days)) (VP (VBN left) (PP (IN in) (NP (DT the) (NN year)))))) (. .))) + + + 2 + 1 + + + 0 + 2 + + + 4 + 3 + + + 2 + 4 + + + 4 + 5 + + + 5 + 6 + + + 8 + 7 + + + 6 + 8 + + + + + 2 + 1 + + + 0 + 2 + + + 4 + 3 + + + 2 + 4 + + + 4 + 5 + + + 8 + 7 + + + 5 + 8 + + + + + 2 + 1 + + + 0 + 2 + + + 4 + 3 + + + 2 + 4 + + + 4 + 5 + + + 8 + 7 + + + 5 + 8 + + + + + + + This + this + 89 + 93 + DT + O + + + is + be + 94 + 96 + VBZ + O + + + April + April + 97 + 102 + NNP + DATE + ****04 + + + Fool + Fool + 103 + 107 + NNP + DATE + ****04 + + + 's + 's + 108 + 110 + POS + DATE + ****04 + + + Day + day + 111 + 114 + NN + DATE + ****04 + + + . + . + 115 + 116 + . + O + + + (ROOT (S (NP (DT This)) (VP (VBZ is) (NP (NP (NNP April) (NNP Fool) (POS 's)) (NN Day))) (. .))) + + + 6 + 1 + + + 6 + 2 + + + 4 + 3 + + + 6 + 4 + + + 4 + 5 + + + 0 + 6 + + + + + 6 + 1 + + + 6 + 2 + + + 4 + 3 + + + 6 + 4 + + + 0 + 6 + + + + + 6 + 1 + + + 6 + 2 + + + 4 + 3 + + + 6 + 4 + + + 0 + 6 + + + + + + + +( (NP (NP (NN Today)) (PP (IN In) (NP (NN History))) (: -) (NP (NNP April) (CD 2)))) + + +( (S (NP (NN Today)) (VP (VBZ is) (NP (NP (NP (NNP Monday)) (, ,) (NP (NNP April) (CD 2))) (, ,) (NP (NP (DT the) (JJ 92nd) (NN day)) (PP (IN of) (NP (CD 2001)))))) (. .))) +( (S (NP (EX There)) (VP (VBP are) (NP (NP (CD 273) (NNS days)) (VP (VBN left) (PP (IN in) (NP (DT the) (NN year)))))) (. .))) +( (S (NP (DT This)) (VP (VBZ is) (NP (NP (NNP April) (NNP Tool) (POS 's)) (NN Day))) (. .))) + + + + + + Today + today + 0 + 5 + NN + O + + + is + be + 6 + 8 + VBZ + O + + + Monday + Monday + 9 + 15 + NNP + DATE + ****0401 + + + , + , + 16 + 17 + , + DATE + ****0401 + + + April + April + 18 + 23 + NNP + DATE + ****0401 + + + 2 + 2 + 24 + 25 + CD + DATE + ****0401 + + + , + , + 26 + 27 + , + O + + + the + the + 28 + 31 + DT + O + + + 92nd + 92nd + 32 + 36 + JJ + DATE + 2001 + + + day + day + 37 + 40 + NN + DATE + 2001 + + + of + of + 41 + 43 + IN + DATE + 2001 + + + 2001 + 2001 + 44 + 48 + CD + DATE + 2001 + + + . + . + 49 + 50 + . + O + + + (ROOT (S (NP (NN Today)) (VP (VBZ is) (NP (NP (NP (NNP Monday)) (, ,) (NP (NNP April) (CD 2))) (, ,) (NP (NP (DT the) (JJ 92nd) (NN day)) (PP (IN of) (NP (CD 2001)))))) (. .))) + + + 3 + 1 + + + 3 + 2 + + + 0 + 3 + + + 3 + 5 + + + 5 + 6 + + + 10 + 8 + + + 10 + 9 + + + 3 + 10 + + + 10 + 11 + + + 11 + 12 + + + + + 3 + 1 + + + 3 + 2 + + + 0 + 3 + + + 3 + 5 + + + 5 + 6 + + + 10 + 8 + + + 10 + 9 + + + 3 + 10 + + + 10 + 12 + + + + + 3 + 1 + + + 3 + 2 + + + 0 + 3 + + + 3 + 5 + + + 5 + 6 + + + 10 + 8 + + + 10 + 9 + + + 3 + 10 + + + 10 + 12 + + + + + + + There + there + 51 + 56 + EX + O + + + are + be + 57 + 60 + VBP + O + + + 273 + 273 + 61 + 64 + CD + NUMBER + 273.0 + + + days + day + 65 + 69 + NNS + NUMBER + 274.0 + + + left + leave + 70 + 74 + VBN + O + + + in + in + 75 + 77 + IN + O + + + the + the + 78 + 81 + DT + O + + + year + year + 82 + 86 + NN + O + + + . + . + 87 + 88 + . + O + + + (ROOT (S (NP (EX There)) (VP (VBP are) (NP (NP (CD 273) (NNS days)) (VP (VBN left) (PP (IN in) (NP (DT the) (NN year)))))) (. .))) + + + 2 + 1 + + + 0 + 2 + + + 4 + 3 + + + 2 + 4 + + + 4 + 5 + + + 5 + 6 + + + 8 + 7 + + + 6 + 8 + + + + + 2 + 1 + + + 0 + 2 + + + 4 + 3 + + + 2 + 4 + + + 4 + 5 + + + 8 + 7 + + + 5 + 8 + + + + + 2 + 1 + + + 0 + 2 + + + 4 + 3 + + + 2 + 4 + + + 4 + 5 + + + 8 + 7 + + + 5 + 8 + + + + + + + This + this + 89 + 93 + DT + O + + + is + be + 94 + 96 + VBZ + O + + + April + April + 97 + 102 + NNP + DATE + ****04 + + + Tool + Tool + 103 + 107 + NNP + DATE + ****04 + + + 's + 's + 108 + 110 + POS + DATE + ****04 + + + Day + day + 111 + 114 + NN + DATE + ****04 + + + . + . + 115 + 116 + . + O + + + (ROOT (S (NP (DT This)) (VP (VBZ is) (NP (NP (NNP April) (NNP Tool) (POS 's)) (NN Day))) (. .))) + + + 6 + 1 + + + 6 + 2 + + + 4 + 3 + + + 6 + 4 + + + 4 + 5 + + + 0 + 6 + + + + + 6 + 1 + + + 6 + 2 + + + 4 + 3 + + + 6 + 4 + + + 0 + 6 + + + + + 6 + 1 + + + 6 + 2 + + + 4 + 3 + + + 6 + 4 + + + 0 + 6 + + + + + + + +( (NP (NP (NN Today)) (PP (IN In) (NP (NN History))) (: -) (NP (NNP April) (CD 3)))) + + +( (S (NP (NN Today)) (VP (VBZ is) (NP (NP (NP (NNP Tuesday)) (, ,) (NP (NNP April) (CD 3))) (, ,) (NP (NP (DT the) (JJ 93rd) (NN day)) (PP (IN of) (NP (CD 2001)))))) (. .))) +( (S (NP (EX There)) (VP (VBP are) (NP (NP (CD 272) (NNS days)) (VP (VBN left) (PP (IN in) (NP (DT the) (NN year)))))) (. .))) +( (S (NP (DT This)) (VP (VBZ is) (NP (NP (NNP April) (NNP Pool) (POS 's)) (NN Day))) (. .))) + + + + + + Today + today + 0 + 5 + NN + O + + + is + be + 6 + 8 + VBZ + O + + + Tuesday + Tuesday + 9 + 15 + NNP + DATE + ****0401 + + + , + , + 16 + 17 + , + DATE + ****0401 + + + April + April + 18 + 23 + NNP + DATE + ****0401 + + + 3 + 3 + 24 + 25 + CD + DATE + ****0401 + + + , + , + 26 + 27 + , + O + + + the + the + 28 + 31 + DT + O + + + 93rd + 93rd + 32 + 36 + JJ + DATE + 2001 + + + day + day + 37 + 40 + NN + DATE + 2001 + + + of + of + 41 + 43 + IN + DATE + 2001 + + + 2001 + 2001 + 44 + 48 + CD + DATE + 2001 + + + . + . + 49 + 50 + . + O + + + (ROOT (S (NP (NN Today)) (VP (VBZ is) (NP (NP (NP (NNP Tuesday)) (, ,) (NP (NNP April) (CD 3))) (, ,) (NP (NP (DT the) (JJ 93rd) (NN day)) (PP (IN of) (NP (CD 2001)))))) (. .))) + + + 3 + 1 + + + 3 + 2 + + + 0 + 3 + + + 3 + 5 + + + 5 + 6 + + + 10 + 8 + + + 10 + 9 + + + 3 + 10 + + + 10 + 11 + + + 11 + 12 + + + + + 3 + 1 + + + 3 + 2 + + + 0 + 3 + + + 3 + 5 + + + 5 + 6 + + + 10 + 8 + + + 10 + 9 + + + 3 + 10 + + + 10 + 12 + + + + + 3 + 1 + + + 3 + 2 + + + 0 + 3 + + + 3 + 5 + + + 5 + 6 + + + 10 + 8 + + + 10 + 9 + + + 3 + 10 + + + 10 + 12 + + + + + + + There + there + 51 + 56 + EX + O + + + are + be + 57 + 60 + VBP + O + + + 272 + 272 + 61 + 64 + CD + NUMBER + 272.0 + + + days + day + 65 + 69 + NNS + NUMBER + 274.0 + + + left + leave + 70 + 74 + VBN + O + + + in + in + 75 + 77 + IN + O + + + the + the + 78 + 81 + DT + O + + + year + year + 82 + 86 + NN + O + + + . + . + 87 + 88 + . + O + + + (ROOT (S (NP (EX There)) (VP (VBP are) (NP (NP (CD 272) (NNS days)) (VP (VBN left) (PP (IN in) (NP (DT the) (NN year)))))) (. .))) + + + 2 + 1 + + + 0 + 2 + + + 4 + 3 + + + 2 + 4 + + + 4 + 5 + + + 5 + 6 + + + 8 + 7 + + + 6 + 8 + + + + + 2 + 1 + + + 0 + 2 + + + 4 + 3 + + + 2 + 4 + + + 4 + 5 + + + 8 + 7 + + + 5 + 8 + + + + + 2 + 1 + + + 0 + 2 + + + 4 + 3 + + + 2 + 4 + + + 4 + 5 + + + 8 + 7 + + + 5 + 8 + + + + + + + This + this + 89 + 93 + DT + O + + + is + be + 94 + 96 + VBZ + O + + + April + April + 97 + 102 + NNP + DATE + ****04 + + + Pool + Pool + 103 + 107 + NNP + DATE + ****04 + + + 's + 's + 108 + 110 + POS + DATE + ****04 + + + Day + day + 111 + 114 + NN + DATE + ****04 + + + . + . + 115 + 116 + . + O + + + (ROOT (S (NP (DT This)) (VP (VBZ is) (NP (NP (NNP April) (NNP Pool) (POS 's)) (NN Day))) (. .))) + + + 6 + 1 + + + 6 + 2 + + + 4 + 3 + + + 6 + 4 + + + 4 + 5 + + + 0 + 6 + + + + + 6 + 1 + + + 6 + 2 + + + 4 + 3 + + + 6 + 4 + + + 0 + 6 + + + + + 6 + 1 + + + 6 + 2 + + + 4 + 3 + + + 6 + 4 + + + 0 + 6 + + + + + + diff --git a/dkpro-core-io-graf-asl/pom.xml b/dkpro-core-io-graf-asl/pom.xml deleted file mode 100644 index ef76551d3b..0000000000 --- a/dkpro-core-io-graf-asl/pom.xml +++ /dev/null @@ -1,138 +0,0 @@ - - - 4.0.0 - - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT - ../dkpro-core-asl - - de.tudarmstadt.ukp.dkpro.core.io.graf-asl - jar - DKPro Core ASL - IO - Graph Annotation Format (GrAF) - - - org.apache.uima - uimaj-core - - - org.apache.uima - uimafit-core - - - commons-io - commons-io - - - org.xces.graf.uima - UimaUtils - 3.0.0 - - - org.tc37sc4.graf - graf-api - 1.2.3 - - - org.tc37sc4.graf - graf-io - 1.2.3 - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl - - - junit - junit - test - - - xmlunit - xmlunit - test - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl - test - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl - test - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.text-asl - test - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-model-sentence-en-maxent - test - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-model-token-en-maxent - test - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-model-tagger-en-maxent - test - - - - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl - 1.10.0-SNAPSHOT - pom - import - - - - - - - - org.apache.maven.plugins - maven-dependency-plugin - - - - de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.opennlp-model-token-en-maxent - de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.opennlp-model-sentence-en-maxent - de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.opennlp-model-tagger-en-maxent - - - - xml-apis:xml-apis - - - - - - - \ No newline at end of file diff --git a/dkpro-core-io-graf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/graf/GrafReader.java b/dkpro-core-io-graf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/graf/GrafReader.java deleted file mode 100644 index 4e26f4eba3..0000000000 --- a/dkpro-core-io-graf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/graf/GrafReader.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.graf; - -import java.io.File; -import java.io.IOException; - -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.CASException; -import org.apache.uima.cas.impl.CASCompleteSerializer; -import org.apache.uima.cas.impl.CASImpl; -import org.apache.uima.cas.impl.Serialization; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.factory.FsIndexFactory; -import org.apache.uima.fit.factory.TypePrioritiesFactory; -import org.apache.uima.fit.factory.TypeSystemDescriptionFactory; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.metadata.FsIndexCollection; -import org.apache.uima.resource.metadata.TypePriorities; -import org.apache.uima.resource.metadata.TypeSystemDescription; -import org.xces.graf.api.GrafException; -import org.xces.graf.api.IGraph; -import org.xces.graf.io.GrafParser; -import org.xces.graf.io.dom.ResourceHeader; -import org.xces.graf.uima.CASFactory; -import org.xml.sax.SAXException; - -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; - -/** - * ISO GrAF reader. - */ -@ResourceMetaData(name="ISO GrAF Reader") -@MimeTypeCapability({MimeTypes.APPLICATION_X_GRAF_XML}) -@TypeCapability( - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) -public class GrafReader - extends ResourceCollectionReaderBase -{ - @Override - public void getNext(CAS aCAS) - throws IOException, CollectionException - { - try { - Resource res = nextFile(); - initCas(aCAS, res); - - // FIXME: This is just because we need to get the header from somewhere right now in the - // unit test. Eventually, we'd probably look for it relative to the file or via a - // parameter. - ResourceHeader header = new ResourceHeader(new File("target/header.xml")); - GrafParser parser = new GrafParser(header); - IGraph graph = parser.parse(new File(res.getResolvedUri())); - - // Find the configurations for the CAS to pass to the CASFactory - TypeSystemDescription tsd = TypeSystemDescriptionFactory.createTypeSystemDescription(); - TypePriorities tp = TypePrioritiesFactory.createTypePriorities(); - FsIndexCollection idx = FsIndexFactory.createFsIndexCollection(); - - // Read the file - CASFactory casFactory = new CASFactory(); - CAS newCas = casFactory.createCas(graph, tsd, tp, idx.getFsIndexes(), null); - - // Copy contents over to the CAS that was passed to the reader to fill in - // Would be nice if CASFactory allowed to read data into an existing CAS. - CASCompleteSerializer ser = Serialization.serializeCASComplete((CASImpl) newCas); - Serialization.deserializeCASComplete(ser, (CASImpl) aCAS); - } - catch (ResourceInitializationException | SAXException | CASException | GrafException e) { - throw new IOException(e); - } - } -} diff --git a/dkpro-core-io-graf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/graf/GrafWriter.java b/dkpro-core-io-graf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/graf/GrafWriter.java deleted file mode 100644 index 687d345ad9..0000000000 --- a/dkpro-core-io-graf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/graf/GrafWriter.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.graf; - -import java.io.OutputStream; - -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.xces.graf.api.IGraph; -import org.xces.graf.io.GrafRenderer; -import org.xces.graf.io.IRenderer; -import org.xces.graf.uima.GraphFactory; - -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; - -/** - * ISO GrAF writer. - */ -@ResourceMetaData(name="ISO GrAF Writer") -@MimeTypeCapability({MimeTypes.APPLICATION_X_GRAF_XML}) -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) -public class GrafWriter -extends JCasFileWriter_ImplBase -{ - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - IRenderer renderer = null; - try (OutputStream docOS = getOutputStream(aJCas, ".xml");) { - // Convert CAS - GraphFactory grafFactory = new GraphFactory(); - IGraph graph = grafFactory.createGraph(aJCas.getCas()); - - // Write CAS - renderer = new GrafRenderer(docOS); - renderer.render(graph); - } - catch (Exception e) { - throw new AnalysisEngineProcessException(e); - } - finally { - if (renderer != null) { - renderer.close(); - } - } - } -} diff --git a/dkpro-core-io-graf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/graf/GrafReaderWriterTest.java b/dkpro-core-io-graf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/graf/GrafReaderWriterTest.java deleted file mode 100644 index 8488687aef..0000000000 --- a/dkpro-core-io-graf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/graf/GrafReaderWriterTest.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.graf; - -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testRoundTrip; - -import java.io.File; - -import org.apache.commons.io.FileUtils; -import org.junit.Ignore; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - -public class GrafReaderWriterTest -{ - @Ignore("Doesn't work yet...") - @Test - public void test() - throws Exception - { - String header = "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - ""; - - FileUtils.writeStringToFile(new File("target/header.xml"), header); - - testRoundTrip(GrafReader.class, GrafWriter.class, "reference/example1.txt.xml"); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-io-graf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/graf/GrafWriterTest.java b/dkpro-core-io-graf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/graf/GrafWriterTest.java deleted file mode 100644 index 3f236b786f..0000000000 --- a/dkpro-core-io-graf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/graf/GrafWriterTest.java +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.graf; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; -import static org.junit.Assert.assertTrue; - -import java.io.File; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReaderDescription; -import org.custommonkey.xmlunit.Diff; -import org.custommonkey.xmlunit.ElementNameAndAttributeQualifier; -import org.custommonkey.xmlunit.XMLAssert; -import org.junit.Rule; -import org.junit.Test; -import org.xml.sax.InputSource; - -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpSegmenter; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.DocumentMetaDataStripper; - -public class GrafWriterTest -{ - @Test - public void test() throws Exception - { - write(); - //read(); - } - - public void write() throws Exception - { - File targetFolder = testContext.getTestOutputFolder(); - - CollectionReaderDescription textReader = createReaderDescription( - TextReader.class, - TextReader.PARAM_LANGUAGE, "en", - ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "src/test/resources/texts", - ResourceCollectionReaderBase.PARAM_PATTERNS, new String [] { - ResourceCollectionReaderBase.INCLUDE_PREFIX+"*.txt" - }); - - AnalysisEngineDescription segmenter = createEngineDescription( - OpenNlpSegmenter.class); - - AnalysisEngineDescription posTagger = createEngineDescription( - OpenNlpPosTagger.class); - - AnalysisEngineDescription stripper = createEngineDescription( - DocumentMetaDataStripper.class); - - AnalysisEngineDescription grafWriter = createEngineDescription( - GrafWriter.class, - GrafWriter.PARAM_TARGET_LOCATION, targetFolder); - - runPipeline(textReader, segmenter, posTagger, stripper, grafWriter); - - File output = new File(targetFolder, "example1.txt.xml"); - assertTrue(output.exists()); - - Diff myDiff = new Diff( - new InputSource("src/test/resources/reference/example1.txt.xml"), - new InputSource(output.getPath())); - myDiff.overrideElementQualifier(new ElementNameAndAttributeQualifier()); - XMLAssert.assertXMLEqual(myDiff, true); - } - -// public void read() throws Exception -// { -// CollectionReader xmiReader = CollectionReaderFactory.createReader( -// XmiReader.class, -// ResourceCollectionReaderBase.PARAM_PATH, testFolder.getRoot().getPath(), -// ResourceCollectionReaderBase.PARAM_PATTERNS, new String [] { -// ResourceCollectionReaderBase.INCLUDE_PREFIX+"*.xmi" -// }); -// -// CAS cas = CasCreationUtils.createCas(createTypeSystemDescription(), null, null); -// xmiReader.getNext(cas); -// -// String refText = readFileToString(new File("src/test/resources/texts/example1.txt")); -// assertEquals(refText, cas.getDocumentText()); -// assertEquals("latin", cas.getDocumentLanguage()); -// } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-io-graf-asl/src/test/resources/META-INF/org.apache.uima.fit/typepriorities.txt b/dkpro-core-io-graf-asl/src/test/resources/META-INF/org.apache.uima.fit/typepriorities.txt deleted file mode 100644 index d472ed560b..0000000000 --- a/dkpro-core-io-graf-asl/src/test/resources/META-INF/org.apache.uima.fit/typepriorities.txt +++ /dev/null @@ -1 +0,0 @@ -classpath*:desc/typepriorities/priorities.xml diff --git a/dkpro-core-io-graf-asl/src/test/resources/desc/typepriorities/priorities.xml b/dkpro-core-io-graf-asl/src/test/resources/desc/typepriorities/priorities.xml deleted file mode 100644 index e9d7198ff1..0000000000 --- a/dkpro-core-io-graf-asl/src/test/resources/desc/typepriorities/priorities.xml +++ /dev/null @@ -1,17 +0,0 @@ - - - AutoImportableTypePriorities - - 1.0 - - - - de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData - de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription - de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagDescription - de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence - de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token - de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS - - - diff --git a/dkpro-core-io-graf-asl/src/test/resources/log4j.properties b/dkpro-core-io-graf-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-io-graf-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-graf-asl/src/test/resources/reference/example1.txt.xml b/dkpro-core-io-graf-asl/src/test/resources/reference/example1.txt.xml deleted file mode 100644 index 4103a098a5..0000000000 --- a/dkpro-core-io-graf-asl/src/test/resources/reference/example1.txt.xml +++ /dev/null @@ -1,339 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/dkpro-core-io-graf-asl/src/test/resources/texts/example1.txt b/dkpro-core-io-graf-asl/src/test/resources/texts/example1.txt deleted file mode 100644 index 45d75b6c2f..0000000000 --- a/dkpro-core-io-graf-asl/src/test/resources/texts/example1.txt +++ /dev/null @@ -1 +0,0 @@ -Where is Cornelia? Suddenly Marcus calls out: "There is Cornelia, she stands over there!" \ No newline at end of file diff --git a/dkpro-core-io-html-asl/pom.xml b/dkpro-core-io-html-asl/pom.xml index 9932c24992..dc62520034 100644 --- a/dkpro-core-io-html-asl/pom.xml +++ b/dkpro-core-io-html-asl/pom.xml @@ -18,14 +18,18 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.html-asl + dkpro-core-io-html-asl jar - DKPro Core ASL - IO - HTML + DKPro Core ASL - IO - HTML (jsoup ${jsoup.version}) + https://dkpro.github.io/dkpro-core/ + + 1.14.3 + org.apache.uima @@ -42,31 +46,39 @@ org.jsoup jsoup - 1.10.2 + ${jsoup.version} com.ibm.icu icu4j - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl + + + org.dkpro.core + dkpro-core-api-xml-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -74,8 +86,28 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.assertj + assertj-core + test + + + xmlunit + xmlunit + test + + + org.dkpro.core + dkpro-core-testing-asl + test + + + org.dkpro.core + dkpro-core-io-xmi-asl + test + + + org.dkpro.core + dkpro-core-io-xml-asl test diff --git a/dkpro-core-io-html-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/html/HtmlReader.java b/dkpro-core-io-html-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/html/HtmlReader.java deleted file mode 100644 index fc2aa1ae25..0000000000 --- a/dkpro-core-io-html-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/html/HtmlReader.java +++ /dev/null @@ -1,234 +0,0 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.html; - -import static de.tudarmstadt.ukp.dkpro.core.io.html.internal.JSoupUtil.appendNormalisedText; -import static de.tudarmstadt.ukp.dkpro.core.io.html.internal.JSoupUtil.lastCharIsWhitespace; - -import java.io.BufferedInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayDeque; -import java.util.Deque; -import java.util.HashMap; -import java.util.Map; - -import org.apache.commons.io.IOUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.cas.CAS; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; -import org.jsoup.nodes.TextNode; -import org.jsoup.select.NodeTraversor; -import org.jsoup.select.NodeVisitor; - -import com.ibm.icu.text.CharsetDetector; - -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; - -/** - * Reads the contents of a given URL and strips the HTML. Returns the textual contents. Also - * recognizes headings and paragraphs. - */ -@ResourceMetaData(name="HTML Reader") -@MimeTypeCapability({MimeTypes.APPLICATION_XHTML, MimeTypes.TEXT_HTML}) -@TypeCapability( - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph" }) -public class HtmlReader - extends JCasResourceCollectionReader_ImplBase -{ - /** - * Automatically detect encoding. - * - * @see CharsetDetector - */ - public static final String ENCODING_AUTO = "auto"; - - /** - * Name of configuration parameter that contains the character encoding used by the input files. - */ - public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) - private String sourceEncoding; - - private Map mappings = new HashMap<>(); - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - mappings.put("h1", Heading.type); - mappings.put("h2", Heading.type); - mappings.put("h3", Heading.type); - mappings.put("h4", Heading.type); - mappings.put("h5", Heading.type); - mappings.put("h6", Heading.type); - mappings.put("p", Paragraph.type); - } - - @Override - public void getNext(JCas aJCas) - throws IOException, CollectionException - { - Resource res = nextFile(); - initCas(aJCas, res); - - CAS cas = aJCas.getCas(); - - String html; - try (InputStream is = new BufferedInputStream( - CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()))) { - - if (ENCODING_AUTO.equals(sourceEncoding)) { - CharsetDetector detector = new CharsetDetector(); - html = IOUtils.toString(detector.getReader(is, null)); - } - else { - html = IOUtils.toString(is, sourceEncoding); - } - } - - Document doc = Jsoup.parse(html); - - StringBuilder builder = new StringBuilder(); - Deque events = new ArrayDeque<>(); - - NodeTraversor traversor = new NodeTraversor(new NodeVisitor() - { - @Override - public void head(Node node, int depth) - { - if (node instanceof TextNode) { - TextNode textNode = (TextNode) node; - appendNormalisedText(builder, textNode); - } - else if (node instanceof Element) { - Element element = (Element) node; - if (builder.length() > 0 - && (element.isBlock() || element.nodeName().equals("br")) - && !lastCharIsWhitespace(builder)) { - builder.append(" "); - } - - // Build a stack of the open elements, recording their start offsets - // and whether we created annotations for them or not. - events.push(new Event(node, builder.length())); - } - } - - @Override - public void tail(Node node, int depth) - { - if (node instanceof TextNode) { - // Nothing to do - } - else if (node instanceof Element) { - Event event = events.pop(); - Integer type = mappings.get(node.nodeName()); - if (type != null) { - int[] span = { event.begin, builder.length() }; - trim(builder, span); - Div div = (Div) cas.createAnnotation(aJCas.getCasType(type), span[0], - span[1]); - div.setDivType(node.nodeName()); - div.addToIndexes(); - } - } - } - }); - - traversor.traverse(doc); - - aJCas.setDocumentText(builder.toString()); - } - - /** - * Remove trailing or leading whitespace from the annotation. - * @param aText the text. - * @param aSpan the offsets. - */ - private static void trim(StringBuilder aText, int[] aSpan) - { - int begin = aSpan[0]; - int end = aSpan[1]-1; - - while ( - (begin < (aText.length()-1)) - && trimChar(aText.charAt(begin)) - ) { - begin ++; - } - while ( - (end > 0) - && trimChar(aText.charAt(end)) - ) { - end --; - } - - end++; - - aSpan[0] = begin; - aSpan[1] = end; - } - - private static boolean trimChar(final char aChar) - { - switch (aChar) { - case '\n': return true; // Line break - case '\r': return true; // Carriage return - case '\t': return true; // Tab - case '\u200E': return true; // LEFT-TO-RIGHT MARK - case '\u200F': return true; // RIGHT-TO-LEFT MARK - case '\u2028': return true; // LINE SEPARATOR - case '\u2029': return true; // PARAGRAPH SEPARATOR - default: - return Character.isWhitespace(aChar); - } - } - - private static class Event - { - int begin; - - public Event(Node aNode, int aBegin) - { - super(); - begin = aBegin; - } - } -} diff --git a/dkpro-core-io-html-asl/src/main/java/org/dkpro/core/io/html/HtmlDocumentReader.java b/dkpro-core-io-html-asl/src/main/java/org/dkpro/core/io/html/HtmlDocumentReader.java new file mode 100644 index 0000000000..d399c3b1ea --- /dev/null +++ b/dkpro-core-io-html-asl/src/main/java/org/dkpro/core/io/html/HtmlDocumentReader.java @@ -0,0 +1,241 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.html; + +import static de.tudarmstadt.ukp.dkpro.core.api.segmentation.TrimUtils.trim; +import static org.dkpro.core.io.html.internal.JSoupUtil.appendNormalisedText; +import static org.dkpro.core.io.html.internal.JSoupUtil.lastCharIsWhitespace; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.io.IOUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.cas.CAS; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.api.xml.CasXmlHandler; +import org.dkpro.core.api.xml.type.XmlElement; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Attribute; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeTraversor; +import org.jsoup.select.NodeVisitor; +import org.xml.sax.helpers.AttributesImpl; + +import com.ibm.icu.text.CharsetDetector; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Reads the contents of a given URL and strips the HTML. Returns the textual contents. Also + * recognizes headings and paragraphs. + */ +@ResourceMetaData(name = "HTML Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.APPLICATION_XHTML, MimeTypes.TEXT_HTML}) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph", + "org.dkpro.core.api.xml.type.XmlAttribute", + "org.dkpro.core.api.xml.type.XmlDocument", + "org.dkpro.core.api.xml.type.XmlElement", + "org.dkpro.core.api.xml.type.XmlNode", + "org.dkpro.core.api.xml.type.XmlTextNode" }) +public class HtmlDocumentReader + extends JCasResourceCollectionReader_ImplBase +{ + /** + * Automatically detect encoding. + * + * @see CharsetDetector + */ + public static final String ENCODING_AUTO = "auto"; + + /** + * Name of configuration parameter that contains the character encoding used by the input files. + */ + public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) + private String sourceEncoding; + + /** + * Normalize whitespace. + */ + public static final String PARAM_NORMALIZE_WHITESPACE = "normalizeWhitespace"; + @ConfigurationParameter(name = PARAM_NORMALIZE_WHITESPACE, mandatory = true, defaultValue = "true") + private boolean normalizeWhitespace; + + private Map mappings = new HashMap<>(); + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + mappings.put("h1", Heading.type); + mappings.put("h2", Heading.type); + mappings.put("h3", Heading.type); + mappings.put("h4", Heading.type); + mappings.put("h5", Heading.type); + mappings.put("h6", Heading.type); + mappings.put("p", Paragraph.type); + } + + @Override + public void getNext(JCas aJCas) + throws IOException, CollectionException + { + Resource res = nextFile(); + initCas(aJCas, res); + + CAS cas = aJCas.getCas(); + + String html; + try (InputStream is = new BufferedInputStream( + CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()))) { + + if (ENCODING_AUTO.equals(sourceEncoding)) { + CharsetDetector detector = new CharsetDetector(); + html = IOUtils.toString(detector.getReader(is, null)); + } + else { + html = IOUtils.toString(is, sourceEncoding); + } + } + + Document doc = Jsoup.parse(html); + + CasXmlHandler handler = new CasXmlHandler(aJCas); + + NodeVisitor visitor = new NodeVisitor() + { + @Override + public void head(Node node, int depth) + { + try { + if (node instanceof Document) { + handler.startDocument(); + handler.captureText(false); + } + else if (node instanceof TextNode) { + TextNode textNode = (TextNode) node; + StringBuilder buffer = new StringBuilder(); + if (normalizeWhitespace) { + appendNormalisedText(buffer, textNode); + } + else { + buffer.append(textNode.getWholeText()); + } + char[] text = buffer.toString().toCharArray(); + handler.characters(text, 0, text.length); + } + else if (node instanceof Element) { + Element element = (Element) node; + if ( + handler.getText().length() > 0 && + (element.isBlock() || element.nodeName().equals("br")) && + !lastCharIsWhitespace(handler.getText()) + ) { + char[] text = " ".toCharArray(); + handler.characters(text, 0, text.length); + } + + AttributesImpl attributes = new AttributesImpl(); + + if (element.attributes() != null) { + for (Attribute attr : element.attributes()) { + attributes.addAttribute("", "", attr.getKey(), "CDATA", + attr.getValue()); + } + } + + if ("body".equals(element.tagName())) { + handler.captureText(true); + } + + handler.startElement("", "", element.tagName(), attributes); + } + } + catch (Exception e) { + throw new RuntimeException(e); + } + } + + @Override + public void tail(Node node, int depth) + { + try { + if (node instanceof Document) { + handler.endDocument(); + } + else if (node instanceof Element) { + Element element = (Element) node; + + // Fetch the current element + XmlElement elementFS = handler.getCurrentElement(); + + // Close the current element so that it gets its end offset + handler.endElement("", "", element.tagName()); + + if ("body".equals(element.tagName())) { + handler.captureText(false); + } + + Integer type = mappings.get(node.nodeName()); + if (type != null) { + int[] span = { elementFS.getBegin(), elementFS.getEnd() }; + trim(handler.getText(), span); + Div div = (Div) cas.createAnnotation(aJCas.getCasType(type), span[0], + span[1]); + div.setDivType(node.nodeName()); + div.addToIndexes(); + } + } + } + catch (Exception e) { + throw new RuntimeException(e); + } + } + }; + + NodeTraversor.traverse(visitor, doc); + } +} diff --git a/dkpro-core-io-html-asl/src/main/java/org/dkpro/core/io/html/HtmlReader.java b/dkpro-core-io-html-asl/src/main/java/org/dkpro/core/io/html/HtmlReader.java new file mode 100644 index 0000000000..995dee0dd6 --- /dev/null +++ b/dkpro-core-io-html-asl/src/main/java/org/dkpro/core/io/html/HtmlReader.java @@ -0,0 +1,194 @@ +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.html; + +import static de.tudarmstadt.ukp.dkpro.core.api.segmentation.TrimUtils.trim; +import static org.dkpro.core.io.html.internal.JSoupUtil.appendNormalisedText; +import static org.dkpro.core.io.html.internal.JSoupUtil.lastCharIsWhitespace; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.io.IOUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.cas.CAS; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeTraversor; +import org.jsoup.select.NodeVisitor; + +import com.ibm.icu.text.CharsetDetector; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Reads the contents of a given URL and strips the HTML. Returns the textual contents. Also + * recognizes headings and paragraphs. + */ +@ResourceMetaData(name = "HTML Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.APPLICATION_XHTML, MimeTypes.TEXT_HTML}) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph" }) +public class HtmlReader + extends JCasResourceCollectionReader_ImplBase +{ + /** + * Automatically detect encoding. + * + * @see CharsetDetector + */ + public static final String ENCODING_AUTO = "auto"; + + /** + * Name of configuration parameter that contains the character encoding used by the input files. + */ + public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) + private String sourceEncoding; + + private Map mappings = new HashMap<>(); + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + mappings.put("h1", Heading.type); + mappings.put("h2", Heading.type); + mappings.put("h3", Heading.type); + mappings.put("h4", Heading.type); + mappings.put("h5", Heading.type); + mappings.put("h6", Heading.type); + mappings.put("p", Paragraph.type); + } + + @Override + public void getNext(JCas aJCas) + throws IOException, CollectionException + { + Resource res = nextFile(); + initCas(aJCas, res); + + CAS cas = aJCas.getCas(); + + String html; + try (InputStream is = new BufferedInputStream( + CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()))) { + + if (ENCODING_AUTO.equals(sourceEncoding)) { + CharsetDetector detector = new CharsetDetector(); + html = IOUtils.toString(detector.getReader(is, null)); + } + else { + html = IOUtils.toString(is, sourceEncoding); + } + } + + Document doc = Jsoup.parse(html); + + StringBuilder builder = new StringBuilder(); + Deque events = new ArrayDeque<>(); + + NodeVisitor visitor = new NodeVisitor() + { + @Override + public void head(Node node, int depth) + { + if (node instanceof TextNode) { + TextNode textNode = (TextNode) node; + appendNormalisedText(builder, textNode); + } + else if (node instanceof Element) { + Element element = (Element) node; + if (builder.length() > 0 + && (element.isBlock() || element.nodeName().equals("br")) + && !lastCharIsWhitespace(builder)) { + builder.append(" "); + } + + // Build a stack of the open elements, recording their start offsets + // and whether we created annotations for them or not. + events.push(new Event(node, builder.length())); + } + } + + @Override + public void tail(Node node, int depth) + { + if (node instanceof TextNode) { + // Nothing to do + } + else if (node instanceof Element) { + Event event = events.pop(); + Integer type = mappings.get(node.nodeName()); + if (type != null) { + int[] span = { event.begin, builder.length() }; + trim(builder, span); + Div div = (Div) cas.createAnnotation(aJCas.getCasType(type), span[0], + span[1]); + div.setDivType(node.nodeName()); + div.addToIndexes(); + } + } + } + }; + + NodeTraversor.traverse(visitor, doc); + + aJCas.setDocumentText(builder.toString()); + } + + private static class Event + { + int begin; + + public Event(Node aNode, int aBegin) + { + super(); + begin = aBegin; + } + } +} diff --git a/dkpro-core-io-html-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/html/internal/JSoupUtil.java b/dkpro-core-io-html-asl/src/main/java/org/dkpro/core/io/html/internal/JSoupUtil.java similarity index 80% rename from dkpro-core-io-html-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/html/internal/JSoupUtil.java rename to dkpro-core-io-html-asl/src/main/java/org/dkpro/core/io/html/internal/JSoupUtil.java index 660866266e..e7a55435c3 100644 --- a/dkpro-core-io-html-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/html/internal/JSoupUtil.java +++ b/dkpro-core-io-html-asl/src/main/java/org/dkpro/core/io/html/internal/JSoupUtil.java @@ -20,9 +20,10 @@ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package de.tudarmstadt.ukp.dkpro.core.io.html.internal; +package org.dkpro.core.io.html.internal; + +import static org.jsoup.internal.StringUtil.appendNormalisedWhitespace; -import org.jsoup.helper.StringUtil; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; @@ -35,33 +36,37 @@ public final class JSoupUtil /* * org.jsoup.nodes.TextNode.lastCharIsWhitespace(StringBuilder) */ - public static boolean lastCharIsWhitespace(StringBuilder sb) { + public static boolean lastCharIsWhitespace(CharSequence sb) + { return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' '; } - + /* * org.jsoup.nodes.Element.appendNormalisedText(StringBuilder, TextNode) */ - public static void appendNormalisedText(StringBuilder accum, TextNode textNode) { + public static void appendNormalisedText(StringBuilder accum, TextNode textNode) + { String text = textNode.getWholeText(); if (preserveWhitespace(textNode.parentNode())) { accum.append(text); } else { - StringUtil.appendNormalisedWhitespace(accum, text, lastCharIsWhitespace(accum)); + appendNormalisedWhitespace(accum, text, lastCharIsWhitespace(accum)); } } - + /* * org.jsoup.nodes.Element.preserveWhitespace(Node) */ - public static boolean preserveWhitespace(Node node) { - // looks only at this element and one level up, to prevent recursion & needless stack searches + public static boolean preserveWhitespace(Node node) + { + // looks only at this element and one level up, to prevent recursion & needless stack + // searches if (node != null && node instanceof Element) { Element element = (Element) node; - return element.tag().preserveWhitespace() || - element.parent() != null && element.parent().tag().preserveWhitespace(); + return element.tag().preserveWhitespace() + || element.parent() != null && element.parent().tag().preserveWhitespace(); } return false; } diff --git a/dkpro-core-io-html-asl/src/test/java/org/dkpro/core/io/html/HtmlDocumentReaderTest.java b/dkpro-core-io-html-asl/src/test/java/org/dkpro/core/io/html/HtmlDocumentReaderTest.java new file mode 100644 index 0000000000..87650b6b3e --- /dev/null +++ b/dkpro-core-io-html-asl/src/test/java/org/dkpro/core/io/html/HtmlDocumentReaderTest.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.html; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.assertj.core.api.Assertions.assertThat; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; + +import java.io.File; +import java.io.IOException; + +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.custommonkey.xmlunit.XMLAssert; +import org.dkpro.core.io.xmi.XmiWriter; +import org.dkpro.core.io.xml.XmlDocumentWriter; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestOptions; +import org.junit.Rule; +import org.junit.Test; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; + +public class HtmlDocumentReaderTest +{ + @Test + public void testReadFileWithOnlyBody() + throws Exception + { + JCas jcas = JCasFactory.createJCas(); + + CollectionReader reader = createReader(HtmlDocumentReader.class, + HtmlDocumentReader.PARAM_SOURCE_LOCATION, "src/test/resources/html/test.html", + HtmlDocumentReader.PARAM_LANGUAGE, "en"); + + reader.getNext(jcas.getCas()); + + assertThat(jcas.getDocumentText()) + .isEqualTo(" Heading This is the first paragraph. This is the second paragraph. "); + + assertThat(select(jcas, Heading.class)) + .extracting(Heading::getCoveredText) + .containsExactly("Heading"); + + assertThat(select(jcas, Paragraph.class)) + .extracting(Paragraph::getCoveredText) + .containsExactly("This is the first paragraph.", "This is the second paragraph."); + } + + @Test + public void testReadFileWithOnlyBodyAndWriteAsXml() + throws Exception + { + testOneWay( + createReaderDescription(HtmlDocumentReader.class, + HtmlDocumentReader.PARAM_LANGUAGE, "en", + HtmlDocumentReader.PARAM_NORMALIZE_WHITESPACE, false), + createEngineDescription(XmlDocumentWriter.class), + "html/test-document.xml", + "html/test.html", + new TestOptions().resultAssertor(this::assertXmlEquals)); + } + + @Test + public void testReadFileWithOnlyBodyAndWriteAsXmi() + throws Exception + { + testOneWay( + createReaderDescription(HtmlDocumentReader.class, + HtmlDocumentReader.PARAM_LANGUAGE, "en"), + createEngineDescription(XmiWriter.class), + "html/test-document.xmi", + "html/test.html"); + } + + @Test + public void testReadFileWithHead() + throws Exception + { + JCas jcas = JCasFactory.createJCas(); + + CollectionReader reader = createReader(HtmlDocumentReader.class, + HtmlDocumentReader.PARAM_SOURCE_LOCATION, "src/test/resources/html/test-with-head.html", + HtmlDocumentReader.PARAM_LANGUAGE, "en"); + + reader.getNext(jcas.getCas()); + + assertThat(jcas.getDocumentText()) + .isEqualTo(" Heading This is the first paragraph. This is the second paragraph. "); + + assertThat(select(jcas, Heading.class)) + .extracting(Heading::getCoveredText) + .containsExactly("Heading"); + + assertThat(select(jcas, Paragraph.class)) + .extracting(Paragraph::getCoveredText) + .containsExactly("This is the first paragraph.", "This is the second paragraph."); + } + + @Test + public void testReadFileWithHeadAndWriteAsXmi() + throws Exception + { + testOneWay( + createReaderDescription(HtmlDocumentReader.class, + HtmlDocumentReader.PARAM_LANGUAGE, "en"), + createEngineDescription(XmiWriter.class), + "html/test-with-head.xmi", + "html/test-with-head.html"); + } + + private void assertXmlEquals(File expected, File actual) + { + try { + XMLAssert.assertXMLEqual( + new InputSource(expected.getPath()), + new InputSource(actual.getPath())); + } + catch (SAXException | IOException e) { + throw new RuntimeException(e); + } + } + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-html-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/html/HtmlReaderTest.java b/dkpro-core-io-html-asl/src/test/java/org/dkpro/core/io/html/HtmlReaderTest.java similarity index 80% rename from dkpro-core-io-html-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/html/HtmlReaderTest.java rename to dkpro-core-io-html-asl/src/test/java/org/dkpro/core/io/html/HtmlReaderTest.java index 1106a3ccdc..c171237ad0 100644 --- a/dkpro-core-io-html-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/html/HtmlReaderTest.java +++ b/dkpro-core-io-html-asl/src/test/java/org/dkpro/core/io/html/HtmlReaderTest.java @@ -1,86 +1,87 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.html; - -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testOneWay; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.apache.uima.fit.util.CasUtil.select; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.io.File; -import java.net.URL; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.JCasIterable; -import org.apache.uima.jcas.JCas; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.dumper.CasDumpWriter; - -public class HtmlReaderTest -{ - @Test - public void wwwReaderTest() - throws Exception - { - File targetDir = testContext.getTestOutputFolder(); - - CollectionReaderDescription reader = createReaderDescription( - HtmlReader.class, - HtmlReader.PARAM_SOURCE_LOCATION, new URL("http://www.google.de") - ); - - AnalysisEngineDescription dumpWriter = createEngineDescription(CasDumpWriter.class, - CasDumpWriter.PARAM_TARGET_LOCATION, new File(targetDir, "google.html.dump")); - - for (JCas jcas : new JCasIterable(reader, dumpWriter)) { - dumpMetaData(DocumentMetaData.get(jcas)); - assertEquals(1, select(jcas.getCas(), jcas.getDocumentAnnotationFs().getType()).size()); - - assertTrue(jcas.getDocumentText().startsWith("Google")); - } - } - - @Test - public void testReadFile() - throws Exception - { - testOneWay( - createReaderDescription(HtmlReader.class, - HtmlReader.PARAM_LANGUAGE, "en"), - "html/test.html.dump", - "html/test.html"); - } - - private void dumpMetaData(final DocumentMetaData aMetaData) - { - System.out.println("Collection ID: "+aMetaData.getCollectionId()); - System.out.println("ID : "+aMetaData.getDocumentId()); - System.out.println("Base URI : "+aMetaData.getDocumentBaseUri()); - System.out.println("URI : "+aMetaData.getDocumentUri()); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} \ No newline at end of file +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.html; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.util.CasUtil.select; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.net.URL; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.JCasIterable; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.html.HtmlReader; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.dumper.CasDumpWriter; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; + +public class HtmlReaderTest +{ + @Test + public void wwwReaderTest() + throws Exception + { + File targetDir = testContext.getTestOutputFolder(); + + CollectionReaderDescription reader = createReaderDescription( + HtmlReader.class, + HtmlReader.PARAM_SOURCE_LOCATION, new URL("http://www.google.de") + ); + + AnalysisEngineDescription dumpWriter = createEngineDescription(CasDumpWriter.class, + CasDumpWriter.PARAM_TARGET_LOCATION, new File(targetDir, "google.html.dump")); + + for (JCas jcas : new JCasIterable(reader, dumpWriter)) { + dumpMetaData(DocumentMetaData.get(jcas)); + assertEquals(1, select(jcas.getCas(), jcas.getDocumentAnnotationFs().getType()).size()); + + assertTrue(jcas.getDocumentText().startsWith("Google")); + } + } + + @Test + public void testReadFile() + throws Exception + { + testOneWay( + createReaderDescription(HtmlReader.class, + HtmlReader.PARAM_LANGUAGE, "en"), + "html/test.html.dump", + "html/test.html"); + } + + private void dumpMetaData(final DocumentMetaData aMetaData) + { + System.out.println("Collection ID: " + aMetaData.getCollectionId()); + System.out.println("ID : " + aMetaData.getDocumentId()); + System.out.println("Base URI : " + aMetaData.getDocumentBaseUri()); + System.out.println("URI : " + aMetaData.getDocumentUri()); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-html-asl/src/test/resources/html/test-document.xmi b/dkpro-core-io-html-asl/src/test/resources/html/test-document.xmi new file mode 100644 index 0000000000..587c8c404c --- /dev/null +++ b/dkpro-core-io-html-asl/src/test/resources/html/test-document.xmi @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dkpro-core-io-html-asl/src/test/resources/html/test-document.xml b/dkpro-core-io-html-asl/src/test/resources/html/test-document.xml new file mode 100644 index 0000000000..9e98a0aee0 --- /dev/null +++ b/dkpro-core-io-html-asl/src/test/resources/html/test-document.xml @@ -0,0 +1,11 @@ + + +

Heading

+

+ This is the first paragraph. +

+

+ This is the second paragraph. +

+ + \ No newline at end of file diff --git a/dkpro-core-io-html-asl/src/test/resources/html/test-with-head.html b/dkpro-core-io-html-asl/src/test/resources/html/test-with-head.html new file mode 100644 index 0000000000..22ed39c647 --- /dev/null +++ b/dkpro-core-io-html-asl/src/test/resources/html/test-with-head.html @@ -0,0 +1,14 @@ + + + Page Title + + +

Heading

+

+ This is the first paragraph. +

+

+ This is the second paragraph. +

+ + \ No newline at end of file diff --git a/dkpro-core-io-html-asl/src/test/resources/html/test-with-head.xmi b/dkpro-core-io-html-asl/src/test/resources/html/test-with-head.xmi new file mode 100644 index 0000000000..b91dc59cc0 --- /dev/null +++ b/dkpro-core-io-html-asl/src/test/resources/html/test-with-head.xmi @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dkpro-core-io-html-asl/src/test/resources/log4j2.xml b/dkpro-core-io-html-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-html-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-imscwb-asl/pom.xml b/dkpro-core-io-imscwb-asl/pom.xml index 8a33d6eb21..0dace9c010 100644 --- a/dkpro-core-io-imscwb-asl/pom.xml +++ b/dkpro-core-io-imscwb-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.imscwb-asl + dkpro-core-io-imscwb-asl jar DKPro Core ASL - IO - IMS Corpus Workbench + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -44,43 +45,42 @@ commons-lang3 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl - test + eu.openminted.share.annotations + omtd-share-annotations-api - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.negra-asl + org.dkpro.core + dkpro-core-testing-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.bnc-asl + org.dkpro.core + dkpro-core-io-negra-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl junit @@ -88,13 +88,18 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.snowball-asl + org.assertj + assertj-core test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl + org.dkpro.core + dkpro-core-snowball-asl + test + + + org.dkpro.core + dkpro-core-opennlp-asl test @@ -106,9 +111,9 @@ - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-opennlp-asl + 2.3.0-SNAPSHOT pom import diff --git a/dkpro-core-io-imscwb-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/ImsCwbReader.java b/dkpro-core-io-imscwb-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/ImsCwbReader.java deleted file mode 100644 index b400acd861..0000000000 --- a/dkpro-core-io-imscwb-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/ImsCwbReader.java +++ /dev/null @@ -1,340 +0,0 @@ -/* - * Copyright 2011 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universit√§t Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.imscwb; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.TypeSystem; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.internal.util.XMLUtils; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.Progress; -import org.apache.uima.util.ProgressImpl; - -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.io.imscwb.util.CorpusSentence; -import de.tudarmstadt.ukp.dkpro.core.io.imscwb.util.CorpusText; -import de.tudarmstadt.ukp.dkpro.core.io.imscwb.util.TextIterable; - -/** - * Reads a tab-separated format including pseudo-XML tags. - */ -@ResourceMetaData(name="IMS CWB Reader") -@MimeTypeCapability({MimeTypes.TEXT_X_IMSCWB}) -@TypeCapability( - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) -public class ImsCwbReader - extends ResourceCollectionReaderBase -{ - /** - * Character encoding of the output. - */ - public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name=PARAM_SOURCE_ENCODING, mandatory=true, defaultValue="UTF-8") - private String encoding; - - /** - * Location of the mapping file for part-of-speech tags to UIMA types. - */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - protected String mappingPosLocation; - - /** - * Specify which tag set should be used to locate the mapping file. - */ - public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; - @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) - protected String posTagset; - - /** - * Read tokens and generate {@link Token} annotations. - * - * Default: {@code true} - */ - public static final String PARAM_READ_TOKEN = ComponentParameters.PARAM_READ_TOKEN; - @ConfigurationParameter(name = PARAM_READ_TOKEN, mandatory = true, defaultValue = "true") - private boolean readTokens; - - /** - * Read part-of-speech tags and generate {@link POS} annotations or subclasses if a - * {@link #PARAM_POS_TAG_SET tag set} or {@link #PARAM_POS_MAPPING_LOCATION mapping file} is - * used. - * - * Default: {@code true} - */ - public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; - @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") - private boolean readPos; - - /** - * Read sentences. - * - * Default: {@code true} - */ - public static final String PARAM_READ_SENTENCES = ComponentParameters.PARAM_READ_SENTENCE; - @ConfigurationParameter(name = PARAM_READ_SENTENCES, mandatory = true, defaultValue = "true") - private boolean readSentences; - - /** - * Read lemmas. - * - * Default: {@code true} - */ - public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; - @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") - private boolean readLemmas; - - /** - * If true, the unit IDs are used only to detect if a new document (CAS) needs to be created, - * but for the purpose of setting the document ID, a new ID is generated. (Default: false) - */ - public static final String PARAM_GENERATE_NEW_IDS = "generateNewIds"; - @ConfigurationParameter(name = PARAM_GENERATE_NEW_IDS, mandatory = true, defaultValue = "false") - private boolean generateNewIds; - - /** - * If true, the unit text ID encoded in the corpus file is stored as the URI in the document - * meta data. This setting has is not affected by {@link #PARAM_GENERATE_NEW_IDS} - * (Default: false) - */ - public static final String PARAM_ID_IS_URL = "idIsUrl"; - @ConfigurationParameter(name = PARAM_ID_IS_URL, mandatory = true, defaultValue = "false") - private boolean idIsUrl; - - /** - * Replace non-XML characters with spaces. - * (Default: true) - */ - public static final String PARAM_REPLACE_NON_XML = "replaceNonXml"; - @ConfigurationParameter(name = PARAM_REPLACE_NON_XML, mandatory = true, defaultValue = "true") - private boolean replaceNonXml; - - private Type tokenType; - private Type lemmaType; - private Type sentenceType; - - private TextIterable wackyIterator; - - private int completed; - - private MappingProvider posMappingProvider; - - private int documentCount; - private int qualifier; - private Resource lastResource; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - wackyIterator = new TextIterable(getResources(), encoding); - - posMappingProvider = MappingProviderFactory.createPosMappingProvider(mappingPosLocation, - posTagset, getLanguage()); - - documentCount = 0; - qualifier = 0; - lastResource = null; - } - - @Override - public boolean hasNext() - throws IOException, CollectionException - { - return wackyIterator.hasNext(); - } - - @Override - public void getNext(CAS aCAS) - throws IOException, CollectionException - { - Resource res = wackyIterator.getCurrentResource(); - CorpusText text = wackyIterator.next(); - - // Reset counter when a new file is read. - if (!res.equals(lastResource)) { - qualifier = 0; - lastResource = res; - } - - String documentId; - if (generateNewIds) { - documentId = String.valueOf(documentCount); - } - else { - documentId = text.getDocumentTitle(); - } - - initCas(aCAS, res, String.valueOf(qualifier)); - DocumentMetaData meta = DocumentMetaData.get(aCAS); - meta.setDocumentTitle(text.getDocumentTitle()); - meta.setDocumentId(documentId); - - if (idIsUrl) { - meta.setDocumentBaseUri(null); - meta.setDocumentUri(text.getDocumentTitle()); - } - - try { - posMappingProvider.configure(aCAS); - } - catch (AnalysisEngineProcessException e) { - throw new IOException(e); - } - - List tokenAnnotations = new ArrayList(); - List lemmaAnnotations = new ArrayList(); - List posAnnotations = new ArrayList(); - List sentenceAnnotations = new ArrayList(); - - TypeSystem typeSystem = aCAS.getTypeSystem(); - tokenType = typeSystem.getType(Token.class.getName()); - lemmaType = typeSystem.getType(Lemma.class.getName()); - sentenceType = typeSystem.getType(Sentence.class.getName()); - - StringBuilder sb = new StringBuilder(); - int offset = 0; - - for (CorpusSentence sentence : text.getSentences()) { - int savedOffset = offset; - for (int i=0; i 0) { - bw.write(" "); - for (int i = 0; i < aAttributes.length; i += 2) { - bw.write(aAttributes[i]); - bw.write("=\""); - bw.write(escapeXml(aAttributes[i + 1])); - bw.write('"'); - } - } - bw.write('>'); - bw.write(LS); - } - - private void endElement(String aElement) - throws IOException - { - bw.write("'); - bw.write(LS); - - } - - private void field(String aValue) - throws IOException - { - bw.write(TAB); - bw.write(escapeXml(aValue)); - } - - private Writer getWriter() - throws IOException - { - if (cqpHome != null) { - dataDirectory = new File(outputFile, "data"); - registryDirectory = new File(outputFile, "registry"); - forceMkdir(dataDirectory); - forceMkdir(registryDirectory); - - List cmd = new ArrayList(); - cmd.add(new File(cqpHome, "cwb-encode").getAbsolutePath()); - - cmd.add("-c"); - cmd.add(getCwbCharset(encoding)); - // -x XML-aware (replace XML entities and ignore directory for data files created by ./cwb-encode - cmd.add("-d"); - cmd.add(dataDirectory.getPath()); - // -R create registry entry (named ) listing all encoded attributes - cmd.add("-R"); - cmd.add(new File(registryDirectory, corpusName).getPath()); - - // -P declare additional p-attribute - if (writePOS) { - cmd.add("-P"); - cmd.add(ATTR_POS); - } - - if (writeCPOS) { - cmd.add("-P"); - cmd.add(ATTR_CPOS); - } - - if (writeLemma) { - cmd.add("-P"); - cmd.add(ATTR_LEMMA); - } - - if (writeDocId) { - cmd.add("-P"); - cmd.add(ATTR_URI); - } - - if (writeOffsets) { - cmd.add("-P"); - cmd.add(ATTR_BEGIN); - cmd.add("-P"); - cmd.add(ATTR_END); - } - - if (additionalFeatures != null) { - for (String featurePath : additionalFeatures) { - String[] segments = featurePath.split("/", 2); - if (segments.length != 2) { - throw new IllegalArgumentException("Given feature path is malformed: [" - + featurePath + "] (exactly one \"/\" (slash) must exist)."); - } - String typeName = segments[0]; - String featureName = segments.length > 1 ? segments[1] : ""; - String name = (substringAfterLast(typeName, ".") + "_" + featureName) - .toLowerCase(); - cmd.add("-P"); - cmd.add(name); - } - } - - if (writeDocumentTag) { - cmd.add("-S"); - cmd.add(E_DOCUMENT + ":0+" + ATTR_URI); - } - - if (writeTextTag) { - cmd.add("-S"); - cmd.add(E_TEXT + ":0+" + ATTR_ID); - } - - { - cmd.add("-S"); - cmd.add(sentenceTag + ":0"); - } - - getLogger().info("Spawning cwb-encode: " + join(cmd, " ")); - - final ProcessBuilder pb = new ProcessBuilder(); - pb.command(cmd); - childProcess = pb.start(); - return new OutputStreamWriter(childProcess.getOutputStream(), encoding); - } - else { - return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile), - encoding)); - } - } - - private void attendChildProceess() - { - if (childProcess != null) { - try { - InputStream stdout = childProcess.getInputStream(); - if (stdout.available() > 0) { - byte[] data = new byte[stdout.available()]; - stdout.read(data); - getLogger().info(new String(data, "UTF-8")); - } - InputStream stderr = childProcess.getErrorStream(); - if (stderr.available() > 0) { - byte[] data = new byte[stderr.available()]; - stderr.read(data); - getLogger().error(new String(data, "UTF-8")); - } - } - catch (IOException e) { - getLogger().error("Unable to communicate with child process"); - } - } - } - - @Override - public void collectionProcessComplete() - throws AnalysisEngineProcessException - { - IOUtils.closeQuietly(bw); - if (childProcess != null) { - try { - childProcess.waitFor(); - attendChildProceess(); - childProcess = null; - } - catch (InterruptedException e) { - throw new AnalysisEngineProcessException(e); - } - - runCwbCommand("cwb-makeall", "-r", registryDirectory.getPath(), "-V", - corpusName.toUpperCase()); - - if (cqpCompress) { - // Compress the token sequence of a positional attribute. Creates .huf, .hcd, - // and .huf.syn files, which replace the corresponding .corpus files. After - // running this tool successfully, the .corpus files can be deleted. - runCwbCommand("cwb-huffcode", "-r", registryDirectory.getPath(), "-A", - corpusName.toUpperCase()); - for (File f : listFiles(dataDirectory, new String[] { "huf" }, false)) { - deleteQuietly(new File(removeExtension(f.getPath()) + ".corpus")); - } - - // Compress the index of a positional attribute. Creates .crc and .crx files - // which replace the corresponding .corpus.rev and .corpus.rdx files. After - // running this tool successfully, the latter files can be deleted. - runCwbCommand("cwb-compress-rdx", "-r", registryDirectory.getPath(), "-A", - corpusName.toUpperCase()); - for (File f : listFiles(dataDirectory, new String[] { "crc" }, false)) { - deleteQuietly(new File(removeExtension(f.getPath()) + ".corpus.rev")); - deleteQuietly(new File(removeExtension(f.getPath()) + ".corpus.rdx")); - } - } - } - } - - private void runCwbCommand(String aCommand, String... aArguments) - throws AnalysisEngineProcessException - { - try { - List args = new ArrayList(aArguments.length + 1); - args.add(new File(cqpHome, aCommand).getAbsolutePath()); - for (String arg : aArguments) { - args.add(arg); - } - - ProcessBuilder pb = new ProcessBuilder(args); - getLogger().info("Spawning " + aCommand + ": " + join(args, " ")); - childProcess = pb.start(); - childProcess.waitFor(); - } - catch (InterruptedException e) { - throw new AnalysisEngineProcessException(e); - } - catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - finally { - attendChildProceess(); - childProcess = null; - } - } - - private static Map CHARSET_MAPPING = new HashMap(); - static { - CHARSET_MAPPING.put("ISO-8859-1", "latin1"); - CHARSET_MAPPING.put("UTF-8", "utf8"); - } - - private static String getCwbCharset(String aEncoding) - { - String enc = CHARSET_MAPPING.get(aEncoding); - if (enc == null) { - throw new IllegalArgumentException("Encoding [" + enc + "] not supported by CWB."); - } - return enc; - } - - private static String escapeXml(String aString) - { - return aString.replaceAll("&", "&").replaceAll("<", "<").replaceAll(">", ">") - .replaceAll("\"", """).replaceAll("'", "'"); - } - - /** - * Get the feature value of an annotation which is covered by another annotation. - * - * @param aFeaturePath - * The fully qualified feature path of the feature in question: - * your.package.and.annotation.class.name/featureName - * @param aCoveringAnnotation - * The annotation that covers the annotation for which the feature value should be - * extracted. - * @return the feature value if a feature name is given; coveredText if only the annotation is - * given - */ - public String getCoveredAnnotationFeatureValue(String aFeaturePath, - AnnotationFS aCoveringAnnotation) - { - String[] segments = aFeaturePath.split("/", 2); - if (segments.length != 2) { - throw new IllegalArgumentException("Given feature path is malformed: [" + aFeaturePath - + "] (exactly one \"/\" (slash) must exist)."); - } - String typeName = segments[0]; - String featureName = segments[1]; - Type type = CasUtil.getAnnotationType(aCoveringAnnotation.getCAS(), typeName); - Feature feature = type.getFeatureByBaseName(featureName); - if (feature == null) { - throw new IllegalArgumentException("Feature [" + featureName - + "] is not defined for type [" + type + "] (check lower/uppercase spelling)."); - } - - List covered = CasUtil.selectCovered(type, aCoveringAnnotation); - switch (covered.size()) { - case 0: - if (getLogger().isWarnEnabled()) { - getLogger().warn( - "There is no annotation of type [" + typeName - + "] available which is covered by [" + aCoveringAnnotation - + "], returning empty string."); - } - return ""; - case 1: - return covered.get(0).getFeatureValueAsString(feature); - default: - if (getLogger().isWarnEnabled()) { - getLogger().warn( - "There are multiple annotations of type [" + typeName - + "] available which are covered by [" + aCoveringAnnotation - + "], returning the first."); - } - return covered.get(0).getFeatureValueAsString(feature); - } - } -} diff --git a/dkpro-core-io-imscwb-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/package-info.java b/dkpro-core-io-imscwb-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/package-info.java deleted file mode 100644 index 591e3b5f3d..0000000000 --- a/dkpro-core-io-imscwb-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/package-info.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright 2010-2011 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Support for
IMS Corpus Workbench tab-separated format - * also used by WaCky. - * - * @since 1.2.0 - */ -package de.tudarmstadt.ukp.dkpro.core.io.imscwb; diff --git a/dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/ImsCwbReader.java b/dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/ImsCwbReader.java new file mode 100644 index 0000000000..edaca4b055 --- /dev/null +++ b/dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/ImsCwbReader.java @@ -0,0 +1,345 @@ +/* + * Copyright 2011 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universit√§t Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.imscwb; + +import static org.dkpro.core.api.parameter.ComponentParameters.DEFAULT_ENCODING; +import static org.dkpro.core.api.parameter.ComponentParameters.DEFAULT_MAPPING_ENABLED; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.TypeSystem; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.internal.util.XMLUtils; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Progress; +import org.apache.uima.util.ProgressImpl; +import org.dkpro.core.api.io.ResourceCollectionReaderBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.io.imscwb.util.CorpusSentence; +import org.dkpro.core.io.imscwb.util.CorpusText; +import org.dkpro.core.io.imscwb.util.TextIterable; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Reads a tab-separated format including pseudo-XML tags. + */ +@ResourceMetaData(name = "IMS CWB Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.TEXT_X_IMSCWB}) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) +public class ImsCwbReader + extends ResourceCollectionReaderBase +{ + /** + * Character encoding of the output. + */ + public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, + defaultValue = DEFAULT_ENCODING) + private String encoding; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Location of the mapping file for part-of-speech tags to UIMA types. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + protected String mappingPosLocation; + + /** + * Specify which tag set should be used to locate the mapping file. + */ + public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; + @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) + protected String posTagset; + + /** + * Read tokens and generate {@link Token} annotations. + */ + public static final String PARAM_READ_TOKEN = ComponentParameters.PARAM_READ_TOKEN; + @ConfigurationParameter(name = PARAM_READ_TOKEN, mandatory = true, defaultValue = "true") + private boolean readTokens; + + /** + * Read part-of-speech tags and generate {@link POS} annotations or subclasses if a + * {@link #PARAM_POS_TAG_SET tag set} or {@link #PARAM_POS_MAPPING_LOCATION mapping file} is + * used. + */ + public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; + @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") + private boolean readPos; + + /** + * Read sentences. + */ + public static final String PARAM_READ_SENTENCES = ComponentParameters.PARAM_READ_SENTENCE; + @ConfigurationParameter(name = PARAM_READ_SENTENCES, mandatory = true, defaultValue = "true") + private boolean readSentences; + + /** + * Read lemmas. + */ + public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; + @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") + private boolean readLemmas; + + /** + * If true, the unit IDs are used only to detect if a new document (CAS) needs to be created, + * but for the purpose of setting the document ID, a new ID is generated. + */ + public static final String PARAM_GENERATE_NEW_IDS = "generateNewIds"; + @ConfigurationParameter(name = PARAM_GENERATE_NEW_IDS, mandatory = true, defaultValue = "false") + private boolean generateNewIds; + + /** + * If true, the unit text ID encoded in the corpus file is stored as the URI in the document + * meta data. This setting has is not affected by {@link #PARAM_GENERATE_NEW_IDS} + */ + public static final String PARAM_ID_IS_URL = "idIsUrl"; + @ConfigurationParameter(name = PARAM_ID_IS_URL, mandatory = true, defaultValue = "false") + private boolean idIsUrl; + + /** + * Replace non-XML characters with spaces. + */ + public static final String PARAM_REPLACE_NON_XML = "replaceNonXml"; + @ConfigurationParameter(name = PARAM_REPLACE_NON_XML, mandatory = true, defaultValue = "true") + private boolean replaceNonXml; + + private Type tokenType; + private Type lemmaType; + private Type sentenceType; + + private TextIterable wackyIterator; + + private int completed; + + private MappingProvider posMappingProvider; + + private int documentCount; + private int qualifier; + private Resource lastResource; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + wackyIterator = new TextIterable(getResources(), encoding); + + posMappingProvider = createPosMappingProvider(this, mappingPosLocation, posTagset, + getLanguage()); + + documentCount = 0; + qualifier = 0; + lastResource = null; + } + + @Override + public boolean hasNext() + throws IOException, CollectionException + { + return wackyIterator.hasNext(); + } + + @Override + public void getNext(CAS aCAS) + throws IOException, CollectionException + { + Resource res = wackyIterator.getCurrentResource(); + CorpusText text = wackyIterator.next(); + + // Reset counter when a new file is read. + if (!res.equals(lastResource)) { + qualifier = 0; + lastResource = res; + } + + String documentId; + if (generateNewIds) { + documentId = String.valueOf(documentCount); + } + else { + documentId = text.getDocumentTitle(); + } + + initCas(aCAS, res, String.valueOf(qualifier)); + DocumentMetaData meta = DocumentMetaData.get(aCAS); + meta.setDocumentTitle(text.getDocumentTitle()); + meta.setDocumentId(documentId); + + if (idIsUrl) { + meta.setDocumentBaseUri(null); + meta.setDocumentUri(text.getDocumentTitle()); + } + + try { + posMappingProvider.configure(aCAS); + } + catch (AnalysisEngineProcessException e) { + throw new IOException(e); + } + + List tokenAnnotations = new ArrayList(); + List lemmaAnnotations = new ArrayList(); + List posAnnotations = new ArrayList(); + List sentenceAnnotations = new ArrayList(); + + TypeSystem typeSystem = aCAS.getTypeSystem(); + tokenType = typeSystem.getType(Token.class.getName()); + lemmaType = typeSystem.getType(Lemma.class.getName()); + sentenceType = typeSystem.getType(Sentence.class.getName()); + + StringBuilder sb = new StringBuilder(); + int offset = 0; + + for (CorpusSentence sentence : text.getSentences()) { + int savedOffset = offset; + for (int i = 0; i < sentence.getTokens().size(); i++) { + String token = doReplaceNonXml(sentence.getTokens().get(i)); + String lemma = doReplaceNonXml(sentence.getLemmas().get(i)); + String pos = doReplaceNonXml(sentence.getPOS().get(i)); + int len = token.length(); + + if (readPos) { + Type posType = posMappingProvider.getTagType(pos); + AnnotationFS posAnno = aCAS.createAnnotation(posType, offset, offset + len); + posAnno.setStringValue(posType.getFeatureByBaseName("PosValue"), pos); + posAnnotations.add(posAnno); + } + + if (readLemmas) { + AnnotationFS lemmaAnno = aCAS.createAnnotation( + lemmaType, offset, offset + len); + lemmaAnno.setStringValue(lemmaType.getFeatureByBaseName("value"), lemma); + lemmaAnnotations.add(lemmaAnno); + } + + if (readTokens) { + AnnotationFS tokenAnno = aCAS.createAnnotation( + tokenType, offset, offset + len); + if (readPos) { + tokenAnno.setFeatureValue( + tokenType.getFeatureByBaseName("pos"), + posAnnotations.get(posAnnotations.size() - 1)); + } + if (readLemmas) { + tokenAnno.setFeatureValue( + tokenType.getFeatureByBaseName("lemma"), + lemmaAnnotations.get(lemmaAnnotations.size() - 1)); + } + tokenAnnotations.add(tokenAnno); + } + + sb.append(token); + sb.append(" "); + + // increase offset by size of token + 1 for the space + offset += len + 1; + } + + if (readSentences) { + AnnotationFS sentenceAnno = aCAS.createAnnotation( + sentenceType, savedOffset, offset); + sentenceAnnotations.add(sentenceAnno); + } + } + + String sText = sb.toString(); + + aCAS.setDocumentText(sText); + + // finally add the annotations to the CAS + for (AnnotationFS t : tokenAnnotations) { + aCAS.addFsToIndexes(t); + } + for (AnnotationFS l : lemmaAnnotations) { + aCAS.addFsToIndexes(l); + } + for (AnnotationFS p : posAnnotations) { + aCAS.addFsToIndexes(p); + } + for (AnnotationFS s : sentenceAnnotations) { + aCAS.addFsToIndexes(s); + } + + completed++; + documentCount++; + qualifier++; + } + + @Override + public Progress[] getProgress() + { + return new Progress[] { new ProgressImpl(completed, 0, "text") }; + } + + private String doReplaceNonXml(String aString) + { + if (!replaceNonXml) { + return aString; + } + + char[] buf = aString.toCharArray(); + int pos = XMLUtils.checkForNonXmlCharacters(buf, 0, buf.length, false); + + if (pos == -1) { + return aString; + } + + while (pos != -1) { + buf[pos] = ' '; + pos = XMLUtils.checkForNonXmlCharacters(buf, pos, buf.length - pos, false); + } + return String.valueOf(buf); + } +} diff --git a/dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/ImsCwbWriter.java b/dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/ImsCwbWriter.java new file mode 100644 index 0000000000..953a21cf72 --- /dev/null +++ b/dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/ImsCwbWriter.java @@ -0,0 +1,643 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.imscwb; + +import static org.apache.commons.io.FileUtils.deleteQuietly; +import static org.apache.commons.io.FileUtils.forceMkdir; +import static org.apache.commons.io.FileUtils.listFiles; +import static org.apache.commons.io.FilenameUtils.removeExtension; +import static org.apache.commons.lang3.StringUtils.defaultString; +import static org.apache.commons.lang3.StringUtils.join; +import static org.apache.commons.lang3.StringUtils.substringAfterLast; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.dkpro.core.api.parameter.ComponentParameters.DEFAULT_ENCODING; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.output.CloseShieldOutputStream; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.CasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.Parameters; + +/** + * Writes in the IMS Open Corpus Workbench verticalized XML format. + *

+ * This writer produces a text file which needs to be converted to the binary IMS CWB index files + * using the command line tools that come with the CWB. + *

+ * It is possible to set the parameter {@link #PARAM_CQP_HOME} to directly create output in the + * native binary CQP format via the original CWB command line tools. + *

+ * When not configured to write directly to a CQP process, then the writer will produce one file per + * CAS. In order to write all data to the same file, use + * {@link JCasFileWriter_ImplBase#PARAM_SINGULAR_TARGET}. + */ +@ResourceMetaData(name = "IMS CWB Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@Parameters( + exclude = { + ImsCwbWriter.PARAM_TARGET_LOCATION }) +@MimeTypeCapability({MimeTypes.TEXT_X_IMSCWB}) +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }) +public class ImsCwbWriter + extends JCasFileWriter_ImplBase +{ + public static final String E_SENTENCE = "s"; + public static final String E_TEXT = "text"; + public static final String E_DOCUMENT = "document"; + public static final String ATTR_BEGIN = "begin"; + public static final String ATTR_END = "end"; + public static final String ATTR_POS = "pos"; + public static final String ATTR_CPOS = "cpos"; + public static final String ATTR_LEMMA = "lemma"; + public static final String ATTR_ID = "id"; + public static final String ATTR_URI = "uri"; + + /** + * Specify the suffix of output files. Default value .vrt. If the suffix is not + * needed, provide an empty string as value. + */ + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; + @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".vrt") + private String filenameSuffix; + + /** + * Character encoding of the output data. + */ + public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; + @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, + defaultValue = DEFAULT_ENCODING) + private String encoding; + + /** + * Write the document ID for each token. It is usually a better idea to generate a + * {@link #PARAM_WRITE_DOCUMENT_TAG document tag} or a {@link #PARAM_WRITE_TEXT_TAG text tag} + * which also contain the document ID that can be queried in CQP. + */ + public static final String PARAM_WRITE_DOC_ID = "writeDocId"; + @ConfigurationParameter(name = PARAM_WRITE_DOC_ID, mandatory = true, defaultValue = "false") + private boolean writeDocId; + + /** + * Write part-of-speech tags. + */ + public static final String PARAM_WRITE_POS = "writePOS"; + @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "true") + private boolean writePOS; + + /** + * Write coarse-grained part-of-speech tags. These are the simple names of the UIMA types used + * to represent the part-of-speech tag. + */ + public static final String PARAM_WRITE_CPOS = "writeCPOS"; + @ConfigurationParameter(name = PARAM_WRITE_CPOS, mandatory = true, defaultValue = "false") + private boolean writeCPOS; + + /** + * Write lemmata. + */ + public static final String PARAM_WRITE_LEMMA = "writeLemma"; + @ConfigurationParameter(name = PARAM_WRITE_LEMMA, mandatory = true, defaultValue = "true") + private boolean writeLemma; + + /** + * Write a pseudo-XML tag with the name {@code document} to mark the start and end of a + * document. + */ + public static final String PARAM_WRITE_DOCUMENT_TAG = "writeDocumentTag"; + @ConfigurationParameter(name = PARAM_WRITE_DOCUMENT_TAG, mandatory = true, defaultValue = "false") + private boolean writeDocumentTag; + + /** + * Write a pseudo-XML tag with the name {@code text} to mark the start and end of a document. + * This is used by CQPweb. + */ + public static final String PARAM_WRITE_TEXT_TAG = "writeTextTag"; + @ConfigurationParameter(name = PARAM_WRITE_TEXT_TAG, mandatory = true, defaultValue = "true") + private boolean writeTextTag; + + /** + * Write the start and end position of each token. + */ + public static final String PARAM_WRITE_OFFSETS = "writeOffsets"; + @ConfigurationParameter(name = PARAM_WRITE_OFFSETS, mandatory = true, defaultValue = "false") + private boolean writeOffsets; + + /** + * Write additional token-level annotation features. These have to be given as an array of fully + * qualified feature paths (fully.qualified.classname/featureName). The names for these + * annotations in CQP are their lowercase shortnames. + */ + public static final String PARAM_ADDITIONAL_FEATURES = "additionalFeatures"; + @ConfigurationParameter(name = PARAM_ADDITIONAL_FEATURES, mandatory = false) + private String[] additionalFeatures; + + /** + * Make document IDs compatible with CQPweb. CQPweb demands an id consisting of only letters, + * numbers and underscore. + */ + public static final String PARAM_CQPWEB_COMPATIBILITY = "cqpwebCompatibility"; + @ConfigurationParameter(name = PARAM_CQPWEB_COMPATIBILITY, mandatory = true, defaultValue = "false") + private boolean cqpwebCompatibility; + + /** + * Set this parameter to the directory containing the cwb-encode and cwb-makeall commands if you + * want the write to directly encode into the CQP binary format. + */ + public static final String PARAM_CQP_HOME = "cqpHome"; + @ConfigurationParameter(name = PARAM_CQP_HOME, mandatory = false) + private File cqpHome; + + /** + * Set this parameter to compress the token streams and the indexes using cwb-huffcode and + * cwb-compress-rdx. With modern hardware, this may actually slow down queries, so we turn it + * off by default. If you have large data sets, you best try yourself what works best for you. + * (default: false) + */ + public static final String PARAM_CQP_COMPRESS = "cqpCompress"; + @ConfigurationParameter(name = PARAM_CQP_COMPRESS, mandatory = true, defaultValue = "false") + private boolean cqpCompress; + + /** + * The name of the generated corpus. + */ + public static final String PARAM_CORPUS_NAME = "corpusName"; + @ConfigurationParameter(name = PARAM_CORPUS_NAME, mandatory = true, defaultValue = "corpus") + private String corpusName; + + /** + * The pseudo-XML tag used to mark sentence boundaries. + */ + public static final String PARAM_SENTENCE_TAG = "sentenceTag"; + @ConfigurationParameter(name = PARAM_SENTENCE_TAG, mandatory = true, defaultValue = E_SENTENCE) + private String sentenceTag; + + private static final String LS = "\n"; + private static final String TAB = "\t"; + private int currentId; + + private Process childProcess; + private File dataDirectory; + private File registryDirectory; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + currentId = 0; + } + + @Override + public void process(JCas jcas) + throws AnalysisEngineProcessException + { + String documentId = DocumentMetaData.get(jcas).getDocumentId(); + String documentUri = DocumentMetaData.get(jcas).getDocumentUri(); + + // CQPweb demands an id consisting of only letters, numbers and underscore + if (cqpwebCompatibility) { + // if the documentTag is written as well keep the id, else use the uri instead + if (writeDocumentTag) { + if (documentId == null || documentId.length() == 0) { + documentId = Integer.toString(currentId); + } + documentId = documentId.replaceAll("[^\\d\\w_]", "_"); + } + else { + if (documentUri == null || documentUri.length() == 0) { + documentUri = Integer.toString(currentId); + } + documentId = documentUri.replaceAll("[^\\d\\w_]", "_"); + } + } + + try (BufferedWriter out = new BufferedWriter( + new OutputStreamWriter(getOutputStream(jcas, filenameSuffix), encoding))) { + if (writeTextTag) { + startElement(out, E_TEXT, ATTR_ID, documentId); + } + if (writeDocumentTag) { + startElement(out, E_DOCUMENT, ATTR_URI, documentUri); + } + for (Sentence sentence : select(jcas, Sentence.class)) { + attendChildProceess(); + startElement(out, sentenceTag); + for (Token token : selectCovered(jcas, Token.class, sentence)) { + // write token + out.write(escapeXml(token.getCoveredText())); + + // write pos tag + if (writePOS) { + field(out, defaultString(token.getPosValue(), "-")); + } + + // write coarse grained pos tag + if (writeCPOS) { + field(out, + token.getPos() != null + ? defaultString(token.getPos().getCoarseValue(), "-") + : "-"); + } + + // write lemma + if (writeLemma) { + field(out, defaultString(token.getLemmaValue(), "-")); + } + + // write doc-id + if (writeDocId) { + field(out, documentId); + } + + // write offsets + if (writeOffsets) { + field(out, String.valueOf(token.getBegin())); + field(out, String.valueOf(token.getEnd())); + } + + // write additional tags + if (additionalFeatures != null) { + for (String featurePath : additionalFeatures) { + String val = getCoveredAnnotationFeatureValue(featurePath, token); + field(out, val); + } + } + + out.write(LS); + } + endElement(out, sentenceTag); + } + if (writeDocumentTag) { + endElement(out, E_DOCUMENT); + } + if (writeTextTag) { + endElement(out, E_TEXT); + } + + currentId++; + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + + private void startElement(Writer aOut, String aElement, String... aAttributes) + throws IOException + { + aOut.write('<'); + aOut.write(aElement); + if (aAttributes != null && aAttributes.length > 0) { + aOut.write(" "); + for (int i = 0; i < aAttributes.length; i += 2) { + aOut.write(aAttributes[i]); + aOut.write("=\""); + aOut.write(escapeXml(aAttributes[i + 1])); + aOut.write('"'); + } + } + aOut.write('>'); + aOut.write(LS); + } + + private void endElement(Writer aOut, String aElement) + throws IOException + { + aOut.write("'); + aOut.write(LS); + + } + + private void field(Writer aOut, String aValue) + throws IOException + { + aOut.write(TAB); + aOut.write(escapeXml(aValue)); + } + + @Override + protected NamedOutputStream getOutputStream(JCas aJCas, String aExtension) throws IOException + { + // Write directly to CQP if asked to + if (cqpHome != null) { + // If CQP is not running yet, start it. + if (childProcess == null) { + startCqpProcess(); + } + return new NamedOutputStream(null, + new CloseShieldOutputStream(childProcess.getOutputStream())); + } + + return super.getOutputStream(aJCas, aExtension); + } + + private void startCqpProcess() throws IOException + { + dataDirectory = new File(getTargetLocation(), "data"); + registryDirectory = new File(getTargetLocation(), "registry"); + forceMkdir(dataDirectory); + forceMkdir(registryDirectory); + + List cmd = new ArrayList(); + cmd.add(new File(cqpHome, "cwb-encode").getAbsolutePath()); + + cmd.add("-c"); + cmd.add(getCwbCharset(encoding)); + // -x XML-aware (replace XML entities and ignore directory for data files created by ./cwb-encode + cmd.add("-d"); + cmd.add(dataDirectory.getPath()); + // -R create registry entry (named ) listing all encoded attributes + cmd.add("-R"); + cmd.add(new File(registryDirectory, corpusName).getPath()); + + // -P declare additional p-attribute + if (writePOS) { + cmd.add("-P"); + cmd.add(ATTR_POS); + } + + if (writeCPOS) { + cmd.add("-P"); + cmd.add(ATTR_CPOS); + } + + if (writeLemma) { + cmd.add("-P"); + cmd.add(ATTR_LEMMA); + } + + if (writeDocId) { + cmd.add("-P"); + cmd.add(ATTR_URI); + } + + if (writeOffsets) { + cmd.add("-P"); + cmd.add(ATTR_BEGIN); + cmd.add("-P"); + cmd.add(ATTR_END); + } + + if (additionalFeatures != null) { + for (String featurePath : additionalFeatures) { + String[] segments = featurePath.split("/", 2); + if (segments.length != 2) { + throw new IllegalArgumentException("Given feature path is malformed: [" + + featurePath + "] (exactly one \"/\" (slash) must exist)."); + } + String typeName = segments[0]; + String featureName = segments.length > 1 ? segments[1] : ""; + String name = (substringAfterLast(typeName, ".") + "_" + featureName) + .toLowerCase(); + cmd.add("-P"); + cmd.add(name); + } + } + + if (writeDocumentTag) { + cmd.add("-S"); + cmd.add(E_DOCUMENT + ":0+" + ATTR_URI); + } + + if (writeTextTag) { + cmd.add("-S"); + cmd.add(E_TEXT + ":0+" + ATTR_ID); + } + + { + cmd.add("-S"); + cmd.add(sentenceTag + ":0"); + } + + getLogger().info("Spawning cwb-encode: " + join(cmd, " ")); + + final ProcessBuilder pb = new ProcessBuilder(); + pb.command(cmd); + childProcess = pb.start(); + } + + private void attendChildProceess() + { + if (childProcess != null) { + try { + InputStream stdout = childProcess.getInputStream(); + if (stdout.available() > 0) { + byte[] data = new byte[stdout.available()]; + stdout.read(data); + getLogger().info(new String(data, "UTF-8")); + } + InputStream stderr = childProcess.getErrorStream(); + if (stderr.available() > 0) { + byte[] data = new byte[stderr.available()]; + stderr.read(data); + getLogger().error(new String(data, "UTF-8")); + } + } + catch (IOException e) { + getLogger().error("Unable to communicate with child process"); + } + } + } + + @Override + public void collectionProcessComplete() + throws AnalysisEngineProcessException + { + if (childProcess != null) { + IOUtils.closeQuietly(childProcess.getOutputStream()); + + try { + childProcess.waitFor(); + attendChildProceess(); + childProcess = null; + } + catch (InterruptedException e) { + throw new AnalysisEngineProcessException(e); + } + + runCwbCommand("cwb-makeall", "-r", registryDirectory.getPath(), "-V", + corpusName.toUpperCase()); + + if (cqpCompress) { + // Compress the token sequence of a positional attribute. Creates .huf, .hcd, + // and .huf.syn files, which replace the corresponding .corpus files. After + // running this tool successfully, the .corpus files can be deleted. + runCwbCommand("cwb-huffcode", "-r", registryDirectory.getPath(), "-A", + corpusName.toUpperCase()); + for (File f : listFiles(dataDirectory, new String[] { "huf" }, false)) { + deleteQuietly(new File(removeExtension(f.getPath()) + ".corpus")); + } + + // Compress the index of a positional attribute. Creates .crc and .crx files + // which replace the corresponding .corpus.rev and .corpus.rdx files. After + // running this tool successfully, the latter files can be deleted. + runCwbCommand("cwb-compress-rdx", "-r", registryDirectory.getPath(), "-A", + corpusName.toUpperCase()); + for (File f : listFiles(dataDirectory, new String[] { "crc" }, false)) { + deleteQuietly(new File(removeExtension(f.getPath()) + ".corpus.rev")); + deleteQuietly(new File(removeExtension(f.getPath()) + ".corpus.rdx")); + } + } + } + } + + private void runCwbCommand(String aCommand, String... aArguments) + throws AnalysisEngineProcessException + { + try { + List args = new ArrayList(aArguments.length + 1); + args.add(new File(cqpHome, aCommand).getAbsolutePath()); + for (String arg : aArguments) { + args.add(arg); + } + + ProcessBuilder pb = new ProcessBuilder(args); + getLogger().info("Spawning " + aCommand + ": " + join(args, " ")); + childProcess = pb.start(); + childProcess.waitFor(); + } + catch (InterruptedException e) { + throw new AnalysisEngineProcessException(e); + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + finally { + attendChildProceess(); + childProcess = null; + } + } + + private static Map CHARSET_MAPPING = new HashMap(); + static { + CHARSET_MAPPING.put("ISO-8859-1", "latin1"); + CHARSET_MAPPING.put("UTF-8", "utf8"); + } + + private static String getCwbCharset(String aEncoding) + { + String enc = CHARSET_MAPPING.get(aEncoding); + if (enc == null) { + throw new IllegalArgumentException("Encoding [" + enc + "] not supported by CWB."); + } + return enc; + } + + private static String escapeXml(String aString) + { + return aString.replaceAll("&", "&").replaceAll("<", "<").replaceAll(">", ">") + .replaceAll("\"", """).replaceAll("'", "'"); + } + + /** + * Get the feature value of an annotation which is covered by another annotation. + * + * @param aFeaturePath + * The fully qualified feature path of the feature in question: + * your.package.and.annotation.class.name/featureName + * @param aCoveringAnnotation + * The annotation that covers the annotation for which the feature value should be + * extracted. + * @return the feature value if a feature name is given; coveredText if only the annotation is + * given + */ + public String getCoveredAnnotationFeatureValue(String aFeaturePath, + AnnotationFS aCoveringAnnotation) + { + String[] segments = aFeaturePath.split("/", 2); + if (segments.length != 2) { + throw new IllegalArgumentException("Given feature path is malformed: [" + aFeaturePath + + "] (exactly one \"/\" (slash) must exist)."); + } + String typeName = segments[0]; + String featureName = segments[1]; + Type type = CasUtil.getAnnotationType(aCoveringAnnotation.getCAS(), typeName); + Feature feature = type.getFeatureByBaseName(featureName); + if (feature == null) { + throw new IllegalArgumentException("Feature [" + featureName + + "] is not defined for type [" + type + "] (check lower/uppercase spelling)."); + } + + List covered = CasUtil.selectCovered(type, aCoveringAnnotation); + switch (covered.size()) { + case 0: + if (getLogger().isWarnEnabled()) { + getLogger().warn( + "There is no annotation of type [" + typeName + + "] available which is covered by [" + aCoveringAnnotation + + "], returning empty string."); + } + return ""; + case 1: + return covered.get(0).getFeatureValueAsString(feature); + default: + if (getLogger().isWarnEnabled()) { + getLogger().warn( + "There are multiple annotations of type [" + typeName + + "] available which are covered by [" + aCoveringAnnotation + + "], returning the first."); + } + return covered.get(0).getFeatureValueAsString(feature); + } + } +} diff --git a/dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/package-info.java b/dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/package-info.java new file mode 100644 index 0000000000..4816370372 --- /dev/null +++ b/dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/package-info.java @@ -0,0 +1,25 @@ +/* + * Copyright 2010-2011 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Support for IMS Corpus Workbench tab-separated format + * also used by WaCky. + * + * @since 1.2.0 + */ +package org.dkpro.core.io.imscwb; diff --git a/dkpro-core-io-imscwb-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/util/CorpusSentence.java b/dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/util/CorpusSentence.java similarity index 97% rename from dkpro-core-io-imscwb-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/util/CorpusSentence.java rename to dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/util/CorpusSentence.java index 5536691016..53489c9dce 100644 --- a/dkpro-core-io-imscwb-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/util/CorpusSentence.java +++ b/dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/util/CorpusSentence.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.imscwb.util; +package org.dkpro.core.io.imscwb.util; import java.util.ArrayList; import java.util.List; diff --git a/dkpro-core-io-imscwb-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/util/CorpusText.java b/dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/util/CorpusText.java similarity index 90% rename from dkpro-core-io-imscwb-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/util/CorpusText.java rename to dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/util/CorpusText.java index 0835009e29..989236ea09 100644 --- a/dkpro-core-io-imscwb-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/util/CorpusText.java +++ b/dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/util/CorpusText.java @@ -15,23 +15,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.imscwb.util; +package org.dkpro.core.io.imscwb.util; import java.util.ArrayList; import java.util.List; /** * Used for temporary storing extracted texts before adding to the CAS. - * - * */ public class CorpusText { private final List sentences; private String documentTitle; - public CorpusText() { - this(""); + public CorpusText() { + this(""); } public CorpusText(String title) @@ -55,8 +53,8 @@ public String getDocumentTitle() } public void setDocumentTitle(String documentTitle) { - this.documentTitle = documentTitle; - } + this.documentTitle = documentTitle; + } } diff --git a/dkpro-core-io-imscwb-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/util/TabTokenizer.java b/dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/util/TabTokenizer.java similarity index 96% rename from dkpro-core-io-imscwb-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/util/TabTokenizer.java rename to dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/util/TabTokenizer.java index 6bc92d8e56..db87b57f91 100644 --- a/dkpro-core-io-imscwb-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/util/TabTokenizer.java +++ b/dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/util/TabTokenizer.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.imscwb.util; +package org.dkpro.core.io.imscwb.util; import java.util.Iterator; @@ -77,4 +77,4 @@ public void remove() { throw new UnsupportedOperationException(); } -} \ No newline at end of file +} diff --git a/dkpro-core-io-imscwb-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/util/TextIterable.java b/dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/util/TextIterable.java similarity index 88% rename from dkpro-core-io-imscwb-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/util/TextIterable.java rename to dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/util/TextIterable.java index 9f7827cdfb..77cb0a4466 100644 --- a/dkpro-core-io-imscwb-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/util/TextIterable.java +++ b/dkpro-core-io-imscwb-asl/src/main/java/org/dkpro/core/io/imscwb/util/TextIterable.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.imscwb.util; +package org.dkpro.core.io.imscwb.util; import java.io.BufferedReader; import java.io.FileNotFoundException; @@ -29,8 +29,8 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase.Resource; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.api.io.ResourceCollectionReaderBase.Resource; +import org.dkpro.core.api.resources.CompressionUtils; public class TextIterable implements Iterable, Iterator @@ -82,7 +82,8 @@ public boolean hasNext() { } @Override - public CorpusText next(){ + public CorpusText next() + { return texts.poll(); } @@ -133,14 +134,20 @@ private void fillTextQueue(int bufferSize) throws IOException { if (insideSentence && currentSentence != null) { TabTokenizer tokenizer = new TabTokenizer(line); - for (int i=0; i<3; i++) { + for (int i = 0; i < 3; i++) { if (!tokenizer.hasNext()) { throw new IOException("Ill-formed line: " + line); } switch (i) { - case 0 : currentSentence.addToken(tokenizer.next()); break; - case 1 : currentSentence.addPOS(tokenizer.next()); break; - case 2 : currentSentence.addLemma(tokenizer.next()); break; + case 0: + currentSentence.addToken(tokenizer.next()); + break; + case 1: + currentSentence.addPOS(tokenizer.next()); + break; + case 2: + currentSentence.addLemma(tokenizer.next()); + break; } } } @@ -174,4 +181,4 @@ private BufferedReader getReader() throws FileNotFoundException, IOException { } return r; } -} \ No newline at end of file +} diff --git a/dkpro-core-io-imscwb-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/ImsCwbReaderTest.java b/dkpro-core-io-imscwb-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/ImsCwbReaderTest.java deleted file mode 100644 index 8554581a84..0000000000 --- a/dkpro-core-io-imscwb-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/ImsCwbReaderTest.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.imscwb; - -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.apache.uima.fit.pipeline.SimplePipeline.iteratePipeline; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.fail; - -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.jcas.JCas; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - -public class ImsCwbReaderTest -{ - @Test - public void wackyTest() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - ImsCwbReader.class, - ImsCwbReader.PARAM_SOURCE_LOCATION, "src/test/resources/wacky/", - ImsCwbReader.PARAM_LANGUAGE, "de", - ImsCwbReader.PARAM_SOURCE_ENCODING, "ISO-8859-15", - ResourceCollectionReaderBase.PARAM_PATTERNS, "[+]test.txt"); - - String firstSentence = "Nikita ( La Femme Nikita ) Dieser Episodenf\u00FChrer wurde von " + - "September 1998 bis Mai 1999 von Konstantin C.W. Volkmann geschrieben und im Mai " + - "2000 von Stefan B\u00F6rzel \u00FCbernommen . "; - - int i = 0; - for (JCas jcas : iteratePipeline(reader)) { - // System.out.println(jcas.getDocumentText()); - if (i == 0) { - assertEquals(11406, select(jcas, Token.class).size()); - assertEquals(11406, select(jcas, Lemma.class).size()); - assertEquals(11406, select(jcas, POS.class).size()); - assertEquals(717, select(jcas, Sentence.class).size()); - - assertEquals(firstSentence, select(jcas, Sentence.class).iterator().next() - .getCoveredText()); - - assertEquals("http://www.epguides.de/nikita.htm", DocumentMetaData.get(jcas) - .getDocumentTitle()); - } - i++; - } - - assertEquals(4, i); - - } - - @Test - public void wackyTest_noAnnotations() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - ImsCwbReader.class, - ImsCwbReader.PARAM_SOURCE_LOCATION, "src/test/resources/wacky/", - ImsCwbReader.PARAM_PATTERNS, "[+]test.txt", - ImsCwbReader.PARAM_LANGUAGE, "de", - ImsCwbReader.PARAM_SOURCE_ENCODING, "ISO-8859-15", - ImsCwbReader.PARAM_READ_TOKEN, false, - ImsCwbReader.PARAM_READ_LEMMA, false, - ImsCwbReader.PARAM_READ_POS, false, - ImsCwbReader.PARAM_READ_SENTENCES, false); - - int i = 0; - for (JCas jcas : iteratePipeline(reader)) { - if (i == 0) { - assertEquals(0, select(jcas, Token.class).size()); - assertEquals(0, select(jcas, POS.class).size()); - assertEquals(0, select(jcas, Sentence.class).size()); - } - i++; - } - - assertEquals(4, i); - } - - @Test(expected = IllegalStateException.class) - public void wackyTest__expectedException() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - ImsCwbReader.class, - ImsCwbReader.PARAM_SOURCE_LOCATION, "src/test/resources/wacky", - ImsCwbReader.PARAM_LANGUAGE, "de", - ImsCwbReader.PARAM_SOURCE_ENCODING, "ISO-8859-15", - ImsCwbReader.PARAM_READ_TOKEN, false, - ImsCwbReader.PARAM_READ_LEMMA, true, - ImsCwbReader.PARAM_READ_POS, false, - ImsCwbReader.PARAM_READ_SENTENCES, false); - - for (JCas jcas : iteratePipeline(reader)) { - // should never get here - fail("no Exception!"); - } - fail("no Exception!"); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} \ No newline at end of file diff --git a/dkpro-core-io-imscwb-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/ImsCwbReaderWriterTest.java b/dkpro-core-io-imscwb-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/ImsCwbReaderWriterTest.java deleted file mode 100644 index 7401a3006a..0000000000 --- a/dkpro-core-io-imscwb-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/ImsCwbReaderWriterTest.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright 2014 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.imscwb; - -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testOneWay2; - -import java.io.File; - -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - -public class ImsCwbReaderWriterTest -{ - @Test - public void testTuebadz() - throws Exception - { - testOneWay2(ImsCwbReader.class, ImsCwbWriter.class, "tuebadz/corpus-sample-ref.txt", - "corpus-sample-ref.txt", "tuebadz/corpus-sample-ref.txt", - ComponentParameters.PARAM_TARGET_LOCATION, - new File(testContext.getTestOutputFolder(), "corpus-sample-ref.txt"), - ImsCwbReader.PARAM_LANGUAGE, "de", - ImsCwbReader.PARAM_POS_TAG_SET, "stts"); - } - - @Test - public void testWacky() - throws Exception - { - testOneWay2(ImsCwbReader.class, ImsCwbWriter.class, "wacky/test-ref.txt", - "test.txt", "wacky/test.txt", - ComponentParameters.PARAM_TARGET_LOCATION, - new File(testContext.getTestOutputFolder(), "test.txt"), - ImsCwbReader.PARAM_SOURCE_ENCODING, "iso8859-1"); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-io-imscwb-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/ImsCwbWriterTest.java b/dkpro-core-io-imscwb-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/ImsCwbWriterTest.java deleted file mode 100644 index 71848b0907..0000000000 --- a/dkpro-core-io-imscwb-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/ImsCwbWriterTest.java +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */package de.tudarmstadt.ukp.dkpro.core.io.imscwb; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; -import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; -import static org.junit.Assert.assertEquals; - -import java.io.File; - -import org.apache.commons.io.FileUtils; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReader; -import org.apache.uima.fit.component.CasDumpWriter; -import org.junit.Ignore; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.io.bnc.BncReader; -import de.tudarmstadt.ukp.dkpro.core.io.negra.NegraExportReader; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; -import de.tudarmstadt.ukp.dkpro.core.snowball.SnowballStemmer; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.EOLUtils; - -/** - * - * - */ -public class ImsCwbWriterTest -{ - private static final String outputFile = "target/corpus-sample.ims"; - - @Test - public void test1() - throws Exception - { - File dump = new File(testContext.getTestOutputFolder(), "/dump.txt"); - File output = new File(testContext.getTestOutputFolder(), "/output.txt"); - - CollectionReader ner = createReader( - NegraExportReader.class, - NegraExportReader.PARAM_SOURCE_LOCATION, "src/test/resources/tuebadz/corpus-sample.export", - NegraExportReader.PARAM_LANGUAGE, "de", - NegraExportReader.PARAM_SOURCE_ENCODING, "UTF-8"); - - AnalysisEngineDescription tag = createEngineDescription( - OpenNlpPosTagger.class); - - AnalysisEngineDescription tw = createEngineDescription( - ImsCwbWriter.class, - ImsCwbWriter.PARAM_TARGET_LOCATION, output, - ImsCwbWriter.PARAM_TARGET_ENCODING, "UTF-8"); - - AnalysisEngineDescription cdw = createEngineDescription( - CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, dump); - - runPipeline(ner, tag, tw, cdw); - - String reference = FileUtils.readFileToString( - new File("src/test/resources/tuebadz/corpus-sample-ref.txt"), "UTF-8"); - String actual = FileUtils.readFileToString( - output, "UTF-8"); - reference = EOLUtils.normalizeLineEndings(reference); - actual = EOLUtils.normalizeLineEndings(actual); - assertEquals(reference, actual); - } - - @Test - public void testAdditionalFeatures() - throws Exception - { - CollectionReader ner = createReader( - NegraExportReader.class, - NegraExportReader.PARAM_SOURCE_LOCATION, "src/test/resources/tuebadz/corpus-sample.export", - NegraExportReader.PARAM_LANGUAGE, "de", - NegraExportReader.PARAM_SOURCE_ENCODING, "UTF-8"); - - AnalysisEngineDescription tag = createEngineDescription( - OpenNlpPosTagger.class); - - AnalysisEngineDescription stem = createEngineDescription( - SnowballStemmer.class); - - AnalysisEngineDescription tw = createEngineDescription( - ImsCwbWriter.class, - ImsCwbWriter.PARAM_TARGET_LOCATION, outputFile, - ImsCwbWriter.PARAM_TARGET_ENCODING, "UTF-8", - ImsCwbWriter.PARAM_WRITE_CPOS, true, - ImsCwbWriter.PARAM_ADDITIONAL_FEATURES, new String[] { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem/value" }); - - AnalysisEngineDescription cdw = createEngineDescription( - CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "target/dump.txt"); - - runPipeline(ner, tag, stem, tw, cdw); - - String reference = FileUtils.readFileToString( - new File("src/test/resources/tuebadz/corpus-sample-addfeat-ref.txt"), "UTF-8"); - String actual = FileUtils.readFileToString( - new File(outputFile), "UTF-8"); - reference = EOLUtils.normalizeLineEndings(reference); - actual = EOLUtils.normalizeLineEndings(actual); - assertEquals(reference, actual); - } - @Ignore("FX8 is a file from the BNC. While available online for download, we currently do not " - + "ship it due to licensing issues.") - @Test - public void test1a() - throws Exception - { - CollectionReader ner = createReader( - BncReader.class, - BncReader.PARAM_SOURCE_LOCATION, "src/test/resources", - BncReader.PARAM_PATTERNS, new String[] { "[+]FX8.xml" }, - BncReader.PARAM_LANGUAGE, "en"); - - AnalysisEngineDescription tw = createEngineDescription( - ImsCwbWriter.class, - ImsCwbWriter.PARAM_TARGET_LOCATION, outputFile, - ImsCwbWriter.PARAM_TARGET_ENCODING, "UTF-8"); - - AnalysisEngineDescription cdw = createEngineDescription( - CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "target/dump.txt"); - - runPipeline(ner, tw, cdw); - - String reference = FileUtils.readFileToString( - new File("src/test/resources/reference/bnc-sample.ims"), "UTF-8"); - String actual = FileUtils.readFileToString( - new File(outputFile), "UTF-8"); - assertEquals(reference, actual); - } - - @Ignore("This test cannot work (yet) because we do not ship the cwb-encode and cwb-makeall binaries") - @Test - public void test2() - throws Exception - { - CollectionReader ner = createReader( - NegraExportReader.class, - NegraExportReader.PARAM_SOURCE_LOCATION, "src/test/resources/corpus-sample.export", - NegraExportReader.PARAM_LANGUAGE, "de", - NegraExportReader.PARAM_SOURCE_ENCODING, "UTF-8"); - - AnalysisEngineDescription tag = createEngineDescription( - OpenNlpPosTagger.class); - - AnalysisEngineDescription tw = createEngineDescription( - ImsCwbWriter.class, - ImsCwbWriter.PARAM_TARGET_LOCATION, "target/cqbformat", - ImsCwbWriter.PARAM_TARGET_ENCODING, "UTF-8", - ImsCwbWriter.PARAM_CQP_HOME, "/Users/bluefire/bin/cwb-2.2.b99"); - - runPipeline(ner, tag, tw); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-io-imscwb-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/TuebadzToImsCwbPipeline.java b/dkpro-core-io-imscwb-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/TuebadzToImsCwbPipeline.java deleted file mode 100644 index 33fb00729a..0000000000 --- a/dkpro-core-io-imscwb-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/imscwb/TuebadzToImsCwbPipeline.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */package de.tudarmstadt.ukp.dkpro.core.io.imscwb; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; -import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReader; -import org.junit.Ignore; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.io.negra.NegraExportReader; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; - -/** - * - */ -@Ignore("This is to convert the actual corpus!") -public class TuebadzToImsCwbPipeline -{ - private static final String inputFile = "src/main/resources/tuebadz.export"; - private static final String outputFile = "target/tuebadz.ims.xml"; - - @Test - public void convert() - throws Exception - { - CollectionReader ner = createReader( - NegraExportReader.class, - NegraExportReader.PARAM_SOURCE_LOCATION, inputFile, - NegraExportReader.PARAM_LANGUAGE, "de", - NegraExportReader.PARAM_SOURCE_ENCODING, "ISO-8859-1"); - - AnalysisEngineDescription tag = createEngineDescription( - OpenNlpPosTagger.class); - - AnalysisEngineDescription tw = createEngineDescription( - ImsCwbWriter.class, - ImsCwbWriter.PARAM_TARGET_LOCATION, outputFile, - ImsCwbWriter.PARAM_TARGET_ENCODING, "UTF-8"); - - runPipeline(ner, tag, tw); - } -} diff --git a/dkpro-core-io-imscwb-asl/src/test/java/org/dkpro/core/io/imscwb/ImsCwbReaderTest.java b/dkpro-core-io-imscwb-asl/src/test/java/org/dkpro/core/io/imscwb/ImsCwbReaderTest.java new file mode 100644 index 0000000000..7052ed5378 --- /dev/null +++ b/dkpro-core-io-imscwb-asl/src/test/java/org/dkpro/core/io/imscwb/ImsCwbReaderTest.java @@ -0,0 +1,130 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.imscwb; + +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.pipeline.SimplePipeline.iteratePipeline; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.ResourceCollectionReaderBase; +import org.dkpro.core.io.imscwb.ImsCwbReader; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class ImsCwbReaderTest +{ + @Test + public void wackyTest() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + ImsCwbReader.class, + ImsCwbReader.PARAM_SOURCE_LOCATION, "src/test/resources/wacky/", + ImsCwbReader.PARAM_LANGUAGE, "de", + ImsCwbReader.PARAM_SOURCE_ENCODING, "ISO-8859-15", + ResourceCollectionReaderBase.PARAM_PATTERNS, "[+]test.txt"); + + String firstSentence = "Nikita ( La Femme Nikita ) Dieser Episodenf\u00FChrer wurde von " + + "September 1998 bis Mai 1999 von Konstantin C.W. Volkmann geschrieben und im Mai " + + "2000 von Stefan B\u00F6rzel \u00FCbernommen . "; + + int i = 0; + for (JCas jcas : iteratePipeline(reader)) { + // System.out.println(jcas.getDocumentText()); + if (i == 0) { + assertEquals(11406, select(jcas, Token.class).size()); + assertEquals(11406, select(jcas, Lemma.class).size()); + assertEquals(11406, select(jcas, POS.class).size()); + assertEquals(717, select(jcas, Sentence.class).size()); + + assertEquals(firstSentence, select(jcas, Sentence.class).iterator().next() + .getCoveredText()); + + assertEquals("http://www.epguides.de/nikita.htm", DocumentMetaData.get(jcas) + .getDocumentTitle()); + } + i++; + } + + assertEquals(4, i); + + } + + @Test + public void wackyTest_noAnnotations() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + ImsCwbReader.class, + ImsCwbReader.PARAM_SOURCE_LOCATION, "src/test/resources/wacky/", + ImsCwbReader.PARAM_PATTERNS, "[+]test.txt", + ImsCwbReader.PARAM_LANGUAGE, "de", + ImsCwbReader.PARAM_SOURCE_ENCODING, "ISO-8859-15", + ImsCwbReader.PARAM_READ_TOKEN, false, + ImsCwbReader.PARAM_READ_LEMMA, false, + ImsCwbReader.PARAM_READ_POS, false, + ImsCwbReader.PARAM_READ_SENTENCES, false); + + int i = 0; + for (JCas jcas : iteratePipeline(reader)) { + if (i == 0) { + assertEquals(0, select(jcas, Token.class).size()); + assertEquals(0, select(jcas, POS.class).size()); + assertEquals(0, select(jcas, Sentence.class).size()); + } + i++; + } + + assertEquals(4, i); + } + + @Test(expected = IllegalStateException.class) + public void wackyTest__expectedException() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + ImsCwbReader.class, + ImsCwbReader.PARAM_SOURCE_LOCATION, "src/test/resources/wacky", + ImsCwbReader.PARAM_LANGUAGE, "de", + ImsCwbReader.PARAM_SOURCE_ENCODING, "ISO-8859-15", + ImsCwbReader.PARAM_READ_TOKEN, false, + ImsCwbReader.PARAM_READ_LEMMA, true, + ImsCwbReader.PARAM_READ_POS, false, + ImsCwbReader.PARAM_READ_SENTENCES, false); + + for (JCas jcas : iteratePipeline(reader)) { + // should never get here + fail("no Exception!"); + } + fail("no Exception!"); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-imscwb-asl/src/test/java/org/dkpro/core/io/imscwb/ImsCwbReaderWriterTest.java b/dkpro-core-io-imscwb-asl/src/test/java/org/dkpro/core/io/imscwb/ImsCwbReaderWriterTest.java new file mode 100644 index 0000000000..55708be82d --- /dev/null +++ b/dkpro-core-io-imscwb-asl/src/test/java/org/dkpro/core/io/imscwb/ImsCwbReaderWriterTest.java @@ -0,0 +1,87 @@ +/* + * Copyright 2014 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.imscwb; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.util.Files.contentOf; + +import java.io.File; + +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.ReaderAssert; +import org.junit.Rule; +import org.junit.Test; + +public class ImsCwbReaderWriterTest +{ + @Test + public void thatRoundTripWithTuebaDzWorks() + throws Exception + { + ReaderAssert.assertThat( + ImsCwbReader.class, + ImsCwbReader.PARAM_SOURCE_LOCATION, + "src/test/resources/tuebadz/corpus-sample-ref.txt", + ImsCwbReader.PARAM_POS_TAG_SET, "stts", + ImsCwbReader.PARAM_LANGUAGE, "de") + .usingWriter( + ImsCwbWriter.class) + .writingToSingular("${TARGET}/corpus-sample-ref.txt") + .outputAsString() + .isEqualToNormalizingNewlines(contentOf( + new File("src/test/resources/tuebadz/corpus-sample-ref.txt"), UTF_8)); + } + + @Test + public void thatRoundTripWithMultipleInputsWorks() + throws Exception + { + ReaderAssert.assertThat( + ImsCwbReader.class, + ImsCwbReader.PARAM_SOURCE_LOCATION, + "src/test/resources/multiple/*.vrt") + .usingWriter( + ImsCwbWriter.class) + .keepOriginalExtension() + .asFiles() + .allSatisfy(file -> assertThat(contentOf(file, UTF_8)).isEqualToNormalizingNewlines( + contentOf(new File("src/test/resources/multiple", file.getName()), UTF_8))); + } + + @Test + public void thatOneWayWithWackyWorks() throws Exception + { + ReaderAssert.assertThat( + ImsCwbReader.class, + ImsCwbReader.PARAM_SOURCE_LOCATION, + "src/test/resources/wacky/test.txt", + ImsCwbReader.PARAM_POS_TAG_SET, "stts", + ImsCwbReader.PARAM_LANGUAGE, "de", + ImsCwbReader.PARAM_SOURCE_ENCODING, "iso8859-1") + .usingWriter( + ImsCwbWriter.class) + .writingToSingular("${TARGET}/test.txt") + .outputAsString() + .isEqualToNormalizingNewlines(contentOf( + new File("src/test/resources/wacky/test-ref.txt"), UTF_8)); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-imscwb-asl/src/test/java/org/dkpro/core/io/imscwb/ImsCwbWriterTest.java b/dkpro-core-io-imscwb-asl/src/test/java/org/dkpro/core/io/imscwb/ImsCwbWriterTest.java new file mode 100644 index 0000000000..84b5d8dd67 --- /dev/null +++ b/dkpro-core-io-imscwb-asl/src/test/java/org/dkpro/core/io/imscwb/ImsCwbWriterTest.java @@ -0,0 +1,113 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.imscwb; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; +import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; +import static org.assertj.core.util.Files.contentOf; + +import java.io.File; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.collection.CollectionReader; +import org.dkpro.core.io.negra.NegraExportReader; +import org.dkpro.core.opennlp.OpenNlpPosTagger; +import org.dkpro.core.snowball.SnowballStemmer; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.ReaderAssert; +import org.junit.Ignore; +import org.junit.Rule; +import org.junit.Test; + +public class ImsCwbWriterTest +{ + @Test + public void thatWritingTuebaDzSampleWorks() + throws Exception + { + ReaderAssert.assertThat( + NegraExportReader.class, + NegraExportReader.PARAM_SOURCE_LOCATION, + "src/test/resources/tuebadz/corpus-sample.export", + NegraExportReader.PARAM_LANGUAGE, "de", + NegraExportReader.PARAM_SOURCE_ENCODING, "UTF-8") + .usingEngines( + createEngineDescription(OpenNlpPosTagger.class)) + .usingWriter( + ImsCwbWriter.class, + ImsCwbWriter.PARAM_TARGET_ENCODING, "UTF-8") + .writingToSingular("${TARGET}/corpus-sample.vrt") + .outputAsString() + .isEqualToNormalizingNewlines(contentOf( + new File("src/test/resources/tuebadz/corpus-sample-ref.txt"), UTF_8)); + } + + @Test + public void thatWritingTuebaDzSampleWithAdditionalFeaturesWorks() + throws Exception + { + ReaderAssert.assertThat( + NegraExportReader.class, + NegraExportReader.PARAM_SOURCE_LOCATION, + "src/test/resources/tuebadz/corpus-sample.export", + NegraExportReader.PARAM_LANGUAGE, "de", + NegraExportReader.PARAM_SOURCE_ENCODING, "UTF-8") + .usingEngines( + createEngineDescription(OpenNlpPosTagger.class), + createEngineDescription(SnowballStemmer.class)) + .usingWriter( + ImsCwbWriter.class, + ImsCwbWriter.PARAM_TARGET_LOCATION, "${TARGET}/corpus-sample-addfeat.vrt", + ImsCwbWriter.PARAM_SINGULAR_TARGET, true, + ImsCwbWriter.PARAM_WRITE_CPOS, true, + ImsCwbWriter.PARAM_ADDITIONAL_FEATURES, new String[] { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem/value" }) + .outputAsString() + .isEqualToNormalizingNewlines(contentOf( + new File("src/test/resources/tuebadz/corpus-sample-addfeat-ref.txt"), + UTF_8)); + } + + @Ignore("This test cannot work (yet) because we do not ship the cwb-encode and cwb-makeall binaries") + @Test + public void test2() + throws Exception + { + CollectionReader ner = createReader( + NegraExportReader.class, + NegraExportReader.PARAM_SOURCE_LOCATION, "src/test/resources/corpus-sample.export", + NegraExportReader.PARAM_LANGUAGE, "de", + NegraExportReader.PARAM_SOURCE_ENCODING, "UTF-8"); + + AnalysisEngineDescription tag = createEngineDescription( + OpenNlpPosTagger.class); + + AnalysisEngineDescription tw = createEngineDescription( + ImsCwbWriter.class, + ImsCwbWriter.PARAM_TARGET_LOCATION, "target/cqbformat", + ImsCwbWriter.PARAM_TARGET_ENCODING, "UTF-8", + ImsCwbWriter.PARAM_CQP_HOME, "/Users/bluefire/bin/cwb-2.2.b99"); + + runPipeline(ner, tag, tw); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-imscwb-asl/src/test/java/org/dkpro/core/io/imscwb/TuebadzToImsCwbPipeline.java b/dkpro-core-io-imscwb-asl/src/test/java/org/dkpro/core/io/imscwb/TuebadzToImsCwbPipeline.java new file mode 100644 index 0000000000..6db0ebbf3c --- /dev/null +++ b/dkpro-core-io-imscwb-asl/src/test/java/org/dkpro/core/io/imscwb/TuebadzToImsCwbPipeline.java @@ -0,0 +1,57 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */package org.dkpro.core.io.imscwb; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; +import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.collection.CollectionReader; +import org.dkpro.core.io.imscwb.ImsCwbWriter; +import org.dkpro.core.io.negra.NegraExportReader; +import org.dkpro.core.opennlp.OpenNlpPosTagger; +import org.junit.Ignore; +import org.junit.Test; + +@Ignore("This is to convert the actual corpus!") +public class TuebadzToImsCwbPipeline +{ + private static final String inputFile = "src/main/resources/tuebadz.export"; + private static final String outputFile = "target/tuebadz.ims.xml"; + + @Test + public void convert() + throws Exception + { + CollectionReader ner = createReader( + NegraExportReader.class, + NegraExportReader.PARAM_SOURCE_LOCATION, inputFile, + NegraExportReader.PARAM_LANGUAGE, "de", + NegraExportReader.PARAM_SOURCE_ENCODING, "ISO-8859-1"); + + AnalysisEngineDescription tag = createEngineDescription( + OpenNlpPosTagger.class); + + AnalysisEngineDescription tw = createEngineDescription( + ImsCwbWriter.class, + ImsCwbWriter.PARAM_TARGET_LOCATION, outputFile, + ImsCwbWriter.PARAM_TARGET_ENCODING, "UTF-8"); + + runPipeline(ner, tag, tw); + } +} diff --git a/dkpro-core-io-imscwb-asl/src/test/resources/log4j.properties b/dkpro-core-io-imscwb-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-io-imscwb-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-imscwb-asl/src/test/resources/log4j2.xml b/dkpro-core-io-imscwb-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-imscwb-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-imscwb-asl/src/test/resources/multiple/T990507.2.vrt b/dkpro-core-io-imscwb-asl/src/test/resources/multiple/T990507.2.vrt new file mode 100644 index 0000000000..a878b6870c --- /dev/null +++ b/dkpro-core-io-imscwb-asl/src/test/resources/multiple/T990507.2.vrt @@ -0,0 +1,22 @@ + + +Seine PPOSAT - +Position NN - +bei APPR - +der ART - +Prüfgesellschaft NN - +mußte VVFIN - +er PPER - +damals ADV - +niederlegen VVPP - +, $, - +den ART - +AWO-Posten NN - +nicht PTKNEG - +. $. - + + +K. NE - +W. NE - + + diff --git a/dkpro-core-io-imscwb-asl/src/test/resources/multiple/T990507.3.vrt b/dkpro-core-io-imscwb-asl/src/test/resources/multiple/T990507.3.vrt new file mode 100644 index 0000000000..7e1a3c10f2 --- /dev/null +++ b/dkpro-core-io-imscwb-asl/src/test/resources/multiple/T990507.3.vrt @@ -0,0 +1,19 @@ + + +SPD NE - +/ $( - +CDU NE - +/ $( - +AfB NE - +für $( - +Daewoo-Millionen NN - + + +Aber KON - +Bremerhavens NE - +AfB NE - +fordert VVFIN - +jetzt ADV - +Untersuchungsausschuß NN - + + diff --git a/dkpro-core-io-jdbc-asl/pom.xml b/dkpro-core-io-jdbc-asl/pom.xml index 997cbdb115..efaf8518cd 100644 --- a/dkpro-core-io-jdbc-asl/pom.xml +++ b/dkpro-core-io-jdbc-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.jdbc-asl + dkpro-core-io-jdbc-asl jar DKPro Core ASL - IO - JDBC + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -36,8 +37,12 @@ uimafit-core - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -52,7 +57,7 @@ org.hsqldb hsqldb - 2.3.2 + 2.5.0 test diff --git a/dkpro-core-io-jdbc-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jdbc/package-info.java b/dkpro-core-io-jdbc-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jdbc/package-info.java deleted file mode 100644 index 7535b4929b..0000000000 --- a/dkpro-core-io-jdbc-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jdbc/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Support for JDBC-capable SQL databases. - */ -package de.tudarmstadt.ukp.dkpro.core.io.jdbc; diff --git a/dkpro-core-io-jdbc-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jdbc/JdbcReader.java b/dkpro-core-io-jdbc-asl/src/main/java/org/dkpro/core/io/jdbc/JdbcReader.java similarity index 92% rename from dkpro-core-io-jdbc-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jdbc/JdbcReader.java rename to dkpro-core-io-jdbc-asl/src/main/java/org/dkpro/core/io/jdbc/JdbcReader.java index e90e63c4fe..30fc7f23ec 100644 --- a/dkpro-core-io-jdbc-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jdbc/JdbcReader.java +++ b/dkpro-core-io-jdbc-asl/src/main/java/org/dkpro/core/io/jdbc/JdbcReader.java @@ -15,9 +15,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.jdbc; +package org.dkpro.core.io.jdbc; + +import static java.util.Arrays.asList; + +import java.io.IOException; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.ResultSet; +import java.sql.ResultSetMetaData; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.HashSet; +import java.util.Set; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import org.apache.commons.dbutils.DbUtils; import org.apache.uima.UimaContext; import org.apache.uima.cas.CAS; @@ -31,12 +42,10 @@ import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; -import java.io.IOException; -import java.sql.*; -import java.util.HashSet; -import java.util.Set; - -import static java.util.Arrays.asList; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Collection reader for JDBC database.The obtained data will be written into CAS DocumentText as @@ -50,7 +59,9 @@ * will create a CAS for each record, write the content of "text" column into CAS document text and * that of "title" column into the document title field of the {@link DocumentMetaData} annotation. */ -@ResourceMetaData(name="JDBC-based Database Reader") +@Component(value = OperationType.READER) +@ResourceMetaData(name = "JDBC-based Database Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @TypeCapability( outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" }) @@ -74,8 +85,8 @@ public class JdbcReader /** * Specify the class name of the JDBC driver. *

- * If used with uimaFIT and the value is not given, com.mysql.cj.jdbc.Driver will be - * taken. + * If used with uimaFIT and the value is not given, com.mysql.cj.jdbc.Driver will + * be taken. */ public static final String PARAM_DRIVER = "driver"; @ConfigurationParameter(name = PARAM_DRIVER, mandatory = true, defaultValue = "com.mysql.cj.jdbc.Driver") @@ -87,15 +98,16 @@ public class JdbcReader * If used with uimaFIT and the value is not given, jdbc:mysql://127.0.0.1/ will be * taken. *

- * Do not use this parameter to add additional parameters, but use {@link #PARAM_CONNECTION_PARAMS} - * instead. + * Do not use this parameter to add additional parameters, but use + * {@link #PARAM_CONNECTION_PARAMS} instead. */ public static final String PARAM_CONNECTION = "connection"; @ConfigurationParameter(name = PARAM_CONNECTION, mandatory = true, defaultValue = "jdbc:mysql://127.0.0.1/") private String connection; /** - * Add additional parameters for the connection URL here in a single string: {@code [&propertyName1=propertyValue1[&propertyName2=propertyValue2]...]}. + * Add additional parameters for the connection URL here in a single string: + * {@code [&propertyName1=propertyValue1[&propertyName2=propertyValue2]...]}. */ public static final String PARAM_CONNECTION_PARAMS = "connectionParams"; @ConfigurationParameter(name = PARAM_CONNECTION_PARAMS, mandatory = true, defaultValue = "") diff --git a/dkpro-core-io-jdbc-asl/src/main/java/org/dkpro/core/io/jdbc/package-info.java b/dkpro-core-io-jdbc-asl/src/main/java/org/dkpro/core/io/jdbc/package-info.java new file mode 100644 index 0000000000..8e75711dc2 --- /dev/null +++ b/dkpro-core-io-jdbc-asl/src/main/java/org/dkpro/core/io/jdbc/package-info.java @@ -0,0 +1,22 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Support for JDBC-capable SQL databases. + */ +package org.dkpro.core.io.jdbc; diff --git a/dkpro-core-io-jdbc-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/jdbc/JdbcReaderExample.java b/dkpro-core-io-jdbc-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/jdbc/JdbcReaderExample.java deleted file mode 100644 index 12419266d0..0000000000 --- a/dkpro-core-io-jdbc-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/jdbc/JdbcReaderExample.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.jdbc; - -import java.io.IOException; -import java.sql.Connection; -import java.sql.DriverManager; -import java.sql.SQLException; -import java.sql.Statement; - -import junit.framework.Assert; - -import org.apache.commons.dbutils.DbUtils; -import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.cas.CAS; -import org.apache.uima.collection.CollectionReader; -import org.apache.uima.fit.component.CasDumpWriter; -import org.apache.uima.fit.factory.AnalysisEngineFactory; -import org.apache.uima.fit.factory.CollectionReaderFactory; -import org.apache.uima.fit.factory.JCasFactory; -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; - -public class JdbcReaderExample -{ - public static final String DB_NAME = "test_db"; - public static final String DB_USER = "root"; - public static final String DB_PASS = ""; - public static final String TBL_NAME = "test_table"; - - String query = "SELECT title AS \"" + JdbcReader.CAS_METADATA_TITLE + "\", text AS \"" - + JdbcReader.CAS_TEXT + "\" FROM " + TBL_NAME + ";"; - - @Test - public void localhostMysqlExample() - throws UIMAException, IOException - { - // This is a dummy example. It only shows how to use JdbcReader and may not run on your - // system. - CollectionReader jdbcReader = CollectionReaderFactory.createReader( - JdbcReader.class, - JdbcReader.PARAM_DATABASE, DB_NAME, - JdbcReader.PARAM_USER, DB_USER, - JdbcReader.PARAM_PASSWORD, DB_PASS, - JdbcReader.PARAM_QUERY, query); - - AnalysisEngine extractor = AnalysisEngineFactory.createEngine(CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "-"); - - SimplePipeline.runPipeline(jdbcReader, extractor); - } - - @Test - public void hsqldbExampleTest() - throws SQLException, UIMAException, IOException - { - // Setup in-memory database. - Connection conn = null; - Statement stmnt = null; - try { - conn = DriverManager.getConnection("jdbc:hsqldb:mem:/" + DB_NAME, DB_USER, DB_PASS); - stmnt = conn.createStatement(); - stmnt.addBatch("CREATE TABLE " + TBL_NAME + " (title varchar(50), text varchar(100));"); - stmnt.addBatch("INSERT INTO " + TBL_NAME + " (title, text) VALUES ('title1', 'text...1');"); - stmnt.addBatch("INSERT INTO " + TBL_NAME + " (title, text) VALUES ('title2', 'text...2');"); - stmnt.addBatch("INSERT INTO " + TBL_NAME + " (title, text) VALUES ('title3', 'text...3');"); - stmnt.addBatch("INSERT INTO " + TBL_NAME + " (title, text) VALUES ('title4', 'text...4');"); - stmnt.executeBatch(); - } - finally { - DbUtils.closeQuietly(stmnt); - DbUtils.closeQuietly(conn); - } - // Read out with JdbcReader. - CollectionReader jdbcReader = CollectionReaderFactory.createReader( - JdbcReader.class, - JdbcReader.PARAM_DATABASE, "test_db", - JdbcReader.PARAM_USER, "root", - JdbcReader.PARAM_PASSWORD, "", - JdbcReader.PARAM_QUERY, query, - JdbcReader.PARAM_DRIVER, "org.hsqldb.jdbc.JDBCDriver", - JdbcReader.PARAM_CONNECTION, "jdbc:hsqldb:mem:"); - - int i = 1; - while (jdbcReader.hasNext()) { - // Does it still have a next row? - jdbcReader.hasNext(); - // Really? - jdbcReader.hasNext(); - - CAS cas = JCasFactory.createJCas().getCas(); - jdbcReader.getNext(cas); - Assert.assertEquals("title" + i, DocumentMetaData.get(cas).getDocumentTitle()); - Assert.assertEquals("text..." + i, cas.getDocumentText()); - i++; - } - } -} diff --git a/dkpro-core-io-jdbc-asl/src/test/java/org/dkpro/core/io/jdbc/JdbcReaderExample.java b/dkpro-core-io-jdbc-asl/src/test/java/org/dkpro/core/io/jdbc/JdbcReaderExample.java new file mode 100644 index 0000000000..9fbfccea2e --- /dev/null +++ b/dkpro-core-io-jdbc-asl/src/test/java/org/dkpro/core/io/jdbc/JdbcReaderExample.java @@ -0,0 +1,116 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.jdbc; + +import java.io.IOException; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.sql.Statement; + +import org.apache.commons.dbutils.DbUtils; +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.cas.CAS; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.component.CasDumpWriter; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.CollectionReaderFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.io.jdbc.JdbcReader; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import junit.framework.Assert; + +public class JdbcReaderExample +{ + public static final String DB_NAME = "test_db"; + public static final String DB_USER = "root"; + public static final String DB_PASS = ""; + public static final String TBL_NAME = "test_table"; + + String query = "SELECT title AS \"" + JdbcReader.CAS_METADATA_TITLE + "\", text AS \"" + + JdbcReader.CAS_TEXT + "\" FROM " + TBL_NAME + ";"; + + @Test + public void localhostMysqlExample() + throws UIMAException, IOException + { + // This is a dummy example. It only shows how to use JdbcReader and may not run on your + // system. + CollectionReader jdbcReader = CollectionReaderFactory.createReader( + JdbcReader.class, + JdbcReader.PARAM_DATABASE, DB_NAME, + JdbcReader.PARAM_USER, DB_USER, + JdbcReader.PARAM_PASSWORD, DB_PASS, + JdbcReader.PARAM_QUERY, query); + + AnalysisEngine extractor = AnalysisEngineFactory.createEngine(CasDumpWriter.class, + CasDumpWriter.PARAM_OUTPUT_FILE, "-"); + + SimplePipeline.runPipeline(jdbcReader, extractor); + } + + @Test + public void hsqldbExampleTest() + throws SQLException, UIMAException, IOException + { + // Setup in-memory database. + Connection conn = null; + Statement stmnt = null; + try { + conn = DriverManager.getConnection("jdbc:hsqldb:mem:/" + DB_NAME, DB_USER, DB_PASS); + stmnt = conn.createStatement(); + stmnt.addBatch("CREATE TABLE " + TBL_NAME + " (title varchar(50), text varchar(100));"); + stmnt.addBatch("INSERT INTO " + TBL_NAME + " (title, text) VALUES ('title1', 'text...1');"); + stmnt.addBatch("INSERT INTO " + TBL_NAME + " (title, text) VALUES ('title2', 'text...2');"); + stmnt.addBatch("INSERT INTO " + TBL_NAME + " (title, text) VALUES ('title3', 'text...3');"); + stmnt.addBatch("INSERT INTO " + TBL_NAME + " (title, text) VALUES ('title4', 'text...4');"); + stmnt.executeBatch(); + } + finally { + DbUtils.closeQuietly(stmnt); + DbUtils.closeQuietly(conn); + } + // Read out with JdbcReader. + CollectionReader jdbcReader = CollectionReaderFactory.createReader( + JdbcReader.class, + JdbcReader.PARAM_DATABASE, "test_db", + JdbcReader.PARAM_USER, "root", + JdbcReader.PARAM_PASSWORD, "", + JdbcReader.PARAM_QUERY, query, + JdbcReader.PARAM_DRIVER, "org.hsqldb.jdbc.JDBCDriver", + JdbcReader.PARAM_CONNECTION, "jdbc:hsqldb:mem:"); + + int i = 1; + while (jdbcReader.hasNext()) { + // Does it still have a next row? + jdbcReader.hasNext(); + // Really? + jdbcReader.hasNext(); + + CAS cas = JCasFactory.createJCas().getCas(); + jdbcReader.getNext(cas); + Assert.assertEquals("title" + i, DocumentMetaData.get(cas).getDocumentTitle()); + Assert.assertEquals("text..." + i, cas.getDocumentText()); + i++; + } + } +} diff --git a/dkpro-core-io-json-asl/pom.xml b/dkpro-core-io-json-asl/pom.xml index a03b4d6c4d..cf4ea7751f 100644 --- a/dkpro-core-io-json-asl/pom.xml +++ b/dkpro-core-io-json-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.json-asl + dkpro-core-io-json-asl jar DKPro Core ASL - IO - UIMA JSON + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -44,12 +45,16 @@ commons-io - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -57,13 +62,13 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.conll-asl + org.dkpro.core + dkpro-core-io-conll-asl test diff --git a/dkpro-core-io-json-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/json/JsonWriter.java b/dkpro-core-io-json-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/json/JsonWriter.java deleted file mode 100644 index 3100ea1c57..0000000000 --- a/dkpro-core-io-json-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/json/JsonWriter.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.json; - -import static org.apache.commons.io.IOUtils.closeQuietly; - -import java.io.File; -import java.io.IOException; -import java.io.OutputStream; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CASRuntimeException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.json.JsonCasSerializer; -import org.apache.uima.json.JsonCasSerializer.JsonContextFormat; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.TypeSystemUtil; -import org.xml.sax.SAXException; - -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; - -/** - * UIMA JSON format writer. - */ -@ResourceMetaData(name="UIMA JSON CAS Writer") -@MimeTypeCapability({MimeTypes.APPLICATION_X_UIMA_JSON}) -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) -public class JsonWriter - extends JCasFileWriter_ImplBase -{ - /** - * Location to write the type system to. If this is not set, a file called typesystem.xml will - * be written to the XMI output path. If this is set, it is expected to be a file relative - * to the current work directory or an absolute file. - *
- * If this parameter is set, the {@link #PARAM_COMPRESSION} parameter has no effect on the - * type system. Instead, if the file name ends in ".gz", the file will be compressed, - * otherwise not. - */ - public static final String PARAM_TYPE_SYSTEM_FILE = "typeSystemFile"; - @ConfigurationParameter(name = PARAM_TYPE_SYSTEM_FILE, mandatory = false) - private File typeSystemFile; - - public static final String PARAM_PRETTY_PRINT = "prettyPrint"; - @ConfigurationParameter(name = PARAM_PRETTY_PRINT, mandatory = true, defaultValue = "true") - private boolean prettyPrint; - - public static final String PARAM_OMIT_DEFAULT_VALUES = "omitDefaultValues"; - @ConfigurationParameter(name = PARAM_OMIT_DEFAULT_VALUES, mandatory = true, defaultValue = "true") - private boolean omitDefaultValues; - - public static final String PARAM_JSON_CONTEXT_FORMAT = "jsonContextFormat"; - @ConfigurationParameter(name = PARAM_JSON_CONTEXT_FORMAT, mandatory = true, defaultValue = "omitExpandedTypeNames") - private String jsonContextFormat; - - private boolean typeSystemWritten; - - private JsonCasSerializer jcs; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - typeSystemWritten = false; - jcs = new JsonCasSerializer(); - jcs.setPrettyPrint(prettyPrint); - jcs.setOmit0Values(omitDefaultValues); - jcs.setJsonContext(JsonContextFormat.valueOf(jsonContextFormat)); - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - try (OutputStream docOS = getOutputStream(aJCas, ".json")) { - jcs.serialize(aJCas.getCas(), docOS); - - if (!typeSystemWritten) { - writeTypeSystem(aJCas); - typeSystemWritten = true; - } - } - catch (Exception e) { - throw new AnalysisEngineProcessException(e); - } - } - - private void writeTypeSystem(JCas aJCas) - throws IOException, CASRuntimeException, SAXException - { - @SuppressWarnings("resource") - OutputStream typeOS = null; - - try { - if (typeSystemFile != null) { - typeOS = CompressionUtils.getOutputStream(typeSystemFile); - } - else { - typeOS = getOutputStream("TypeSystem", ".xml"); - } - - TypeSystemUtil.typeSystem2TypeSystemDescription(aJCas.getTypeSystem()).toXML(typeOS); - } - finally { - closeQuietly(typeOS); - } - } -} diff --git a/dkpro-core-io-json-asl/src/main/java/org/dkpro/core/io/json/JsonWriter.java b/dkpro-core-io-json-asl/src/main/java/org/dkpro/core/io/json/JsonWriter.java new file mode 100644 index 0000000000..736a83b111 --- /dev/null +++ b/dkpro-core-io-json-asl/src/main/java/org/dkpro/core/io/json/JsonWriter.java @@ -0,0 +1,145 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.json; + +import static org.apache.commons.io.IOUtils.closeQuietly; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CASRuntimeException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.json.JsonCasSerializer; +import org.apache.uima.json.JsonCasSerializer.JsonContextFormat; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.TypeSystemUtil; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.xml.sax.SAXException; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * UIMA JSON format writer. + */ +@ResourceMetaData(name = "UIMA JSON CAS Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.APPLICATION_X_UIMA_JSON}) +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) +public class JsonWriter + extends JCasFileWriter_ImplBase +{ + /** + * Location to write the type system to. If this is not set, a file called typesystem.xml will + * be written to the XMI output path. If this is set, it is expected to be a file relative + * to the current work directory or an absolute file. + *
+ * If this parameter is set, the {@link #PARAM_COMPRESSION} parameter has no effect on the + * type system. Instead, if the file name ends in ".gz", the file will be compressed, + * otherwise not. + */ + public static final String PARAM_TYPE_SYSTEM_FILE = "typeSystemFile"; + @ConfigurationParameter(name = PARAM_TYPE_SYSTEM_FILE, mandatory = false) + private File typeSystemFile; + + /** + * Whether to pretty-print the JSON output. + */ + public static final String PARAM_PRETTY_PRINT = "prettyPrint"; + @ConfigurationParameter(name = PARAM_PRETTY_PRINT, mandatory = true, defaultValue = "true") + private boolean prettyPrint; + + /** + * Whether to fields that have their default values from the JSON output. + */ + public static final String PARAM_OMIT_DEFAULT_VALUES = "omitDefaultValues"; + @ConfigurationParameter(name = PARAM_OMIT_DEFAULT_VALUES, mandatory = true, defaultValue = "true") + private boolean omitDefaultValues; + + /** + * The level of detail to use for the context (i.e. type system) information. + */ + public static final String PARAM_JSON_CONTEXT_FORMAT = "jsonContextFormat"; + @ConfigurationParameter(name = PARAM_JSON_CONTEXT_FORMAT, mandatory = true, defaultValue = "omitExpandedTypeNames") + private String jsonContextFormat; + + private boolean typeSystemWritten; + + private JsonCasSerializer jcs; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + typeSystemWritten = false; + jcs = new JsonCasSerializer(); + jcs.setPrettyPrint(prettyPrint); + jcs.setOmit0Values(omitDefaultValues); + jcs.setJsonContext(JsonContextFormat.valueOf(jsonContextFormat)); + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + try (OutputStream docOS = getOutputStream(aJCas, ".json")) { + jcs.serialize(aJCas.getCas(), docOS); + + if (!typeSystemWritten) { + writeTypeSystem(aJCas); + typeSystemWritten = true; + } + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + } + + private void writeTypeSystem(JCas aJCas) + throws IOException, CASRuntimeException, SAXException + { + @SuppressWarnings("resource") + OutputStream typeOS = null; + + try { + if (typeSystemFile != null) { + typeOS = CompressionUtils.getOutputStream(typeSystemFile); + } + else { + typeOS = getOutputStream("TypeSystem", ".xml"); + } + + TypeSystemUtil.typeSystem2TypeSystemDescription(aJCas.getTypeSystem()).toXML(typeOS); + } + finally { + closeQuietly(typeOS); + } + } +} diff --git a/dkpro-core-io-json-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/json/JsonWriterTest.java b/dkpro-core-io-json-asl/src/test/java/org/dkpro/core/io/json/JsonWriterTest.java similarity index 77% rename from dkpro-core-io-json-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/json/JsonWriterTest.java rename to dkpro-core-io-json-asl/src/test/java/org/dkpro/core/io/json/JsonWriterTest.java index 37bc760fbf..542c2d1c21 100644 --- a/dkpro-core-io-json-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/json/JsonWriterTest.java +++ b/dkpro-core-io-json-asl/src/test/java/org/dkpro/core/io/json/JsonWriterTest.java @@ -15,17 +15,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.json; +package org.dkpro.core.io.json; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testOneWay; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; +import org.dkpro.core.io.conll.Conll2000Reader; +import org.dkpro.core.io.json.JsonWriter; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2000Reader; -import de.tudarmstadt.ukp.dkpro.core.io.json.JsonWriter; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - public class JsonWriterTest { @Test diff --git a/dkpro-core-io-json-asl/src/test/resources/conll/2000/chunk2000_ref.json b/dkpro-core-io-json-asl/src/test/resources/conll/2000/chunk2000_ref.json index 7f3742440a..adfdff1f6f 100644 --- a/dkpro-core-io-json-asl/src/test/resources/conll/2000/chunk2000_ref.json +++ b/dkpro-core-io-json-asl/src/test/resources/conll/2000/chunk2000_ref.json @@ -25,244 +25,244 @@ "_subtypes" : ["DocumentMetaData" ] } } }, "_views" : { "_InitialView" : { - "POS" : [36, 54, 72, 90, 108, 126, 144, 162, 180, 198, 216, 234, 252, 270, 288, 306, 324, 342, 360, 378, 396, 414, 432, 450, 468, 486, 504, 522, 540, 558, 576, 594, 612, 630, 648, 666, 684, 797, 815, 833, 851, 869, 887, 905, 923, 941, 959, 977, 995, 1013, 1031, 1049, 1067, 1085, 1103, 1121, 1139, 1157, 1175, 1193, 1211, 1229, 1247, 1265, 1348, 1366, 1384, 1402, 1420, 1438, 1456, 1474, 1492, 1510, 1528, 1546, 1564, 1582, 1600, 1618, 1636, 1654, 1672, 1690, 1708, 1726, 1744, 1762, 1780, 1798, 1816, 1834, 1852 ], + "POS" : [5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221 ], "DocumentMetaData" : [ - {"sofa" : 6, "end" : 552, "language" : "x-unspecified", "documentTitle" : "chunk2000_test.conll", "documentId" : "chunk2000_test.conll", "isLastSegment" : false } ], + {"sofa" : 1, "end" : 552, "language" : "x-unspecified", "documentTitle" : "chunk2000_test.conll", "documentId" : "chunk2000_test.conll", "isLastSegment" : false } ], "Sentence" : [ - {"sofa" : 6, "end" : 215 }, - {"sofa" : 6, "begin" : 217, "end" : 372 }, - {"sofa" : 6, "begin" : 374, "end" : 550 } ], + {"sofa" : 1, "end" : 215 }, + {"sofa" : 1, "begin" : 217, "end" : 372 }, + {"sofa" : 1, "begin" : 374, "end" : 550 } ], "Token" : [ - {"sofa" : 6, "end" : 10, "pos" : 36 }, - {"sofa" : 6, "begin" : 11, "end" : 13, "pos" : 54 }, - {"sofa" : 6, "begin" : 14, "end" : 17, "pos" : 72 }, - {"sofa" : 6, "begin" : 18, "end" : 23, "pos" : 90 }, - {"sofa" : 6, "begin" : 24, "end" : 26, "pos" : 108 }, - {"sofa" : 6, "begin" : 27, "end" : 33, "pos" : 126 }, - {"sofa" : 6, "begin" : 34, "end" : 42, "pos" : 144 }, - {"sofa" : 6, "begin" : 43, "end" : 45, "pos" : 162 }, - {"sofa" : 6, "begin" : 46, "end" : 50, "pos" : 180 }, - {"sofa" : 6, "begin" : 51, "end" : 58, "pos" : 198 }, - {"sofa" : 6, "begin" : 59, "end" : 64, "pos" : 216 }, - {"sofa" : 6, "begin" : 65, "end" : 69, "pos" : 234 }, - {"sofa" : 6, "begin" : 70, "end" : 72, "pos" : 252 }, - {"sofa" : 6, "begin" : 73, "end" : 78, "pos" : 270 }, - {"sofa" : 6, "begin" : 79, "end" : 86, "pos" : 288 }, - {"sofa" : 6, "begin" : 87, "end" : 90, "pos" : 306 }, - {"sofa" : 6, "begin" : 91, "end" : 100, "pos" : 324 }, - {"sofa" : 6, "begin" : 101, "end" : 102, "pos" : 342 }, - {"sofa" : 6, "begin" : 103, "end" : 106, "pos" : 360 }, - {"sofa" : 6, "begin" : 107, "end" : 110, "pos" : 378 }, - {"sofa" : 6, "begin" : 111, "end" : 118, "pos" : 396 }, - {"sofa" : 6, "begin" : 119, "end" : 127, "pos" : 414 }, - {"sofa" : 6, "begin" : 128, "end" : 129, "pos" : 432 }, - {"sofa" : 6, "begin" : 130, "end" : 134, "pos" : 450 }, - {"sofa" : 6, "begin" : 135, "end" : 137, "pos" : 468 }, - {"sofa" : 6, "begin" : 138, "end" : 142, "pos" : 486 }, - {"sofa" : 6, "begin" : 143, "end" : 144, "pos" : 504 }, - {"sofa" : 6, "begin" : 145, "end" : 156, "pos" : 522 }, - {"sofa" : 6, "begin" : 157, "end" : 168, "pos" : 540 }, - {"sofa" : 6, "begin" : 169, "end" : 173, "pos" : 558 }, - {"sofa" : 6, "begin" : 174, "end" : 178, "pos" : 576 }, - {"sofa" : 6, "begin" : 179, "end" : 182, "pos" : 594 }, - {"sofa" : 6, "begin" : 183, "end" : 189, "pos" : 612 }, - {"sofa" : 6, "begin" : 190, "end" : 192, "pos" : 630 }, - {"sofa" : 6, "begin" : 193, "end" : 204, "pos" : 648 }, - {"sofa" : 6, "begin" : 205, "end" : 213, "pos" : 666 }, - {"sofa" : 6, "begin" : 214, "end" : 215, "pos" : 684 }, - {"sofa" : 6, "begin" : 217, "end" : 227, "pos" : 797 }, - {"sofa" : 6, "begin" : 228, "end" : 230, "pos" : 815 }, - {"sofa" : 6, "begin" : 231, "end" : 234, "pos" : 833 }, - {"sofa" : 6, "begin" : 235, "end" : 244, "pos" : 851 }, - {"sofa" : 6, "begin" : 245, "end" : 250, "pos" : 869 }, - {"sofa" : 6, "begin" : 251, "end" : 257, "pos" : 887 }, - {"sofa" : 6, "begin" : 258, "end" : 260, "pos" : 905 }, - {"sofa" : 6, "begin" : 261, "end" : 269, "pos" : 923 }, - {"sofa" : 6, "begin" : 270, "end" : 280, "pos" : 941 }, - {"sofa" : 6, "begin" : 281, "end" : 283, "pos" : 959 }, - {"sofa" : 6, "begin" : 284, "end" : 285, "pos" : 977 }, - {"sofa" : 6, "begin" : 286, "end" : 290, "pos" : 995 }, - {"sofa" : 6, "begin" : 291, "end" : 299, "pos" : 1013 }, - {"sofa" : 6, "begin" : 300, "end" : 306, "pos" : 1031 }, - {"sofa" : 6, "begin" : 307, "end" : 310, "pos" : 1049 }, - {"sofa" : 6, "begin" : 311, "end" : 317, "pos" : 1067 }, - {"sofa" : 6, "begin" : 318, "end" : 320, "pos" : 1085 }, - {"sofa" : 6, "begin" : 321, "end" : 328, "pos" : 1103 }, - {"sofa" : 6, "begin" : 329, "end" : 330, "pos" : 1121 }, - {"sofa" : 6, "begin" : 331, "end" : 339, "pos" : 1139 }, - {"sofa" : 6, "begin" : 340, "end" : 342, "pos" : 1157 }, - {"sofa" : 6, "begin" : 343, "end" : 351, "pos" : 1175 }, - {"sofa" : 6, "begin" : 352, "end" : 356, "pos" : 1193 }, - {"sofa" : 6, "begin" : 357, "end" : 360, "pos" : 1211 }, - {"sofa" : 6, "begin" : 361, "end" : 365, "pos" : 1229 }, - {"sofa" : 6, "begin" : 366, "end" : 370, "pos" : 1247 }, - {"sofa" : 6, "begin" : 371, "end" : 372, "pos" : 1265 }, - {"sofa" : 6, "begin" : 374, "end" : 377, "pos" : 1348 }, - {"sofa" : 6, "begin" : 378, "end" : 386, "pos" : 1366 }, - {"sofa" : 6, "begin" : 387, "end" : 393, "pos" : 1384 }, - {"sofa" : 6, "begin" : 394, "end" : 404, "pos" : 1402 }, - {"sofa" : 6, "begin" : 405, "end" : 412, "pos" : 1420 }, - {"sofa" : 6, "begin" : 413, "end" : 416, "pos" : 1438 }, - {"sofa" : 6, "begin" : 417, "end" : 425, "pos" : 1456 }, - {"sofa" : 6, "begin" : 426, "end" : 429, "pos" : 1474 }, - {"sofa" : 6, "begin" : 430, "end" : 434, "pos" : 1492 }, - {"sofa" : 6, "begin" : 435, "end" : 441, "pos" : 1510 }, - {"sofa" : 6, "begin" : 442, "end" : 444, "pos" : 1528 }, - {"sofa" : 6, "begin" : 445, "end" : 448, "pos" : 1546 }, - {"sofa" : 6, "begin" : 449, "end" : 459, "pos" : 1564 }, - {"sofa" : 6, "begin" : 460, "end" : 462, "pos" : 1582 }, - {"sofa" : 6, "begin" : 463, "end" : 470, "pos" : 1600 }, - {"sofa" : 6, "begin" : 471, "end" : 473, "pos" : 1618 }, - {"sofa" : 6, "begin" : 474, "end" : 482, "pos" : 1636 }, - {"sofa" : 6, "begin" : 483, "end" : 486, "pos" : 1654 }, - {"sofa" : 6, "begin" : 487, "end" : 490, "pos" : 1672 }, - {"sofa" : 6, "begin" : 491, "end" : 497, "pos" : 1690 }, - {"sofa" : 6, "begin" : 498, "end" : 506, "pos" : 1708 }, - {"sofa" : 6, "begin" : 507, "end" : 509, "pos" : 1726 }, - {"sofa" : 6, "begin" : 510, "end" : 513, "pos" : 1744 }, - {"sofa" : 6, "begin" : 514, "end" : 521, "pos" : 1762 }, - {"sofa" : 6, "begin" : 522, "end" : 527, "pos" : 1780 }, - {"sofa" : 6, "begin" : 528, "end" : 534, "pos" : 1798 }, - {"sofa" : 6, "begin" : 535, "end" : 539, "pos" : 1816 }, - {"sofa" : 6, "begin" : 540, "end" : 548, "pos" : 1834 }, - {"sofa" : 6, "begin" : 549, "end" : 550, "pos" : 1852 } ], + {"sofa" : 1, "end" : 10, "pos" : 5 }, + {"sofa" : 1, "begin" : 11, "end" : 13, "pos" : 7 }, + {"sofa" : 1, "begin" : 14, "end" : 17, "pos" : 9 }, + {"sofa" : 1, "begin" : 18, "end" : 23, "pos" : 11 }, + {"sofa" : 1, "begin" : 24, "end" : 26, "pos" : 13 }, + {"sofa" : 1, "begin" : 27, "end" : 33, "pos" : 15 }, + {"sofa" : 1, "begin" : 34, "end" : 42, "pos" : 17 }, + {"sofa" : 1, "begin" : 43, "end" : 45, "pos" : 19 }, + {"sofa" : 1, "begin" : 46, "end" : 50, "pos" : 21 }, + {"sofa" : 1, "begin" : 51, "end" : 58, "pos" : 23 }, + {"sofa" : 1, "begin" : 59, "end" : 64, "pos" : 25 }, + {"sofa" : 1, "begin" : 65, "end" : 69, "pos" : 27 }, + {"sofa" : 1, "begin" : 70, "end" : 72, "pos" : 29 }, + {"sofa" : 1, "begin" : 73, "end" : 78, "pos" : 31 }, + {"sofa" : 1, "begin" : 79, "end" : 86, "pos" : 33 }, + {"sofa" : 1, "begin" : 87, "end" : 90, "pos" : 35 }, + {"sofa" : 1, "begin" : 91, "end" : 100, "pos" : 37 }, + {"sofa" : 1, "begin" : 101, "end" : 102, "pos" : 39 }, + {"sofa" : 1, "begin" : 103, "end" : 106, "pos" : 41 }, + {"sofa" : 1, "begin" : 107, "end" : 110, "pos" : 43 }, + {"sofa" : 1, "begin" : 111, "end" : 118, "pos" : 45 }, + {"sofa" : 1, "begin" : 119, "end" : 127, "pos" : 47 }, + {"sofa" : 1, "begin" : 128, "end" : 129, "pos" : 49 }, + {"sofa" : 1, "begin" : 130, "end" : 134, "pos" : 51 }, + {"sofa" : 1, "begin" : 135, "end" : 137, "pos" : 53 }, + {"sofa" : 1, "begin" : 138, "end" : 142, "pos" : 55 }, + {"sofa" : 1, "begin" : 143, "end" : 144, "pos" : 57 }, + {"sofa" : 1, "begin" : 145, "end" : 156, "pos" : 59 }, + {"sofa" : 1, "begin" : 157, "end" : 168, "pos" : 61 }, + {"sofa" : 1, "begin" : 169, "end" : 173, "pos" : 63 }, + {"sofa" : 1, "begin" : 174, "end" : 178, "pos" : 65 }, + {"sofa" : 1, "begin" : 179, "end" : 182, "pos" : 67 }, + {"sofa" : 1, "begin" : 183, "end" : 189, "pos" : 69 }, + {"sofa" : 1, "begin" : 190, "end" : 192, "pos" : 71 }, + {"sofa" : 1, "begin" : 193, "end" : 204, "pos" : 73 }, + {"sofa" : 1, "begin" : 205, "end" : 213, "pos" : 75 }, + {"sofa" : 1, "begin" : 214, "end" : 215, "pos" : 77 }, + {"sofa" : 1, "begin" : 217, "end" : 227, "pos" : 98 }, + {"sofa" : 1, "begin" : 228, "end" : 230, "pos" : 100 }, + {"sofa" : 1, "begin" : 231, "end" : 234, "pos" : 102 }, + {"sofa" : 1, "begin" : 235, "end" : 244, "pos" : 104 }, + {"sofa" : 1, "begin" : 245, "end" : 250, "pos" : 106 }, + {"sofa" : 1, "begin" : 251, "end" : 257, "pos" : 108 }, + {"sofa" : 1, "begin" : 258, "end" : 260, "pos" : 110 }, + {"sofa" : 1, "begin" : 261, "end" : 269, "pos" : 112 }, + {"sofa" : 1, "begin" : 270, "end" : 280, "pos" : 114 }, + {"sofa" : 1, "begin" : 281, "end" : 283, "pos" : 116 }, + {"sofa" : 1, "begin" : 284, "end" : 285, "pos" : 118 }, + {"sofa" : 1, "begin" : 286, "end" : 290, "pos" : 120 }, + {"sofa" : 1, "begin" : 291, "end" : 299, "pos" : 122 }, + {"sofa" : 1, "begin" : 300, "end" : 306, "pos" : 124 }, + {"sofa" : 1, "begin" : 307, "end" : 310, "pos" : 126 }, + {"sofa" : 1, "begin" : 311, "end" : 317, "pos" : 128 }, + {"sofa" : 1, "begin" : 318, "end" : 320, "pos" : 130 }, + {"sofa" : 1, "begin" : 321, "end" : 328, "pos" : 132 }, + {"sofa" : 1, "begin" : 329, "end" : 330, "pos" : 134 }, + {"sofa" : 1, "begin" : 331, "end" : 339, "pos" : 136 }, + {"sofa" : 1, "begin" : 340, "end" : 342, "pos" : 138 }, + {"sofa" : 1, "begin" : 343, "end" : 351, "pos" : 140 }, + {"sofa" : 1, "begin" : 352, "end" : 356, "pos" : 142 }, + {"sofa" : 1, "begin" : 357, "end" : 360, "pos" : 144 }, + {"sofa" : 1, "begin" : 361, "end" : 365, "pos" : 146 }, + {"sofa" : 1, "begin" : 366, "end" : 370, "pos" : 148 }, + {"sofa" : 1, "begin" : 371, "end" : 372, "pos" : 150 }, + {"sofa" : 1, "begin" : 374, "end" : 377, "pos" : 165 }, + {"sofa" : 1, "begin" : 378, "end" : 386, "pos" : 167 }, + {"sofa" : 1, "begin" : 387, "end" : 393, "pos" : 169 }, + {"sofa" : 1, "begin" : 394, "end" : 404, "pos" : 171 }, + {"sofa" : 1, "begin" : 405, "end" : 412, "pos" : 173 }, + {"sofa" : 1, "begin" : 413, "end" : 416, "pos" : 175 }, + {"sofa" : 1, "begin" : 417, "end" : 425, "pos" : 177 }, + {"sofa" : 1, "begin" : 426, "end" : 429, "pos" : 179 }, + {"sofa" : 1, "begin" : 430, "end" : 434, "pos" : 181 }, + {"sofa" : 1, "begin" : 435, "end" : 441, "pos" : 183 }, + {"sofa" : 1, "begin" : 442, "end" : 444, "pos" : 185 }, + {"sofa" : 1, "begin" : 445, "end" : 448, "pos" : 187 }, + {"sofa" : 1, "begin" : 449, "end" : 459, "pos" : 189 }, + {"sofa" : 1, "begin" : 460, "end" : 462, "pos" : 191 }, + {"sofa" : 1, "begin" : 463, "end" : 470, "pos" : 193 }, + {"sofa" : 1, "begin" : 471, "end" : 473, "pos" : 195 }, + {"sofa" : 1, "begin" : 474, "end" : 482, "pos" : 197 }, + {"sofa" : 1, "begin" : 483, "end" : 486, "pos" : 199 }, + {"sofa" : 1, "begin" : 487, "end" : 490, "pos" : 201 }, + {"sofa" : 1, "begin" : 491, "end" : 497, "pos" : 203 }, + {"sofa" : 1, "begin" : 498, "end" : 506, "pos" : 205 }, + {"sofa" : 1, "begin" : 507, "end" : 509, "pos" : 207 }, + {"sofa" : 1, "begin" : 510, "end" : 513, "pos" : 209 }, + {"sofa" : 1, "begin" : 514, "end" : 521, "pos" : 211 }, + {"sofa" : 1, "begin" : 522, "end" : 527, "pos" : 213 }, + {"sofa" : 1, "begin" : 528, "end" : 534, "pos" : 215 }, + {"sofa" : 1, "begin" : 535, "end" : 539, "pos" : 217 }, + {"sofa" : 1, "begin" : 540, "end" : 548, "pos" : 219 }, + {"sofa" : 1, "begin" : 549, "end" : 550, "pos" : 221 } ], "Chunk" : [ - {"sofa" : 6, "end" : 10, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 11, "end" : 13, "chunkValue" : "PP" }, - {"sofa" : 6, "begin" : 14, "end" : 23, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 24, "end" : 50, "chunkValue" : "VP" }, - {"sofa" : 6, "begin" : 51, "end" : 69, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 70, "end" : 72, "chunkValue" : "SBAR" }, - {"sofa" : 6, "begin" : 73, "end" : 86, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 87, "end" : 90, "chunkValue" : "PP" }, - {"sofa" : 6, "begin" : 91, "end" : 100, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 103, "end" : 106, "chunkValue" : "ADJP" }, - {"sofa" : 6, "begin" : 107, "end" : 110, "chunkValue" : "PP" }, - {"sofa" : 6, "begin" : 111, "end" : 118, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 119, "end" : 127, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 130, "end" : 142, "chunkValue" : "VP" }, - {"sofa" : 6, "begin" : 143, "end" : 168, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 169, "end" : 173, "chunkValue" : "PP" }, - {"sofa" : 6, "begin" : 174, "end" : 189, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 190, "end" : 213, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 228, "end" : 230, "chunkValue" : "PP" }, - {"sofa" : 6, "begin" : 231, "end" : 244, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 245, "end" : 257, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 258, "end" : 280, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 281, "end" : 283, "chunkValue" : "PP" }, - {"sofa" : 6, "begin" : 284, "end" : 306, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 307, "end" : 328, "chunkValue" : "VP" }, - {"sofa" : 6, "begin" : 329, "end" : 339, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 340, "end" : 342, "chunkValue" : "PP" }, - {"sofa" : 6, "begin" : 343, "end" : 351, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 352, "end" : 356, "chunkValue" : "PP" }, - {"sofa" : 6, "begin" : 357, "end" : 370, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 378, "end" : 386, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 387, "end" : 393, "chunkValue" : "VP" }, - {"sofa" : 6, "begin" : 394, "end" : 412, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 413, "end" : 416, "chunkValue" : "PP" }, - {"sofa" : 6, "begin" : 417, "end" : 425, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 426, "end" : 441, "chunkValue" : "VP" }, - {"sofa" : 6, "begin" : 442, "end" : 444, "chunkValue" : "PP" }, - {"sofa" : 6, "begin" : 445, "end" : 459, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 460, "end" : 470, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 471, "end" : 482, "chunkValue" : "VP" }, - {"sofa" : 6, "begin" : 483, "end" : 506, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 507, "end" : 509, "chunkValue" : "PP" }, - {"sofa" : 6, "begin" : 510, "end" : 534, "chunkValue" : "NP" }, - {"sofa" : 6, "begin" : 535, "end" : 548, "chunkValue" : "NP" } ] } }, + {"sofa" : 1, "end" : 10, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 11, "end" : 13, "chunkValue" : "PP" }, + {"sofa" : 1, "begin" : 14, "end" : 23, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 24, "end" : 50, "chunkValue" : "VP" }, + {"sofa" : 1, "begin" : 51, "end" : 69, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 70, "end" : 72, "chunkValue" : "SBAR" }, + {"sofa" : 1, "begin" : 73, "end" : 86, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 87, "end" : 90, "chunkValue" : "PP" }, + {"sofa" : 1, "begin" : 91, "end" : 100, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 103, "end" : 106, "chunkValue" : "ADJP" }, + {"sofa" : 1, "begin" : 107, "end" : 110, "chunkValue" : "PP" }, + {"sofa" : 1, "begin" : 111, "end" : 118, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 119, "end" : 127, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 130, "end" : 142, "chunkValue" : "VP" }, + {"sofa" : 1, "begin" : 143, "end" : 168, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 169, "end" : 173, "chunkValue" : "PP" }, + {"sofa" : 1, "begin" : 174, "end" : 189, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 190, "end" : 213, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 228, "end" : 230, "chunkValue" : "PP" }, + {"sofa" : 1, "begin" : 231, "end" : 244, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 245, "end" : 257, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 258, "end" : 280, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 281, "end" : 283, "chunkValue" : "PP" }, + {"sofa" : 1, "begin" : 284, "end" : 306, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 307, "end" : 328, "chunkValue" : "VP" }, + {"sofa" : 1, "begin" : 329, "end" : 339, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 340, "end" : 342, "chunkValue" : "PP" }, + {"sofa" : 1, "begin" : 343, "end" : 351, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 352, "end" : 356, "chunkValue" : "PP" }, + {"sofa" : 1, "begin" : 357, "end" : 370, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 378, "end" : 386, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 387, "end" : 393, "chunkValue" : "VP" }, + {"sofa" : 1, "begin" : 394, "end" : 412, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 413, "end" : 416, "chunkValue" : "PP" }, + {"sofa" : 1, "begin" : 417, "end" : 425, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 426, "end" : 441, "chunkValue" : "VP" }, + {"sofa" : 1, "begin" : 442, "end" : 444, "chunkValue" : "PP" }, + {"sofa" : 1, "begin" : 445, "end" : 459, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 460, "end" : 470, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 471, "end" : 482, "chunkValue" : "VP" }, + {"sofa" : 1, "begin" : 483, "end" : 506, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 507, "end" : 509, "chunkValue" : "PP" }, + {"sofa" : 1, "begin" : 510, "end" : 534, "chunkValue" : "NP" }, + {"sofa" : 1, "begin" : 535, "end" : 548, "chunkValue" : "NP" } ] } }, "_referenced_fss" : { - "6" : {"_type" : "Sofa", "sofaNum" : 1, "sofaID" : "_InitialView", "mimeType" : "text", "sofaString" : "Confidence in the pound is widely expected to take another sharp dive if trade figures for September , due for release tomorrow , fail to show a substantial improvement from July and August 's near-record deficits . \nChancellor of the Exchequer Nigel Lawson 's restated commitment to a firm monetary policy has helped to prevent a freefall in sterling over the past week . \nBut analysts reckon underlying support for sterling has been eroded by the chancellor 's failure to announce any new policy measures in his Mansion House speech last Thursday . \n" }, - "36" : {"_type" : "POS", "sofa" : 6, "end" : 10, "PosValue" : "NN" }, - "54" : {"_type" : "POS", "sofa" : 6, "begin" : 11, "end" : 13, "PosValue" : "IN" }, - "72" : {"_type" : "POS", "sofa" : 6, "begin" : 14, "end" : 17, "PosValue" : "DT" }, - "90" : {"_type" : "POS", "sofa" : 6, "begin" : 18, "end" : 23, "PosValue" : "NN" }, - "108" : {"_type" : "POS", "sofa" : 6, "begin" : 24, "end" : 26, "PosValue" : "VBZ" }, - "126" : {"_type" : "POS", "sofa" : 6, "begin" : 27, "end" : 33, "PosValue" : "RB" }, - "144" : {"_type" : "POS", "sofa" : 6, "begin" : 34, "end" : 42, "PosValue" : "VBN" }, - "162" : {"_type" : "POS", "sofa" : 6, "begin" : 43, "end" : 45, "PosValue" : "TO" }, - "180" : {"_type" : "POS", "sofa" : 6, "begin" : 46, "end" : 50, "PosValue" : "VB" }, - "198" : {"_type" : "POS", "sofa" : 6, "begin" : 51, "end" : 58, "PosValue" : "DT" }, - "216" : {"_type" : "POS", "sofa" : 6, "begin" : 59, "end" : 64, "PosValue" : "JJ" }, - "234" : {"_type" : "POS", "sofa" : 6, "begin" : 65, "end" : 69, "PosValue" : "NN" }, - "252" : {"_type" : "POS", "sofa" : 6, "begin" : 70, "end" : 72, "PosValue" : "IN" }, - "270" : {"_type" : "POS", "sofa" : 6, "begin" : 73, "end" : 78, "PosValue" : "NN" }, - "288" : {"_type" : "POS", "sofa" : 6, "begin" : 79, "end" : 86, "PosValue" : "NNS" }, - "306" : {"_type" : "POS", "sofa" : 6, "begin" : 87, "end" : 90, "PosValue" : "IN" }, - "324" : {"_type" : "POS", "sofa" : 6, "begin" : 91, "end" : 100, "PosValue" : "NNP" }, - "342" : {"_type" : "POS", "sofa" : 6, "begin" : 101, "end" : 102, "PosValue" : "," }, - "360" : {"_type" : "POS", "sofa" : 6, "begin" : 103, "end" : 106, "PosValue" : "JJ" }, - "378" : {"_type" : "POS", "sofa" : 6, "begin" : 107, "end" : 110, "PosValue" : "IN" }, - "396" : {"_type" : "POS", "sofa" : 6, "begin" : 111, "end" : 118, "PosValue" : "NN" }, - "414" : {"_type" : "POS", "sofa" : 6, "begin" : 119, "end" : 127, "PosValue" : "NN" }, - "432" : {"_type" : "POS", "sofa" : 6, "begin" : 128, "end" : 129, "PosValue" : "," }, - "450" : {"_type" : "POS", "sofa" : 6, "begin" : 130, "end" : 134, "PosValue" : "VB" }, - "468" : {"_type" : "POS", "sofa" : 6, "begin" : 135, "end" : 137, "PosValue" : "TO" }, - "486" : {"_type" : "POS", "sofa" : 6, "begin" : 138, "end" : 142, "PosValue" : "VB" }, - "504" : {"_type" : "POS", "sofa" : 6, "begin" : 143, "end" : 144, "PosValue" : "DT" }, - "522" : {"_type" : "POS", "sofa" : 6, "begin" : 145, "end" : 156, "PosValue" : "JJ" }, - "540" : {"_type" : "POS", "sofa" : 6, "begin" : 157, "end" : 168, "PosValue" : "NN" }, - "558" : {"_type" : "POS", "sofa" : 6, "begin" : 169, "end" : 173, "PosValue" : "IN" }, - "576" : {"_type" : "POS", "sofa" : 6, "begin" : 174, "end" : 178, "PosValue" : "NNP" }, - "594" : {"_type" : "POS", "sofa" : 6, "begin" : 179, "end" : 182, "PosValue" : "CC" }, - "612" : {"_type" : "POS", "sofa" : 6, "begin" : 183, "end" : 189, "PosValue" : "NNP" }, - "630" : {"_type" : "POS", "sofa" : 6, "begin" : 190, "end" : 192, "PosValue" : "POS" }, - "648" : {"_type" : "POS", "sofa" : 6, "begin" : 193, "end" : 204, "PosValue" : "JJ" }, - "666" : {"_type" : "POS", "sofa" : 6, "begin" : 205, "end" : 213, "PosValue" : "NNS" }, - "684" : {"_type" : "POS", "sofa" : 6, "begin" : 214, "end" : 215, "PosValue" : "." }, - "797" : {"_type" : "POS", "sofa" : 6, "begin" : 217, "end" : 227, "PosValue" : "NNP" }, - "815" : {"_type" : "POS", "sofa" : 6, "begin" : 228, "end" : 230, "PosValue" : "IN" }, - "833" : {"_type" : "POS", "sofa" : 6, "begin" : 231, "end" : 234, "PosValue" : "DT" }, - "851" : {"_type" : "POS", "sofa" : 6, "begin" : 235, "end" : 244, "PosValue" : "NNP" }, - "869" : {"_type" : "POS", "sofa" : 6, "begin" : 245, "end" : 250, "PosValue" : "NNP" }, - "887" : {"_type" : "POS", "sofa" : 6, "begin" : 251, "end" : 257, "PosValue" : "NNP" }, - "905" : {"_type" : "POS", "sofa" : 6, "begin" : 258, "end" : 260, "PosValue" : "POS" }, - "923" : {"_type" : "POS", "sofa" : 6, "begin" : 261, "end" : 269, "PosValue" : "VBN" }, - "941" : {"_type" : "POS", "sofa" : 6, "begin" : 270, "end" : 280, "PosValue" : "NN" }, - "959" : {"_type" : "POS", "sofa" : 6, "begin" : 281, "end" : 283, "PosValue" : "TO" }, - "977" : {"_type" : "POS", "sofa" : 6, "begin" : 284, "end" : 285, "PosValue" : "DT" }, - "995" : {"_type" : "POS", "sofa" : 6, "begin" : 286, "end" : 290, "PosValue" : "NN" }, - "1013" : {"_type" : "POS", "sofa" : 6, "begin" : 291, "end" : 299, "PosValue" : "JJ" }, - "1031" : {"_type" : "POS", "sofa" : 6, "begin" : 300, "end" : 306, "PosValue" : "NN" }, - "1049" : {"_type" : "POS", "sofa" : 6, "begin" : 307, "end" : 310, "PosValue" : "VBZ" }, - "1067" : {"_type" : "POS", "sofa" : 6, "begin" : 311, "end" : 317, "PosValue" : "VBN" }, - "1085" : {"_type" : "POS", "sofa" : 6, "begin" : 318, "end" : 320, "PosValue" : "TO" }, - "1103" : {"_type" : "POS", "sofa" : 6, "begin" : 321, "end" : 328, "PosValue" : "VB" }, - "1121" : {"_type" : "POS", "sofa" : 6, "begin" : 329, "end" : 330, "PosValue" : "DT" }, - "1139" : {"_type" : "POS", "sofa" : 6, "begin" : 331, "end" : 339, "PosValue" : "NN" }, - "1157" : {"_type" : "POS", "sofa" : 6, "begin" : 340, "end" : 342, "PosValue" : "IN" }, - "1175" : {"_type" : "POS", "sofa" : 6, "begin" : 343, "end" : 351, "PosValue" : "NN" }, - "1193" : {"_type" : "POS", "sofa" : 6, "begin" : 352, "end" : 356, "PosValue" : "IN" }, - "1211" : {"_type" : "POS", "sofa" : 6, "begin" : 357, "end" : 360, "PosValue" : "DT" }, - "1229" : {"_type" : "POS", "sofa" : 6, "begin" : 361, "end" : 365, "PosValue" : "JJ" }, - "1247" : {"_type" : "POS", "sofa" : 6, "begin" : 366, "end" : 370, "PosValue" : "NN" }, - "1265" : {"_type" : "POS", "sofa" : 6, "begin" : 371, "end" : 372, "PosValue" : "." }, - "1348" : {"_type" : "POS", "sofa" : 6, "begin" : 374, "end" : 377, "PosValue" : "CC" }, - "1366" : {"_type" : "POS", "sofa" : 6, "begin" : 378, "end" : 386, "PosValue" : "NNS" }, - "1384" : {"_type" : "POS", "sofa" : 6, "begin" : 387, "end" : 393, "PosValue" : "VBP" }, - "1402" : {"_type" : "POS", "sofa" : 6, "begin" : 394, "end" : 404, "PosValue" : "VBG" }, - "1420" : {"_type" : "POS", "sofa" : 6, "begin" : 405, "end" : 412, "PosValue" : "NN" }, - "1438" : {"_type" : "POS", "sofa" : 6, "begin" : 413, "end" : 416, "PosValue" : "IN" }, - "1456" : {"_type" : "POS", "sofa" : 6, "begin" : 417, "end" : 425, "PosValue" : "NN" }, - "1474" : {"_type" : "POS", "sofa" : 6, "begin" : 426, "end" : 429, "PosValue" : "VBZ" }, - "1492" : {"_type" : "POS", "sofa" : 6, "begin" : 430, "end" : 434, "PosValue" : "VBN" }, - "1510" : {"_type" : "POS", "sofa" : 6, "begin" : 435, "end" : 441, "PosValue" : "VBN" }, - "1528" : {"_type" : "POS", "sofa" : 6, "begin" : 442, "end" : 444, "PosValue" : "IN" }, - "1546" : {"_type" : "POS", "sofa" : 6, "begin" : 445, "end" : 448, "PosValue" : "DT" }, - "1564" : {"_type" : "POS", "sofa" : 6, "begin" : 449, "end" : 459, "PosValue" : "NN" }, - "1582" : {"_type" : "POS", "sofa" : 6, "begin" : 460, "end" : 462, "PosValue" : "POS" }, - "1600" : {"_type" : "POS", "sofa" : 6, "begin" : 463, "end" : 470, "PosValue" : "NN" }, - "1618" : {"_type" : "POS", "sofa" : 6, "begin" : 471, "end" : 473, "PosValue" : "TO" }, - "1636" : {"_type" : "POS", "sofa" : 6, "begin" : 474, "end" : 482, "PosValue" : "VB" }, - "1654" : {"_type" : "POS", "sofa" : 6, "begin" : 483, "end" : 486, "PosValue" : "DT" }, - "1672" : {"_type" : "POS", "sofa" : 6, "begin" : 487, "end" : 490, "PosValue" : "JJ" }, - "1690" : {"_type" : "POS", "sofa" : 6, "begin" : 491, "end" : 497, "PosValue" : "NN" }, - "1708" : {"_type" : "POS", "sofa" : 6, "begin" : 498, "end" : 506, "PosValue" : "NNS" }, - "1726" : {"_type" : "POS", "sofa" : 6, "begin" : 507, "end" : 509, "PosValue" : "IN" }, - "1744" : {"_type" : "POS", "sofa" : 6, "begin" : 510, "end" : 513, "PosValue" : "PRP$" }, - "1762" : {"_type" : "POS", "sofa" : 6, "begin" : 514, "end" : 521, "PosValue" : "NNP" }, - "1780" : {"_type" : "POS", "sofa" : 6, "begin" : 522, "end" : 527, "PosValue" : "NNP" }, - "1798" : {"_type" : "POS", "sofa" : 6, "begin" : 528, "end" : 534, "PosValue" : "NN" }, - "1816" : {"_type" : "POS", "sofa" : 6, "begin" : 535, "end" : 539, "PosValue" : "JJ" }, - "1834" : {"_type" : "POS", "sofa" : 6, "begin" : 540, "end" : 548, "PosValue" : "NNP" }, - "1852" : {"_type" : "POS", "sofa" : 6, "begin" : 549, "end" : 550, "PosValue" : "." } } } \ No newline at end of file + "1" : {"_type" : "Sofa", "sofaNum" : 1, "sofaID" : "_InitialView", "mimeType" : "text", "sofaString" : "Confidence in the pound is widely expected to take another sharp dive if trade figures for September , due for release tomorrow , fail to show a substantial improvement from July and August 's near-record deficits . \nChancellor of the Exchequer Nigel Lawson 's restated commitment to a firm monetary policy has helped to prevent a freefall in sterling over the past week . \nBut analysts reckon underlying support for sterling has been eroded by the chancellor 's failure to announce any new policy measures in his Mansion House speech last Thursday . \n" }, + "5" : {"_type" : "POS", "sofa" : 1, "end" : 10, "PosValue" : "NN" }, + "7" : {"_type" : "POS", "sofa" : 1, "begin" : 11, "end" : 13, "PosValue" : "IN" }, + "9" : {"_type" : "POS", "sofa" : 1, "begin" : 14, "end" : 17, "PosValue" : "DT" }, + "11" : {"_type" : "POS", "sofa" : 1, "begin" : 18, "end" : 23, "PosValue" : "NN" }, + "13" : {"_type" : "POS", "sofa" : 1, "begin" : 24, "end" : 26, "PosValue" : "VBZ" }, + "15" : {"_type" : "POS", "sofa" : 1, "begin" : 27, "end" : 33, "PosValue" : "RB" }, + "17" : {"_type" : "POS", "sofa" : 1, "begin" : 34, "end" : 42, "PosValue" : "VBN" }, + "19" : {"_type" : "POS", "sofa" : 1, "begin" : 43, "end" : 45, "PosValue" : "TO" }, + "21" : {"_type" : "POS", "sofa" : 1, "begin" : 46, "end" : 50, "PosValue" : "VB" }, + "23" : {"_type" : "POS", "sofa" : 1, "begin" : 51, "end" : 58, "PosValue" : "DT" }, + "25" : {"_type" : "POS", "sofa" : 1, "begin" : 59, "end" : 64, "PosValue" : "JJ" }, + "27" : {"_type" : "POS", "sofa" : 1, "begin" : 65, "end" : 69, "PosValue" : "NN" }, + "29" : {"_type" : "POS", "sofa" : 1, "begin" : 70, "end" : 72, "PosValue" : "IN" }, + "31" : {"_type" : "POS", "sofa" : 1, "begin" : 73, "end" : 78, "PosValue" : "NN" }, + "33" : {"_type" : "POS", "sofa" : 1, "begin" : 79, "end" : 86, "PosValue" : "NNS" }, + "35" : {"_type" : "POS", "sofa" : 1, "begin" : 87, "end" : 90, "PosValue" : "IN" }, + "37" : {"_type" : "POS", "sofa" : 1, "begin" : 91, "end" : 100, "PosValue" : "NNP" }, + "39" : {"_type" : "POS", "sofa" : 1, "begin" : 101, "end" : 102, "PosValue" : "," }, + "41" : {"_type" : "POS", "sofa" : 1, "begin" : 103, "end" : 106, "PosValue" : "JJ" }, + "43" : {"_type" : "POS", "sofa" : 1, "begin" : 107, "end" : 110, "PosValue" : "IN" }, + "45" : {"_type" : "POS", "sofa" : 1, "begin" : 111, "end" : 118, "PosValue" : "NN" }, + "47" : {"_type" : "POS", "sofa" : 1, "begin" : 119, "end" : 127, "PosValue" : "NN" }, + "49" : {"_type" : "POS", "sofa" : 1, "begin" : 128, "end" : 129, "PosValue" : "," }, + "51" : {"_type" : "POS", "sofa" : 1, "begin" : 130, "end" : 134, "PosValue" : "VB" }, + "53" : {"_type" : "POS", "sofa" : 1, "begin" : 135, "end" : 137, "PosValue" : "TO" }, + "55" : {"_type" : "POS", "sofa" : 1, "begin" : 138, "end" : 142, "PosValue" : "VB" }, + "57" : {"_type" : "POS", "sofa" : 1, "begin" : 143, "end" : 144, "PosValue" : "DT" }, + "59" : {"_type" : "POS", "sofa" : 1, "begin" : 145, "end" : 156, "PosValue" : "JJ" }, + "61" : {"_type" : "POS", "sofa" : 1, "begin" : 157, "end" : 168, "PosValue" : "NN" }, + "63" : {"_type" : "POS", "sofa" : 1, "begin" : 169, "end" : 173, "PosValue" : "IN" }, + "65" : {"_type" : "POS", "sofa" : 1, "begin" : 174, "end" : 178, "PosValue" : "NNP" }, + "67" : {"_type" : "POS", "sofa" : 1, "begin" : 179, "end" : 182, "PosValue" : "CC" }, + "69" : {"_type" : "POS", "sofa" : 1, "begin" : 183, "end" : 189, "PosValue" : "NNP" }, + "71" : {"_type" : "POS", "sofa" : 1, "begin" : 190, "end" : 192, "PosValue" : "POS" }, + "73" : {"_type" : "POS", "sofa" : 1, "begin" : 193, "end" : 204, "PosValue" : "JJ" }, + "75" : {"_type" : "POS", "sofa" : 1, "begin" : 205, "end" : 213, "PosValue" : "NNS" }, + "77" : {"_type" : "POS", "sofa" : 1, "begin" : 214, "end" : 215, "PosValue" : "." }, + "98" : {"_type" : "POS", "sofa" : 1, "begin" : 217, "end" : 227, "PosValue" : "NNP" }, + "100" : {"_type" : "POS", "sofa" : 1, "begin" : 228, "end" : 230, "PosValue" : "IN" }, + "102" : {"_type" : "POS", "sofa" : 1, "begin" : 231, "end" : 234, "PosValue" : "DT" }, + "104" : {"_type" : "POS", "sofa" : 1, "begin" : 235, "end" : 244, "PosValue" : "NNP" }, + "106" : {"_type" : "POS", "sofa" : 1, "begin" : 245, "end" : 250, "PosValue" : "NNP" }, + "108" : {"_type" : "POS", "sofa" : 1, "begin" : 251, "end" : 257, "PosValue" : "NNP" }, + "110" : {"_type" : "POS", "sofa" : 1, "begin" : 258, "end" : 260, "PosValue" : "POS" }, + "112" : {"_type" : "POS", "sofa" : 1, "begin" : 261, "end" : 269, "PosValue" : "VBN" }, + "114" : {"_type" : "POS", "sofa" : 1, "begin" : 270, "end" : 280, "PosValue" : "NN" }, + "116" : {"_type" : "POS", "sofa" : 1, "begin" : 281, "end" : 283, "PosValue" : "TO" }, + "118" : {"_type" : "POS", "sofa" : 1, "begin" : 284, "end" : 285, "PosValue" : "DT" }, + "120" : {"_type" : "POS", "sofa" : 1, "begin" : 286, "end" : 290, "PosValue" : "NN" }, + "122" : {"_type" : "POS", "sofa" : 1, "begin" : 291, "end" : 299, "PosValue" : "JJ" }, + "124" : {"_type" : "POS", "sofa" : 1, "begin" : 300, "end" : 306, "PosValue" : "NN" }, + "126" : {"_type" : "POS", "sofa" : 1, "begin" : 307, "end" : 310, "PosValue" : "VBZ" }, + "128" : {"_type" : "POS", "sofa" : 1, "begin" : 311, "end" : 317, "PosValue" : "VBN" }, + "130" : {"_type" : "POS", "sofa" : 1, "begin" : 318, "end" : 320, "PosValue" : "TO" }, + "132" : {"_type" : "POS", "sofa" : 1, "begin" : 321, "end" : 328, "PosValue" : "VB" }, + "134" : {"_type" : "POS", "sofa" : 1, "begin" : 329, "end" : 330, "PosValue" : "DT" }, + "136" : {"_type" : "POS", "sofa" : 1, "begin" : 331, "end" : 339, "PosValue" : "NN" }, + "138" : {"_type" : "POS", "sofa" : 1, "begin" : 340, "end" : 342, "PosValue" : "IN" }, + "140" : {"_type" : "POS", "sofa" : 1, "begin" : 343, "end" : 351, "PosValue" : "NN" }, + "142" : {"_type" : "POS", "sofa" : 1, "begin" : 352, "end" : 356, "PosValue" : "IN" }, + "144" : {"_type" : "POS", "sofa" : 1, "begin" : 357, "end" : 360, "PosValue" : "DT" }, + "146" : {"_type" : "POS", "sofa" : 1, "begin" : 361, "end" : 365, "PosValue" : "JJ" }, + "148" : {"_type" : "POS", "sofa" : 1, "begin" : 366, "end" : 370, "PosValue" : "NN" }, + "150" : {"_type" : "POS", "sofa" : 1, "begin" : 371, "end" : 372, "PosValue" : "." }, + "165" : {"_type" : "POS", "sofa" : 1, "begin" : 374, "end" : 377, "PosValue" : "CC" }, + "167" : {"_type" : "POS", "sofa" : 1, "begin" : 378, "end" : 386, "PosValue" : "NNS" }, + "169" : {"_type" : "POS", "sofa" : 1, "begin" : 387, "end" : 393, "PosValue" : "VBP" }, + "171" : {"_type" : "POS", "sofa" : 1, "begin" : 394, "end" : 404, "PosValue" : "VBG" }, + "173" : {"_type" : "POS", "sofa" : 1, "begin" : 405, "end" : 412, "PosValue" : "NN" }, + "175" : {"_type" : "POS", "sofa" : 1, "begin" : 413, "end" : 416, "PosValue" : "IN" }, + "177" : {"_type" : "POS", "sofa" : 1, "begin" : 417, "end" : 425, "PosValue" : "NN" }, + "179" : {"_type" : "POS", "sofa" : 1, "begin" : 426, "end" : 429, "PosValue" : "VBZ" }, + "181" : {"_type" : "POS", "sofa" : 1, "begin" : 430, "end" : 434, "PosValue" : "VBN" }, + "183" : {"_type" : "POS", "sofa" : 1, "begin" : 435, "end" : 441, "PosValue" : "VBN" }, + "185" : {"_type" : "POS", "sofa" : 1, "begin" : 442, "end" : 444, "PosValue" : "IN" }, + "187" : {"_type" : "POS", "sofa" : 1, "begin" : 445, "end" : 448, "PosValue" : "DT" }, + "189" : {"_type" : "POS", "sofa" : 1, "begin" : 449, "end" : 459, "PosValue" : "NN" }, + "191" : {"_type" : "POS", "sofa" : 1, "begin" : 460, "end" : 462, "PosValue" : "POS" }, + "193" : {"_type" : "POS", "sofa" : 1, "begin" : 463, "end" : 470, "PosValue" : "NN" }, + "195" : {"_type" : "POS", "sofa" : 1, "begin" : 471, "end" : 473, "PosValue" : "TO" }, + "197" : {"_type" : "POS", "sofa" : 1, "begin" : 474, "end" : 482, "PosValue" : "VB" }, + "199" : {"_type" : "POS", "sofa" : 1, "begin" : 483, "end" : 486, "PosValue" : "DT" }, + "201" : {"_type" : "POS", "sofa" : 1, "begin" : 487, "end" : 490, "PosValue" : "JJ" }, + "203" : {"_type" : "POS", "sofa" : 1, "begin" : 491, "end" : 497, "PosValue" : "NN" }, + "205" : {"_type" : "POS", "sofa" : 1, "begin" : 498, "end" : 506, "PosValue" : "NNS" }, + "207" : {"_type" : "POS", "sofa" : 1, "begin" : 507, "end" : 509, "PosValue" : "IN" }, + "209" : {"_type" : "POS", "sofa" : 1, "begin" : 510, "end" : 513, "PosValue" : "PRP$" }, + "211" : {"_type" : "POS", "sofa" : 1, "begin" : 514, "end" : 521, "PosValue" : "NNP" }, + "213" : {"_type" : "POS", "sofa" : 1, "begin" : 522, "end" : 527, "PosValue" : "NNP" }, + "215" : {"_type" : "POS", "sofa" : 1, "begin" : 528, "end" : 534, "PosValue" : "NN" }, + "217" : {"_type" : "POS", "sofa" : 1, "begin" : 535, "end" : 539, "PosValue" : "JJ" }, + "219" : {"_type" : "POS", "sofa" : 1, "begin" : 540, "end" : 548, "PosValue" : "NNP" }, + "221" : {"_type" : "POS", "sofa" : 1, "begin" : 549, "end" : 550, "PosValue" : "." } } } \ No newline at end of file diff --git a/dkpro-core-io-json-asl/src/test/resources/log4j.properties b/dkpro-core-io-json-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-io-json-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-json-asl/src/test/resources/log4j2.xml b/dkpro-core-io-json-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-json-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-jwpl-asl/pom.xml b/dkpro-core-io-jwpl-asl/pom.xml index 2f33a5cee9..422fdd12ab 100644 --- a/dkpro-core-io-jwpl-asl/pom.xml +++ b/dkpro-core-io-jwpl-asl/pom.xml @@ -1,97 +1,102 @@ - 4.0.0 - - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT - ../dkpro-core-asl - - de.tudarmstadt.ukp.dkpro.core.io.jwpl-asl - jar - DKPro Core ASL - IO - Wikipedia via JWPL - - - org.apache.uima - uimaj-core - - - org.apache.uima - uimafit-core - - - org.apache.commons - commons-lang3 - - - de.tudarmstadt.ukp.wikipedia - de.tudarmstadt.ukp.wikipedia.revisionmachine - - - de.tudarmstadt.ukp.wikipedia - de.tudarmstadt.ukp.wikipedia.util - - - de.tudarmstadt.ukp.wikipedia - de.tudarmstadt.ukp.wikipedia.api - - - de.tudarmstadt.ukp.wikipedia - de.tudarmstadt.ukp.wikipedia.parser - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl - - - junit - junit - test - - - - - - de.tudarmstadt.ukp.wikipedia - de.tudarmstadt.ukp.wikipedia - 0.9.2 - pom - import - - - - - - - false - src/main/resources - - desc/type/**/* - - - - true - src/main/resources - - desc/type/**/* - - - - + 4.0.0 + + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT + ../dkpro-core-asl + + dkpro-core-io-jwpl-asl + jar + DKPro Core ASL - IO - Wikipedia via JWPL + https://dkpro.github.io/dkpro-core/ + + + org.apache.uima + uimaj-core + + + org.apache.uima + uimafit-core + + + org.apache.commons + commons-lang3 + + + de.tudarmstadt.ukp.wikipedia + de.tudarmstadt.ukp.wikipedia.revisionmachine + + + de.tudarmstadt.ukp.wikipedia + de.tudarmstadt.ukp.wikipedia.util + + + de.tudarmstadt.ukp.wikipedia + de.tudarmstadt.ukp.wikipedia.api + + + de.tudarmstadt.ukp.wikipedia + de.tudarmstadt.ukp.wikipedia.parser + + + org.dkpro.core + dkpro-core-api-metadata-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api + + + junit + junit + test + + + + + + de.tudarmstadt.ukp.wikipedia + de.tudarmstadt.ukp.wikipedia + 0.9.2 + pom + import + + + + + + + false + src/main/resources + + desc/type/**/* + + + + true + src/main/resources + + desc/type/**/* + + + + diff --git a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaArticleInfoReader.java b/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaArticleInfoReader.java deleted file mode 100644 index 4f927d453c..0000000000 --- a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaArticleInfoReader.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.jwpl; - -import java.io.IOException; -import java.util.Iterator; - -import org.apache.uima.UimaContext; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.Progress; -import org.apache.uima.util.ProgressImpl; - -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.ArticleInfo; -import de.tudarmstadt.ukp.wikipedia.api.MetaData; -import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException; -import de.tudarmstadt.ukp.wikipedia.revisionmachine.api.RevisionAPIConfiguration; -import de.tudarmstadt.ukp.wikipedia.revisionmachine.api.RevisionApi; - -/** - * Reads all general article infos without retrieving the whole Page objects - * - * - */ - -@TypeCapability( - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.ArticleInfo"}) - - -public class WikipediaArticleInfoReader extends WikipediaReaderBase -{ - protected long currentArticleIndex; - protected long nrOfArticles; - - protected Iterator idIter; - protected RevisionApi revApi; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException{ - super.initialize(context); - - MetaData md = wiki.getMetaData(); - this.nrOfArticles = md.getNumberOfPages() - md.getNumberOfDisambiguationPages() - md.getNumberOfRedirectPages(); - this.currentArticleIndex = 0; - - RevisionAPIConfiguration revConfig = new RevisionAPIConfiguration(dbconfig); - - try { - revApi = new RevisionApi(revConfig); - } - catch (WikiApiException e) { - throw new ResourceInitializationException(e); - } - - idIter = wiki.getPageIds().iterator(); - } - - - @Override - public boolean hasNext() - throws IOException, CollectionException - { - return idIter.hasNext(); - } - - - @Override - public void getNext(JCas aJCas) - throws IOException, CollectionException - { - super.getNext(aJCas); - - int id = idIter.next(); - currentArticleIndex++; - - try - { - addDocumentMetaData(aJCas, id); - - ArticleInfo info = new ArticleInfo(aJCas); - info.setAuthors(revApi.getNumberOfUniqueContributors(id)); - info.setRevisions(revApi.getNumberOfRevisions(id)); - info.setFirstAppearance(revApi.getFirstDateOfAppearance(id).getTime()); - info.setLastAppearance(revApi.getLastDateOfAppearance(id).getTime()); - info.addToIndexes(); - } - catch (WikiApiException e) { - //could e.g. happen if no revision is available for this page - getLogger().warn("Unable to fetch next article", e); - } - } - - - @Override - public Progress[] getProgress() - { - return new Progress[] { - new ProgressImpl( - Long.valueOf(currentArticleIndex).intValue(), - Long.valueOf(nrOfArticles).intValue(), - Progress.ENTITIES - ) - }; - } - - private void addDocumentMetaData(JCas jcas, int id) throws WikiApiException { - DocumentMetaData metaData = DocumentMetaData.create(jcas); - metaData.setDocumentTitle(wiki.getTitle(id).toString()); - metaData.setCollectionId(Integer.valueOf(id).toString()); - metaData.setLanguage(dbconfig.getLanguage().toString()); - - } -} diff --git a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaLinkReader.java b/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaLinkReader.java deleted file mode 100644 index 03da78df00..0000000000 --- a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaLinkReader.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.jwpl; - -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; - -import de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.WikipediaLink; -import de.tudarmstadt.ukp.wikipedia.api.Page; -import de.tudarmstadt.ukp.wikipedia.api.exception.WikiTitleParsingException; -import de.tudarmstadt.ukp.wikipedia.parser.Link; -import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage; - -/** - * Read links from Wikipedia. - */ -@TypeCapability(outputs={ - "de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.DBConfig", - "de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.WikipediaLink"}) -public class WikipediaLinkReader extends WikipediaStandardReaderBase { - - /** - * Which types of links are allowed? - */ - public static final String PARAM_ALLOWED_LINK_TYPES = "AllowedLinkTypes"; - @ConfigurationParameter(name = PARAM_ALLOWED_LINK_TYPES, mandatory = true) - private String[] allowedLinkTypes; - - @Override - protected boolean isValidPage(Page page) throws WikiTitleParsingException - { - return !page.isDisambiguation() && !page.isDiscussion() && !page.isRedirect(); - } - - @Override - protected String getPlainDocumentText(Page page) { - String text = ""; - ParsedPage pp = parser.parse(page.getText()); - if (pp != null ) { - text = pp.getText(); - } - return text; - } - - @Override - public void getNext(JCas jcas) - throws IOException, CollectionException { - super.getNext(jcas); - - ParsedPage pp = parser.parse(getPage().getText()); - - //Don't do anything if there is no document text - if(jcas.getDocumentText().length()==0){ - return; - } - - //add link annotations - List allowedLinkTypeList = Arrays.asList(this.allowedLinkTypes); - WikipediaLink wikipediaLink; - int begin = 0; - int end = 0; - for(Link link : pp.getLinks()){ - if(allowedLinkTypeList.contains(link.getType().name())){ - //TODO: The begin and end of a link is defined with an absolute position in the raw text. - //But, Wikipedia guidelines claim that the first mention has to be marked - begin = 0; - end = 0; - begin = jcas.getDocumentText().indexOf(link.getText(), begin); - if(begin == -1){ - begin = jcas.getDocumentText().indexOf(link.getText()); - } - if(begin == -1){ - begin = 0; - } - end = begin + link.getText().length(); - if(end >= jcas.getDocumentText().length()){ - end = begin; - } - wikipediaLink = new WikipediaLink(jcas); - wikipediaLink.setBegin(0); - wikipediaLink.setEnd(1); - wikipediaLink.setLinkType(link.getType().name()); - wikipediaLink.setTarget(link.getTarget()); - wikipediaLink.setAnchor(link.getText()); - wikipediaLink.addToIndexes(); - } - } - } -} diff --git a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaQueryReader.java b/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaQueryReader.java deleted file mode 100644 index f8448f9ba4..0000000000 --- a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaQueryReader.java +++ /dev/null @@ -1,202 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.jwpl; - -import org.apache.uima.UimaContext; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.Level; - -import de.tudarmstadt.ukp.wikipedia.api.PageQuery; -import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException; - -/** - * Reads all article pages that match a query created by the numerous parameters of this class. - */ -public class WikipediaQueryReader - extends WikipediaArticleReader -{ - - /** - * Maximum number of categories. - * Articles with a higher number of categories will not be returned by the query. - */ - public static final String PARAM_MAX_CATEGORIES = "MaxCategories"; - @ConfigurationParameter(name = PARAM_MAX_CATEGORIES, mandatory=false, defaultValue="-1") - private int maxCategories; - - /** - * Minimum number of categories. - * Articles with a lower number of categories will not be returned by the query. - */ - public static final String PARAM_MIN_CATEGORIES = "MinCategories"; - @ConfigurationParameter(name = PARAM_MIN_CATEGORIES, mandatory=false, defaultValue="-1") - private int minCategories; - - /** - * Maximum number of incoming links. - * Articles with a higher number of incoming links will not be returned by the query. - */ - public static final String PARAM_MAX_INLINKS = "MaxInlinks"; - @ConfigurationParameter(name = PARAM_MAX_INLINKS, mandatory=false, defaultValue="-1") - private int maxInlinks; - - /** - * Minimum number of incoming links. - * Articles with a lower number of incoming links will not be returned by the query. - */ - public static final String PARAM_MIN_INLINKS = "MinInlinks"; - @ConfigurationParameter(name = PARAM_MIN_INLINKS, mandatory=false, defaultValue="-1") - private int minInlinks; - - /** - * Maximum number of outgoing links. - * Articles with a higher number of outgoing links will not be returned by the query. - */ - public static final String PARAM_MAX_OUTLINKS = "MaxOutlinks"; - @ConfigurationParameter(name = PARAM_MAX_OUTLINKS, mandatory=false, defaultValue="-1") - private int maxOutlinks; - - /** - * Minimum number of outgoing links. - * Articles with a lower number of outgoing links will not be returned by the query. - */ - public static final String PARAM_MIN_OUTLINKS = "MinOutlinks"; - @ConfigurationParameter(name = PARAM_MIN_OUTLINKS, mandatory=false, defaultValue="-1") - private int minOutlinks; - - /** - * Maximum number of redirects. - * Articles with a higher number of redirects will not be returned by the query. - */ - public static final String PARAM_MAX_REDIRECTS = "MaxRedirects"; - @ConfigurationParameter(name = PARAM_MAX_REDIRECTS, mandatory=false, defaultValue="-1") - private int maxRedirects; - - /** - * Minimum number of redirects. - * Articles with a lower number of redirects will not be returned by the query. - */ - public static final String PARAM_MIN_REDIRECTS = "MinRedirects"; - @ConfigurationParameter(name = PARAM_MIN_REDIRECTS, mandatory=false, defaultValue="-1") - private int minRedirects; - - /** - * Maximum number of tokens. - * Articles with a higher number of tokens will not be returned by the query. - */ - public static final String PARAM_MAX_TOKENS = "MaxTokens"; - @ConfigurationParameter(name = PARAM_MAX_TOKENS, mandatory=false, defaultValue="-1") - private int maxTokens; - - /** - * Minimum number of tokens. - * Articles with a lower number of tokens will not be returned by the query. - */ - public static final String PARAM_MIN_TOKENS = "MinTokens"; - @ConfigurationParameter(name = PARAM_MIN_TOKENS, mandatory=false, defaultValue="-1") - private int minTokens; - - /** - * SQL-style title pattern. - * Only articles that match the pattern will be returned by the query. - */ - public static final String PARAM_TITLE_PATTERN = "TitlePattern"; - @ConfigurationParameter(name = PARAM_TITLE_PATTERN, mandatory=false, defaultValue="") - private String titlePattern; - - - protected boolean queryInitialized = false; // indicates whether a query parameter was used - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - - PageQuery query = new PageQuery(); - - if (maxCategories != -1) { - query.setMaxCategories(maxCategories); - queryInitialized = true; - } - - if (minCategories != -1) { - query.setMinCategories(minCategories); - queryInitialized = true; - } - - if (maxInlinks != -1) { - query.setMaxIndegree(maxInlinks); - queryInitialized = true; - } - - if (minInlinks != -1) { - query.setMinIndegree(minInlinks); - queryInitialized = true; - } - - if (maxOutlinks != -1) { - query.setMaxOutdegree(maxOutlinks); - queryInitialized = true; - } - - if (minOutlinks != -1) { - query.setMinOutdegree(minOutlinks); - queryInitialized = true; - } - - if (maxRedirects != -1) { - query.setMaxRedirects(maxRedirects); - queryInitialized = true; - } - - if (minRedirects != -1) { - query.setMinRedirects(minRedirects); - queryInitialized = true; - } - - if (maxTokens != -1) { - query.setMaxTokens(maxTokens); - queryInitialized = true; - } - - if (minTokens != -1) { - query.setMinTokens(minTokens); - queryInitialized = true; - } - - if (!titlePattern.equals("")) { - query.setTitlePattern(titlePattern); - queryInitialized = true; - } - - this.getLogger().log(Level.INFO, query.getQueryInfo()); - - // if a query was initialized, overwrite the page iterator - if (queryInitialized) { - try { - pageIter = wiki.getPages(query).iterator(); - } - catch (WikiApiException e) { - throw new ResourceInitializationException(e); - } - - } - } -} diff --git a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaReaderBase.java b/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaReaderBase.java deleted file mode 100644 index d0c933ac10..0000000000 --- a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaReaderBase.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.jwpl; - -import java.io.IOException; - -import org.apache.uima.UimaContext; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.Progress; - -import de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.DBConfig; -import de.tudarmstadt.ukp.wikipedia.api.DatabaseConfiguration; -import de.tudarmstadt.ukp.wikipedia.api.WikiConstants.Language; -import de.tudarmstadt.ukp.wikipedia.api.Wikipedia; -import de.tudarmstadt.ukp.wikipedia.api.exception.WikiInitializationException; - -/** - * Abstract base class for all Wikipedia readers. - */ -public abstract class WikipediaReaderBase extends JCasCollectionReader_ImplBase -{ - - /** The host server. */ - public static final String PARAM_HOST = "Host"; - @ConfigurationParameter(name = PARAM_HOST, mandatory=true) - private String host; - - /** The name of the database. */ - public static final String PARAM_DB = "Database"; - @ConfigurationParameter(name = PARAM_DB, mandatory=true) - private String db; - - /** The username of the database account. */ - public static final String PARAM_USER = "User"; - @ConfigurationParameter(name = PARAM_USER, mandatory=true) - private String user; - - /** The password of the database account. */ - public static final String PARAM_PASSWORD = "Password"; - @ConfigurationParameter(name = PARAM_PASSWORD, mandatory=true) - private String password; - - /** The language of the Wikipedia that should be connected to. */ - public static final String PARAM_LANGUAGE = "Language"; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory=true) - private Language language; - - /** Sets whether the database configuration should be stored in the CAS, - * so that annotators down the pipeline can access additional data. */ - public static final String PARAM_CREATE_DATABASE_CONFIG_ANNOTATION = "CreateDBAnno"; - @ConfigurationParameter(name = PARAM_CREATE_DATABASE_CONFIG_ANNOTATION, mandatory=true, defaultValue="false") - private boolean createDbAnno; - - protected DatabaseConfiguration dbconfig; - - protected Wikipedia wiki; - - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - - dbconfig = new DatabaseConfiguration( - host, - db, - user, - password, - language - ); - - try { - this.wiki = new Wikipedia(dbconfig); - } - catch (WikiInitializationException e) { - throw new ResourceInitializationException(e); - } - } - - @Override - public void getNext(JCas jcas) throws IOException, CollectionException - { - if(createDbAnno){ - DBConfig dbconfiganno = new DBConfig(jcas); - dbconfiganno.setHost(host); - dbconfiganno.setPassword(password); - dbconfiganno.setDB(db); - dbconfiganno.setUser(user); - dbconfiganno.setLanguage(language.toString()); - dbconfiganno.addToIndexes(); - } - } - - @Override - public abstract Progress[] getProgress(); - -} diff --git a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaRevisionReader.java b/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaRevisionReader.java deleted file mode 100644 index d86367b8fb..0000000000 --- a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaRevisionReader.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.jwpl; - -import java.io.IOException; -import java.sql.SQLException; - -import org.apache.commons.lang3.StringEscapeUtils; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; - -import de.tudarmstadt.ukp.dkpro.core.io.jwpl.util.WikiUtils; -import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException; -import de.tudarmstadt.ukp.wikipedia.revisionmachine.api.Revision; - -/** - * Reads Wikipedia page revisions. - */ -@TypeCapability( - outputs={ - "de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.DBConfig", - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.WikipediaRevision"}) - -public class WikipediaRevisionReader extends WikipediaRevisionReaderBase -{ - - @Override - public void getNext(JCas jcas) - throws IOException, CollectionException - { - super.getNext(jcas); - - try { - Revision revision = null; - if(!revisionIds.isEmpty()){ - //in case we iterate over a given list of revisions - String nextId = revIdIterator.next(); - try{ - revision = this.revisionApi.getRevision(Integer.parseInt(nextId)); - }catch(Exception e){ - //in case of lost connection - //TODO should be handled in RevisionAPI - revisionApi.reconnect(); - revision = this.revisionApi.getRevision(Integer.parseInt(nextId)); - } - }else{ - //in case we iterate over ALL revisions - try{ - revision = this.revisionApi.getRevision(currentArticle.getPageId(), timestampIter.next()); - }catch(Exception e){ - //in case of lost connection - //TODO should be handled in RevisionAPI - revisionApi.reconnect(); - revision = this.revisionApi.getRevision(currentArticle.getPageId(), timestampIter.next()); - } - } - - String text = ""; - if (outputPlainText) { - text = WikiUtils.cleanText( - StringEscapeUtils.unescapeHtml4(revision.getRevisionText()) - ); - } - else { - text = revision.getRevisionText(); - } - jcas.setDocumentText(text); - - addDocumentMetaData(jcas, revision.getArticleID(), revision.getRevisionID()); - addRevisionAnnotation(jcas, revision); - } - catch (WikiApiException e) { - throw new CollectionException(e); - } - catch (SQLException e) { - throw new CollectionException(e); - } - } -} diff --git a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaStandardReaderBase.java b/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaStandardReaderBase.java deleted file mode 100644 index e306c91eba..0000000000 --- a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaStandardReaderBase.java +++ /dev/null @@ -1,284 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.jwpl; - -import java.io.BufferedReader; -import java.io.DataInputStream; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Set; - -import org.apache.uima.UimaContext; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.Progress; -import org.apache.uima.util.ProgressImpl; - -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.io.jwpl.util.WikiUtils; -import de.tudarmstadt.ukp.wikipedia.api.MetaData; -import de.tudarmstadt.ukp.wikipedia.api.Page; -import de.tudarmstadt.ukp.wikipedia.api.PageIterator; -import de.tudarmstadt.ukp.wikipedia.api.exception.WikiTitleParsingException; -import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.FlushTemplates; -import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; -import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParserFactory; - -/** - * Abstract base class for standard Wikipedia readers reading single articles - * instead of revision pairs. - * - * - */ -public abstract class WikipediaStandardReaderBase - extends WikipediaReaderBase -{ - - /** Whether the reader outputs plain text or wiki markup. */ - public static final String PARAM_OUTPUT_PLAIN_TEXT = "OutputPlainText"; - @ConfigurationParameter(name = PARAM_OUTPUT_PLAIN_TEXT, mandatory = true, defaultValue = "true") - protected boolean outputPlainText; - - /** The page buffer size (#pages) of the page iterator. */ - public static final String PARAM_PAGE_BUFFER = "PageBuffer"; - @ConfigurationParameter(name = PARAM_PAGE_BUFFER, mandatory = true, defaultValue = "1000") - protected int pageBuffer; - - /** - * Defines the path to a file containing a line-separated list of - * page ids of the pages that should be retrieved. (Optional) - */ - public static final String PARAM_PATH_TO_PAGE_ID_LIST = "PageIdsFromFile"; - @ConfigurationParameter(name = PARAM_PATH_TO_PAGE_ID_LIST, mandatory = false) - protected String pageIdFile; - - /** - * Defines the path to a file containing a line-separated list of - * page titles of the pages that should be retrieved. (Optional) - */ - public static final String PARAM_PATH_TO_PAGE_TITLE_LIST = "PageTitleFromFile"; - @ConfigurationParameter(name = PARAM_PATH_TO_PAGE_TITLE_LIST, mandatory = false) - protected String pageNameFile; - - /** - * Defines an array of - * page ids of the pages that should be retrieved. (Optional) - */ - public static final String PARAM_PAGE_ID_LIST = "PageIdFromArray"; - @ConfigurationParameter(name = PARAM_PAGE_ID_LIST, mandatory = false) - protected String[] pageIdParamArray; - - /** - * Defines an array of page titles of the pages that should be retrieved. - * (Optional) - */ - public static final String PARAM_PAGE_TITLE_LIST = "PageTitlesFromArray"; - @ConfigurationParameter(name = PARAM_PAGE_TITLE_LIST, mandatory = false) - protected String[] pageNameParamArray; - - private Set pageIds = null; - private Set pageTitles = null; - - protected long currentArticleIndex; - protected long nrOfArticles; - - protected Iterator pageIter; - - private Page page; - - protected MediaWikiParser parser; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - - pageIds = new HashSet(); - pageTitles = new HashSet(); - - try { - if (pageIdFile != null) { - pageIds = loadFile(pageIdFile); - } - if (pageNameFile != null) { - pageTitles = loadFile(pageNameFile); - } - if (pageIdParamArray != null && pageIdParamArray.length > 0) { - for(String id: pageIdParamArray){ - pageIds.add(id); - } - } - if (pageNameParamArray != null && pageNameParamArray.length > 0) { - for(String id: pageNameParamArray){ - pageTitles.add(id); - } - } - } - catch (Exception e) { - throw new ResourceInitializationException(e); - } - - //Use one of the lists or iterate over all articles? - if(!pageIds.isEmpty()||!pageTitles.isEmpty()) - { - this.nrOfArticles = pageIds.size()+pageTitles.size(); - pageIter = new PageIterator(wiki, pageIds, pageTitles, pageBuffer); - } - else //use iterator over all pages in the db - { - MetaData md = wiki.getMetaData(); - this.nrOfArticles = md.getNumberOfPages() - - md.getNumberOfDisambiguationPages() - - md.getNumberOfRedirectPages(); - - pageIter = new PageIterator(wiki, true, pageBuffer); - } - - currentArticleIndex = 0; - - MediaWikiParserFactory pf = new MediaWikiParserFactory(); - pf.setTemplateParserClass(FlushTemplates.class); - - parser = pf.createParser(); - } - - @Override - public boolean hasNext() - throws IOException, CollectionException - { - return pageIter.hasNext(); - } - - @Override - public void getNext(JCas jcas) - throws IOException, CollectionException - { - super.getNext(jcas); - - page = pageIter.next(); - currentArticleIndex++; - - try { - getLogger().debug("title: " + page.getTitle()); - - addDocumentMetaData(jcas, page); - - if (!isValidPage(page)) { - jcas.setDocumentText(""); - return; - } - - if (outputPlainText) { - jcas.setDocumentText(WikiUtils - .cleanText(getPlainDocumentText(page))); - } - else { - jcas.setDocumentText(getDocumentText(page)); - } - - } - catch (WikiTitleParsingException e1) { - jcas.setDocumentText(""); - return; - } - } - - protected abstract boolean isValidPage(Page page) - throws WikiTitleParsingException; - - @Override - public Progress[] getProgress() - { - return new Progress[] { new ProgressImpl( - Long.valueOf(currentArticleIndex).intValue(), - Long.valueOf(nrOfArticles).intValue(), Progress.ENTITIES) }; - } - - protected String getDocumentText(Page page) - { - return page.getText(); - } - - protected abstract String getPlainDocumentText(Page page); - - private void addDocumentMetaData(JCas jcas, Page page) - throws WikiTitleParsingException - { - String language = WikiUtils.jwplLanguage2dkproLanguage(dbconfig.getLanguage()); - DocumentMetaData metaData = DocumentMetaData.create(jcas); - metaData.setDocumentTitle(page.getTitle().getWikiStyleTitle()); - metaData.setCollectionId(Integer.valueOf(page.getPageId()).toString()); - metaData.setDocumentId(Integer.valueOf(page.getPageId()).toString()); - metaData.setDocumentBaseUri("http://" + language + ".wikipedia.org"); - metaData.setDocumentUri("http://" + language + ".wikipedia.org/w/index.php?title=" + page.getTitle().getWikiStyleTitle()); - metaData.setLanguage(language); - } - - /** - * Loads a text file line-by-line into a Set of Strings. - * - * @param fileName - * path to the file - * @return a Set containing the individual lines of the text file - * @throws IOException - * if any error occurs while reading the file - */ - private Set loadFile(String fileName) - throws IOException - { - Set container = new HashSet(); - - FileInputStream fstream=null; - DataInputStream in=null; - BufferedReader br=null; - try{ - fstream = new FileInputStream(fileName); - in = new DataInputStream(fstream); - br = new BufferedReader(new InputStreamReader(in)); - - String strLine; - while ((strLine = br.readLine()) != null) { - container.add(strLine); - } - }finally{ - if(br!=null){ - br.close(); - } - if(in!=null){ - in.close(); - } - if(fstream!=null){ - fstream.close(); - } - } - - return container; - } - - public Page getPage() { - return page; - } - - -} diff --git a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaTemplateFilteredArticleReader.java b/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaTemplateFilteredArticleReader.java deleted file mode 100644 index 9fc9bacf52..0000000000 --- a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaTemplateFilteredArticleReader.java +++ /dev/null @@ -1,543 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.jwpl; - -import java.io.IOException; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Set; - -import org.apache.uima.UimaContext; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.Progress; -import org.apache.uima.util.ProgressImpl; - -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.io.jwpl.util.WikiUtils; -import de.tudarmstadt.ukp.wikipedia.api.Page; -import de.tudarmstadt.ukp.wikipedia.api.WikiConstants; -import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException; -import de.tudarmstadt.ukp.wikipedia.api.exception.WikiPageNotFoundException; -import de.tudarmstadt.ukp.wikipedia.api.exception.WikiTitleParsingException; -import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage; -import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.FlushTemplates; -import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; -import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParserFactory; -import de.tudarmstadt.ukp.wikipedia.util.templates.WikipediaTemplateInfo; -import de.tudarmstadt.ukp.wikipedia.util.templates.WikipediaTemplateInfoGenerator; - -/** - * Reads all pages that contain or do not contain the templates specified in the template whitelist - * and template blacklist. - * - *

- * It is possible to just define a whitelist OR a blacklist. If both whitelist and blacklist are - * provided, the articles are chosen that DO contain the templates from the whitelist and at the - * same time DO NOT contain the templates from the blacklist (= the intersection of the - * "whitelist page set" and the "blacklist page set") - *

- * - *

- * This reader only works if template tables have been generated for the JWPL database using the - * {@link WikipediaTemplateInfoGenerator}. - *

- * - *

- * NOTE: This reader directly extends the {@link WikipediaReaderBase} and not the - * {@link WikipediaStandardReaderBase} - *

- * - */ -@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.DBConfig", - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" }) -public class WikipediaTemplateFilteredArticleReader - extends WikipediaReaderBase -{ - /** If set to true, only the first paragraph instead of the whole article is used. */ - public static final String PARAM_ONLY_FIRST_PARAGRAPH = "OnlyFirstParagraph"; - @ConfigurationParameter(name = PARAM_ONLY_FIRST_PARAGRAPH, mandatory=true, defaultValue="false") - private boolean onlyFirstParagraph; - - /** Whether the reader outputs plain text or wiki markup. */ - public static final String PARAM_OUTPUT_PLAIN_TEXT = "OutputPlainText"; - @ConfigurationParameter(name = PARAM_OUTPUT_PLAIN_TEXT, mandatory = true, defaultValue = "true") - private boolean outputPlainText; - - /** Whether the reader should read also include talk pages. */ - public static final String PARAM_INCLUDE_DISCUSSION_PAGES = "IncludeDiscussions"; - @ConfigurationParameter(name = PARAM_INCLUDE_DISCUSSION_PAGES, mandatory = true, defaultValue = "true") - private boolean inludeDiscussions; - - /** - * If this option is set, discussion pages are rejected that are associated with a blacklisted - * article. Analogously, articles are rejected that are associated with a blacklisted discussion - * page. - *

- * This check is rather expensive and could take a long time. This is option is not active if - * only a whitelist is used. - *

- *

- * Default Value: false - *

- */ - public static final String PARAM_DOUBLE_CHECK_ASSOCIATED_PAGES = "DoubleCheckAssociatedPages"; - @ConfigurationParameter(name = PARAM_DOUBLE_CHECK_ASSOCIATED_PAGES, mandatory = true, defaultValue = "false") - private boolean doubleCheckWhitelistedArticles; - - /** - * Optional parameter that allows to define the max number of articles that should be delivered - * by the reader. - *

- * This avoids unnecessary filtering if only a small number of articles is needed. - *

- */ - public static final String PARAM_LIMIT_NUMBER_OF_ARTICLES_TO_READ = "LimitNUmberOfArticlesToRead"; - @ConfigurationParameter(name = PARAM_LIMIT_NUMBER_OF_ARTICLES_TO_READ, mandatory = false) - private Integer articleLimit; - - /** - * Defines templates that the articles MUST contain. - *

- * If you also define a blacklist, the intersection of both sets is used. (= pages that DO - * contain templates from the whitelist, but DO NOT contain templates from the blacklist) - *

- */ - public static final String PARAM_TEMPLATE_WHITELIST = "TemplateWhitelist"; - @ConfigurationParameter(name = PARAM_TEMPLATE_WHITELIST, mandatory = false) - private String[] templateWhitelistArray; - - /** - * Defines templates that the articles MUST NOT contain. - *

- * If you also define a whitelist, the intersection of both sets is used. (= pages that DO - * contain templates from the whitelist, but DO NOT contain templates from the blacklist) - *

- */ - public static final String PARAM_TEMPLATE_BLACKLIST = "TemplateBlacklist"; - @ConfigurationParameter(name = PARAM_TEMPLATE_BLACKLIST, mandatory = false) - private String[] templateBlacklistArray; - - /** - * Defines whether to match the templates exactly or whether to match all - * templates that start with the String given in the respective parameter - * list. - *

Default Value: {@code true}

- */ - public static final String PARAM_EXACT_TEMPLATE_MATCHING = "ExactTemplateMatching"; - @ConfigurationParameter(name = PARAM_EXACT_TEMPLATE_MATCHING, mandatory = true, defaultValue="true") - private boolean exactTemplateMatching; - - /** The page buffer size (#pages) of the page iterator. */ - public static final String PARAM_PAGE_BUFFER = "PageBuffer"; - @ConfigurationParameter(name = PARAM_PAGE_BUFFER, mandatory = true, defaultValue = "1000") - private int pageBuffer; - - private List bufferedPages; - private List pageIds; - - List templateBlacklist; - List templateWhitelist; - - private long currentArticleIndex; - private long nrOfArticles; - - private MediaWikiParser parser; - private WikipediaTemplateInfo tplInfo; - - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - - if(articleLimit!=null){ - getLogger().info("Article limit is set to " + articleLimit + " The reader won't " + - "deliver all pages that meet the requirements. Remove " + - "PARAM_LIMIT_NUMBER_OF_ARTICLES_TO_READ if that is not what you want."); - } - - if (templateBlacklistArray == null && templateWhitelistArray == null) { - throw new ResourceInitializationException(); - } - - try { - bufferedPages = new LinkedList(); - pageIds = new LinkedList(); - tplInfo = new WikipediaTemplateInfo(wiki); - - Iterable filteredIds = null; - - // WHITELIST FILTER - Set wlSet = null; - if (templateWhitelistArray != null && templateWhitelistArray.length > 0) { - - //convert array to list - templateWhitelist = Arrays.asList(templateWhitelistArray); - wlSet = new HashSet(); - - if (exactTemplateMatching) { - filteredIds = tplInfo.getPageIdsContainingTemplateNames( - templateWhitelist); - } - else { - filteredIds = tplInfo.getPageIdsContainingTemplateFragments( - templateWhitelist); - } - - for (Integer id : filteredIds) { - wlSet.add(id); - } - getLogger().info("The whitelist contains "+templateWhitelist.size()+" templates"); - getLogger().info(wlSet.size()+" articles are whitelisted"); - }else{ - getLogger().info("No whitelist active"); - } - - // BLACKLIST FILTER - Set blSet = null; - if (templateBlacklistArray != null && templateBlacklistArray.length > 0) { - - //convert array to list - templateBlacklist =Arrays.asList(templateBlacklistArray); - blSet = new HashSet(); - - if(wlSet!=null){ - //if the whitelist is active, we can just treat the blacklist - //as another whitelist and remove all items from the whitelist - //that are also in the blacklist. - //This way, we don't have to perform the expensive - //getPageIdsNotContainingTemplateNames operation here - if (exactTemplateMatching) { - filteredIds = tplInfo.getPageIdsContainingTemplateNames( - templateBlacklist); - } - else { - filteredIds = tplInfo.getPageIdsContainingTemplateFragments( - templateBlacklist); - } - for (Integer id : filteredIds) { - blSet.add(id); - } - getLogger().info("The blacklist contains "+templateBlacklist.size()+" templates"); - getLogger().info(blSet.size()+" articles are blacklisted"); - }else{ - //if the whitelist is not active, we have to treat the - //the blacklist like a real blacklist and call the - //rather expensive getPageIdsNotContainingTemplateNames() - if (exactTemplateMatching) { - filteredIds = tplInfo.getPageIdsNotContainingTemplateNames( - templateBlacklist); - } - else { - filteredIds = tplInfo.getPageIdsNotContainingTemplateFragments( - templateBlacklist); - } - for (Integer id : filteredIds) { - blSet.add(id); - } - getLogger().info("The blacklist contains "+templateBlacklist.size()+" templates"); - getLogger().info(blSet.size()+" articles are NOT blacklisted"); - } - }else{ - getLogger().info("No blacklist active"); - } - - // GET FINAL ID LIST - if (blSet != null && wlSet != null) { - //here, blSet contains pages CONTAINING the blacklisted tpls - - //so, first remove blacklisted pages from the whitelist - wlSet.removeAll(blSet); - - if(articleLimit!=null){ - //limit number of articles, if necessary - Set tempWlSet = new HashSet(); - tempWlSet.addAll(wlSet); - wlSet.clear(); - Iterator ids = tempWlSet.iterator(); - for(int i=0;i tempWlSet = new HashSet(); - tempWlSet.addAll(wlSet); - wlSet.clear(); - Iterator ids = tempWlSet.iterator(); - for(int i=0;i tempBlSet = new HashSet(); - tempBlSet.addAll(blSet); - blSet.clear(); - Iterator ids = tempBlSet.iterator(); - for(int i=0;i blacklistedArticles=new HashSet(); - if (exactTemplateMatching) { - blacklistedArticles.addAll(tplInfo.getPageIdsNotContainingTemplateNames( - templateBlacklist)); - } - else { - blacklistedArticles.addAll(tplInfo.getPageIdsNotContainingTemplateFragments( - templateBlacklist)); - } - pageIds.addAll(doubleCheckAssociatedArticles(blSet, blacklistedArticles)); - }else{ - pageIds.addAll(blSet); - } - - } - - this.nrOfArticles = pageIds.size(); - - getLogger().info("Reading "+nrOfArticles+" pages"); - - } - catch (Exception e) { - throw new ResourceInitializationException(e); - } - - currentArticleIndex = 0; - - //TODO Use SWEBLE - MediaWikiParserFactory pf = new MediaWikiParserFactory(); - pf.setTemplateParserClass(FlushTemplates.class); - - parser = pf.createParser(); - } - - @Override - public boolean hasNext() - throws IOException, CollectionException - { - return !pageIds.isEmpty()||!bufferedPages.isEmpty(); - } - - @Override - public void getNext(JCas jcas) - throws IOException, CollectionException - { - super.getNext(jcas); - - Page page = null; - try { - //fill buffer if empty - if(bufferedPages.isEmpty()) { - getLogger().trace("Filling buffer"); - for (int i = 0; i < (pageIds.size() < pageBuffer ? pageIds.size() : pageBuffer); i++) { - bufferedPages.add(wiki.getPage(pageIds.remove(0))); - } - } - //get next page from buffer - page = bufferedPages.remove(0); - - getLogger().trace("Processing article: " + page.getTitle()); - - addDocumentMetaData(jcas, page); - - if (!isValidPage(page)) { - jcas.setDocumentText(""); - return; - } - - if (outputPlainText) { - jcas.setDocumentText(WikiUtils - .cleanText(getPlainDocumentText(page))); - } - else { - jcas.setDocumentText(getDocumentText(page)); - } - - } - catch (WikiApiException e) { - throw new CollectionException(e); - } - - currentArticleIndex++; - } - - /** - * Only accept article pages and (if includeDiscussions=true) talk pages - * - * @param page - * the page that should be checked for validity - * @return true, if page is valid. false, else - * @throws WikiTitleParsingException - * if the page title cannot be parsed. - */ - private boolean isValidPage(Page page) - throws WikiTitleParsingException - { - return !page.isDisambiguation() && !page.isRedirect() - && (inludeDiscussions || (!inludeDiscussions && !page.isDiscussion())); - } - - @Override - public Progress[] getProgress() - { - return new Progress[] { new ProgressImpl( - Long.valueOf(currentArticleIndex).intValue(), - Long.valueOf(nrOfArticles).intValue(), Progress.ENTITIES) }; - } - - private String getDocumentText(Page page) - { - return page.getText(); - } - - private String getPlainDocumentText(Page page) - { - String text = ""; - ParsedPage pp = parser.parse(page.getText()); - - if (onlyFirstParagraph) { - if (pp != null && pp.getParagraph(0) != null) { - text = pp.getParagraph(0).getText(); - } - } - else { - if (pp != null ) { - text = pp.getText(); - } - } - return text; - } - - /** - * Double checks a list of page ids and checks for each id that belongs to a discussion page the - * corresponding article if it is blacklisted
- *
- * This is an rather expensive operation! - * - * @param idsToDoubleCheck - * the set of ids that should be double checked - * @param blIds - * a set with ids of blacklisted articles - * @return a the list of articles after double checking - * @throws WikiApiException - * if the wiki data cannot be accessed. - */ - private Set doubleCheckAssociatedArticles(Set idsToDoubleCheck, - Set blIds) - throws WikiApiException - { - if (idsToDoubleCheck.size() > 20000) { - getLogger().info("You want to double check "+idsToDoubleCheck.size()+" articles in the whitelist. This can take a very long time."+System.getProperty("line.separator")+ - "If you do not need ALL pages that meet the specified requirements, you might speed things up by setting PARAM_LIMIT_NUMBER_OF_ARTICLES_TO_READ."); - } - - Set doubleFilteredArticles = new HashSet(); - - //do the additional filtering - for(Integer id: idsToDoubleCheck){ - try{ - String curPageTitle = wiki.getTitle(id).getWikiStyleTitle(); - - //check associated discussion or article - if(curPageTitle.startsWith(WikiConstants.DISCUSSION_PREFIX)){ - curPageTitle = curPageTitle.replaceAll(WikiConstants.DISCUSSION_PREFIX, ""); - - if(curPageTitle.contains("/")){ - //If we have a discussion archive - String[] parts = curPageTitle.split("/"); - if(parts!=null&&parts.length>0&&parts[0].length()>0){ - curPageTitle = parts[0]; - } - - } - - List curArticleIds = wiki.getPageIds(curPageTitle); - for(int curArtId:curArticleIds){ - if(blIds.contains(curArtId)){ - //select id of current page for removal - doubleFilteredArticles.add(id); - } - } - }else{ - List curDiscussionIds = wiki.getPageIds(WikiConstants.DISCUSSION_PREFIX+curPageTitle); - for(int curDiscId:curDiscussionIds){ - if(blIds.contains(curDiscId)){ - //select id of current page for removal - doubleFilteredArticles.add(id); - } - } - } - }catch(WikiPageNotFoundException e){ - //just go on with the next id - } - } - - idsToDoubleCheck.removeAll(doubleFilteredArticles); - return idsToDoubleCheck; - } - - private void addDocumentMetaData(JCas jcas, Page page) - throws WikiTitleParsingException - { - DocumentMetaData metaData = DocumentMetaData.create(jcas); - metaData.setDocumentTitle(page.getTitle().getWikiStyleTitle()); - metaData.setCollectionId(Integer.valueOf(page.getPageId()).toString()); - metaData.setDocumentId(Integer.valueOf(page.getPageId()).toString()); - metaData.setLanguage(dbconfig.getLanguage().toString()); - } -} diff --git a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/package-info.java b/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/package-info.java deleted file mode 100644 index 684687833e..0000000000 --- a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/package-info.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Support for the Wikipedia files using - * JWPL (read-only). - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.io.jwpl; diff --git a/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaArticleInfoReader.java b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaArticleInfoReader.java new file mode 100644 index 0000000000..f630290a6a --- /dev/null +++ b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaArticleInfoReader.java @@ -0,0 +1,130 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.jwpl; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.uima.UimaContext; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Progress; +import org.apache.uima.util.ProgressImpl; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.ArticleInfo; +import de.tudarmstadt.ukp.wikipedia.api.MetaData; +import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException; +import de.tudarmstadt.ukp.wikipedia.revisionmachine.api.RevisionAPIConfiguration; +import de.tudarmstadt.ukp.wikipedia.revisionmachine.api.RevisionApi; + +/** + * Reads all general article infos without retrieving the whole Page objects + */ +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.ArticleInfo"}) +public class WikipediaArticleInfoReader extends WikipediaReaderBase +{ + protected long currentArticleIndex; + protected long nrOfArticles; + + protected Iterator idIter; + protected RevisionApi revApi; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException { + super.initialize(context); + + MetaData md = wiki.getMetaData(); + this.nrOfArticles = md.getNumberOfPages() - md.getNumberOfDisambiguationPages() + - md.getNumberOfRedirectPages(); + this.currentArticleIndex = 0; + + RevisionAPIConfiguration revConfig = new RevisionAPIConfiguration(dbconfig); + + try { + revApi = new RevisionApi(revConfig); + } + catch (WikiApiException e) { + throw new ResourceInitializationException(e); + } + + idIter = wiki.getPageIds().iterator(); + } + + + @Override + public boolean hasNext() + throws IOException, CollectionException + { + return idIter.hasNext(); + } + + + @Override + public void getNext(JCas aJCas) + throws IOException, CollectionException + { + super.getNext(aJCas); + + int id = idIter.next(); + currentArticleIndex++; + + try + { + addDocumentMetaData(aJCas, id); + + ArticleInfo info = new ArticleInfo(aJCas); + info.setAuthors(revApi.getNumberOfUniqueContributors(id)); + info.setRevisions(revApi.getNumberOfRevisions(id)); + info.setFirstAppearance(revApi.getFirstDateOfAppearance(id).getTime()); + info.setLastAppearance(revApi.getLastDateOfAppearance(id).getTime()); + info.addToIndexes(); + } + catch (WikiApiException e) { + //could e.g. happen if no revision is available for this page + getLogger().warn("Unable to fetch next article", e); + } + } + + + @Override + public Progress[] getProgress() + { + return new Progress[] { + new ProgressImpl( + Long.valueOf(currentArticleIndex).intValue(), + Long.valueOf(nrOfArticles).intValue(), + Progress.ENTITIES + ) + }; + } + + private void addDocumentMetaData(JCas jcas, int id) throws WikiApiException { + DocumentMetaData metaData = DocumentMetaData.create(jcas); + metaData.setDocumentTitle(wiki.getTitle(id).toString()); + metaData.setCollectionId(Integer.valueOf(id).toString()); + metaData.setLanguage(dbconfig.getLanguage().toString()); + + } +} diff --git a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaArticleReader.java b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaArticleReader.java similarity index 87% rename from dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaArticleReader.java rename to dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaArticleReader.java index aa5d320b65..97df6d5182 100644 --- a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaArticleReader.java +++ b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaArticleReader.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.jwpl; +package org.dkpro.core.io.jwpl; import de.tudarmstadt.ukp.wikipedia.api.Page; import de.tudarmstadt.ukp.wikipedia.api.exception.WikiTitleParsingException; @@ -23,15 +23,14 @@ /** * Reads all article pages. * - * A parameter controls whether the full article or only the first paragraph is set as the document text. + * A parameter controls whether the full article or only the first paragraph is set as the document + * text. * * No Redirects, disambiguation pages, or discussion pages are regarded, however. - * - * */ -public class WikipediaArticleReader extends WikipediaPageReader +public class WikipediaArticleReader + extends WikipediaPageReader { - @Override protected boolean isValidPage(Page page) throws WikiTitleParsingException { diff --git a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaDiscussionReader.java b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaDiscussionReader.java similarity index 95% rename from dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaDiscussionReader.java rename to dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaDiscussionReader.java index 042e6226b7..8a5e067594 100644 --- a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaDiscussionReader.java +++ b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaDiscussionReader.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.jwpl; +package org.dkpro.core.io.jwpl; import org.apache.uima.fit.descriptor.TypeCapability; @@ -25,17 +25,12 @@ /** * Reads all discussion pages. - * - * */ - @TypeCapability( - outputs={ + outputs = { "de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.DBConfig"}) - public class WikipediaDiscussionReader extends WikipediaStandardReaderBase { - //TODO Use SWEBLE @Override protected String getPlainDocumentText(Page page) diff --git a/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaLinkReader.java b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaLinkReader.java new file mode 100644 index 0000000000..b6f2dd39e9 --- /dev/null +++ b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaLinkReader.java @@ -0,0 +1,112 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.jwpl; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; + +import de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.WikipediaLink; +import de.tudarmstadt.ukp.wikipedia.api.Page; +import de.tudarmstadt.ukp.wikipedia.api.exception.WikiTitleParsingException; +import de.tudarmstadt.ukp.wikipedia.parser.Link; +import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage; + +/** + * Read links from Wikipedia. + */ +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.DBConfig", + "de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.WikipediaLink"}) +public class WikipediaLinkReader extends WikipediaStandardReaderBase { + + /** + * Which types of links are allowed? + */ + public static final String PARAM_ALLOWED_LINK_TYPES = "AllowedLinkTypes"; + @ConfigurationParameter(name = PARAM_ALLOWED_LINK_TYPES, mandatory = true) + private String[] allowedLinkTypes; + + @Override + protected boolean isValidPage(Page page) throws WikiTitleParsingException + { + return !page.isDisambiguation() && !page.isDiscussion() && !page.isRedirect(); + } + + @Override + protected String getPlainDocumentText(Page page) { + String text = ""; + ParsedPage pp = parser.parse(page.getText()); + if (pp != null ) { + text = pp.getText(); + } + return text; + } + + @Override + public void getNext(JCas jcas) + throws IOException, CollectionException { + super.getNext(jcas); + + ParsedPage pp = parser.parse(getPage().getText()); + + //Don't do anything if there is no document text + if (jcas.getDocumentText().length() == 0) { + return; + } + + //add link annotations + List allowedLinkTypeList = Arrays.asList(this.allowedLinkTypes); + WikipediaLink wikipediaLink; + int begin = 0; + int end = 0; + for (Link link : pp.getLinks()) { + if (allowedLinkTypeList.contains(link.getType().name())) { + // TODO: The begin and end of a link is defined with an absolute position in the + // raw text. But, Wikipedia guidelines claim that the first mention has to be + // marked + begin = 0; + end = 0; + begin = jcas.getDocumentText().indexOf(link.getText(), begin); + if (begin == -1) { + begin = jcas.getDocumentText().indexOf(link.getText()); + } + if (begin == -1) { + begin = 0; + } + end = begin + link.getText().length(); + if (end >= jcas.getDocumentText().length()) { + end = begin; + } + wikipediaLink = new WikipediaLink(jcas); + wikipediaLink.setBegin(0); + wikipediaLink.setEnd(1); + wikipediaLink.setLinkType(link.getType().name()); + wikipediaLink.setTarget(link.getTarget()); + wikipediaLink.setAnchor(link.getText()); + wikipediaLink.addToIndexes(); + } + } + } +} diff --git a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaPageReader.java b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaPageReader.java similarity index 95% rename from dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaPageReader.java rename to dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaPageReader.java index 5e21323ab1..7729175411 100644 --- a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaPageReader.java +++ b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaPageReader.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.jwpl; +package org.dkpro.core.io.jwpl; import org.apache.uima.UimaContext; import org.apache.uima.fit.descriptor.ConfigurationParameter; @@ -29,20 +29,19 @@ /** * Reads all Wikipedia pages in the database (articles, discussions, etc). * - * A parameter controls whether the full article or only the first paragraph is set as the document text. + * A parameter controls whether the full article or only the first paragraph is set as the document + * text. * * No Redirects or disambiguation pages are regarded, however. */ @TypeCapability( - outputs={ + outputs = { "de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.DBConfig"}) - public class WikipediaPageReader extends WikipediaStandardReaderBase { - /** If set to true, only the first paragraph instead of the whole article is used. */ public static final String PARAM_ONLY_FIRST_PARAGRAPH = "OnlyFirstParagraph"; - @ConfigurationParameter(name = PARAM_ONLY_FIRST_PARAGRAPH, mandatory=true, defaultValue="false") + @ConfigurationParameter(name = PARAM_ONLY_FIRST_PARAGRAPH, mandatory = true, defaultValue = "false") private boolean onlyFirstParagraph; @Override diff --git a/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaQueryReader.java b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaQueryReader.java new file mode 100644 index 0000000000..8a433b6787 --- /dev/null +++ b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaQueryReader.java @@ -0,0 +1,199 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.jwpl; + +import org.apache.uima.UimaContext; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Level; + +import de.tudarmstadt.ukp.wikipedia.api.PageQuery; +import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException; + +/** + * Reads all article pages that match a query created by the numerous parameters of this class. + */ +public class WikipediaQueryReader + extends WikipediaArticleReader +{ + /** + * Maximum number of categories. Articles with a higher number of categories will not be + * returned by the query. + */ + public static final String PARAM_MAX_CATEGORIES = "MaxCategories"; + @ConfigurationParameter(name = PARAM_MAX_CATEGORIES, mandatory = false, defaultValue = "-1") + private int maxCategories; + + /** + * Minimum number of categories. Articles with a lower number of categories will not be returned + * by the query. + */ + public static final String PARAM_MIN_CATEGORIES = "MinCategories"; + @ConfigurationParameter(name = PARAM_MIN_CATEGORIES, mandatory = false, defaultValue = "-1") + private int minCategories; + + /** + * Maximum number of incoming links. Articles with a higher number of incoming links will not be + * returned by the query. + */ + public static final String PARAM_MAX_INLINKS = "MaxInlinks"; + @ConfigurationParameter(name = PARAM_MAX_INLINKS, mandatory = false, defaultValue = "-1") + private int maxInlinks; + + /** + * Minimum number of incoming links. Articles with a lower number of incoming links will not be + * returned by the query. + */ + public static final String PARAM_MIN_INLINKS = "MinInlinks"; + @ConfigurationParameter(name = PARAM_MIN_INLINKS, mandatory = false, defaultValue = "-1") + private int minInlinks; + + /** + * Maximum number of outgoing links. Articles with a higher number of outgoing links will not be + * returned by the query. + */ + public static final String PARAM_MAX_OUTLINKS = "MaxOutlinks"; + @ConfigurationParameter(name = PARAM_MAX_OUTLINKS, mandatory = false, defaultValue = "-1") + private int maxOutlinks; + + /** + * Minimum number of outgoing links. Articles with a lower number of outgoing links will not be + * returned by the query. + */ + public static final String PARAM_MIN_OUTLINKS = "MinOutlinks"; + @ConfigurationParameter(name = PARAM_MIN_OUTLINKS, mandatory = false, defaultValue = "-1") + private int minOutlinks; + + /** + * Maximum number of redirects. Articles with a higher number of redirects will not be returned + * by the query. + */ + public static final String PARAM_MAX_REDIRECTS = "MaxRedirects"; + @ConfigurationParameter(name = PARAM_MAX_REDIRECTS, mandatory = false, defaultValue = "-1") + private int maxRedirects; + + /** + * Minimum number of redirects. Articles with a lower number of redirects will not be returned + * by the query. + */ + public static final String PARAM_MIN_REDIRECTS = "MinRedirects"; + @ConfigurationParameter(name = PARAM_MIN_REDIRECTS, mandatory = false, defaultValue = "-1") + private int minRedirects; + + /** + * Maximum number of tokens. Articles with a higher number of tokens will not be returned by the + * query. + */ + public static final String PARAM_MAX_TOKENS = "MaxTokens"; + @ConfigurationParameter(name = PARAM_MAX_TOKENS, mandatory = false, defaultValue = "-1") + private int maxTokens; + + /** + * Minimum number of tokens. Articles with a lower number of tokens will not be returned by the + * query. + */ + public static final String PARAM_MIN_TOKENS = "MinTokens"; + @ConfigurationParameter(name = PARAM_MIN_TOKENS, mandatory = false, defaultValue = "-1") + private int minTokens; + + /** + * SQL-style title pattern. Only articles that match the pattern will be returned by the query. + */ + public static final String PARAM_TITLE_PATTERN = "TitlePattern"; + @ConfigurationParameter(name = PARAM_TITLE_PATTERN, mandatory = false, defaultValue = "") + private String titlePattern; + + protected boolean queryInitialized = false; // indicates whether a query parameter was used + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + PageQuery query = new PageQuery(); + + if (maxCategories != -1) { + query.setMaxCategories(maxCategories); + queryInitialized = true; + } + + if (minCategories != -1) { + query.setMinCategories(minCategories); + queryInitialized = true; + } + + if (maxInlinks != -1) { + query.setMaxIndegree(maxInlinks); + queryInitialized = true; + } + + if (minInlinks != -1) { + query.setMinIndegree(minInlinks); + queryInitialized = true; + } + + if (maxOutlinks != -1) { + query.setMaxOutdegree(maxOutlinks); + queryInitialized = true; + } + + if (minOutlinks != -1) { + query.setMinOutdegree(minOutlinks); + queryInitialized = true; + } + + if (maxRedirects != -1) { + query.setMaxRedirects(maxRedirects); + queryInitialized = true; + } + + if (minRedirects != -1) { + query.setMinRedirects(minRedirects); + queryInitialized = true; + } + + if (maxTokens != -1) { + query.setMaxTokens(maxTokens); + queryInitialized = true; + } + + if (minTokens != -1) { + query.setMinTokens(minTokens); + queryInitialized = true; + } + + if (!titlePattern.equals("")) { + query.setTitlePattern(titlePattern); + queryInitialized = true; + } + + this.getLogger().log(Level.INFO, query.getQueryInfo()); + + // if a query was initialized, overwrite the page iterator + if (queryInitialized) { + try { + pageIter = wiki.getPages(query).iterator(); + } + catch (WikiApiException e) { + throw new ResourceInitializationException(e); + } + + } + } +} diff --git a/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaReaderBase.java b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaReaderBase.java new file mode 100644 index 0000000000..47969d47a4 --- /dev/null +++ b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaReaderBase.java @@ -0,0 +1,120 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.jwpl; + +import java.io.IOException; + +import org.apache.uima.UimaContext; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Progress; + +import de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.DBConfig; +import de.tudarmstadt.ukp.wikipedia.api.DatabaseConfiguration; +import de.tudarmstadt.ukp.wikipedia.api.WikiConstants.Language; +import de.tudarmstadt.ukp.wikipedia.api.Wikipedia; +import de.tudarmstadt.ukp.wikipedia.api.exception.WikiInitializationException; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Abstract base class for all Wikipedia readers. + */ +@Component(value = OperationType.READER) +public abstract class WikipediaReaderBase extends JCasCollectionReader_ImplBase +{ + /** The host server. */ + public static final String PARAM_HOST = "Host"; + @ConfigurationParameter(name = PARAM_HOST, mandatory = true) + private String host; + + /** The name of the database. */ + public static final String PARAM_DB = "Database"; + @ConfigurationParameter(name = PARAM_DB, mandatory = true) + private String db; + + /** The username of the database account. */ + public static final String PARAM_USER = "User"; + @ConfigurationParameter(name = PARAM_USER, mandatory = true) + private String user; + + /** The password of the database account. */ + public static final String PARAM_PASSWORD = "Password"; + @ConfigurationParameter(name = PARAM_PASSWORD, mandatory = true) + private String password; + + /** The language of the Wikipedia that should be connected to. */ + public static final String PARAM_LANGUAGE = "Language"; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true) + private Language language; + + /** + * Sets whether the database configuration should be stored in the CAS, so that annotators down + * the pipeline can access additional data. + */ + public static final String PARAM_CREATE_DATABASE_CONFIG_ANNOTATION = "CreateDBAnno"; + @ConfigurationParameter(name = PARAM_CREATE_DATABASE_CONFIG_ANNOTATION, mandatory = true, defaultValue = "false") + private boolean createDbAnno; + + protected DatabaseConfiguration dbconfig; + + protected Wikipedia wiki; + + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + dbconfig = new DatabaseConfiguration( + host, + db, + user, + password, + language + ); + + try { + this.wiki = new Wikipedia(dbconfig); + } + catch (WikiInitializationException e) { + throw new ResourceInitializationException(e); + } + } + + @Override + public void getNext(JCas jcas) throws IOException, CollectionException + { + if (createDbAnno) { + DBConfig dbconfiganno = new DBConfig(jcas); + dbconfiganno.setHost(host); + dbconfiganno.setPassword(password); + dbconfiganno.setDB(db); + dbconfiganno.setUser(user); + dbconfiganno.setLanguage(language.toString()); + dbconfiganno.addToIndexes(); + } + } + + @Override + public abstract Progress[] getProgress(); +} diff --git a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaRevisionPairReader.java b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaRevisionPairReader.java similarity index 98% rename from dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaRevisionPairReader.java rename to dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaRevisionPairReader.java index 0df1ef843f..c18f02dc45 100644 --- a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaRevisionPairReader.java +++ b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaRevisionPairReader.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.jwpl; +package org.dkpro.core.io.jwpl; import java.io.IOException; import java.sql.Timestamp; @@ -29,8 +29,8 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Level; +import org.dkpro.core.io.jwpl.util.WikiUtils; -import de.tudarmstadt.ukp.dkpro.core.io.jwpl.util.WikiUtils; import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException; import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage; import de.tudarmstadt.ukp.wikipedia.revisionmachine.api.Revision; diff --git a/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaRevisionReader.java b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaRevisionReader.java new file mode 100644 index 0000000000..8feb18a432 --- /dev/null +++ b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaRevisionReader.java @@ -0,0 +1,101 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.jwpl; + +import java.io.IOException; +import java.sql.SQLException; + +import org.apache.commons.lang3.StringEscapeUtils; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.jwpl.util.WikiUtils; + +import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException; +import de.tudarmstadt.ukp.wikipedia.revisionmachine.api.Revision; + +/** + * Reads Wikipedia page revisions. + */ +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.DBConfig", + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.WikipediaRevision"}) + +public class WikipediaRevisionReader extends WikipediaRevisionReaderBase +{ + + @Override + public void getNext(JCas jcas) + throws IOException, CollectionException + { + super.getNext(jcas); + + try { + Revision revision = null; + if (!revisionIds.isEmpty()) { + // in case we iterate over a given list of revisions + String nextId = revIdIterator.next(); + try { + revision = this.revisionApi.getRevision(Integer.parseInt(nextId)); + } + catch (Exception e) { + // in case of lost connection + // TODO should be handled in RevisionAPI + revisionApi.reconnect(); + revision = this.revisionApi.getRevision(Integer.parseInt(nextId)); + } + } + else { + //in case we iterate over ALL revisions + try { + revision = this.revisionApi.getRevision(currentArticle.getPageId(), + timestampIter.next()); + } + catch (Exception e) { + //in case of lost connection + //TODO should be handled in RevisionAPI + revisionApi.reconnect(); + revision = this.revisionApi.getRevision(currentArticle.getPageId(), + timestampIter.next()); + } + } + + String text = ""; + if (outputPlainText) { + text = WikiUtils.cleanText( + StringEscapeUtils.unescapeHtml4(revision.getRevisionText()) + ); + } + else { + text = revision.getRevisionText(); + } + jcas.setDocumentText(text); + + addDocumentMetaData(jcas, revision.getArticleID(), revision.getRevisionID()); + addRevisionAnnotation(jcas, revision); + } + catch (WikiApiException e) { + throw new CollectionException(e); + } + catch (SQLException e) { + throw new CollectionException(e); + } + } +} diff --git a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaRevisionReaderBase.java b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaRevisionReaderBase.java similarity index 99% rename from dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaRevisionReaderBase.java rename to dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaRevisionReaderBase.java index 299392773f..7c23c4a505 100644 --- a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaRevisionReaderBase.java +++ b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaRevisionReaderBase.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.jwpl; +package org.dkpro.core.io.jwpl; import java.io.BufferedReader; import java.io.DataInputStream; @@ -36,10 +36,10 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; +import org.dkpro.core.io.jwpl.util.WikiUtils; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.WikipediaRevision; -import de.tudarmstadt.ukp.dkpro.core.io.jwpl.util.WikiUtils; import de.tudarmstadt.ukp.wikipedia.api.MetaData; import de.tudarmstadt.ukp.wikipedia.api.Page; import de.tudarmstadt.ukp.wikipedia.api.PageIterator; diff --git a/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaStandardReaderBase.java b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaStandardReaderBase.java new file mode 100644 index 0000000000..899ffc7f64 --- /dev/null +++ b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaStandardReaderBase.java @@ -0,0 +1,284 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.jwpl; + +import java.io.BufferedReader; +import java.io.DataInputStream; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +import org.apache.uima.UimaContext; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Progress; +import org.apache.uima.util.ProgressImpl; +import org.dkpro.core.io.jwpl.util.WikiUtils; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.wikipedia.api.MetaData; +import de.tudarmstadt.ukp.wikipedia.api.Page; +import de.tudarmstadt.ukp.wikipedia.api.PageIterator; +import de.tudarmstadt.ukp.wikipedia.api.exception.WikiTitleParsingException; +import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.FlushTemplates; +import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; +import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParserFactory; + +/** + * Abstract base class for standard Wikipedia readers reading single articles + * instead of revision pairs. + * + * + */ +public abstract class WikipediaStandardReaderBase + extends WikipediaReaderBase +{ + + /** Whether the reader outputs plain text or wiki markup. */ + public static final String PARAM_OUTPUT_PLAIN_TEXT = "OutputPlainText"; + @ConfigurationParameter(name = PARAM_OUTPUT_PLAIN_TEXT, mandatory = true, defaultValue = "true") + protected boolean outputPlainText; + + /** The page buffer size (#pages) of the page iterator. */ + public static final String PARAM_PAGE_BUFFER = "PageBuffer"; + @ConfigurationParameter(name = PARAM_PAGE_BUFFER, mandatory = true, defaultValue = "1000") + protected int pageBuffer; + + /** + * Defines the path to a file containing a line-separated list of + * page ids of the pages that should be retrieved. (Optional) + */ + public static final String PARAM_PATH_TO_PAGE_ID_LIST = "PageIdsFromFile"; + @ConfigurationParameter(name = PARAM_PATH_TO_PAGE_ID_LIST, mandatory = false) + protected String pageIdFile; + + /** + * Defines the path to a file containing a line-separated list of + * page titles of the pages that should be retrieved. (Optional) + */ + public static final String PARAM_PATH_TO_PAGE_TITLE_LIST = "PageTitleFromFile"; + @ConfigurationParameter(name = PARAM_PATH_TO_PAGE_TITLE_LIST, mandatory = false) + protected String pageNameFile; + + /** + * Defines an array of + * page ids of the pages that should be retrieved. (Optional) + */ + public static final String PARAM_PAGE_ID_LIST = "PageIdFromArray"; + @ConfigurationParameter(name = PARAM_PAGE_ID_LIST, mandatory = false) + protected String[] pageIdParamArray; + + /** + * Defines an array of page titles of the pages that should be retrieved. + * (Optional) + */ + public static final String PARAM_PAGE_TITLE_LIST = "PageTitlesFromArray"; + @ConfigurationParameter(name = PARAM_PAGE_TITLE_LIST, mandatory = false) + protected String[] pageNameParamArray; + + private Set pageIds = null; + private Set pageTitles = null; + + protected long currentArticleIndex; + protected long nrOfArticles; + + protected Iterator pageIter; + + private Page page; + + protected MediaWikiParser parser; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + pageIds = new HashSet(); + pageTitles = new HashSet(); + + try { + if (pageIdFile != null) { + pageIds = loadFile(pageIdFile); + } + if (pageNameFile != null) { + pageTitles = loadFile(pageNameFile); + } + if (pageIdParamArray != null && pageIdParamArray.length > 0) { + for (String id : pageIdParamArray) { + pageIds.add(id); + } + } + if (pageNameParamArray != null && pageNameParamArray.length > 0) { + for (String id : pageNameParamArray) { + pageTitles.add(id); + } + } + } + catch (Exception e) { + throw new ResourceInitializationException(e); + } + + // Use one of the lists or iterate over all articles? + if (!pageIds.isEmpty() || !pageTitles.isEmpty()) { + this.nrOfArticles = pageIds.size() + pageTitles.size(); + pageIter = new PageIterator(wiki, pageIds, pageTitles, pageBuffer); + } + else //use iterator over all pages in the db + { + MetaData md = wiki.getMetaData(); + this.nrOfArticles = md.getNumberOfPages() + - md.getNumberOfDisambiguationPages() + - md.getNumberOfRedirectPages(); + + pageIter = new PageIterator(wiki, true, pageBuffer); + } + + currentArticleIndex = 0; + + MediaWikiParserFactory pf = new MediaWikiParserFactory(); + pf.setTemplateParserClass(FlushTemplates.class); + + parser = pf.createParser(); + } + + @Override + public boolean hasNext() + throws IOException, CollectionException + { + return pageIter.hasNext(); + } + + @Override + public void getNext(JCas jcas) + throws IOException, CollectionException + { + super.getNext(jcas); + + page = pageIter.next(); + currentArticleIndex++; + + try { + getLogger().debug("title: " + page.getTitle()); + + addDocumentMetaData(jcas, page); + + if (!isValidPage(page)) { + jcas.setDocumentText(""); + return; + } + + if (outputPlainText) { + jcas.setDocumentText(WikiUtils + .cleanText(getPlainDocumentText(page))); + } + else { + jcas.setDocumentText(getDocumentText(page)); + } + + } + catch (WikiTitleParsingException e1) { + jcas.setDocumentText(""); + return; + } + } + + protected abstract boolean isValidPage(Page page) + throws WikiTitleParsingException; + + @Override + public Progress[] getProgress() + { + return new Progress[] { new ProgressImpl( + Long.valueOf(currentArticleIndex).intValue(), + Long.valueOf(nrOfArticles).intValue(), Progress.ENTITIES) }; + } + + protected String getDocumentText(Page page) + { + return page.getText(); + } + + protected abstract String getPlainDocumentText(Page page); + + private void addDocumentMetaData(JCas jcas, Page page) + throws WikiTitleParsingException + { + String language = WikiUtils.jwplLanguage2dkproLanguage(dbconfig.getLanguage()); + DocumentMetaData metaData = DocumentMetaData.create(jcas); + metaData.setDocumentTitle(page.getTitle().getWikiStyleTitle()); + metaData.setCollectionId(Integer.valueOf(page.getPageId()).toString()); + metaData.setDocumentId(Integer.valueOf(page.getPageId()).toString()); + metaData.setDocumentBaseUri("http://" + language + ".wikipedia.org"); + metaData.setDocumentUri("http://" + language + ".wikipedia.org/w/index.php?title=" + page.getTitle().getWikiStyleTitle()); + metaData.setLanguage(language); + } + + /** + * Loads a text file line-by-line into a Set of Strings. + * + * @param fileName + * path to the file + * @return a Set containing the individual lines of the text file + * @throws IOException + * if any error occurs while reading the file + */ + private Set loadFile(String fileName) + throws IOException + { + Set container = new HashSet(); + + FileInputStream fstream = null; + DataInputStream in = null; + BufferedReader br = null; + try { + fstream = new FileInputStream(fileName); + in = new DataInputStream(fstream); + br = new BufferedReader(new InputStreamReader(in)); + + String strLine; + while ((strLine = br.readLine()) != null) { + container.add(strLine); + } + } + finally { + if (br != null) { + br.close(); + } + if (in != null) { + in.close(); + } + if (fstream != null) { + fstream.close(); + } + } + + return container; + } + + public Page getPage() { + return page; + } + + +} diff --git a/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaTemplateFilteredArticleReader.java b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaTemplateFilteredArticleReader.java new file mode 100644 index 0000000000..773dd24a23 --- /dev/null +++ b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/WikipediaTemplateFilteredArticleReader.java @@ -0,0 +1,556 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.jwpl; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +import org.apache.uima.UimaContext; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Progress; +import org.apache.uima.util.ProgressImpl; +import org.dkpro.core.io.jwpl.util.WikiUtils; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.wikipedia.api.Page; +import de.tudarmstadt.ukp.wikipedia.api.WikiConstants; +import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException; +import de.tudarmstadt.ukp.wikipedia.api.exception.WikiPageNotFoundException; +import de.tudarmstadt.ukp.wikipedia.api.exception.WikiTitleParsingException; +import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage; +import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.FlushTemplates; +import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; +import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParserFactory; +import de.tudarmstadt.ukp.wikipedia.util.templates.WikipediaTemplateInfo; +import de.tudarmstadt.ukp.wikipedia.util.templates.WikipediaTemplateInfoGenerator; + +/** + * Reads all pages that contain or do not contain the templates specified in the template whitelist + * and template blacklist. + * + *

+ * It is possible to just define a whitelist OR a blacklist. If both whitelist and blacklist are + * provided, the articles are chosen that DO contain the templates from the whitelist and at the + * same time DO NOT contain the templates from the blacklist (= the intersection of the + * "whitelist page set" and the "blacklist page set") + *

+ * + *

+ * This reader only works if template tables have been generated for the JWPL database using the + * {@link WikipediaTemplateInfoGenerator}. + *

+ * + *

+ * NOTE: This reader directly extends the {@link WikipediaReaderBase} and not the + * {@link WikipediaStandardReaderBase} + *

+ * + */ +@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.DBConfig", + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" }) +public class WikipediaTemplateFilteredArticleReader + extends WikipediaReaderBase +{ + /** If set to true, only the first paragraph instead of the whole article is used. */ + public static final String PARAM_ONLY_FIRST_PARAGRAPH = "OnlyFirstParagraph"; + @ConfigurationParameter(name = PARAM_ONLY_FIRST_PARAGRAPH, mandatory = true, defaultValue = "false") + private boolean onlyFirstParagraph; + + /** Whether the reader outputs plain text or wiki markup. */ + public static final String PARAM_OUTPUT_PLAIN_TEXT = "OutputPlainText"; + @ConfigurationParameter(name = PARAM_OUTPUT_PLAIN_TEXT, mandatory = true, defaultValue = "true") + private boolean outputPlainText; + + /** Whether the reader should read also include talk pages. */ + public static final String PARAM_INCLUDE_DISCUSSION_PAGES = "IncludeDiscussions"; + @ConfigurationParameter(name = PARAM_INCLUDE_DISCUSSION_PAGES, mandatory = true, defaultValue = "true") + private boolean inludeDiscussions; + + /** + * If this option is set, discussion pages are rejected that are associated with a blacklisted + * article. Analogously, articles are rejected that are associated with a blacklisted discussion + * page. + *

+ * This check is rather expensive and could take a long time. This is option is not active if + * only a whitelist is used. + *

+ *

+ * Default Value: false + *

+ */ + public static final String PARAM_DOUBLE_CHECK_ASSOCIATED_PAGES = "DoubleCheckAssociatedPages"; + @ConfigurationParameter(name = PARAM_DOUBLE_CHECK_ASSOCIATED_PAGES, mandatory = true, defaultValue = "false") + private boolean doubleCheckWhitelistedArticles; + + /** + * Optional parameter that allows to define the max number of articles that should be delivered + * by the reader. + *

+ * This avoids unnecessary filtering if only a small number of articles is needed. + *

+ */ + public static final String PARAM_LIMIT_NUMBER_OF_ARTICLES_TO_READ = "LimitNUmberOfArticlesToRead"; + @ConfigurationParameter(name = PARAM_LIMIT_NUMBER_OF_ARTICLES_TO_READ, mandatory = false) + private Integer articleLimit; + + /** + * Defines templates that the articles MUST contain. + *

+ * If you also define a blacklist, the intersection of both sets is used. (= pages that DO + * contain templates from the whitelist, but DO NOT contain templates from the blacklist) + *

+ */ + public static final String PARAM_TEMPLATE_WHITELIST = "TemplateWhitelist"; + @ConfigurationParameter(name = PARAM_TEMPLATE_WHITELIST, mandatory = false) + private String[] templateWhitelistArray; + + /** + * Defines templates that the articles MUST NOT contain. + *

+ * If you also define a whitelist, the intersection of both sets is used. (= pages that DO + * contain templates from the whitelist, but DO NOT contain templates from the blacklist) + *

+ */ + public static final String PARAM_TEMPLATE_BLACKLIST = "TemplateBlacklist"; + @ConfigurationParameter(name = PARAM_TEMPLATE_BLACKLIST, mandatory = false) + private String[] templateBlacklistArray; + + /** + * Defines whether to match the templates exactly or whether to match all + * templates that start with the String given in the respective parameter + * list. + *

Default Value: {@code true}

+ */ + public static final String PARAM_EXACT_TEMPLATE_MATCHING = "ExactTemplateMatching"; + @ConfigurationParameter(name = PARAM_EXACT_TEMPLATE_MATCHING, mandatory = true, defaultValue = "true") + private boolean exactTemplateMatching; + + /** The page buffer size (#pages) of the page iterator. */ + public static final String PARAM_PAGE_BUFFER = "PageBuffer"; + @ConfigurationParameter(name = PARAM_PAGE_BUFFER, mandatory = true, defaultValue = "1000") + private int pageBuffer; + + private List bufferedPages; + private List pageIds; + + List templateBlacklist; + List templateWhitelist; + + private long currentArticleIndex; + private long nrOfArticles; + + private MediaWikiParser parser; + private WikipediaTemplateInfo tplInfo; + + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + if (articleLimit != null) { + getLogger().info("Article limit is set to " + articleLimit + " The reader won't " + + "deliver all pages that meet the requirements. Remove " + + "PARAM_LIMIT_NUMBER_OF_ARTICLES_TO_READ if that is not what you want."); + } + + if (templateBlacklistArray == null && templateWhitelistArray == null) { + throw new ResourceInitializationException(); + } + + try { + bufferedPages = new LinkedList(); + pageIds = new LinkedList(); + tplInfo = new WikipediaTemplateInfo(wiki); + + Iterable filteredIds = null; + + // WHITELIST FILTER + Set wlSet = null; + if (templateWhitelistArray != null && templateWhitelistArray.length > 0) { + + //convert array to list + templateWhitelist = Arrays.asList(templateWhitelistArray); + wlSet = new HashSet(); + + if (exactTemplateMatching) { + filteredIds = tplInfo.getPageIdsContainingTemplateNames( + templateWhitelist); + } + else { + filteredIds = tplInfo.getPageIdsContainingTemplateFragments( + templateWhitelist); + } + + for (Integer id : filteredIds) { + wlSet.add(id); + } + getLogger() + .info("The whitelist contains " + templateWhitelist.size() + " templates"); + getLogger().info(wlSet.size() + " articles are whitelisted"); + } + else { + getLogger().info("No whitelist active"); + } + + // BLACKLIST FILTER + Set blSet = null; + if (templateBlacklistArray != null && templateBlacklistArray.length > 0) { + + //convert array to list + templateBlacklist = Arrays.asList(templateBlacklistArray); + blSet = new HashSet(); + + if (wlSet != null) { + //if the whitelist is active, we can just treat the blacklist + //as another whitelist and remove all items from the whitelist + //that are also in the blacklist. + //This way, we don't have to perform the expensive + //getPageIdsNotContainingTemplateNames operation here + if (exactTemplateMatching) { + filteredIds = tplInfo.getPageIdsContainingTemplateNames( + templateBlacklist); + } + else { + filteredIds = tplInfo.getPageIdsContainingTemplateFragments( + templateBlacklist); + } + for (Integer id : filteredIds) { + blSet.add(id); + } + getLogger().info( + "The blacklist contains " + templateBlacklist.size() + " templates"); + getLogger().info(blSet.size() + " articles are blacklisted"); + } + else { + //if the whitelist is not active, we have to treat the + //the blacklist like a real blacklist and call the + //rather expensive getPageIdsNotContainingTemplateNames() + if (exactTemplateMatching) { + filteredIds = tplInfo.getPageIdsNotContainingTemplateNames( + templateBlacklist); + } + else { + filteredIds = tplInfo.getPageIdsNotContainingTemplateFragments( + templateBlacklist); + } + for (Integer id : filteredIds) { + blSet.add(id); + } + getLogger().info( + "The blacklist contains " + templateBlacklist.size() + " templates"); + getLogger().info(blSet.size() + " articles are NOT blacklisted"); + } + } + else { + getLogger().info("No blacklist active"); + } + + // GET FINAL ID LIST + if (blSet != null && wlSet != null) { + //here, blSet contains pages CONTAINING the blacklisted tpls + + //so, first remove blacklisted pages from the whitelist + wlSet.removeAll(blSet); + + if (articleLimit != null) { + // limit number of articles, if necessary + Set tempWlSet = new HashSet(); + tempWlSet.addAll(wlSet); + wlSet.clear(); + Iterator ids = tempWlSet.iterator(); + for (int i = 0; i < articleLimit; i++) { + if (ids.hasNext()) { + wlSet.add(ids.next()); + } + } + } + + // now double filter, if necessary + if (doubleCheckWhitelistedArticles) { + getLogger().info("Double checking " + wlSet.size() + " articles"); + + // if doublecheck-param is set, double check whitelisted + // articles against the blacklist before adding them + pageIds.addAll(doubleCheckAssociatedArticles(wlSet, blSet)); + } + else { + pageIds.addAll(wlSet); + } + } + else if (blSet == null && wlSet != null) { + if (articleLimit != null) { + // limit number of articles, if necessary + Set tempWlSet = new HashSet(); + tempWlSet.addAll(wlSet); + wlSet.clear(); + Iterator ids = tempWlSet.iterator(); + for (int i = 0; i < articleLimit; i++) { + if (ids.hasNext()) { + wlSet.add(ids.next()); + } + } + } + pageIds.addAll(wlSet); + } + else if (blSet != null && wlSet == null) { + if (articleLimit != null) { + // limit number of articles, if necessary + Set tempBlSet = new HashSet(); + tempBlSet.addAll(blSet); + blSet.clear(); + Iterator ids = tempBlSet.iterator(); + for (int i = 0; i < articleLimit; i++) { + if (ids.hasNext()) { + blSet.add(ids.next()); + } + } + } + // here, blSet contains pages NOT containing the blacklisted tpls + // now add remaining pages to the pageId list + if (doubleCheckWhitelistedArticles) { + getLogger().info("Double checking " + blSet.size() + " articles"); + + // if doublecheck-param is set, double check the articles + // that are not blacklisted against the blacklist + Set blacklistedArticles = new HashSet(); + if (exactTemplateMatching) { + blacklistedArticles.addAll( + tplInfo.getPageIdsNotContainingTemplateNames(templateBlacklist)); + } + else { + blacklistedArticles.addAll(tplInfo + .getPageIdsNotContainingTemplateFragments(templateBlacklist)); + } + pageIds.addAll(doubleCheckAssociatedArticles(blSet, blacklistedArticles)); + } + else { + pageIds.addAll(blSet); + } + + } + + this.nrOfArticles = pageIds.size(); + + getLogger().info("Reading " + nrOfArticles + " pages"); + } + catch (Exception e) { + throw new ResourceInitializationException(e); + } + + currentArticleIndex = 0; + + //TODO Use SWEBLE + MediaWikiParserFactory pf = new MediaWikiParserFactory(); + pf.setTemplateParserClass(FlushTemplates.class); + + parser = pf.createParser(); + } + + @Override + public boolean hasNext() + throws IOException, CollectionException + { + return !pageIds.isEmpty() || !bufferedPages.isEmpty(); + } + + @Override + public void getNext(JCas jcas) + throws IOException, CollectionException + { + super.getNext(jcas); + + Page page = null; + try { + // fill buffer if empty + if (bufferedPages.isEmpty()) { + getLogger().trace("Filling buffer"); + for (int i = 0; i < (pageIds.size() < pageBuffer ? pageIds.size() + : pageBuffer); i++) { + bufferedPages.add(wiki.getPage(pageIds.remove(0))); + } + } + //get next page from buffer + page = bufferedPages.remove(0); + + getLogger().trace("Processing article: " + page.getTitle()); + + addDocumentMetaData(jcas, page); + + if (!isValidPage(page)) { + jcas.setDocumentText(""); + return; + } + + if (outputPlainText) { + jcas.setDocumentText(WikiUtils + .cleanText(getPlainDocumentText(page))); + } + else { + jcas.setDocumentText(getDocumentText(page)); + } + + } + catch (WikiApiException e) { + throw new CollectionException(e); + } + + currentArticleIndex++; + } + + /** + * Only accept article pages and (if includeDiscussions=true) talk pages + * + * @param page + * the page that should be checked for validity + * @return true, if page is valid. false, else + * @throws WikiTitleParsingException + * if the page title cannot be parsed. + */ + private boolean isValidPage(Page page) + throws WikiTitleParsingException + { + return !page.isDisambiguation() && !page.isRedirect() + && (inludeDiscussions || (!inludeDiscussions && !page.isDiscussion())); + } + + @Override + public Progress[] getProgress() + { + return new Progress[] { new ProgressImpl( + Long.valueOf(currentArticleIndex).intValue(), + Long.valueOf(nrOfArticles).intValue(), Progress.ENTITIES) }; + } + + private String getDocumentText(Page page) + { + return page.getText(); + } + + private String getPlainDocumentText(Page page) + { + String text = ""; + ParsedPage pp = parser.parse(page.getText()); + + if (onlyFirstParagraph) { + if (pp != null && pp.getParagraph(0) != null) { + text = pp.getParagraph(0).getText(); + } + } + else { + if (pp != null ) { + text = pp.getText(); + } + } + return text; + } + + /** + * Double checks a list of page ids and checks for each id that belongs to a discussion page the + * corresponding article if it is blacklisted
+ *
+ * This is an rather expensive operation! + * + * @param idsToDoubleCheck + * the set of ids that should be double checked + * @param blIds + * a set with ids of blacklisted articles + * @return a the list of articles after double checking + * @throws WikiApiException + * if the wiki data cannot be accessed. + */ + private Set doubleCheckAssociatedArticles(Set idsToDoubleCheck, + Set blIds) + throws WikiApiException + { + if (idsToDoubleCheck.size() > 20000) { + getLogger().info("You want to double check " + idsToDoubleCheck.size() + + " articles in the whitelist. This can take a very long time." + + System.getProperty("line.separator") + + "If you do not need ALL pages that meet the specified requirements, " + + "you might speed things up by setting PARAM_LIMIT_NUMBER_OF_ARTICLES_TO_READ."); + } + + Set doubleFilteredArticles = new HashSet(); + + // do the additional filtering + for (Integer id : idsToDoubleCheck) { + try { + String curPageTitle = wiki.getTitle(id).getWikiStyleTitle(); + + // check associated discussion or article + if (curPageTitle.startsWith(WikiConstants.DISCUSSION_PREFIX)) { + curPageTitle = curPageTitle.replaceAll(WikiConstants.DISCUSSION_PREFIX, ""); + + if (curPageTitle.contains("/")) { + // If we have a discussion archive + String[] parts = curPageTitle.split("/"); + if (parts != null && parts.length > 0 && parts[0].length() > 0) { + curPageTitle = parts[0]; + } + + } + + List curArticleIds = wiki.getPageIds(curPageTitle); + for (int curArtId : curArticleIds) { + if (blIds.contains(curArtId)) { + // select id of current page for removal + doubleFilteredArticles.add(id); + } + } + } + else { + List curDiscussionIds = wiki + .getPageIds(WikiConstants.DISCUSSION_PREFIX + curPageTitle); + for (int curDiscId : curDiscussionIds) { + if (blIds.contains(curDiscId)) { + // select id of current page for removal + doubleFilteredArticles.add(id); + } + } + } + } + catch (WikiPageNotFoundException e) { + // just go on with the next id + } + } + + idsToDoubleCheck.removeAll(doubleFilteredArticles); + return idsToDoubleCheck; + } + + private void addDocumentMetaData(JCas jcas, Page page) + throws WikiTitleParsingException + { + DocumentMetaData metaData = DocumentMetaData.create(jcas); + metaData.setDocumentTitle(page.getTitle().getWikiStyleTitle()); + metaData.setCollectionId(Integer.valueOf(page.getPageId()).toString()); + metaData.setDocumentId(Integer.valueOf(page.getPageId()).toString()); + metaData.setLanguage(dbconfig.getLanguage().toString()); + } +} diff --git a/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/package-info.java b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/package-info.java new file mode 100644 index 0000000000..871a642448 --- /dev/null +++ b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/package-info.java @@ -0,0 +1,25 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Support for the Wikipedia files using + * JWPL (read-only). + * + * @since 1.1.0 + */ +package org.dkpro.core.io.jwpl; diff --git a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/util/WikiUtils.java b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/util/WikiUtils.java similarity index 85% rename from dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/util/WikiUtils.java rename to dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/util/WikiUtils.java index f2087f9ab6..102d51587b 100644 --- a/dkpro-core-io-jwpl-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/util/WikiUtils.java +++ b/dkpro-core-io-jwpl-asl/src/main/java/org/dkpro/core/io/jwpl/util/WikiUtils.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.jwpl.util; +package org.dkpro.core.io.jwpl.util; import java.util.HashMap; import java.util.Map; @@ -99,40 +99,41 @@ public static String cleanText(String text) return plainText; } - /** - * Creates a Wikipedia object from a DBConfig annotation without the need to - * manually create the intermediary DatabaseConfiguration. - * - * @param confAnnotation - * annotation containing the db credentials - * @return a Wikipedia object - * @throws WikiApiException - * if the Wikipedia object could not be created - */ - public static Wikipedia getWikipedia(DBConfig confAnnotation) - throws WikiApiException - { - DatabaseConfiguration config = new DatabaseConfiguration(); - config.setHost(confAnnotation.getHost()); - config.setDatabase(confAnnotation.getDB()); - config.setUser(confAnnotation.getUser()); - config.setPassword(confAnnotation.getPassword()); - config.setLanguage(Language.valueOf(confAnnotation.getLanguage())); - return new Wikipedia(config); + /** + * Creates a Wikipedia object from a DBConfig annotation without the need to + * manually create the intermediary DatabaseConfiguration. + * + * @param confAnnotation + * annotation containing the db credentials + * @return a Wikipedia object + * @throws WikiApiException + * if the Wikipedia object could not be created + */ + public static Wikipedia getWikipedia(DBConfig confAnnotation) + throws WikiApiException + { + DatabaseConfiguration config = new DatabaseConfiguration(); + config.setHost(confAnnotation.getHost()); + config.setDatabase(confAnnotation.getDB()); + config.setUser(confAnnotation.getUser()); + config.setPassword(confAnnotation.getPassword()); + config.setLanguage(Language.valueOf(confAnnotation.getLanguage())); + return new Wikipedia(config); } - - public static String jwplLanguage2dkproLanguage(Language jwplLanguage) { - if (jwpl2dkproLanguageMap.containsKey(jwplLanguage.name())) { - return jwpl2dkproLanguageMap.get(jwplLanguage.name()); + + public static String jwplLanguage2dkproLanguage(Language jwplLanguage) { + if (jwpl2dkproLanguageMap.containsKey(jwplLanguage.name())) { + return jwpl2dkproLanguageMap.get(jwplLanguage.name()); - } - else { - System.err.println("Do not know DKPro language for JWPL language: " + jwplLanguage.name()); - return "x-unknown"; - } - } - - @SuppressWarnings("serial") + } + else { + System.err.println( + "Do not know DKPro language for JWPL language: " + jwplLanguage.name()); + return "x-unknown"; + } + } + + @SuppressWarnings("serial") private static Map jwpl2dkproLanguageMap = new HashMap() {{ // abkhazian, // afar, @@ -194,7 +195,7 @@ public static String jwplLanguage2dkproLanguage(Language jwplLanguage) { // dutch_low_saxon, // dzongkha, // emilian_romagnol, - put("english", "en"); + put("english", "en"); // esperanto, // estonian, // ewe, @@ -207,10 +208,10 @@ public static String jwplLanguage2dkproLanguage(Language jwplLanguage) { // fula, // galician, // georgian, - put("german", "de"); + put("german", "de"); // gilaki, // gothic, - put("greek", "el"); + put("greek", "el"); // greenlandic, // guarani, // gujarati, @@ -390,6 +391,7 @@ public static String jwplLanguage2dkproLanguage(Language jwplLanguage) { // zealandic, // zhuang, // zulu, - put("_test", "en"); - }}; + put("_test", "en"); + } + }; } diff --git a/dkpro-core-io-jwpl-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaArticleReaderTest.java b/dkpro-core-io-jwpl-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaArticleReaderTest.java deleted file mode 100644 index b049ffb3cd..0000000000 --- a/dkpro-core-io-jwpl-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaArticleReaderTest.java +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.jwpl; - -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; - -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.JCasIterable; -import org.apache.uima.jcas.JCas; -import org.junit.Ignore; -import org.junit.Test; - -import de.tudarmstadt.ukp.wikipedia.api.WikiConstants.Language; - -@Ignore("Relies on non-public server") -public class WikipediaArticleReaderTest -{ - @Test - public void wikipediaReaderTest() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - WikipediaArticleReader.class, - WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", - WikipediaReaderBase.PARAM_DB, "wikiapi_test", - WikipediaReaderBase.PARAM_USER, "student", - WikipediaReaderBase.PARAM_PASSWORD, "student", - WikipediaReaderBase.PARAM_LANGUAGE, Language._test); - - int i = 0; - for (JCas jcas : new JCasIterable(reader)) { - assertNotNull(jcas); - i++; - } - - assertEquals(28, i); - } - - @Test - public void wikipediaArticleIdReaderTest() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - WikipediaArticleReader.class, - WikipediaArticleReader.PARAM_PAGE_ID_LIST, new String[]{"1041","103","107"}, - WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", - WikipediaReaderBase.PARAM_DB, "wikiapi_test", - WikipediaReaderBase.PARAM_USER, "student", - WikipediaReaderBase.PARAM_PASSWORD, "student", - WikipediaReaderBase.PARAM_LANGUAGE, Language._test); - - int i = 0; - for (JCas jcas : new JCasIterable(reader)) { - assertNotNull(jcas); - i++; - } - - assertEquals(3, i); - } - - @Test - public void wikipediaArticleTitleReaderTest() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - WikipediaArticleReader.class, - WikipediaArticleReader.PARAM_PAGE_TITLE_LIST, new String[]{"TK1","TK3"}, - WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", - WikipediaReaderBase.PARAM_DB, "wikiapi_test", - WikipediaReaderBase.PARAM_USER, "student", - WikipediaReaderBase.PARAM_PASSWORD, "student", - WikipediaReaderBase.PARAM_LANGUAGE, Language._test); - - int i = 0; - for (JCas jcas : new JCasIterable(reader)) { - assertNotNull(jcas); - i++; - } - - assertEquals(2, i); - } - - @Test - public void wikipediaArticleIdFileReaderTest() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - WikipediaArticleReader.class, - WikipediaArticleReader.PARAM_PATH_TO_PAGE_ID_LIST, "src/test/resources/idList", - WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", - WikipediaReaderBase.PARAM_DB, "wikiapi_test", - WikipediaReaderBase.PARAM_USER, "student", - WikipediaReaderBase.PARAM_PASSWORD, "student", - WikipediaReaderBase.PARAM_LANGUAGE, Language._test); - - int i = 0; - for (JCas jcas : new JCasIterable(reader)) { - assertNotNull(jcas); - i++; - } - - assertEquals(3, i); - } - - @Test - public void wikipediaArticleTitleFileReaderTest() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - WikipediaArticleReader.class, - WikipediaArticleReader.PARAM_PATH_TO_PAGE_TITLE_LIST, "src/test/resources/titleList", - WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", - WikipediaReaderBase.PARAM_DB, "wikiapi_test", - WikipediaReaderBase.PARAM_USER, "student", - WikipediaReaderBase.PARAM_PASSWORD, "student", - WikipediaReaderBase.PARAM_LANGUAGE, Language._test); - - int i = 0; - for (JCas jcas : new JCasIterable(reader)) { - assertNotNull(jcas); - i++; - } - - assertEquals(2, i); - } -} diff --git a/dkpro-core-io-jwpl-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaLinkReaderTest.java b/dkpro-core-io-jwpl-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaLinkReaderTest.java deleted file mode 100644 index 782cfa62c6..0000000000 --- a/dkpro-core-io-jwpl-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaLinkReaderTest.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.jwpl; - -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; - -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.JCasIterable; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.junit.Ignore; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.WikipediaLink; -import de.tudarmstadt.ukp.wikipedia.api.WikiConstants.Language; -import de.tudarmstadt.ukp.wikipedia.parser.Link; - -@Ignore("Relies on non-public server") -public class WikipediaLinkReaderTest -{ - @Test - public void wikipediaReaderTest() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - WikipediaLinkReader.class, - WikipediaLinkReader.PARAM_ALLOWED_LINK_TYPES, new String[]{Link.type.INTERNAL.name()}, - WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", - WikipediaReaderBase.PARAM_DB, "wikiapi_test", - WikipediaReaderBase.PARAM_USER, "student", - WikipediaReaderBase.PARAM_PASSWORD, "student", - WikipediaReaderBase.PARAM_LANGUAGE, Language._test); - - int i = 0; - for (JCas jcas : new JCasIterable(reader)) { - assertNotNull(jcas); - i++; - } - - assertEquals(28, i); - } - - @Test - public void wikipediaLinkReaderTest() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - WikipediaLinkReader.class, - WikipediaLinkReader.PARAM_ALLOWED_LINK_TYPES, new String[]{Link.type.INTERNAL.name()}, - WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", - WikipediaReaderBase.PARAM_DB, "wikiapi_test", - WikipediaReaderBase.PARAM_USER, "student", - WikipediaReaderBase.PARAM_PASSWORD, "student", - WikipediaReaderBase.PARAM_LANGUAGE, Language._test); - - int linkCounter = 0; - for (JCas jcas : new JCasIterable(reader)) { - for(WikipediaLink link : JCasUtil.select(jcas, WikipediaLink.class)){ - System.out.println(link.getCoveredText()); - linkCounter++; - } - assertNotNull(jcas); - } - - assertEquals(0, linkCounter); - } -} diff --git a/dkpro-core-io-jwpl-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaQueryReaderTest.java b/dkpro-core-io-jwpl-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaQueryReaderTest.java deleted file mode 100644 index 8c34c33d66..0000000000 --- a/dkpro-core-io-jwpl-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaQueryReaderTest.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.jwpl; - -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; - -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.JCasIterable; -import org.apache.uima.jcas.JCas; -import org.junit.Ignore; -import org.junit.Test; - -import de.tudarmstadt.ukp.wikipedia.api.WikiConstants.Language; - -@Ignore("Relies on non-public server") -public class WikipediaQueryReaderTest -{ - @Test - public void wikipediaReaderTest() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - WikipediaQueryReader.class, - WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", - WikipediaReaderBase.PARAM_DB, "wikiapi_test", - WikipediaReaderBase.PARAM_USER, "student", - WikipediaReaderBase.PARAM_PASSWORD, "student", - WikipediaReaderBase.PARAM_LANGUAGE, Language._test, - WikipediaQueryReader.PARAM_TITLE_PATTERN, "UK%"); - - int i = 0; - for (JCas jcas : new JCasIterable(reader)) { - assertNotNull(jcas); - i++; - } - - assertEquals(1, i); - } - - @Test - public void wikipediaReaderTest2() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - WikipediaQueryReader.class, - WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", - WikipediaReaderBase.PARAM_DB, "wikiapi_test", - WikipediaReaderBase.PARAM_USER, "student", - WikipediaReaderBase.PARAM_PASSWORD, "student", - WikipediaReaderBase.PARAM_LANGUAGE, Language._test, - WikipediaQueryReader.PARAM_MIN_TOKENS, 1, - WikipediaQueryReader.PARAM_MAX_TOKENS, 200, - WikipediaQueryReader.PARAM_TITLE_PATTERN, "UK%"); - - int i = 0; - for (JCas jcas : new JCasIterable(reader)) { - assertNotNull(jcas); - i++; - } - - assertEquals(1, i); - } -} diff --git a/dkpro-core-io-jwpl-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaRevisionPairReaderTest.java b/dkpro-core-io-jwpl-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaRevisionPairReaderTest.java deleted file mode 100644 index 82c80cf16a..0000000000 --- a/dkpro-core-io-jwpl-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaRevisionPairReaderTest.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.jwpl; - -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.junit.Assert.assertNotNull; - -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.JCasIterable; -import org.apache.uima.jcas.JCas; -import org.junit.Ignore; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.wikipedia.api.WikiConstants.Language; - -@Ignore("Relies on non-public server") -public class WikipediaRevisionPairReaderTest -{ - // FIXME currently there is no test database to test revisions - @Test - public void wikipediaRevisionReaderTest() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - WikipediaRevisionPairReader.class, - WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", - WikipediaReaderBase.PARAM_DB, "wikiapi_simple_20090119", - WikipediaReaderBase.PARAM_USER, "student", - WikipediaReaderBase.PARAM_PASSWORD, "student", - WikipediaReaderBase.PARAM_LANGUAGE, Language.simple_english); - - int i = 0; - for (JCas jcas : new JCasIterable(reader)) { - assertNotNull(jcas); - - JCas view1 = jcas.getView(WikipediaRevisionPairReader.REVISION_1); - JCas view2 = jcas.getView(WikipediaRevisionPairReader.REVISION_2); - - DocumentMetaData md1 = DocumentMetaData.get(view1); - DocumentMetaData md2 = DocumentMetaData.get(view2); - -// System.out.println(md1); -// System.out.println(md2); - - i++; - if (i > 10) { - break; - } - } - } -} diff --git a/dkpro-core-io-jwpl-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaRevisionReaderTest.java b/dkpro-core-io-jwpl-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaRevisionReaderTest.java deleted file mode 100644 index 2e97648c9f..0000000000 --- a/dkpro-core-io-jwpl-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/jwpl/WikipediaRevisionReaderTest.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.jwpl; - -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.junit.Assert.assertNotNull; - -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.JCasIterable; -import org.apache.uima.jcas.JCas; -import org.junit.Ignore; -import org.junit.Test; - -import de.tudarmstadt.ukp.wikipedia.api.WikiConstants.Language; - -@Ignore("Relies on non-public server") -public class WikipediaRevisionReaderTest -{ - // FIXME currently there is no test database to test revisions - @Test - public void wikipediaRevisionReaderTest() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - WikipediaRevisionReader.class, - WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", - WikipediaReaderBase.PARAM_DB, "wikiapi_simple_20090119", - WikipediaReaderBase.PARAM_USER, "student", - WikipediaReaderBase.PARAM_PASSWORD, "student", - WikipediaReaderBase.PARAM_LANGUAGE, Language.simple_english); - - int i = 0; - for (JCas jcas : new JCasIterable(reader)) { - assertNotNull(jcas); - i++; - if (i > 1000) { - break; - } - } - } -} diff --git a/dkpro-core-io-jwpl-asl/src/test/java/org/dkpro/core/io/jwpl/WikipediaArticleReaderTest.java b/dkpro-core-io-jwpl-asl/src/test/java/org/dkpro/core/io/jwpl/WikipediaArticleReaderTest.java new file mode 100644 index 0000000000..a75689f223 --- /dev/null +++ b/dkpro-core-io-jwpl-asl/src/test/java/org/dkpro/core/io/jwpl/WikipediaArticleReaderTest.java @@ -0,0 +1,145 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.jwpl; + +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.JCasIterable; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.jwpl.WikipediaArticleReader; +import org.dkpro.core.io.jwpl.WikipediaReaderBase; +import org.junit.Ignore; +import org.junit.Test; + +import de.tudarmstadt.ukp.wikipedia.api.WikiConstants.Language; + +@Ignore("Relies on non-public server") +public class WikipediaArticleReaderTest +{ + @Test + public void wikipediaReaderTest() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + WikipediaArticleReader.class, + WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", + WikipediaReaderBase.PARAM_DB, "wikiapi_test", + WikipediaReaderBase.PARAM_USER, "student", + WikipediaReaderBase.PARAM_PASSWORD, "student", + WikipediaReaderBase.PARAM_LANGUAGE, Language._test); + + int i = 0; + for (JCas jcas : new JCasIterable(reader)) { + assertNotNull(jcas); + i++; + } + + assertEquals(28, i); + } + + @Test + public void wikipediaArticleIdReaderTest() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + WikipediaArticleReader.class, + WikipediaArticleReader.PARAM_PAGE_ID_LIST, new String[]{"1041","103","107"}, + WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", + WikipediaReaderBase.PARAM_DB, "wikiapi_test", + WikipediaReaderBase.PARAM_USER, "student", + WikipediaReaderBase.PARAM_PASSWORD, "student", + WikipediaReaderBase.PARAM_LANGUAGE, Language._test); + + int i = 0; + for (JCas jcas : new JCasIterable(reader)) { + assertNotNull(jcas); + i++; + } + + assertEquals(3, i); + } + + @Test + public void wikipediaArticleTitleReaderTest() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + WikipediaArticleReader.class, + WikipediaArticleReader.PARAM_PAGE_TITLE_LIST, new String[]{"TK1","TK3"}, + WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", + WikipediaReaderBase.PARAM_DB, "wikiapi_test", + WikipediaReaderBase.PARAM_USER, "student", + WikipediaReaderBase.PARAM_PASSWORD, "student", + WikipediaReaderBase.PARAM_LANGUAGE, Language._test); + + int i = 0; + for (JCas jcas : new JCasIterable(reader)) { + assertNotNull(jcas); + i++; + } + + assertEquals(2, i); + } + + @Test + public void wikipediaArticleIdFileReaderTest() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + WikipediaArticleReader.class, + WikipediaArticleReader.PARAM_PATH_TO_PAGE_ID_LIST, "src/test/resources/idList", + WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", + WikipediaReaderBase.PARAM_DB, "wikiapi_test", + WikipediaReaderBase.PARAM_USER, "student", + WikipediaReaderBase.PARAM_PASSWORD, "student", + WikipediaReaderBase.PARAM_LANGUAGE, Language._test); + + int i = 0; + for (JCas jcas : new JCasIterable(reader)) { + assertNotNull(jcas); + i++; + } + + assertEquals(3, i); + } + + @Test + public void wikipediaArticleTitleFileReaderTest() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + WikipediaArticleReader.class, + WikipediaArticleReader.PARAM_PATH_TO_PAGE_TITLE_LIST, "src/test/resources/titleList", + WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", + WikipediaReaderBase.PARAM_DB, "wikiapi_test", + WikipediaReaderBase.PARAM_USER, "student", + WikipediaReaderBase.PARAM_PASSWORD, "student", + WikipediaReaderBase.PARAM_LANGUAGE, Language._test); + + int i = 0; + for (JCas jcas : new JCasIterable(reader)) { + assertNotNull(jcas); + i++; + } + + assertEquals(2, i); + } +} diff --git a/dkpro-core-io-jwpl-asl/src/test/java/org/dkpro/core/io/jwpl/WikipediaLinkReaderTest.java b/dkpro-core-io-jwpl-asl/src/test/java/org/dkpro/core/io/jwpl/WikipediaLinkReaderTest.java new file mode 100644 index 0000000000..b3221c3337 --- /dev/null +++ b/dkpro-core-io-jwpl-asl/src/test/java/org/dkpro/core/io/jwpl/WikipediaLinkReaderTest.java @@ -0,0 +1,86 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.jwpl; + +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.JCasIterable; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.jwpl.WikipediaLinkReader; +import org.dkpro.core.io.jwpl.WikipediaReaderBase; +import org.junit.Ignore; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.WikipediaLink; +import de.tudarmstadt.ukp.wikipedia.api.WikiConstants.Language; +import de.tudarmstadt.ukp.wikipedia.parser.Link; + +@Ignore("Relies on non-public server") +public class WikipediaLinkReaderTest +{ + @Test + public void wikipediaReaderTest() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + WikipediaLinkReader.class, + WikipediaLinkReader.PARAM_ALLOWED_LINK_TYPES, Link.type.INTERNAL.name(), + WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", + WikipediaReaderBase.PARAM_DB, "wikiapi_test", + WikipediaReaderBase.PARAM_USER, "student", + WikipediaReaderBase.PARAM_PASSWORD, "student", + WikipediaReaderBase.PARAM_LANGUAGE, Language._test); + + int i = 0; + for (JCas jcas : new JCasIterable(reader)) { + assertNotNull(jcas); + i++; + } + + assertEquals(28, i); + } + + @Test + public void wikipediaLinkReaderTest() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + WikipediaLinkReader.class, + WikipediaLinkReader.PARAM_ALLOWED_LINK_TYPES, Link.type.INTERNAL.name(), + WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", + WikipediaReaderBase.PARAM_DB, "wikiapi_test", + WikipediaReaderBase.PARAM_USER, "student", + WikipediaReaderBase.PARAM_PASSWORD, "student", + WikipediaReaderBase.PARAM_LANGUAGE, Language._test); + + int linkCounter = 0; + for (JCas jcas : new JCasIterable(reader)) { + for (WikipediaLink link : JCasUtil.select(jcas, WikipediaLink.class)) { + System.out.println(link.getCoveredText()); + linkCounter++; + } + assertNotNull(jcas); + } + + assertEquals(0, linkCounter); + } +} diff --git a/dkpro-core-io-jwpl-asl/src/test/java/org/dkpro/core/io/jwpl/WikipediaQueryReaderTest.java b/dkpro-core-io-jwpl-asl/src/test/java/org/dkpro/core/io/jwpl/WikipediaQueryReaderTest.java new file mode 100644 index 0000000000..d78bbd4da0 --- /dev/null +++ b/dkpro-core-io-jwpl-asl/src/test/java/org/dkpro/core/io/jwpl/WikipediaQueryReaderTest.java @@ -0,0 +1,82 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.jwpl; + +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.JCasIterable; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.jwpl.WikipediaQueryReader; +import org.dkpro.core.io.jwpl.WikipediaReaderBase; +import org.junit.Ignore; +import org.junit.Test; + +import de.tudarmstadt.ukp.wikipedia.api.WikiConstants.Language; + +@Ignore("Relies on non-public server") +public class WikipediaQueryReaderTest +{ + @Test + public void wikipediaReaderTest() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + WikipediaQueryReader.class, + WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", + WikipediaReaderBase.PARAM_DB, "wikiapi_test", + WikipediaReaderBase.PARAM_USER, "student", + WikipediaReaderBase.PARAM_PASSWORD, "student", + WikipediaReaderBase.PARAM_LANGUAGE, Language._test, + WikipediaQueryReader.PARAM_TITLE_PATTERN, "UK%"); + + int i = 0; + for (JCas jcas : new JCasIterable(reader)) { + assertNotNull(jcas); + i++; + } + + assertEquals(1, i); + } + + @Test + public void wikipediaReaderTest2() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + WikipediaQueryReader.class, + WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", + WikipediaReaderBase.PARAM_DB, "wikiapi_test", + WikipediaReaderBase.PARAM_USER, "student", + WikipediaReaderBase.PARAM_PASSWORD, "student", + WikipediaReaderBase.PARAM_LANGUAGE, Language._test, + WikipediaQueryReader.PARAM_MIN_TOKENS, 1, + WikipediaQueryReader.PARAM_MAX_TOKENS, 200, + WikipediaQueryReader.PARAM_TITLE_PATTERN, "UK%"); + + int i = 0; + for (JCas jcas : new JCasIterable(reader)) { + assertNotNull(jcas); + i++; + } + + assertEquals(1, i); + } +} diff --git a/dkpro-core-io-jwpl-asl/src/test/java/org/dkpro/core/io/jwpl/WikipediaRevisionPairReaderTest.java b/dkpro-core-io-jwpl-asl/src/test/java/org/dkpro/core/io/jwpl/WikipediaRevisionPairReaderTest.java new file mode 100644 index 0000000000..9f11199e40 --- /dev/null +++ b/dkpro-core-io-jwpl-asl/src/test/java/org/dkpro/core/io/jwpl/WikipediaRevisionPairReaderTest.java @@ -0,0 +1,69 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.jwpl; + +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.junit.Assert.assertNotNull; + +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.JCasIterable; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.jwpl.WikipediaReaderBase; +import org.dkpro.core.io.jwpl.WikipediaRevisionPairReader; +import org.junit.Ignore; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.wikipedia.api.WikiConstants.Language; + +@Ignore("Relies on non-public server") +public class WikipediaRevisionPairReaderTest +{ + // FIXME currently there is no test database to test revisions + @Test + public void wikipediaRevisionReaderTest() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + WikipediaRevisionPairReader.class, + WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", + WikipediaReaderBase.PARAM_DB, "wikiapi_simple_20090119", + WikipediaReaderBase.PARAM_USER, "student", + WikipediaReaderBase.PARAM_PASSWORD, "student", + WikipediaReaderBase.PARAM_LANGUAGE, Language.simple_english); + + int i = 0; + for (JCas jcas : new JCasIterable(reader)) { + assertNotNull(jcas); + + JCas view1 = jcas.getView(WikipediaRevisionPairReader.REVISION_1); + JCas view2 = jcas.getView(WikipediaRevisionPairReader.REVISION_2); + + DocumentMetaData md1 = DocumentMetaData.get(view1); + DocumentMetaData md2 = DocumentMetaData.get(view2); + +// System.out.println(md1); +// System.out.println(md2); + + i++; + if (i > 10) { + break; + } + } + } +} diff --git a/dkpro-core-io-jwpl-asl/src/test/java/org/dkpro/core/io/jwpl/WikipediaRevisionReaderTest.java b/dkpro-core-io-jwpl-asl/src/test/java/org/dkpro/core/io/jwpl/WikipediaRevisionReaderTest.java new file mode 100644 index 0000000000..ba40f7a22b --- /dev/null +++ b/dkpro-core-io-jwpl-asl/src/test/java/org/dkpro/core/io/jwpl/WikipediaRevisionReaderTest.java @@ -0,0 +1,58 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.jwpl; + +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.junit.Assert.assertNotNull; + +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.JCasIterable; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.jwpl.WikipediaReaderBase; +import org.dkpro.core.io.jwpl.WikipediaRevisionReader; +import org.junit.Ignore; +import org.junit.Test; + +import de.tudarmstadt.ukp.wikipedia.api.WikiConstants.Language; + +@Ignore("Relies on non-public server") +public class WikipediaRevisionReaderTest +{ + // FIXME currently there is no test database to test revisions + @Test + public void wikipediaRevisionReaderTest() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + WikipediaRevisionReader.class, + WikipediaReaderBase.PARAM_HOST, "bender.ukp.informatik.tu-darmstadt.de", + WikipediaReaderBase.PARAM_DB, "wikiapi_simple_20090119", + WikipediaReaderBase.PARAM_USER, "student", + WikipediaReaderBase.PARAM_PASSWORD, "student", + WikipediaReaderBase.PARAM_LANGUAGE, Language.simple_english); + + int i = 0; + for (JCas jcas : new JCasIterable(reader)) { + assertNotNull(jcas); + i++; + if (i > 1000) { + break; + } + } + } +} diff --git a/dkpro-core-io-jwpl-asl/suppressions.xml b/dkpro-core-io-jwpl-asl/suppressions.xml new file mode 100644 index 0000000000..05381817ea --- /dev/null +++ b/dkpro-core-io-jwpl-asl/suppressions.xml @@ -0,0 +1,9 @@ + + + + + + + diff --git a/dkpro-core-io-lcc-asl/pom.xml b/dkpro-core-io-lcc-asl/pom.xml index 0304d8045b..3680837950 100644 --- a/dkpro-core-io-lcc-asl/pom.xml +++ b/dkpro-core-io-lcc-asl/pom.xml @@ -19,15 +19,15 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - org.dkpro.core dkpro-core-io-lcc-asl jar DKPro Core ASL - IO - Leipzig Corpora Collection (LCC) + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -42,20 +42,20 @@ commons-io - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl junit diff --git a/dkpro-core-io-lcc-asl/src/main/java/org/dkpro/core/io/lcc/LccReader.java b/dkpro-core-io-lcc-asl/src/main/java/org/dkpro/core/io/lcc/LccReader.java index 4c2c7783c9..01d8568685 100644 --- a/dkpro-core-io-lcc-asl/src/main/java/org/dkpro/core/io/lcc/LccReader.java +++ b/dkpro-core-io-lcc-asl/src/main/java/org/dkpro/core/io/lcc/LccReader.java @@ -35,18 +35,18 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; /** * Reader for sentence-based Leipzig Corpora Collection files. */ -@ResourceMetaData(name="Leipzig Corpora Collection Reader") +@ResourceMetaData(name = "Leipzig Corpora Collection Reader") @MimeTypeCapability({MimeTypes.TEXT_X_LCC}) @TypeCapability( outputs = { @@ -58,7 +58,8 @@ public class LccReader * Name of configuration parameter that contains the character encoding used by the input files. */ public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String sourceEncoding; /** @@ -68,17 +69,17 @@ public class LccReader @ConfigurationParameter(name = PARAM_WRITE_SENTENCE, mandatory = true, defaultValue = "false") private boolean writeSentence; - /** - * How many input sentences should be merged into one CAS. - */ - public static final String PARAM_SENTENCES_PER_CAS = "sentencesPerCAS"; - @ConfigurationParameter(name = PARAM_SENTENCES_PER_CAS, mandatory = true, defaultValue = "100") - private int sentencesPerCAS; - + /** + * How many input sentences should be merged into one CAS. + */ + public static final String PARAM_SENTENCES_PER_CAS = "sentencesPerCAS"; + @ConfigurationParameter(name = PARAM_SENTENCES_PER_CAS, mandatory = true, defaultValue = "100") + private int sentencesPerCAS; + private Resource res; - private int casOffset; - private BufferedReader br; - private List sentenceBuffer; + private int casOffset; + private BufferedReader br; + private List sentenceBuffer; @Override public void initialize(UimaContext context) @@ -107,49 +108,49 @@ public boolean hasNext() return !sentenceBuffer.isEmpty(); } - @Override - public void getNext(JCas aJCas) throws IOException, CollectionException { - initCas(aJCas, res, String.valueOf(casOffset)); - - StringBuilder sb = new StringBuilder(); - int offset = 0; - for (String sentence : sentenceBuffer) { - if (writeSentence) { - Sentence sAnno = new Sentence(aJCas, offset, offset + sentence.length()); - sAnno.addToIndexes(); - } - sb.append(sentence); - offset += sentence.length(); - sb.append("\n"); - offset++; - } - aJCas.setDocumentText(sb.toString()); - - sentenceBuffer.clear(); - casOffset++; + @Override + public void getNext(JCas aJCas) throws IOException, CollectionException { + initCas(aJCas, res, String.valueOf(casOffset)); + + StringBuilder sb = new StringBuilder(); + int offset = 0; + for (String sentence : sentenceBuffer) { + if (writeSentence) { + Sentence sAnno = new Sentence(aJCas, offset, offset + sentence.length()); + sAnno.addToIndexes(); + } + sb.append(sentence); + offset += sentence.length(); + sb.append("\n"); + offset++; + } + aJCas.setDocumentText(sb.toString()); + + sentenceBuffer.clear(); + casOffset++; step(); - } - - // TODO find some way to properly estimate progress - @Override - public Progress[] getProgress() { + } + + // TODO find some way to properly estimate progress + @Override + public Progress[] getProgress() { return new Progress[] { new ProgressImpl(casOffset, casOffset, "document") }; - } - + } + @Override public void destroy() { closeAll(); super.destroy(); } - + private void closeAll() { res = null; closeQuietly(br); br = null; } - + /** * Seek article in file. Stop once article element has been found without reading it. */ diff --git a/dkpro-core-io-lcc-asl/src/test/java/org/dkpro/core/io/lcc/LccReaderTest.java b/dkpro-core-io-lcc-asl/src/test/java/org/dkpro/core/io/lcc/LccReaderTest.java index fd96e90fa8..cf7d470eb1 100644 --- a/dkpro-core-io-lcc-asl/src/test/java/org/dkpro/core/io/lcc/LccReaderTest.java +++ b/dkpro-core-io-lcc-asl/src/test/java/org/dkpro/core/io/lcc/LccReaderTest.java @@ -38,13 +38,13 @@ public void testDefault() LccReader.class, LccReader.PARAM_SOURCE_LOCATION, "src/test/resources/text/sample.txt"); - int i=0; + int i = 0; for (JCas jcas : new JCasIterable(reader)) { - if (i==0) { - assertEquals(3904, jcas.getDocumentText().length()); - } - i++; - }; + if (i == 0) { + assertEquals(3904, jcas.getDocumentText().length()); + } + i++; + } assertEquals(3, i); } @@ -58,13 +58,13 @@ public void testSmallBuffer() LccReader.PARAM_SOURCE_LOCATION, "src/test/resources/text/sample.txt", LccReader.PARAM_SENTENCES_PER_CAS, 2); - int i=0; + int i = 0; for (JCas jcas : new JCasIterable(reader)) { - if (i==0) { - assertEquals(91, jcas.getDocumentText().length()); - } - i++; - }; + if (i == 0) { + assertEquals(91, jcas.getDocumentText().length()); + } + i++; + } assertEquals(120, i); } @@ -78,13 +78,13 @@ public void testBigBuffer() LccReader.PARAM_SOURCE_LOCATION, "src/test/resources/text/sample.txt", LccReader.PARAM_SENTENCES_PER_CAS, 300); - int i=0; + int i = 0; for (JCas jcas : new JCasIterable(reader)) { - if (i==0) { - assertEquals(10579, jcas.getDocumentText().length()); - } - i++; - }; + if (i == 0) { + assertEquals(10579, jcas.getDocumentText().length()); + } + i++; + } assertEquals(1, i); } @@ -99,16 +99,16 @@ public void testSentenceWriting() LccReader.PARAM_SENTENCES_PER_CAS, 100, LccReader.PARAM_WRITE_SENTENCE, true); - int i=0; + int i = 0; for (JCas jcas : new JCasIterable(reader)) { - if (i==2) { - assertEquals(39, JCasUtil.select(jcas, Sentence.class).size()); - } - else { - assertEquals(100, JCasUtil.select(jcas, Sentence.class).size()); - } - i++; - }; + if (i == 2) { + assertEquals(39, JCasUtil.select(jcas, Sentence.class).size()); + } + else { + assertEquals(100, JCasUtil.select(jcas, Sentence.class).size()); + } + i++; + } assertEquals(3, i); } diff --git a/dkpro-core-io-lif-asl/pom.xml b/dkpro-core-io-lif-asl/pom.xml index 6da4f4f761..99a88c7c1b 100644 --- a/dkpro-core-io-lif-asl/pom.xml +++ b/dkpro-core-io-lif-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.lif-asl + dkpro-core-io-lif-asl jar DKPro Core ASL - IO - LIF + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -44,86 +45,66 @@ commons-lang3 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.syntax-asl + org.dkpro.core + dkpro-core-api-syntax-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.ner-asl + org.dkpro.core + dkpro-core-api-ner-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl org.lappsgrid serialization - 2.3.0 + 2.7.0 org.lappsgrid vocabulary - 2.3.0 + 2.5.0 org.lappsgrid discriminator - 2.2.1 - - - org.codehaus.groovy - groovy-all - 2.4.7 + 2.4.0 it.unimi.dsi fastutil + + eu.openminted.share.annotations + omtd-share-annotations-api + junit junit test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.conll-asl + org.dkpro.core + dkpro-core-io-conll-asl test - - - - - org.apache.maven.plugins - maven-dependency-plugin - - - - org.codehaus.groovy:groovy-all - - - - - - diff --git a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifReader.java b/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifReader.java deleted file mode 100644 index d006fc8e14..0000000000 --- a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifReader.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.lif; - -import java.io.IOException; -import java.io.InputStream; -import org.apache.commons.io.IOUtils; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.lappsgrid.serialization.Serializer; -import org.lappsgrid.serialization.lif.Container; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.io.lif.internal.Lif2DKPro; - -/** - * Reader for the LIF format. - */ -@ResourceMetaData(name="LAPPS Grid LIF Reader") -@MimeTypeCapability({MimeTypes.APPLICATION_X_LIF_JSON}) -@TypeCapability( - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent", - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency"}) -public class LifReader - extends JCasResourceCollectionReader_ImplBase -{ - /** - * Name of configuration parameter that contains the character encoding used by the input files. - */ - public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) - private String sourceEncoding; - - @Override - public void getNext(JCas aJCas) - throws IOException, CollectionException - { - Resource res = nextFile(); - initCas(aJCas, res); - - Container container; - try (InputStream is = res.getInputStream()) { - String json = IOUtils.toString(res.getInputStream(), sourceEncoding); - container = Serializer.parse(json, Container.class); - } - - new Lif2DKPro().convert(container, aJCas); - } -} diff --git a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifWriter.java b/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifWriter.java deleted file mode 100644 index 3f1dbc53b2..0000000000 --- a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifWriter.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.lif; - -import java.io.OutputStream; -import org.apache.commons.io.IOUtils; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.lappsgrid.serialization.Serializer; -import org.lappsgrid.serialization.lif.Container; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.io.lif.internal.DKPro2Lif; - -/** - * Writer for the LIF format. - */ -@ResourceMetaData(name="LAPPS Grid LIF Writer") -@MimeTypeCapability({MimeTypes.APPLICATION_X_LIF_JSON}) -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent", - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency"}) -public class LifWriter - extends JCasFileWriter_ImplBase -{ - /** - * Character encoding of the output data. - */ - public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; - @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) - private String targetEncoding; - - /** - * Specify the suffix of output files. Default value .json. If the suffix is not - * needed, provide an empty string as value. - */ - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; - @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".json") - private String filenameSuffix; - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - // Convert UIMA to LIF Container - Container container = new Container(); - - new DKPro2Lif().convert(aJCas, container); - - try (OutputStream docOS = getOutputStream(aJCas, filenameSuffix)) { - String json = Serializer.toPrettyJson(container); - IOUtils.write(json, docOS, targetEncoding); - } - catch (Exception e) { - throw new AnalysisEngineProcessException(e); - } - } -} diff --git a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/DKPro2Lif.java b/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/DKPro2Lif.java deleted file mode 100644 index 2bd8d2271c..0000000000 --- a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/DKPro2Lif.java +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.lif.internal; - -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; -import java.util.LinkedHashSet; -import java.util.Set; -import java.util.TreeSet; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.cas.TOP; -import org.lappsgrid.discriminator.Discriminators; -import org.lappsgrid.serialization.lif.Annotation; -import org.lappsgrid.serialization.lif.Container; -import org.lappsgrid.serialization.lif.View; -import org.lappsgrid.vocabulary.Features; -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; -import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; - -public class DKPro2Lif -{ - private static final String PHRASE_STRUCTURE = "phrasestruct"; - private static final String CONSTITUENT = "const"; - private static final String DEPENDENCY_STRUCTURE = "depstruct"; - private static final String DEPENDENCY = "dep"; - private static final String PARAGRAPH = "para"; - private static final String SENTENCE = "sent"; - private static final String TOKEN = "tok"; - private static final String NAMED_ENTITY = "ne"; - - private Object2IntOpenHashMap counters = new Object2IntOpenHashMap<>(); - private Int2IntOpenHashMap ids = new Int2IntOpenHashMap(); - - public void convert(JCas aJCas, Container container) - { - container.setLanguage(aJCas.getDocumentLanguage()); - container.setText(aJCas.getDocumentText()); - - View view = container.newView(); - - // Paragraph - for (Paragraph p : select(aJCas, Paragraph.class)) { - view.newAnnotation(id(PARAGRAPH, p), Discriminators.Uri.PARAGRAPH, p.getBegin(), - p.getEnd()); - } - - // Sentence - for (Sentence s : select(aJCas, Sentence.class)) { - view.newAnnotation(id(SENTENCE, s), Discriminators.Uri.SENTENCE, s.getBegin(), - s.getEnd()); - } - - // Token, POS, Lemma - for (Token t : select(aJCas, Token.class)) { - Annotation a = view.newAnnotation(id(TOKEN, t), Discriminators.Uri.TOKEN, t.getBegin(), - t.getEnd()); - if (t.getPos() != null) { - a.addFeature(Features.Token.POS, t.getPos().getPosValue()); - } - - if (t.getLemma() != null) { - a.addFeature(Features.Token.LEMMA, t.getLemma().getValue()); - } - } - - // NamedEntity - for (NamedEntity neAnno : select(aJCas, NamedEntity.class)) { - Annotation ne = view.newAnnotation(id(NAMED_ENTITY, neAnno), Discriminators.Uri.NE, - neAnno.getBegin(), neAnno.getEnd()); - ne.setLabel(neAnno.getValue()); - } - - // Dependency - for (Sentence s : select(aJCas, Sentence.class)) { - Set depRelIds = new TreeSet<>(); - - for (Dependency dep : selectCovered(Dependency.class, s)) { - String depRelId = id(DEPENDENCY, dep); - // LAPPS dependencies inherit from Relation which has no offsets - Annotation depRel = view.newAnnotation(depRelId, Discriminators.Uri.DEPENDENCY); - depRel.setLabel(dep.getDependencyType()); - depRel.addFeature(Features.Dependency.GOVERNOR, id(TOKEN, dep.getGovernor())); - depRel.addFeature(Features.Dependency.DEPENDENT, id(TOKEN, dep.getDependent())); - depRelIds.add(depRelId); - } - - if (!depRelIds.isEmpty()) { - Annotation depStruct = view.newAnnotation(id(DEPENDENCY_STRUCTURE, s), - Discriminators.Uri.DEPENDENCY_STRUCTURE, s.getBegin(), s.getEnd()); - depStruct.addFeature(Features.DependencyStructure.DEPENDENCIES, depRelIds); - } - } - - // Constituents - for (ROOT r : select(aJCas, ROOT.class)) { - Set constituents = new LinkedHashSet<>(); - convertConstituent(view, r, constituents); - - Annotation phraseStruct = view.newAnnotation(id(PHRASE_STRUCTURE, r), - Discriminators.Uri.PHRASE_STRUCTURE, r.getBegin(), r.getEnd()); - phraseStruct.addFeature(Features.PhraseStructure.CONSTITUENTS, constituents); - } - - } - - private void convertConstituent(View aView, org.apache.uima.jcas.tcas.Annotation aNode, - Set aConstituents) - { - if (aNode instanceof Constituent) { - // LAPPS constituents inherit from Relation which has no offsets - Annotation constituent = aView.newAnnotation(id(CONSTITUENT, aNode), - Discriminators.Uri.CONSTITUENT); - aConstituents.add(constituent.getId()); - - for (org.apache.uima.jcas.tcas.Annotation child : select( - ((Constituent) aNode).getChildren(), org.apache.uima.jcas.tcas.Annotation.class)) { - convertConstituent(aView, child, aConstituents); - } - } - else if (aNode instanceof Token) { - aConstituents.add(id(TOKEN, aNode)); - } - else { - throw new IllegalStateException("Unexpected node type: " + aNode); - } - } - - private String id(String aPrefix, TOP aFS) - { - int id; - // if we already have an ID for the given FS return it - if (ids.containsKey(aFS.getAddress())) { - id = ids.get(aFS.getAddress()); - } - // otherwise generate a new ID - else { - id = counters.getInt(aPrefix); - ids.put(aFS.getAddress(), id); - counters.put(aPrefix, id + 1); - } - - return aPrefix + '-' + id; - } -} diff --git a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/Lif2DKPro.java b/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/Lif2DKPro.java deleted file mode 100644 index 0e638202a1..0000000000 --- a/dkpro-core-io-lif-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/lif/internal/Lif2DKPro.java +++ /dev/null @@ -1,294 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.lif.internal; - -import static org.apache.commons.lang3.StringUtils.isEmpty; -import static org.apache.commons.lang3.StringUtils.isNotEmpty; -import static org.apache.uima.fit.util.JCasUtil.select; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - -import org.apache.uima.fit.util.FSCollectionFactory; -import org.apache.uima.jcas.JCas; -import org.lappsgrid.discriminator.Discriminators; -import org.lappsgrid.serialization.lif.Annotation; -import org.lappsgrid.serialization.lif.Container; -import org.lappsgrid.serialization.lif.View; -import org.lappsgrid.vocabulary.Features; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; - -public class Lif2DKPro -{ - public void convert(Container aContainer, JCas aJCas) - { - aJCas.setDocumentLanguage(aContainer.getLanguage()); - aJCas.setDocumentText(aContainer.getText()); - - View view = aContainer.getView(0); - - // Paragraph - view.getAnnotations().stream() - .filter(a -> Discriminators.Uri.PARAGRAPH.equals(a.getAtType())) - .forEach( - para -> { - Paragraph paraAnno = new Paragraph(aJCas, para.getStart().intValue(), - para.getEnd().intValue()); - paraAnno.addToIndexes(); - }); - - // Sentence - view.getAnnotations().stream() - .filter(a -> Discriminators.Uri.SENTENCE.equals(a.getAtType())) - .forEach( - sent -> { - Sentence sentAnno = new Sentence(aJCas, sent.getStart().intValue(), - sent.getEnd().intValue()); - sentAnno.addToIndexes(); - }); - - Map tokenIdx = new HashMap<>(); - - // Token, POS, Lemma - view.getAnnotations().stream() - .filter(a -> Discriminators.Uri.TOKEN.equals(a.getAtType())) - .forEach( - token -> { - Token tokenAnno = new Token(aJCas, token.getStart().intValue(), token - .getEnd().intValue()); - String pos = token.getFeature(Features.Token.POS); - String lemma = token.getFeature(Features.Token.LEMMA); - - if (isNotEmpty(pos)) { - POS posAnno = new POS(aJCas, tokenAnno.getBegin(), tokenAnno - .getEnd()); - posAnno.setPosValue(pos.intern()); - POSUtils.assignCoarseValue(posAnno); - posAnno.addToIndexes(); - tokenAnno.setPos(posAnno); - } - - if (isNotEmpty(lemma)) { - Lemma lemmaAnno = new Lemma(aJCas, tokenAnno.getBegin(), tokenAnno - .getEnd()); - lemmaAnno.setValue(lemma); - lemmaAnno.addToIndexes(); - tokenAnno.setLemma(lemmaAnno); - } - - tokenAnno.addToIndexes(); - tokenIdx.put(token.getId(), tokenAnno); - }); - - // NamedEntity - view.getAnnotations().stream() - .filter(a -> Discriminators.Uri.NE.equals(a.getAtType())) - .forEach( - ne -> { - NamedEntity neAnno = new NamedEntity(aJCas, ne.getStart().intValue(), - ne.getEnd().intValue()); - neAnno.setValue(ne.getLabel()); - neAnno.addToIndexes(); - }); - - // Dependencies - view.getAnnotations().stream() - .filter(a -> Discriminators.Uri.DEPENDENCY.equals(a.getAtType())) - .forEach( - dep -> { - String dependent = dep.getFeature(Features.Dependency.DEPENDENT); - String governor = dep.getFeature(Features.Dependency.GOVERNOR); - - if (isEmpty(governor) || governor.equals(dependent)) { - ROOT depAnno = new ROOT(aJCas); - depAnno.setDependencyType(dep.getLabel()); - depAnno.setDependent(tokenIdx.get(dependent)); - depAnno.setGovernor(tokenIdx.get(dependent)); - depAnno.setBegin(depAnno.getDependent().getBegin()); - depAnno.setEnd(depAnno.getDependent().getEnd()); - depAnno.addToIndexes(); - } - else { - Dependency depAnno = new Dependency(aJCas); - depAnno.setDependencyType(dep.getLabel()); - depAnno.setDependent(tokenIdx.get(dependent)); - depAnno.setGovernor(tokenIdx.get(governor)); - depAnno.setBegin(depAnno.getDependent().getBegin()); - depAnno.setEnd(depAnno.getDependent().getEnd()); - depAnno.addToIndexes(); - } - }); - - // Constituents - view.getAnnotations().stream() - .filter(a -> Discriminators.Uri.PHRASE_STRUCTURE.equals(a.getAtType())) - .forEach( - ps -> { - String rootId = findRoot(view, ps); - // Get the constituent IDs - Set constituentIDs; - constituentIDs = new HashSet<>( - getSetFeature(ps,Features.PhraseStructure.CONSTITUENTS)); - - List constituents = new ArrayList<>(); - Map constituentIdx = new HashMap<>(); - - // Instantiate all the constituents - view.getAnnotations().stream() - .filter(a -> constituentIDs.contains(a.getId())) - .forEach(con -> { - if (Discriminators.Uri.CONSTITUENT.equals(con.getAtType())) { - Constituent conAnno; - if (rootId.equals(con.getId())) { - conAnno = new de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT(aJCas); - } - else { - conAnno = new Constituent(aJCas); - } - if (con.getStart() != null) { - conAnno.setBegin(con.getStart().intValue()); - } - if (con.getEnd() != null) { - conAnno.setEnd(con.getEnd().intValue()); - } - conAnno.setConstituentType(con.getLabel()); - constituentIdx.put(con.getId(), conAnno); - constituents.add(con); - } - // If it is not a constituent, it must be a token ID - we already - // have created the tokens and recorded them in the tokenIdx - }); - - // Set parent and children features - constituents.forEach(con -> { - // Check if it is a constituent or token - Constituent conAnno = constituentIdx.get(con.getId()); - Set childIDs = getSetFeature(con, - Features.Constituent.CHILDREN); - - List children = new ArrayList<>(); - childIDs.forEach(childID -> { - Constituent conChild = constituentIdx.get(childID); - Token tokenChild = tokenIdx.get(childID); - if (conChild != null && tokenChild == null) { - conChild.setParent(conAnno); - children.add(conChild); - } - else if (conChild == null && tokenChild != null) { - tokenChild.setParent(conAnno); - children.add(tokenChild); - } - else if (conChild == null && tokenChild == null) { - throw new IllegalStateException("ID [" + con.getId() - + "] not found"); - } - else { - throw new IllegalStateException("ID [" + con.getId() - + "] is constituent AND token? Impossible!"); - } - }); - - conAnno.setChildren(FSCollectionFactory.createFSArray(aJCas, children)); - }); - - // Percolate offsets - they might not have been set on the constituents! - Constituent root = constituentIdx.get(rootId); - percolateOffsets(root); - - // Add to indexes - constituentIdx.values().forEach(conAnno -> { - conAnno.addToIndexes(); - }); - }); - } - - @SuppressWarnings("unchecked") - private Set getSetFeature(Annotation aAnnotation, String aName) - { - return aAnnotation.getFeatureSet(aName); - } - - private void percolateOffsets(org.apache.uima.jcas.tcas.Annotation aNode) - { - if (aNode instanceof Constituent) { - Constituent conAnno = (Constituent) aNode; - int begin = Integer.MAX_VALUE; - int end = 0; - for (org.apache.uima.jcas.tcas.Annotation a : select(conAnno.getChildren(), - org.apache.uima.jcas.tcas.Annotation.class)) { - percolateOffsets(a); - - begin = Math.min(a.getBegin(), begin); - end = Math.max(a.getEnd(), end); - } - - if (aNode.getBegin() != 0) { - assert begin == aNode.getBegin(); - } - else { - aNode.setBegin(begin); - } - - if (aNode.getEnd() != 0) { - assert end == aNode.getEnd(); - } - else { - aNode.setEnd(end); - } - } - } - - private String findRoot(View aView, Annotation aPS) - { - // Get all the constituents int he phrase structure - Set constituents = new HashSet<>( - getSetFeature(aPS, Features.PhraseStructure.CONSTITUENTS)); - - List psConstituents = aView.getAnnotations().stream() - .filter(a -> Discriminators.Uri.CONSTITUENT.equals(a.getAtType())) - .filter(con -> constituents.contains(con.getId())) - .collect(Collectors.toList()); - - // Remove all constituents that are children of other constituents within the PS - psConstituents.forEach(con -> { - Set children = getSetFeature(con, Features.Constituent.CHILDREN); - children.forEach(child -> constituents.remove(child)); - }); - - // If all went well, only one constituent should be left and that is the root constituent - assert 1 == constituents.size(); - - // Return the ID of the root constituent - return constituents.iterator().next(); - } -} diff --git a/dkpro-core-io-lif-asl/src/main/java/org/dkpro/core/io/lif/LifReader.java b/dkpro-core-io-lif-asl/src/main/java/org/dkpro/core/io/lif/LifReader.java new file mode 100644 index 0000000000..98f1253062 --- /dev/null +++ b/dkpro-core-io-lif-asl/src/main/java/org/dkpro/core/io/lif/LifReader.java @@ -0,0 +1,89 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.lif; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.io.IOUtils; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.io.lif.internal.Lif2DKPro; +import org.lappsgrid.serialization.DataContainer; +import org.lappsgrid.serialization.Serializer; +import org.lappsgrid.serialization.lif.Container; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Reader for the LIF format. + */ +@ResourceMetaData(name = "LAPPS Grid LIF Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.APPLICATION_X_LIF_JSON}) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent", + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency"}) +public class LifReader + extends JCasResourceCollectionReader_ImplBase +{ + /** + * Name of configuration parameter that contains the character encoding used by the input files. + */ + public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) + private String sourceEncoding; + + @Override + public void getNext(JCas aJCas) + throws IOException, CollectionException + { + Resource res = nextFile(); + initCas(aJCas, res); + + Container container; + try (InputStream is = res.getInputStream()) { + String json = IOUtils.toString(res.getInputStream(), sourceEncoding); + try { + // First try parsing without the wire wrapper. + container = Serializer.parse(json, Container.class); + } + catch (Exception e) { + // If that fails, it might be because there is a wire wrapper around the actual + // data, so let's try that. + container = (Container) Serializer.parse(json, DataContainer.class).getPayload(); + } + } + + new Lif2DKPro().convert(container, aJCas); + } +} diff --git a/dkpro-core-io-lif-asl/src/main/java/org/dkpro/core/io/lif/LifWriter.java b/dkpro-core-io-lif-asl/src/main/java/org/dkpro/core/io/lif/LifWriter.java new file mode 100644 index 0000000000..bf92f0d9b1 --- /dev/null +++ b/dkpro-core-io-lif-asl/src/main/java/org/dkpro/core/io/lif/LifWriter.java @@ -0,0 +1,119 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.lif; + +import java.io.OutputStream; + +import org.apache.commons.io.IOUtils; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.io.lif.internal.DKPro2Lif; +import org.lappsgrid.serialization.DataContainer; +import org.lappsgrid.serialization.Serializer; +import org.lappsgrid.serialization.lif.Container; +import org.lappsgrid.serialization.lif.View; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Writer for the LIF format. + */ +@ResourceMetaData(name = "LAPPS Grid LIF Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.APPLICATION_X_LIF_JSON}) +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent", + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency"}) +public class LifWriter + extends JCasFileWriter_ImplBase +{ + /** + * Character encoding of the output data. + */ + public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; + @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) + private String targetEncoding; + + /** + * Specify the suffix of output files. Default value .lif. If the suffix is not + * needed, provide an empty string as value. + */ + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; + @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".lif") + private String filenameSuffix; + + /** + * Write timestamp to view. + */ + public static final String PARAM_WRITE_TIMESTAMP = "writeTimestamp"; + @ConfigurationParameter(name = PARAM_WRITE_TIMESTAMP, mandatory = true, defaultValue = "true") + private boolean writeTimestamp; + + + /** + * Wrap as data object. + */ + public static final String PARAM_ADD_ENVELOPE = "wrapAsDataObject"; + @ConfigurationParameter(name = PARAM_ADD_ENVELOPE, mandatory = true, defaultValue = "false") + private boolean wrapAsDataObject; + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + // Convert UIMA to LIF Container + Container container = new Container(); + + new DKPro2Lif().convert(aJCas, container); + + // Clear timestamp if requested. + if (!writeTimestamp) { + for (View view : container.getViews()) { + view.setTimestamp(null); + } + } + + Object finalOutputObject = container; + if (wrapAsDataObject) { + finalOutputObject = new DataContainer(container); + } + + try (OutputStream docOS = getOutputStream(aJCas, filenameSuffix)) { + String json = Serializer.toPrettyJson(finalOutputObject); + IOUtils.write(json, docOS, targetEncoding); + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + } +} diff --git a/dkpro-core-io-lif-asl/src/main/java/org/dkpro/core/io/lif/internal/DKPro2Lif.java b/dkpro-core-io-lif-asl/src/main/java/org/dkpro/core/io/lif/internal/DKPro2Lif.java new file mode 100644 index 0000000000..e6a4a59c41 --- /dev/null +++ b/dkpro-core-io-lif-asl/src/main/java/org/dkpro/core/io/lif/internal/DKPro2Lif.java @@ -0,0 +1,212 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.lif.internal; + +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; + +import java.util.LinkedHashSet; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.TOP; +import org.lappsgrid.discriminator.Discriminators; +import org.lappsgrid.serialization.lif.Annotation; +import org.lappsgrid.serialization.lif.Container; +import org.lappsgrid.serialization.lif.View; +import org.lappsgrid.vocabulary.Features; + +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; +import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; + +public class DKPro2Lif +{ + private static final String DKPRO_CORE_LIF_CONVERTER = "DKPro Core LIF Converter"; + + private static final String PHRASE_STRUCTURE = "phrasestruct"; + private static final String CONSTITUENT = "const"; + private static final String DEPENDENCY_STRUCTURE = "depstruct"; + private static final String DEPENDENCY = "dep"; + private static final String PARAGRAPH = "para"; + private static final String SENTENCE = "sent"; + private static final String TOKEN = "tok"; + private static final String NAMED_ENTITY = "ne"; + + private Object2IntOpenHashMap counters = new Object2IntOpenHashMap<>(); + private Int2IntOpenHashMap ids = new Int2IntOpenHashMap(); + + public void convert(JCas aJCas, Container container) + { + container.setLanguage(aJCas.getDocumentLanguage()); + container.setText(aJCas.getDocumentText()); + + View view = container.newView(); + + // Paragraph + for (Paragraph p : select(aJCas, Paragraph.class)) { + convertParagraph(view, p); + } + view.addContains(Discriminators.Uri.PARAGRAPH, DKPRO_CORE_LIF_CONVERTER, "Paragraph"); + + // Sentence + for (Sentence s : select(aJCas, Sentence.class)) { + convertSentence(view, s); + } + view.addContains(Discriminators.Uri.SENTENCE, DKPRO_CORE_LIF_CONVERTER, "Sentence"); + + // Token, POS, Lemma + for (Token t : select(aJCas, Token.class)) { + convertToken(view, t); + } + view.addContains(Discriminators.Uri.TOKEN, DKPRO_CORE_LIF_CONVERTER, "Token"); + view.addContains(Discriminators.Uri.LEMMA, DKPRO_CORE_LIF_CONVERTER, "Lemma"); + view.addContains(Discriminators.Uri.POS, DKPRO_CORE_LIF_CONVERTER, "POS"); + + // NamedEntity + for (NamedEntity ne : select(aJCas, NamedEntity.class)) { + convertNamedEntity(view, ne); + } + view.addContains(Discriminators.Uri.NE, DKPRO_CORE_LIF_CONVERTER, "Named entity"); + + // Dependencies + for (Sentence s : select(aJCas, Sentence.class)) { + convertDependencies(view, s); + } + view.addContains(Discriminators.Uri.DEPENDENCY, DKPRO_CORE_LIF_CONVERTER, "Dependencies"); + + // Constituents + for (ROOT r : select(aJCas, ROOT.class)) { + convertConstituents(view, r); + } + view.addContains(Discriminators.Uri.PHRASE_STRUCTURE, DKPRO_CORE_LIF_CONVERTER, + "Constituents"); + } + + private void convertParagraph(View aTarget, Paragraph aParagraph) + { + aTarget.newAnnotation(id(PARAGRAPH, aParagraph), Discriminators.Uri.PARAGRAPH, + aParagraph.getBegin(), aParagraph.getEnd()); + } + + private void convertSentence(View aTarget, Sentence aSentence) + { + aTarget.newAnnotation(id(SENTENCE, aSentence), Discriminators.Uri.SENTENCE, + aSentence.getBegin(), aSentence.getEnd()); + } + + private void convertToken(View aTarget, Token aToken) + { + Annotation a = aTarget.newAnnotation(id(TOKEN, aToken), Discriminators.Uri.TOKEN, + aToken.getBegin(), aToken.getEnd()); + if (aToken.getPos() != null) { + a.addFeature(Features.Token.POS, aToken.getPos().getPosValue()); + } + + if (aToken.getLemma() != null) { + a.addFeature(Features.Token.LEMMA, aToken.getLemma().getValue()); + } + } + + private void convertNamedEntity(View aTarget, NamedEntity aNamedEntity) + { + Annotation ne = aTarget.newAnnotation(id(NAMED_ENTITY, aNamedEntity), Discriminators.Uri.NE, + aNamedEntity.getBegin(), aNamedEntity.getEnd()); + ne.addFeature("category", aNamedEntity.getValue()); + } + + private void convertDependencies(View aView, Sentence aSentence) + { + Set depRelIds = new TreeSet<>(); + + for (Dependency dep : selectCovered(Dependency.class, aSentence)) { + String depRelId = id(DEPENDENCY, dep); + // LAPPS dependencies inherit from Relation which has no offsets + Annotation depRel = aView.newAnnotation(depRelId, Discriminators.Uri.DEPENDENCY); + depRel.setLabel(dep.getDependencyType()); + depRel.addFeature(Features.Dependency.GOVERNOR, id(TOKEN, dep.getGovernor())); + depRel.addFeature(Features.Dependency.DEPENDENT, id(TOKEN, dep.getDependent())); + depRelIds.add(depRelId); + } + + if (!depRelIds.isEmpty()) { + Annotation depStruct = aView.newAnnotation(id(DEPENDENCY_STRUCTURE, aSentence), + Discriminators.Uri.DEPENDENCY_STRUCTURE, aSentence.getBegin(), + aSentence.getEnd()); + depStruct.addFeature(Features.DependencyStructure.DEPENDENCIES, depRelIds); + } + } + + private void convertConstituents(View aTarget, ROOT aRootConstituent) + { + Set constituents = new LinkedHashSet<>(); + convertConstituent(aTarget, aRootConstituent, constituents); + + Annotation phraseStruct = aTarget.newAnnotation(id(PHRASE_STRUCTURE, aRootConstituent), + Discriminators.Uri.PHRASE_STRUCTURE, aRootConstituent.getBegin(), + aRootConstituent.getEnd()); + phraseStruct.addFeature(Features.PhraseStructure.CONSTITUENTS, constituents); + } + + private void convertConstituent(View aView, org.apache.uima.jcas.tcas.Annotation aNode, + Set aConstituents) + { + if (aNode instanceof Constituent) { + // LAPPS constituents inherit from Relation which has no offsets + Annotation constituent = aView.newAnnotation(id(CONSTITUENT, aNode), + Discriminators.Uri.CONSTITUENT); + aConstituents.add(constituent.getId()); + + for (org.apache.uima.jcas.tcas.Annotation child : select( + ((Constituent) aNode).getChildren(), + org.apache.uima.jcas.tcas.Annotation.class)) { + convertConstituent(aView, child, aConstituents); + } + } + else if (aNode instanceof Token) { + aConstituents.add(id(TOKEN, aNode)); + } + else { + throw new IllegalStateException("Unexpected node type: " + aNode); + } + } + + private String id(String aPrefix, TOP aFS) + { + int id; + // if we already have an ID for the given FS return it + if (ids.containsKey(aFS.getAddress())) { + id = ids.get(aFS.getAddress()); + } + // otherwise generate a new ID + else { + id = counters.getInt(aPrefix); + ids.put(aFS.getAddress(), id); + counters.put(aPrefix, id + 1); + } + + return aPrefix + '-' + id; + } +} diff --git a/dkpro-core-io-lif-asl/src/main/java/org/dkpro/core/io/lif/internal/Lif2DKPro.java b/dkpro-core-io-lif-asl/src/main/java/org/dkpro/core/io/lif/internal/Lif2DKPro.java new file mode 100644 index 0000000000..ede2f7a1fc --- /dev/null +++ b/dkpro-core-io-lif-asl/src/main/java/org/dkpro/core/io/lif/internal/Lif2DKPro.java @@ -0,0 +1,334 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.lif.internal; + +import static org.apache.commons.lang3.StringUtils.isEmpty; +import static org.apache.commons.lang3.StringUtils.isNotEmpty; +import static org.apache.uima.fit.util.JCasUtil.select; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.uima.fit.util.FSCollectionFactory; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.lappsgrid.discriminator.Discriminators; +import org.lappsgrid.serialization.lif.Annotation; +import org.lappsgrid.serialization.lif.Container; +import org.lappsgrid.serialization.lif.View; +import org.lappsgrid.vocabulary.Features; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; + +public class Lif2DKPro +{ + private Map tokenIdx; + private Container container; + + public void convert(Container aContainer, JCas aJCas) + { + tokenIdx = new HashMap<>(); + container = aContainer; + + aJCas.setDocumentLanguage(aContainer.getLanguage()); + aJCas.setDocumentText(aContainer.getText()); + + // Paragraph + getView(Discriminators.Uri.PARAGRAPH).getAnnotations().stream() + .filter(a -> Discriminators.Uri.PARAGRAPH.equals(a.getAtType())) + .forEach(para -> convertParagraph(aJCas, para)); + + // Sentence + getView(Discriminators.Uri.SENTENCE).getAnnotations().stream() + .filter(a -> Discriminators.Uri.SENTENCE.equals(a.getAtType())) + .forEach(sent -> convertSentence(aJCas, sent)); + + // Token, POS, Lemma (builds token index) + getView(Discriminators.Uri.TOKEN).getAnnotations().stream() + .filter(a -> Discriminators.Uri.TOKEN.equals(a.getAtType())) + .forEach(token -> convertToken(aJCas, token)); + + // NamedEntity + getView(Discriminators.Uri.NE).getAnnotations().stream() + .filter(a -> isNamedEntity(a.getAtType())) + .forEach(ne -> convertNamedEntity(aJCas, ne)); + + // Dependencies (requires token index) + getView(Discriminators.Uri.DEPENDENCY).getAnnotations().stream() + .filter(a -> Discriminators.Uri.DEPENDENCY.equals(a.getAtType())) + .forEach(dep -> convertDependency(aJCas, dep)); + + // Constituents (requires token index) + getView(Discriminators.Uri.PHRASE_STRUCTURE).getAnnotations().stream() + .filter(a -> Discriminators.Uri.PHRASE_STRUCTURE.equals(a.getAtType())) + .forEach(ps -> convertConstituents(aJCas, getView(Discriminators.Uri.CONSTITUENT), ps)); + } + + private View getView(String aType) + { + // Returns the last view which contains aType. If no view is found, + // then just return the first view. + List views = container.findViewsThatContain(aType); + if (!views.isEmpty()) { + return views.get(views.size() - 1); + } else { + return container.getView(0); + } + } + + private void convertConstituents(JCas aJCas, View view, Annotation ps) + { + String rootId = findRoot(view, ps); + // Get the constituent IDs + Set constituentIDs = new HashSet<>( + getSetFeature(ps, Features.PhraseStructure.CONSTITUENTS)); + + List constituents = new ArrayList<>(); + Map constituentIdx = new HashMap<>(); + + // Instantiate all the constituents + view.getAnnotations().stream().filter(a -> constituentIDs.contains(a.getId())) + .forEach(con -> { + if (Discriminators.Uri.CONSTITUENT.equals(con.getAtType())) { + Constituent conAnno; + if (rootId.equals(con.getId())) { + conAnno = new + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT( + aJCas); + } + else { + conAnno = new Constituent(aJCas); + } + if (con.getStart() != null) { + conAnno.setBegin(con.getStart().intValue()); + } + if (con.getEnd() != null) { + conAnno.setEnd(con.getEnd().intValue()); + } + conAnno.setConstituentType(con.getLabel()); + constituentIdx.put(con.getId(), conAnno); + constituents.add(con); + } + // If it is not a constituent, it must be a token ID - we already + // have created the tokens and recorded them in the tokenIdx + }); + + // Set parent and children features + constituents.forEach(con -> { + // Check if it is a constituent or token + Constituent conAnno = constituentIdx.get(con.getId()); + Set childIDs = getSetFeature(con, Features.Constituent.CHILDREN); + + List children = new ArrayList<>(); + childIDs.forEach(childID -> { + Constituent conChild = constituentIdx.get(childID); + Token tokenChild = tokenIdx.get(childID); + if (conChild != null && tokenChild == null) { + conChild.setParent(conAnno); + children.add(conChild); + } + else if (conChild == null && tokenChild != null) { + tokenChild.setParent(conAnno); + children.add(tokenChild); + } + else if (conChild == null && tokenChild == null) { + throw new IllegalStateException("ID [" + con.getId() + "] not found"); + } + else { + throw new IllegalStateException( + "ID [" + con.getId() + "] is constituent AND token? Impossible!"); + } + }); + + conAnno.setChildren(FSCollectionFactory.createFSArray(aJCas, children)); + }); + + // Percolate offsets - they might not have been set on the constituents! + Constituent root = constituentIdx.get(rootId); + percolateOffsets(root); + + // Add to indexes + constituentIdx.values().forEach(conAnno -> { + conAnno.addToIndexes(); + }); + } + + private Paragraph convertParagraph(JCas aTarget, Annotation aParagraph) + { + Paragraph paragraph = new Paragraph(aTarget, aParagraph.getStart().intValue(), + aParagraph.getEnd().intValue()); + paragraph.addToIndexes(); + return paragraph; + } + + private Sentence convertSentence(JCas aTarget, Annotation aSentence) + { + Sentence sentence = new Sentence(aTarget, aSentence.getStart().intValue(), + aSentence.getEnd().intValue()); + sentence.addToIndexes(); + return sentence; + } + + private Token convertToken(JCas aTarget, Annotation aToken) + { + Token token = new Token(aTarget, aToken.getStart().intValue(), aToken + .getEnd().intValue()); + String pos = aToken.getFeature(Features.Token.POS); + String lemma = aToken.getFeature(Features.Token.LEMMA); + + if (isNotEmpty(pos)) { + POS posAnno = new POS(aTarget, token.getBegin(), token.getEnd()); + posAnno.setPosValue(pos != null ? pos.intern() : null); + POSUtils.assignCoarseValue(posAnno); + posAnno.addToIndexes(); + token.setPos(posAnno); + } + + if (isNotEmpty(lemma)) { + Lemma lemmaAnno = new Lemma(aTarget, token.getBegin(), token.getEnd()); + lemmaAnno.setValue(lemma); + lemmaAnno.addToIndexes(); + token.setLemma(lemmaAnno); + } + + token.addToIndexes(); + + tokenIdx.put(aToken.getId(), token); + + return token; + } + + private NamedEntity convertNamedEntity(JCas aTarget, Annotation aNamedEntity) + { + NamedEntity neAnno = new NamedEntity(aTarget, aNamedEntity.getStart().intValue(), + aNamedEntity.getEnd().intValue()); + neAnno.setValue(aNamedEntity.getFeature("category")); + neAnno.addToIndexes(); + return neAnno; + } + + private Dependency convertDependency(JCas aTarget, Annotation aDependency) + { + String dependent = aDependency.getFeature(Features.Dependency.DEPENDENT); + String governor = aDependency.getFeature(Features.Dependency.GOVERNOR); + + Dependency depAnno; + if (isEmpty(governor) || governor.equals(dependent)) { + depAnno = new ROOT(aTarget); + depAnno.setDependencyType(aDependency.getLabel()); + depAnno.setDependent(tokenIdx.get(dependent)); + depAnno.setGovernor(tokenIdx.get(dependent)); + depAnno.setBegin(depAnno.getDependent().getBegin()); + depAnno.setEnd(depAnno.getDependent().getEnd()); + depAnno.addToIndexes(); + } + else { + depAnno = new Dependency(aTarget); + depAnno.setDependencyType(aDependency.getLabel()); + depAnno.setDependent(tokenIdx.get(dependent)); + depAnno.setGovernor(tokenIdx.get(governor)); + depAnno.setBegin(depAnno.getDependent().getBegin()); + depAnno.setEnd(depAnno.getDependent().getEnd()); + depAnno.addToIndexes(); + } + + return depAnno; + } + + @SuppressWarnings("unchecked") + private Set getSetFeature(Annotation aAnnotation, String aName) + { + return aAnnotation.getFeatureSet(aName); + } + + private void percolateOffsets(org.apache.uima.jcas.tcas.Annotation aNode) + { + if (aNode instanceof Constituent) { + Constituent conAnno = (Constituent) aNode; + int begin = Integer.MAX_VALUE; + int end = 0; + for (org.apache.uima.jcas.tcas.Annotation a : select(conAnno.getChildren(), + org.apache.uima.jcas.tcas.Annotation.class)) { + percolateOffsets(a); + + begin = Math.min(a.getBegin(), begin); + end = Math.max(a.getEnd(), end); + } + + if (aNode.getBegin() != 0) { + assert begin == aNode.getBegin(); + } + else { + aNode.setBegin(begin); + } + + if (aNode.getEnd() != 0) { + assert end == aNode.getEnd(); + } + else { + aNode.setEnd(end); + } + } + } + + private String findRoot(View aView, Annotation aPS) + { + // Get all the constituents int he phrase structure + Set constituents = new HashSet<>( + getSetFeature(aPS, Features.PhraseStructure.CONSTITUENTS)); + + List psConstituents = aView.getAnnotations().stream() + .filter(a -> Discriminators.Uri.CONSTITUENT.equals(a.getAtType())) + .filter(con -> constituents.contains(con.getId())) + .collect(Collectors.toList()); + + // Remove all constituents that are children of other constituents within the PS + psConstituents.forEach(con -> { + Set children = getSetFeature(con, Features.Constituent.CHILDREN); + children.forEach(child -> constituents.remove(child)); + }); + + // If all went well, only one constituent should be left and that is the root constituent + assert 1 == constituents.size(); + + // Return the ID of the root constituent + return constituents.iterator().next(); + } + + private boolean isNamedEntity(String aTypeName) + { + return Discriminators.Uri.NE.equals(aTypeName) + || Discriminators.Uri.DATE.equals(aTypeName) + || Discriminators.Uri.LOCATION.equals(aTypeName) + || Discriminators.Uri.ORGANIZATION.equals(aTypeName) + || Discriminators.Uri.PERSON.equals(aTypeName); + } +} diff --git a/dkpro-core-io-lif-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifReaderWriterTest.java b/dkpro-core-io-lif-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifReaderWriterTest.java deleted file mode 100644 index 7cb7e4372a..0000000000 --- a/dkpro-core-io-lif-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifReaderWriterTest.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright 2016 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.lif; - -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testOneWay; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testRoundTrip; - -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - -public class LifReaderWriterTest -{ - @Test - public void roundTrip() - throws Exception - { - testRoundTrip( - LifReader.class, // the reader - LifWriter.class, // the writer - "conll/2006/fi-ref.json"); // the input also used as output reference - } - - @Test - public void oneDependencyStructure() - throws Exception - { - testOneWay( - LifReader.class, // the reader - LifWriter.class, // the writer - "lif/dependencystructure-ref.json", // the reference file for the output - "lif/dependencystructure.json"); // the input file for the test - } - - @Test - public void onePhraseStructure() - throws Exception - { - testOneWay( - LifReader.class, // the reader - LifWriter.class, // the writer - "lif/phrasestructure-ref.json", // the reference file for the output - "lif/phrasestructure.json"); // the input file for the test - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-io-lif-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifWriterTest.java b/dkpro-core-io-lif-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifWriterTest.java deleted file mode 100644 index 186ab21a04..0000000000 --- a/dkpro-core-io-lif-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/lif/LifWriterTest.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright 2016 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.lif; - -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testOneWay; - -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2006Reader; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - -public class LifWriterTest -{ - @Test - public void oneWay() - throws Exception - { - testOneWay( - Conll2006Reader.class, // the reader - LifWriter.class, // the writer - "conll/2006/fi-ref.json", // the reference file for the output - "conll/2006/fi-orig.conll"); // the input file for the test - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-io-lif-asl/src/test/java/org/dkpro/core/io/lif/LifReaderWriterTest.java b/dkpro-core-io-lif-asl/src/test/java/org/dkpro/core/io/lif/LifReaderWriterTest.java new file mode 100644 index 0000000000..af2d7393b0 --- /dev/null +++ b/dkpro-core-io-lif-asl/src/test/java/org/dkpro/core/io/lif/LifReaderWriterTest.java @@ -0,0 +1,94 @@ +/* + * Copyright 2016 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.lif; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; +import static org.dkpro.core.testing.IOTestRunner.testRoundTrip; + +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Rule; +import org.junit.Test; + +public class LifReaderWriterTest +{ + @Test + public void roundTrip() + throws Exception + { + testRoundTrip( + createReaderDescription(LifReader.class), // the reader + createEngineDescription(LifWriter.class, // the writer + LifWriter.PARAM_WRITE_TIMESTAMP, false), + "conll/2006/fi-ref.lif"); // the input also used as output reference + } + + @Test + public void authenticPosLifFileWithWrapper() + throws Exception + { + testOneWay( + createReaderDescription(LifReader.class), // the reader + createEngineDescription(LifWriter.class, // the writer + LifWriter.PARAM_WRITE_TIMESTAMP, false, + LifWriter.PARAM_ADD_ENVELOPE, true), + "lif/stanford-pos-ref.lif", // the reference file for the output + "lif/stanford-pos.lif"); // the input file for the test + } + + @Test + public void oneDependencyStructure() + throws Exception + { + testOneWay( + createReaderDescription(LifReader.class), // the reader + createEngineDescription(LifWriter.class, // the writer + LifWriter.PARAM_WRITE_TIMESTAMP, false), + "lif/dependencystructure-ref.lif", // the reference file for the output + "lif/dependencystructure.lif"); // the input file for the test + } + + @Test + public void onePhraseStructure() + throws Exception + { + testOneWay( + createReaderDescription(LifReader.class), // the reader + createEngineDescription(LifWriter.class, // the writer + LifWriter.PARAM_WRITE_TIMESTAMP, false), + "lif/phrasestructure-ref.lif", // the reference file for the output + "lif/phrasestructure.lif"); // the input file for the test + } + + + @Test + public void oneNamedEntity() + throws Exception + { + testOneWay( + createReaderDescription(LifReader.class), // the reader + createEngineDescription(LifWriter.class, // the writer + LifWriter.PARAM_WRITE_TIMESTAMP, false), + "lif/specification-ner-ref.lif", // the reference file for the output + "lif/specification-ner.lif"); // the input file for the test + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-lif-asl/src/test/java/org/dkpro/core/io/lif/LifWriterTest.java b/dkpro-core-io-lif-asl/src/test/java/org/dkpro/core/io/lif/LifWriterTest.java new file mode 100644 index 0000000000..4550afc368 --- /dev/null +++ b/dkpro-core-io-lif-asl/src/test/java/org/dkpro/core/io/lif/LifWriterTest.java @@ -0,0 +1,46 @@ +/* + * Copyright 2016 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.lif; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; + +import org.dkpro.core.io.conll.Conll2006Reader; +import org.dkpro.core.io.lif.LifWriter; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Rule; +import org.junit.Test; + +public class LifWriterTest +{ + @Test + public void oneWay() + throws Exception + { + testOneWay( + createReaderDescription(Conll2006Reader.class), // the reader + createEngineDescription(LifWriter.class, // the writer + LifWriter.PARAM_WRITE_TIMESTAMP, false), + "conll/2006/fi-ref.lif", // the reference file for the output + "conll/2006/fi-orig.conll"); // the input file for the test + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-lif-asl/src/test/resources/README.txt b/dkpro-core-io-lif-asl/src/test/resources/README.txt index 0044e01920..44144dd22c 100644 --- a/dkpro-core-io-lif-asl/src/test/resources/README.txt +++ b/dkpro-core-io-lif-asl/src/test/resources/README.txt @@ -9,4 +9,9 @@ src/test/resources/conll/2006/fi-orig.conll http://www.ling.helsinki.fi/kieliteknologia/tutkimus/treebank/sources/ftb3.1.conllx.gz Creative Commons Attribution 3.0 License + +src/test/resources/lif/stanford-pos.lif + + Obtained from LAPPSGrid using Stanford Parser component + Text is title and abstract from: https://www.ncbi.nlm.nih.gov/pubmed/10025748 \ No newline at end of file diff --git a/dkpro-core-io-lif-asl/src/test/resources/conll/2006/fi-ref.json b/dkpro-core-io-lif-asl/src/test/resources/conll/2006/fi-ref.json deleted file mode 100644 index c71fc08d60..0000000000 --- a/dkpro-core-io-lif-asl/src/test/resources/conll/2006/fi-ref.json +++ /dev/null @@ -1,600 +0,0 @@ -{ - "@context" : "http://vocab.lappsgrid.org/context-1.0.0.jsonld", - "metadata" : { }, - "text" : { - "@value" : "NEUVOSTO EURATOMIN HANKINTAKESKUKSEN PERUSSÄÄNTÖ EUROOPAN ATOMIENERGIAYHTEISÖN NEUVOSTO , joka ottaa huomioon perustamissopimuksen 54 artiklan , ja ottaa huomioon komission ehdotuksen , ON PÄÄTTÄNYT antaa Euratomin hankintakeskuksen perussäännön seuraavasti :\n1 artikla Nimi ja tarkoitus\n", - "@language" : "x-unspecified" - }, - "views" : [ { - "metadata" : { }, - "annotations" : [ { - "id" : "sent-0", - "start" : 0, - "end" : 259, - "@type" : "http://vocab.lappsgrid.org/Sentence" - }, { - "id" : "sent-1", - "start" : 260, - "end" : 287, - "@type" : "http://vocab.lappsgrid.org/Sentence" - }, { - "id" : "tok-0", - "start" : 0, - "end" : 8, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "N", - "lemma" : "neuvosto" - } - }, { - "id" : "tok-1", - "start" : 9, - "end" : 18, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "N", - "lemma" : "Euratom" - } - }, { - "id" : "tok-2", - "start" : 19, - "end" : 36, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "N", - "lemma" : "hankinta#keskus" - } - }, { - "id" : "tok-3", - "start" : 37, - "end" : 48, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "N", - "lemma" : "perus#sääntö" - } - }, { - "id" : "tok-4", - "start" : 49, - "end" : 57, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "N", - "lemma" : "Eurooppa" - } - }, { - "id" : "tok-5", - "start" : 58, - "end" : 78, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "N", - "lemma" : "atomi#energia#yhteisö" - } - }, { - "id" : "tok-6", - "start" : 79, - "end" : 87, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "N", - "lemma" : "neuvosto" - } - }, { - "id" : "tok-7", - "start" : 88, - "end" : 89, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "Punct", - "lemma" : "," - } - }, { - "id" : "tok-8", - "start" : 90, - "end" : 94, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "Pron", - "lemma" : "joka" - } - }, { - "id" : "tok-9", - "start" : 95, - "end" : 100, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "V", - "lemma" : "ottaa" - } - }, { - "id" : "tok-10", - "start" : 101, - "end" : 109, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "N", - "lemma" : "huomio" - } - }, { - "id" : "tok-11", - "start" : 110, - "end" : 130, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "N", - "lemma" : "perustamis#sopimus" - } - }, { - "id" : "tok-12", - "start" : 131, - "end" : 133, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "Num", - "lemma" : "54" - } - }, { - "id" : "tok-13", - "start" : 134, - "end" : 142, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "N", - "lemma" : "artikla" - } - }, { - "id" : "tok-14", - "start" : 143, - "end" : 144, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "Punct", - "lemma" : "," - } - }, { - "id" : "tok-15", - "start" : 145, - "end" : 147, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "CC", - "lemma" : "ja" - } - }, { - "id" : "tok-16", - "start" : 148, - "end" : 153, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "V", - "lemma" : "ottaa" - } - }, { - "id" : "tok-17", - "start" : 154, - "end" : 162, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "N", - "lemma" : "huomio" - } - }, { - "id" : "tok-18", - "start" : 163, - "end" : 172, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "N", - "lemma" : "komissio" - } - }, { - "id" : "tok-19", - "start" : 173, - "end" : 183, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "N", - "lemma" : "ehdotus" - } - }, { - "id" : "tok-20", - "start" : 184, - "end" : 185, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "Punct", - "lemma" : "," - } - }, { - "id" : "tok-21", - "start" : 186, - "end" : 188, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "V", - "lemma" : "olla" - } - }, { - "id" : "tok-22", - "start" : 189, - "end" : 198, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "PrfPrc", - "lemma" : "päättää" - } - }, { - "id" : "tok-23", - "start" : 199, - "end" : 204, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "V", - "lemma" : "antaa" - } - }, { - "id" : "tok-24", - "start" : 205, - "end" : 214, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "N", - "lemma" : "Euratom" - } - }, { - "id" : "tok-25", - "start" : 215, - "end" : 232, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "N", - "lemma" : "hankinta#keskus" - } - }, { - "id" : "tok-26", - "start" : 233, - "end" : 245, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "N", - "lemma" : "perus#sääntö" - } - }, { - "id" : "tok-27", - "start" : 246, - "end" : 257, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "Adv", - "lemma" : "seuraava" - } - }, { - "id" : "tok-28", - "start" : 258, - "end" : 259, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "Punct", - "lemma" : ":" - } - }, { - "id" : "tok-29", - "start" : 260, - "end" : 261, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "Num", - "lemma" : "1" - } - }, { - "id" : "tok-30", - "start" : 262, - "end" : 269, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "N", - "lemma" : "artikla" - } - }, { - "id" : "tok-31", - "start" : 270, - "end" : 274, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "N", - "lemma" : "nimi" - } - }, { - "id" : "tok-32", - "start" : 275, - "end" : 277, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "CC", - "lemma" : "ja" - } - }, { - "id" : "tok-33", - "start" : 278, - "end" : 287, - "@type" : "http://vocab.lappsgrid.org/Token", - "features" : { - "pos" : "N", - "lemma" : "tarkoitus" - } - }, { - "id" : "dep-0", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "attr", - "features" : { - "governor" : "tok-1", - "dependent" : "tok-0" - } - }, { - "id" : "dep-1", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "attr", - "features" : { - "governor" : "tok-2", - "dependent" : "tok-1" - } - }, { - "id" : "dep-2", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "attr", - "features" : { - "governor" : "tok-3", - "dependent" : "tok-2" - } - }, { - "id" : "dep-3", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "attr", - "features" : { - "governor" : "tok-6", - "dependent" : "tok-3" - } - }, { - "id" : "dep-4", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "attr", - "features" : { - "governor" : "tok-5", - "dependent" : "tok-4" - } - }, { - "id" : "dep-5", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "attr", - "features" : { - "governor" : "tok-6", - "dependent" : "tok-5" - } - }, { - "id" : "dep-6", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "main", - "features" : { - "governor" : "tok-6", - "dependent" : "tok-6" - } - }, { - "id" : "dep-7", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "subj", - "features" : { - "governor" : "tok-9", - "dependent" : "tok-8" - } - }, { - "id" : "dep-8", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "mod", - "features" : { - "governor" : "tok-6", - "dependent" : "tok-9" - } - }, { - "id" : "dep-9", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "phrv", - "features" : { - "governor" : "tok-9", - "dependent" : "tok-10" - } - }, { - "id" : "dep-10", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "attr", - "features" : { - "governor" : "tok-13", - "dependent" : "tok-11" - } - }, { - "id" : "dep-11", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "attr", - "features" : { - "governor" : "tok-13", - "dependent" : "tok-12" - } - }, { - "id" : "dep-12", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "obj", - "features" : { - "governor" : "tok-9", - "dependent" : "tok-13" - } - }, { - "id" : "dep-13", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "phrm", - "features" : { - "governor" : "tok-16", - "dependent" : "tok-14" - } - }, { - "id" : "dep-14", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "phrm", - "features" : { - "governor" : "tok-16", - "dependent" : "tok-15" - } - }, { - "id" : "dep-15", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "conjunct", - "features" : { - "governor" : "tok-9", - "dependent" : "tok-16" - } - }, { - "id" : "dep-16", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "phrv", - "features" : { - "governor" : "tok-16", - "dependent" : "tok-17" - } - }, { - "id" : "dep-17", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "subj", - "features" : { - "governor" : "tok-19", - "dependent" : "tok-18" - } - }, { - "id" : "dep-18", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "obj", - "features" : { - "governor" : "tok-16", - "dependent" : "tok-19" - } - }, { - "id" : "dep-19", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "phrm", - "features" : { - "governor" : "tok-22", - "dependent" : "tok-20" - } - }, { - "id" : "dep-20", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "aux", - "features" : { - "governor" : "tok-22", - "dependent" : "tok-21" - } - }, { - "id" : "dep-21", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "conjunct", - "features" : { - "governor" : "tok-16", - "dependent" : "tok-22" - } - }, { - "id" : "dep-22", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "obj", - "features" : { - "governor" : "tok-22", - "dependent" : "tok-23" - } - }, { - "id" : "dep-23", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "attr", - "features" : { - "governor" : "tok-25", - "dependent" : "tok-24" - } - }, { - "id" : "dep-24", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "attr", - "features" : { - "governor" : "tok-26", - "dependent" : "tok-25" - } - }, { - "id" : "dep-25", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "obj", - "features" : { - "governor" : "tok-23", - "dependent" : "tok-26" - } - }, { - "id" : "dep-26", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "advl", - "features" : { - "governor" : "tok-23", - "dependent" : "tok-27" - } - }, { - "id" : "depstruct-0", - "start" : 0, - "end" : 259, - "@type" : "http://vocab.lappsgrid.org/DependencyStructure", - "features" : { - "dependencies" : [ "dep-0", "dep-1", "dep-10", "dep-11", "dep-12", "dep-13", "dep-14", "dep-15", "dep-16", "dep-17", "dep-18", "dep-19", "dep-2", "dep-20", "dep-21", "dep-22", "dep-23", "dep-24", "dep-25", "dep-26", "dep-3", "dep-4", "dep-5", "dep-6", "dep-7", "dep-8", "dep-9" ] - } - }, { - "id" : "dep-27", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "attr", - "features" : { - "governor" : "tok-30", - "dependent" : "tok-29" - } - }, { - "id" : "dep-28", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "attr", - "features" : { - "governor" : "tok-31", - "dependent" : "tok-30" - } - }, { - "id" : "dep-29", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "main", - "features" : { - "governor" : "tok-31", - "dependent" : "tok-31" - } - }, { - "id" : "dep-30", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "phrm", - "features" : { - "governor" : "tok-33", - "dependent" : "tok-32" - } - }, { - "id" : "dep-31", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "conjunct", - "features" : { - "governor" : "tok-31", - "dependent" : "tok-33" - } - }, { - "id" : "depstruct-1", - "start" : 260, - "end" : 287, - "@type" : "http://vocab.lappsgrid.org/DependencyStructure", - "features" : { - "dependencies" : [ "dep-27", "dep-28", "dep-29", "dep-30", "dep-31" ] - } - } ] - } ] -} \ No newline at end of file diff --git a/dkpro-core-io-lif-asl/src/test/resources/conll/2006/fi-ref.lif b/dkpro-core-io-lif-asl/src/test/resources/conll/2006/fi-ref.lif new file mode 100644 index 0000000000..422704e70c --- /dev/null +++ b/dkpro-core-io-lif-asl/src/test/resources/conll/2006/fi-ref.lif @@ -0,0 +1,636 @@ +{ + "@context" : "http://vocab.lappsgrid.org/context-1.0.0.jsonld", + "metadata" : { }, + "text" : { + "@value" : "NEUVOSTO EURATOMIN HANKINTAKESKUKSEN PERUSSÄÄNTÖ EUROOPAN ATOMIENERGIAYHTEISÖN NEUVOSTO , joka ottaa huomioon perustamissopimuksen 54 artiklan , ja ottaa huomioon komission ehdotuksen , ON PÄÄTTÄNYT antaa Euratomin hankintakeskuksen perussäännön seuraavasti :\n1 artikla Nimi ja tarkoitus\n", + "@language" : "x-unspecified" + }, + "views" : [ { + "id" : "v1", + "metadata" : { + "contains" : { + "http://vocab.lappsgrid.org/Paragraph" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Paragraph" + }, + "http://vocab.lappsgrid.org/Sentence" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Sentence" + }, + "http://vocab.lappsgrid.org/Token" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Token" + }, + "http://vocab.lappsgrid.org/Token#lemma" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Lemma" + }, + "http://vocab.lappsgrid.org/Token#pos" : { + "producer" : "DKPro Core LIF Converter", + "type" : "POS" + }, + "http://vocab.lappsgrid.org/NamedEntity" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Named entity" + }, + "http://vocab.lappsgrid.org/Dependency" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Dependencies" + }, + "http://vocab.lappsgrid.org/PhraseStructure" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Constituents" + } + } + }, + "annotations" : [ { + "id" : "sent-0", + "start" : 0, + "end" : 259, + "@type" : "http://vocab.lappsgrid.org/Sentence" + }, { + "id" : "sent-1", + "start" : 260, + "end" : 287, + "@type" : "http://vocab.lappsgrid.org/Sentence" + }, { + "id" : "tok-0", + "start" : 0, + "end" : 8, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "N", + "lemma" : "neuvosto" + } + }, { + "id" : "tok-1", + "start" : 9, + "end" : 18, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "N", + "lemma" : "Euratom" + } + }, { + "id" : "tok-2", + "start" : 19, + "end" : 36, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "N", + "lemma" : "hankinta#keskus" + } + }, { + "id" : "tok-3", + "start" : 37, + "end" : 48, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "N", + "lemma" : "perus#sääntö" + } + }, { + "id" : "tok-4", + "start" : 49, + "end" : 57, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "N", + "lemma" : "Eurooppa" + } + }, { + "id" : "tok-5", + "start" : 58, + "end" : 78, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "N", + "lemma" : "atomi#energia#yhteisö" + } + }, { + "id" : "tok-6", + "start" : 79, + "end" : 87, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "N", + "lemma" : "neuvosto" + } + }, { + "id" : "tok-7", + "start" : 88, + "end" : 89, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "Punct", + "lemma" : "," + } + }, { + "id" : "tok-8", + "start" : 90, + "end" : 94, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "Pron", + "lemma" : "joka" + } + }, { + "id" : "tok-9", + "start" : 95, + "end" : 100, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "V", + "lemma" : "ottaa" + } + }, { + "id" : "tok-10", + "start" : 101, + "end" : 109, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "N", + "lemma" : "huomio" + } + }, { + "id" : "tok-11", + "start" : 110, + "end" : 130, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "N", + "lemma" : "perustamis#sopimus" + } + }, { + "id" : "tok-12", + "start" : 131, + "end" : 133, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "Num", + "lemma" : "54" + } + }, { + "id" : "tok-13", + "start" : 134, + "end" : 142, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "N", + "lemma" : "artikla" + } + }, { + "id" : "tok-14", + "start" : 143, + "end" : 144, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "Punct", + "lemma" : "," + } + }, { + "id" : "tok-15", + "start" : 145, + "end" : 147, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "CC", + "lemma" : "ja" + } + }, { + "id" : "tok-16", + "start" : 148, + "end" : 153, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "V", + "lemma" : "ottaa" + } + }, { + "id" : "tok-17", + "start" : 154, + "end" : 162, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "N", + "lemma" : "huomio" + } + }, { + "id" : "tok-18", + "start" : 163, + "end" : 172, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "N", + "lemma" : "komissio" + } + }, { + "id" : "tok-19", + "start" : 173, + "end" : 183, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "N", + "lemma" : "ehdotus" + } + }, { + "id" : "tok-20", + "start" : 184, + "end" : 185, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "Punct", + "lemma" : "," + } + }, { + "id" : "tok-21", + "start" : 186, + "end" : 188, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "V", + "lemma" : "olla" + } + }, { + "id" : "tok-22", + "start" : 189, + "end" : 198, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "PrfPrc", + "lemma" : "päättää" + } + }, { + "id" : "tok-23", + "start" : 199, + "end" : 204, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "V", + "lemma" : "antaa" + } + }, { + "id" : "tok-24", + "start" : 205, + "end" : 214, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "N", + "lemma" : "Euratom" + } + }, { + "id" : "tok-25", + "start" : 215, + "end" : 232, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "N", + "lemma" : "hankinta#keskus" + } + }, { + "id" : "tok-26", + "start" : 233, + "end" : 245, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "N", + "lemma" : "perus#sääntö" + } + }, { + "id" : "tok-27", + "start" : 246, + "end" : 257, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "Adv", + "lemma" : "seuraava" + } + }, { + "id" : "tok-28", + "start" : 258, + "end" : 259, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "Punct", + "lemma" : ":" + } + }, { + "id" : "tok-29", + "start" : 260, + "end" : 261, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "Num", + "lemma" : "1" + } + }, { + "id" : "tok-30", + "start" : 262, + "end" : 269, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "N", + "lemma" : "artikla" + } + }, { + "id" : "tok-31", + "start" : 270, + "end" : 274, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "N", + "lemma" : "nimi" + } + }, { + "id" : "tok-32", + "start" : 275, + "end" : 277, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "CC", + "lemma" : "ja" + } + }, { + "id" : "tok-33", + "start" : 278, + "end" : 287, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "N", + "lemma" : "tarkoitus" + } + }, { + "id" : "dep-0", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "attr", + "features" : { + "governor" : "tok-1", + "dependent" : "tok-0" + } + }, { + "id" : "dep-1", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "attr", + "features" : { + "governor" : "tok-2", + "dependent" : "tok-1" + } + }, { + "id" : "dep-2", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "attr", + "features" : { + "governor" : "tok-3", + "dependent" : "tok-2" + } + }, { + "id" : "dep-3", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "attr", + "features" : { + "governor" : "tok-6", + "dependent" : "tok-3" + } + }, { + "id" : "dep-4", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "attr", + "features" : { + "governor" : "tok-5", + "dependent" : "tok-4" + } + }, { + "id" : "dep-5", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "attr", + "features" : { + "governor" : "tok-6", + "dependent" : "tok-5" + } + }, { + "id" : "dep-6", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "main", + "features" : { + "governor" : "tok-6", + "dependent" : "tok-6" + } + }, { + "id" : "dep-7", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "subj", + "features" : { + "governor" : "tok-9", + "dependent" : "tok-8" + } + }, { + "id" : "dep-8", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "mod", + "features" : { + "governor" : "tok-6", + "dependent" : "tok-9" + } + }, { + "id" : "dep-9", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "phrv", + "features" : { + "governor" : "tok-9", + "dependent" : "tok-10" + } + }, { + "id" : "dep-10", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "attr", + "features" : { + "governor" : "tok-13", + "dependent" : "tok-11" + } + }, { + "id" : "dep-11", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "attr", + "features" : { + "governor" : "tok-13", + "dependent" : "tok-12" + } + }, { + "id" : "dep-12", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "obj", + "features" : { + "governor" : "tok-9", + "dependent" : "tok-13" + } + }, { + "id" : "dep-13", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "phrm", + "features" : { + "governor" : "tok-16", + "dependent" : "tok-14" + } + }, { + "id" : "dep-14", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "phrm", + "features" : { + "governor" : "tok-16", + "dependent" : "tok-15" + } + }, { + "id" : "dep-15", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "conjunct", + "features" : { + "governor" : "tok-9", + "dependent" : "tok-16" + } + }, { + "id" : "dep-16", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "phrv", + "features" : { + "governor" : "tok-16", + "dependent" : "tok-17" + } + }, { + "id" : "dep-17", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "subj", + "features" : { + "governor" : "tok-19", + "dependent" : "tok-18" + } + }, { + "id" : "dep-18", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "obj", + "features" : { + "governor" : "tok-16", + "dependent" : "tok-19" + } + }, { + "id" : "dep-19", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "phrm", + "features" : { + "governor" : "tok-22", + "dependent" : "tok-20" + } + }, { + "id" : "dep-20", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "aux", + "features" : { + "governor" : "tok-22", + "dependent" : "tok-21" + } + }, { + "id" : "dep-21", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "conjunct", + "features" : { + "governor" : "tok-16", + "dependent" : "tok-22" + } + }, { + "id" : "dep-22", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "obj", + "features" : { + "governor" : "tok-22", + "dependent" : "tok-23" + } + }, { + "id" : "dep-23", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "attr", + "features" : { + "governor" : "tok-25", + "dependent" : "tok-24" + } + }, { + "id" : "dep-24", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "attr", + "features" : { + "governor" : "tok-26", + "dependent" : "tok-25" + } + }, { + "id" : "dep-25", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "obj", + "features" : { + "governor" : "tok-23", + "dependent" : "tok-26" + } + }, { + "id" : "dep-26", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "advl", + "features" : { + "governor" : "tok-23", + "dependent" : "tok-27" + } + }, { + "id" : "depstruct-0", + "start" : 0, + "end" : 259, + "@type" : "http://vocab.lappsgrid.org/DependencyStructure", + "features" : { + "dependencies" : [ "dep-0", "dep-1", "dep-10", "dep-11", "dep-12", "dep-13", "dep-14", "dep-15", "dep-16", "dep-17", "dep-18", "dep-19", "dep-2", "dep-20", "dep-21", "dep-22", "dep-23", "dep-24", "dep-25", "dep-26", "dep-3", "dep-4", "dep-5", "dep-6", "dep-7", "dep-8", "dep-9" ] + } + }, { + "id" : "dep-27", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "attr", + "features" : { + "governor" : "tok-30", + "dependent" : "tok-29" + } + }, { + "id" : "dep-28", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "attr", + "features" : { + "governor" : "tok-31", + "dependent" : "tok-30" + } + }, { + "id" : "dep-29", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "main", + "features" : { + "governor" : "tok-31", + "dependent" : "tok-31" + } + }, { + "id" : "dep-30", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "phrm", + "features" : { + "governor" : "tok-33", + "dependent" : "tok-32" + } + }, { + "id" : "dep-31", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "conjunct", + "features" : { + "governor" : "tok-31", + "dependent" : "tok-33" + } + }, { + "id" : "depstruct-1", + "start" : 260, + "end" : 287, + "@type" : "http://vocab.lappsgrid.org/DependencyStructure", + "features" : { + "dependencies" : [ "dep-27", "dep-28", "dep-29", "dep-30", "dep-31" ] + } + } ] + } ] +} \ No newline at end of file diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure-ref.json b/dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure-ref.json deleted file mode 100644 index bfaa9d31f8..0000000000 --- a/dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure-ref.json +++ /dev/null @@ -1,64 +0,0 @@ -{ - "@context" : "http://vocab.lappsgrid.org/context-1.0.0.jsonld", - "metadata" : { }, - "text" : { - "@value" : "Sue sees herself", - "@language" : "en" - }, - "views" : [ { - "metadata" : { }, - "annotations" : [ { - "id" : "sent-0", - "start" : 0, - "end" : 16, - "@type" : "http://vocab.lappsgrid.org/Sentence" - }, { - "id" : "tok-0", - "start" : 0, - "end" : 3, - "@type" : "http://vocab.lappsgrid.org/Token" - }, { - "id" : "tok-1", - "start" : 4, - "end" : 8, - "@type" : "http://vocab.lappsgrid.org/Token" - }, { - "id" : "tok-2", - "start" : 9, - "end" : 16, - "@type" : "http://vocab.lappsgrid.org/Token" - }, { - "id" : "dep-0", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "nsubj", - "features" : { - "governor" : "tok-1", - "dependent" : "tok-0" - } - }, { - "id" : "dep-1", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "ROOT", - "features" : { - "governor" : "tok-1", - "dependent" : "tok-1" - } - }, { - "id" : "dep-2", - "@type" : "http://vocab.lappsgrid.org/Dependency", - "label" : "nobj", - "features" : { - "governor" : "tok-1", - "dependent" : "tok-2" - } - }, { - "id" : "depstruct-0", - "start" : 0, - "end" : 16, - "@type" : "http://vocab.lappsgrid.org/DependencyStructure", - "features" : { - "dependencies" : [ "dep-0", "dep-1", "dep-2" ] - } - } ] - } ] -} \ No newline at end of file diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure-ref.lif b/dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure-ref.lif new file mode 100644 index 0000000000..9afd0d2302 --- /dev/null +++ b/dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure-ref.lif @@ -0,0 +1,100 @@ +{ + "@context" : "http://vocab.lappsgrid.org/context-1.0.0.jsonld", + "metadata" : { }, + "text" : { + "@value" : "Sue sees herself", + "@language" : "en" + }, + "views" : [ { + "id" : "v1", + "metadata" : { + "contains" : { + "http://vocab.lappsgrid.org/Paragraph" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Paragraph" + }, + "http://vocab.lappsgrid.org/Sentence" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Sentence" + }, + "http://vocab.lappsgrid.org/Token" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Token" + }, + "http://vocab.lappsgrid.org/Token#lemma" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Lemma" + }, + "http://vocab.lappsgrid.org/Token#pos" : { + "producer" : "DKPro Core LIF Converter", + "type" : "POS" + }, + "http://vocab.lappsgrid.org/NamedEntity" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Named entity" + }, + "http://vocab.lappsgrid.org/Dependency" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Dependencies" + }, + "http://vocab.lappsgrid.org/PhraseStructure" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Constituents" + } + } + }, + "annotations" : [ { + "id" : "sent-0", + "start" : 0, + "end" : 16, + "@type" : "http://vocab.lappsgrid.org/Sentence" + }, { + "id" : "tok-0", + "start" : 0, + "end" : 3, + "@type" : "http://vocab.lappsgrid.org/Token" + }, { + "id" : "tok-1", + "start" : 4, + "end" : 8, + "@type" : "http://vocab.lappsgrid.org/Token" + }, { + "id" : "tok-2", + "start" : 9, + "end" : 16, + "@type" : "http://vocab.lappsgrid.org/Token" + }, { + "id" : "dep-0", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "nsubj", + "features" : { + "governor" : "tok-1", + "dependent" : "tok-0" + } + }, { + "id" : "dep-1", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "ROOT", + "features" : { + "governor" : "tok-1", + "dependent" : "tok-1" + } + }, { + "id" : "dep-2", + "@type" : "http://vocab.lappsgrid.org/Dependency", + "label" : "nobj", + "features" : { + "governor" : "tok-1", + "dependent" : "tok-2" + } + }, { + "id" : "depstruct-0", + "start" : 0, + "end" : 16, + "@type" : "http://vocab.lappsgrid.org/DependencyStructure", + "features" : { + "dependencies" : [ "dep-0", "dep-1", "dep-2" ] + } + } ] + } ] +} \ No newline at end of file diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure.json b/dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure.lif similarity index 100% rename from dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure.json rename to dkpro-core-io-lif-asl/src/test/resources/lif/dependencystructure.lif diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure-ref.json b/dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure-ref.json deleted file mode 100644 index 72f83c2067..0000000000 --- a/dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure-ref.json +++ /dev/null @@ -1,49 +0,0 @@ -{ - "@context" : "http://vocab.lappsgrid.org/context-1.0.0.jsonld", - "metadata" : { }, - "text" : { - "@value" : "Sue sees herself", - "@language" : "en" - }, - "views" : [ { - "metadata" : { }, - "annotations" : [ { - "id" : "sent-0", - "start" : 0, - "end" : 16, - "@type" : "http://vocab.lappsgrid.org/Sentence" - }, { - "id" : "tok-0", - "start" : 0, - "end" : 3, - "@type" : "http://vocab.lappsgrid.org/Token" - }, { - "id" : "tok-1", - "start" : 4, - "end" : 8, - "@type" : "http://vocab.lappsgrid.org/Token" - }, { - "id" : "tok-2", - "start" : 9, - "end" : 16, - "@type" : "http://vocab.lappsgrid.org/Token" - }, { - "id" : "const-0", - "@type" : "http://vocab.lappsgrid.org/Constituent" - }, { - "id" : "const-1", - "@type" : "http://vocab.lappsgrid.org/Constituent" - }, { - "id" : "const-2", - "@type" : "http://vocab.lappsgrid.org/Constituent" - }, { - "id" : "phrasestruct-0", - "start" : 0, - "end" : 16, - "@type" : "http://vocab.lappsgrid.org/PhraseStructure", - "features" : { - "constituents" : [ "const-0", "const-1", "tok-0", "const-2", "tok-2", "tok-1" ] - } - } ] - } ] -} \ No newline at end of file diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure-ref.lif b/dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure-ref.lif new file mode 100644 index 0000000000..fcf6b83f43 --- /dev/null +++ b/dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure-ref.lif @@ -0,0 +1,85 @@ +{ + "@context" : "http://vocab.lappsgrid.org/context-1.0.0.jsonld", + "metadata" : { }, + "text" : { + "@value" : "Sue sees herself", + "@language" : "en" + }, + "views" : [ { + "id" : "v1", + "metadata" : { + "contains" : { + "http://vocab.lappsgrid.org/Paragraph" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Paragraph" + }, + "http://vocab.lappsgrid.org/Sentence" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Sentence" + }, + "http://vocab.lappsgrid.org/Token" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Token" + }, + "http://vocab.lappsgrid.org/Token#lemma" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Lemma" + }, + "http://vocab.lappsgrid.org/Token#pos" : { + "producer" : "DKPro Core LIF Converter", + "type" : "POS" + }, + "http://vocab.lappsgrid.org/NamedEntity" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Named entity" + }, + "http://vocab.lappsgrid.org/Dependency" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Dependencies" + }, + "http://vocab.lappsgrid.org/PhraseStructure" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Constituents" + } + } + }, + "annotations" : [ { + "id" : "sent-0", + "start" : 0, + "end" : 16, + "@type" : "http://vocab.lappsgrid.org/Sentence" + }, { + "id" : "tok-0", + "start" : 0, + "end" : 3, + "@type" : "http://vocab.lappsgrid.org/Token" + }, { + "id" : "tok-1", + "start" : 4, + "end" : 8, + "@type" : "http://vocab.lappsgrid.org/Token" + }, { + "id" : "tok-2", + "start" : 9, + "end" : 16, + "@type" : "http://vocab.lappsgrid.org/Token" + }, { + "id" : "const-0", + "@type" : "http://vocab.lappsgrid.org/Constituent" + }, { + "id" : "const-1", + "@type" : "http://vocab.lappsgrid.org/Constituent" + }, { + "id" : "const-2", + "@type" : "http://vocab.lappsgrid.org/Constituent" + }, { + "id" : "phrasestruct-0", + "start" : 0, + "end" : 16, + "@type" : "http://vocab.lappsgrid.org/PhraseStructure", + "features" : { + "constituents" : [ "const-0", "const-1", "tok-0", "const-2", "tok-2", "tok-1" ] + } + } ] + } ] +} \ No newline at end of file diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure.json b/dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure.lif similarity index 100% rename from dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure.json rename to dkpro-core-io-lif-asl/src/test/resources/lif/phrasestructure.lif diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/specification-ner-ref.lif b/dkpro-core-io-lif-asl/src/test/resources/lif/specification-ner-ref.lif new file mode 100644 index 0000000000..57bb358c74 --- /dev/null +++ b/dkpro-core-io-lif-asl/src/test/resources/lif/specification-ner-ref.lif @@ -0,0 +1,56 @@ +{ + "@context" : "http://vocab.lappsgrid.org/context-1.0.0.jsonld", + "metadata" : { }, + "text" : { + "@value" : "Jill sleeps.", + "@language" : "x-unspecified" + }, + "views" : [ { + "id" : "v1", + "metadata" : { + "contains" : { + "http://vocab.lappsgrid.org/Paragraph" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Paragraph" + }, + "http://vocab.lappsgrid.org/Sentence" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Sentence" + }, + "http://vocab.lappsgrid.org/Token" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Token" + }, + "http://vocab.lappsgrid.org/Token#lemma" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Lemma" + }, + "http://vocab.lappsgrid.org/Token#pos" : { + "producer" : "DKPro Core LIF Converter", + "type" : "POS" + }, + "http://vocab.lappsgrid.org/NamedEntity" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Named entity" + }, + "http://vocab.lappsgrid.org/Dependency" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Dependencies" + }, + "http://vocab.lappsgrid.org/PhraseStructure" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Constituents" + } + } + }, + "annotations" : [ { + "id" : "ne-0", + "start" : 0, + "end" : 4, + "@type" : "http://vocab.lappsgrid.org/NamedEntity", + "features" : { + "category" : "person" + } + } ] + } ] +} \ No newline at end of file diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/specification-ner.lif b/dkpro-core-io-lif-asl/src/test/resources/lif/specification-ner.lif new file mode 100644 index 0000000000..d61b9cf330 --- /dev/null +++ b/dkpro-core-io-lif-asl/src/test/resources/lif/specification-ner.lif @@ -0,0 +1,30 @@ +{ + "@context": "http://vocab.lappsgrid.org/context-1.0.0.jsonld", + "text": { + "@value": "Jill sleeps." + }, + "views": [ + { + "id": "v1", + "metadata": { + "contains": { + "http://vocab.lappsgrid.org/NamedEntity": { + "producer": "edu.brandeis.cs.lappsgrid.stanford.corenlp.NamedEntityRecognizer:2.0.3", + "namedEntityCategorySet": "ner:stanford" + } + } + }, + "annotations": [ + { + "@type": "http://vocab.lappsgrid.org/NamedEntity", + "id": "c0", + "start": 0, + "end": 4, + "features": { + "category": "person" + } + } + ] + } + ] +} \ No newline at end of file diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos-massaged.lif b/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos-massaged.lif new file mode 100644 index 0000000000..68a37fb082 --- /dev/null +++ b/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos-massaged.lif @@ -0,0 +1 @@ +{"@context":"http://vocab.lappsgrid.org/context-1.0.0.jsonld","metadata":{"sourceid":"10025748","sourcedb":"PubMed"},"text":{"@value":"Actinically degenerate elastic tissue is the likely antigenic basis of actinic granuloma of the skin and of temporal arteritis.\nStaining technique is paramount for detecting and assessing the severe degeneration that occurs in the elastic tissues of the skin and its arteries in response to prolonged exposure to actinic radiation. With a selective \"controlled\" hematoxylin-and-eosin stain, actinically damaged (\"elastotic\") elastic tissue stains blue, as Unna described, and contrasts with normal and simply hyperplastic elastic tissue, which stains red. \"Special\" elastic stains such as Orcein and Verhoeff do not demonstrate this difference. When resorptive (elastolytic) giant cell reactions develop in relation to actinically degenerate elastic tissue of the skin, the papules that arise tend to form expanding, annular rings. A previously used and appropriate name for these autoimmune lesions in the skin is actinic granuloma because this name highlights the likely actinic origin and pathogenesis of many such lesions. Granulomatous inflammation in connection with actinically degenerate internal elastic lamina appears to be the basis of temporal arteritis. Actinic granulomas may occur in the skin concurrently with temporal arteritis. A recent study of temporal arteritis strongly relates its elastic tissue changes to those of \"accelerated\" atherosclerosis."},"views":[{"metadata":{"contains":{"http://vocab.lappsgrid.org/Token#pos":{"producer":"edu.brandeis.cs.lappsgrid.stanford.corenlp.POSTagger:2.0.4","type":"tagger:stanford"}}},"annotations":[{"id":"tk_0_0","start":0,"end":11,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNP","word":"Actinically"}},{"id":"tk_1_1","start":12,"end":22,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"degenerate"}},{"id":"tk_2_2","start":23,"end":30,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastic"}},{"id":"tk_3_3","start":31,"end":37,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"tissue"}},{"id":"tk_4_4","start":38,"end":40,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBZ","word":"is"}},{"id":"tk_5_5","start":41,"end":44,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_6_6","start":45,"end":51,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"likely"}},{"id":"tk_7_7","start":52,"end":61,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"antigenic"}},{"id":"tk_8_8","start":62,"end":67,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"basis"}},{"id":"tk_9_9","start":68,"end":70,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"of"}},{"id":"tk_10_10","start":71,"end":78,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"actinic"}},{"id":"tk_11_11","start":79,"end":88,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"granuloma"}},{"id":"tk_12_12","start":89,"end":91,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"of"}},{"id":"tk_13_13","start":92,"end":95,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_14_14","start":96,"end":100,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"skin"}},{"id":"tk_15_15","start":101,"end":104,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"CC","word":"and"}},{"id":"tk_16_16","start":105,"end":107,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"of"}},{"id":"tk_17_17","start":108,"end":116,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"temporal"}},{"id":"tk_18_18","start":117,"end":126,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"arteritis"}},{"id":"tk_19_19","start":126,"end":127,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":".","word":"."}},{"id":"tk_20_0","start":128,"end":136,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBG","word":"Staining"}},{"id":"tk_21_1","start":137,"end":146,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"technique"}},{"id":"tk_22_2","start":147,"end":149,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBZ","word":"is"}},{"id":"tk_23_3","start":150,"end":159,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"paramount"}},{"id":"tk_24_4","start":160,"end":163,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"for"}},{"id":"tk_25_5","start":164,"end":173,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBG","word":"detecting"}},{"id":"tk_26_6","start":174,"end":177,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"CC","word":"and"}},{"id":"tk_27_7","start":178,"end":187,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBG","word":"assessing"}},{"id":"tk_28_8","start":188,"end":191,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_29_9","start":192,"end":198,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"severe"}},{"id":"tk_30_10","start":199,"end":211,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"degeneration"}},{"id":"tk_31_11","start":212,"end":216,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"WDT","word":"that"}},{"id":"tk_32_12","start":217,"end":223,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBZ","word":"occurs"}},{"id":"tk_33_13","start":224,"end":226,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"in"}},{"id":"tk_34_14","start":227,"end":230,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_35_15","start":231,"end":238,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastic"}},{"id":"tk_36_16","start":239,"end":246,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"tissues"}},{"id":"tk_37_17","start":247,"end":249,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"of"}},{"id":"tk_38_18","start":250,"end":253,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_39_19","start":254,"end":258,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"skin"}},{"id":"tk_40_20","start":259,"end":262,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"CC","word":"and"}},{"id":"tk_41_21","start":263,"end":266,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"PRP$","word":"its"}},{"id":"tk_42_22","start":267,"end":275,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"arteries"}},{"id":"tk_43_23","start":276,"end":278,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"in"}},{"id":"tk_44_24","start":279,"end":287,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"response"}},{"id":"tk_45_25","start":288,"end":290,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"TO","word":"to"}},{"id":"tk_46_26","start":291,"end":300,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"prolonged"}},{"id":"tk_47_27","start":301,"end":309,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"exposure"}},{"id":"tk_48_28","start":310,"end":312,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"TO","word":"to"}},{"id":"tk_49_29","start":313,"end":320,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"actinic"}},{"id":"tk_50_30","start":321,"end":330,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"radiation"}},{"id":"tk_51_31","start":330,"end":331,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":".","word":"."}},{"id":"tk_52_0","start":332,"end":336,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"With"}},{"id":"tk_53_1","start":337,"end":338,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"a"}},{"id":"tk_54_2","start":339,"end":348,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"selective"}},{"id":"tk_55_3","start":349,"end":350,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"``","word":"``"}},{"id":"tk_56_4","start":350,"end":360,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBN","word":"controlled"}},{"id":"tk_57_5","start":360,"end":361,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"''","word":"''"}},{"id":"tk_58_6","start":362,"end":383,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"hematoxylin-and-eosin"}},{"id":"tk_59_7","start":384,"end":389,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBP","word":"stain"}},{"id":"tk_60_8","start":389,"end":390,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":",","word":","}},{"id":"tk_61_9","start":391,"end":402,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"RB","word":"actinically"}},{"id":"tk_62_10","start":403,"end":410,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBN","word":"damaged"}},{"id":"tk_63_11","start":411,"end":412,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"-LRB-","word":"-LRB-"}},{"id":"tk_64_12","start":412,"end":413,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"``","word":"``"}},{"id":"tk_65_13","start":413,"end":422,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastotic"}},{"id":"tk_66_14","start":422,"end":423,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"''","word":"''"}},{"id":"tk_67_15","start":423,"end":424,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"-RRB-","word":"-RRB-"}},{"id":"tk_68_16","start":425,"end":432,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastic"}},{"id":"tk_69_17","start":433,"end":439,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"tissue"}},{"id":"tk_70_18","start":440,"end":446,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"stains"}},{"id":"tk_71_19","start":447,"end":451,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"blue"}},{"id":"tk_72_20","start":451,"end":452,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":",","word":","}},{"id":"tk_73_21","start":453,"end":455,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"as"}},{"id":"tk_74_22","start":456,"end":460,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNP","word":"Unna"}},{"id":"tk_75_23","start":461,"end":470,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBD","word":"described"}},{"id":"tk_76_24","start":470,"end":471,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":",","word":","}},{"id":"tk_77_25","start":472,"end":475,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"CC","word":"and"}},{"id":"tk_78_26","start":476,"end":485,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBZ","word":"contrasts"}},{"id":"tk_79_27","start":486,"end":490,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"with"}},{"id":"tk_80_28","start":491,"end":497,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"normal"}},{"id":"tk_81_29","start":498,"end":501,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"CC","word":"and"}},{"id":"tk_82_30","start":502,"end":508,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"RB","word":"simply"}},{"id":"tk_83_31","start":509,"end":521,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"hyperplastic"}},{"id":"tk_84_32","start":522,"end":529,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastic"}},{"id":"tk_85_33","start":530,"end":536,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"tissue"}},{"id":"tk_86_34","start":536,"end":537,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":",","word":","}},{"id":"tk_87_35","start":538,"end":543,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"WDT","word":"which"}},{"id":"tk_88_36","start":544,"end":550,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"stains"}},{"id":"tk_89_37","start":551,"end":554,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"red"}},{"id":"tk_90_38","start":554,"end":555,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":".","word":"."}},{"id":"tk_91_0","start":556,"end":557,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"``","word":"``"}},{"id":"tk_92_1","start":557,"end":564,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"Special"}},{"id":"tk_93_2","start":564,"end":565,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"''","word":"''"}},{"id":"tk_94_3","start":566,"end":573,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastic"}},{"id":"tk_95_4","start":574,"end":580,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"stains"}},{"id":"tk_96_5","start":581,"end":585,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"such"}},{"id":"tk_97_6","start":586,"end":588,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"as"}},{"id":"tk_98_7","start":589,"end":595,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNP","word":"Orcein"}},{"id":"tk_99_8","start":596,"end":599,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"CC","word":"and"}},{"id":"tk_100_9","start":600,"end":608,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNP","word":"Verhoeff"}},{"id":"tk_101_10","start":609,"end":611,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBP","word":"do"}},{"id":"tk_102_11","start":612,"end":615,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"RB","word":"not"}},{"id":"tk_103_12","start":616,"end":627,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VB","word":"demonstrate"}},{"id":"tk_104_13","start":628,"end":632,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"this"}},{"id":"tk_105_14","start":633,"end":643,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"difference"}},{"id":"tk_106_15","start":643,"end":644,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":".","word":"."}},{"id":"tk_107_0","start":645,"end":649,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"WRB","word":"When"}},{"id":"tk_108_1","start":650,"end":660,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"resorptive"}},{"id":"tk_109_2","start":661,"end":662,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"-LRB-","word":"-LRB-"}},{"id":"tk_110_3","start":662,"end":673,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastolytic"}},{"id":"tk_111_4","start":673,"end":674,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"-RRB-","word":"-RRB-"}},{"id":"tk_112_5","start":675,"end":680,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"giant"}},{"id":"tk_113_6","start":681,"end":685,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"cell"}},{"id":"tk_114_7","start":686,"end":695,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"reactions"}},{"id":"tk_115_8","start":696,"end":703,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBP","word":"develop"}},{"id":"tk_116_9","start":704,"end":706,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"in"}},{"id":"tk_117_10","start":707,"end":715,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"relation"}},{"id":"tk_118_11","start":716,"end":718,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"TO","word":"to"}},{"id":"tk_119_12","start":719,"end":730,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"RB","word":"actinically"}},{"id":"tk_120_13","start":731,"end":741,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VB","word":"degenerate"}},{"id":"tk_121_14","start":742,"end":749,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastic"}},{"id":"tk_122_15","start":750,"end":756,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"tissue"}},{"id":"tk_123_16","start":757,"end":759,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"of"}},{"id":"tk_124_17","start":760,"end":763,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_125_18","start":764,"end":768,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"skin"}},{"id":"tk_126_19","start":768,"end":769,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":",","word":","}},{"id":"tk_127_20","start":770,"end":773,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_128_21","start":774,"end":781,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"papules"}},{"id":"tk_129_22","start":782,"end":786,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"WDT","word":"that"}},{"id":"tk_130_23","start":787,"end":792,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBP","word":"arise"}},{"id":"tk_131_24","start":793,"end":797,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VB","word":"tend"}},{"id":"tk_132_25","start":798,"end":800,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"TO","word":"to"}},{"id":"tk_133_26","start":801,"end":805,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VB","word":"form"}},{"id":"tk_134_27","start":806,"end":815,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBG","word":"expanding"}},{"id":"tk_135_28","start":815,"end":816,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":",","word":","}},{"id":"tk_136_29","start":817,"end":824,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"annular"}},{"id":"tk_137_30","start":825,"end":830,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"rings"}},{"id":"tk_138_31","start":830,"end":831,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":".","word":"."}},{"id":"tk_139_0","start":832,"end":833,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"A"}},{"id":"tk_140_1","start":834,"end":844,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"RB","word":"previously"}},{"id":"tk_141_2","start":845,"end":849,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBN","word":"used"}},{"id":"tk_142_3","start":850,"end":853,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"CC","word":"and"}},{"id":"tk_143_4","start":854,"end":865,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"appropriate"}},{"id":"tk_144_5","start":866,"end":870,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"name"}},{"id":"tk_145_6","start":871,"end":874,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"for"}},{"id":"tk_146_7","start":875,"end":880,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"these"}},{"id":"tk_147_8","start":881,"end":891,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"autoimmune"}},{"id":"tk_148_9","start":892,"end":899,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"lesions"}},{"id":"tk_149_10","start":900,"end":902,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"in"}},{"id":"tk_150_11","start":903,"end":906,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_151_12","start":907,"end":911,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"skin"}},{"id":"tk_152_13","start":912,"end":914,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBZ","word":"is"}},{"id":"tk_153_14","start":915,"end":922,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"actinic"}},{"id":"tk_154_15","start":923,"end":932,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"granuloma"}},{"id":"tk_155_16","start":933,"end":940,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"because"}},{"id":"tk_156_17","start":941,"end":945,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"this"}},{"id":"tk_157_18","start":946,"end":950,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"name"}},{"id":"tk_158_19","start":951,"end":961,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBZ","word":"highlights"}},{"id":"tk_159_20","start":962,"end":965,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_160_21","start":966,"end":972,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"likely"}},{"id":"tk_161_22","start":973,"end":980,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"actinic"}},{"id":"tk_162_23","start":981,"end":987,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"origin"}},{"id":"tk_163_24","start":988,"end":991,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"CC","word":"and"}},{"id":"tk_164_25","start":992,"end":1004,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"pathogenesis"}},{"id":"tk_165_26","start":1005,"end":1007,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"of"}},{"id":"tk_166_27","start":1008,"end":1012,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"many"}},{"id":"tk_167_28","start":1013,"end":1017,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"such"}},{"id":"tk_168_29","start":1018,"end":1025,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"lesions"}},{"id":"tk_169_30","start":1025,"end":1026,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":".","word":"."}},{"id":"tk_170_0","start":1027,"end":1040,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"Granulomatous"}},{"id":"tk_171_1","start":1041,"end":1053,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"inflammation"}},{"id":"tk_172_2","start":1054,"end":1056,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"in"}},{"id":"tk_173_3","start":1057,"end":1067,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"connection"}},{"id":"tk_174_4","start":1068,"end":1072,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"with"}},{"id":"tk_175_5","start":1073,"end":1084,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"RB","word":"actinically"}},{"id":"tk_176_6","start":1085,"end":1095,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"degenerate"}},{"id":"tk_177_7","start":1096,"end":1104,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"internal"}},{"id":"tk_178_8","start":1105,"end":1112,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastic"}},{"id":"tk_179_9","start":1113,"end":1119,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"lamina"}},{"id":"tk_180_10","start":1120,"end":1127,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBZ","word":"appears"}},{"id":"tk_181_11","start":1128,"end":1130,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"TO","word":"to"}},{"id":"tk_182_12","start":1131,"end":1133,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VB","word":"be"}},{"id":"tk_183_13","start":1134,"end":1137,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_184_14","start":1138,"end":1143,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"basis"}},{"id":"tk_185_15","start":1144,"end":1146,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"of"}},{"id":"tk_186_16","start":1147,"end":1155,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"temporal"}},{"id":"tk_187_17","start":1156,"end":1165,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"arteritis"}},{"id":"tk_188_18","start":1165,"end":1166,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":".","word":"."}},{"id":"tk_189_0","start":1167,"end":1174,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"Actinic"}},{"id":"tk_190_1","start":1175,"end":1185,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"granulomas"}},{"id":"tk_191_2","start":1186,"end":1189,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"MD","word":"may"}},{"id":"tk_192_3","start":1190,"end":1195,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VB","word":"occur"}},{"id":"tk_193_4","start":1196,"end":1198,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"in"}},{"id":"tk_194_5","start":1199,"end":1202,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"the"}},{"id":"tk_195_6","start":1203,"end":1207,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"skin"}},{"id":"tk_196_7","start":1208,"end":1220,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"RB","word":"concurrently"}},{"id":"tk_197_8","start":1221,"end":1225,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"with"}},{"id":"tk_198_9","start":1226,"end":1234,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"temporal"}},{"id":"tk_199_10","start":1235,"end":1244,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"arteritis"}},{"id":"tk_200_11","start":1244,"end":1245,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":".","word":"."}},{"id":"tk_201_0","start":1246,"end":1247,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"A"}},{"id":"tk_202_1","start":1248,"end":1254,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"recent"}},{"id":"tk_203_2","start":1255,"end":1260,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"study"}},{"id":"tk_204_3","start":1261,"end":1263,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"of"}},{"id":"tk_205_4","start":1264,"end":1272,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"temporal"}},{"id":"tk_206_5","start":1273,"end":1282,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"arteritis"}},{"id":"tk_207_6","start":1283,"end":1291,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"RB","word":"strongly"}},{"id":"tk_208_7","start":1292,"end":1299,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBZ","word":"relates"}},{"id":"tk_209_8","start":1300,"end":1303,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"PRP$","word":"its"}},{"id":"tk_210_9","start":1304,"end":1311,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"JJ","word":"elastic"}},{"id":"tk_211_10","start":1312,"end":1318,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"tissue"}},{"id":"tk_212_11","start":1319,"end":1326,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NNS","word":"changes"}},{"id":"tk_213_12","start":1327,"end":1329,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"TO","word":"to"}},{"id":"tk_214_13","start":1330,"end":1335,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"DT","word":"those"}},{"id":"tk_215_14","start":1336,"end":1338,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"IN","word":"of"}},{"id":"tk_216_15","start":1339,"end":1340,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"``","word":"``"}},{"id":"tk_217_16","start":1340,"end":1351,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"VBN","word":"accelerated"}},{"id":"tk_218_17","start":1351,"end":1352,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"''","word":"''"}},{"id":"tk_219_18","start":1353,"end":1368,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":"NN","word":"atherosclerosis"}},{"id":"tk_220_19","start":1368,"end":1369,"@type":"http://vocab.lappsgrid.org/Token","features":{"pos":".","word":"."}}]}]} \ No newline at end of file diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos-ref.lif b/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos-ref.lif new file mode 100644 index 0000000000..11f9796609 --- /dev/null +++ b/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos-ref.lif @@ -0,0 +1,1820 @@ +{ + "discriminator" : "http://vocab.lappsgrid.org/ns/media/jsonld#lif", + "payload" : { + "@context" : "http://vocab.lappsgrid.org/context-1.0.0.jsonld", + "metadata" : { }, + "text" : { + "@value" : "Actinically degenerate elastic tissue is the likely antigenic basis of actinic granuloma of the skin and of temporal arteritis.\nStaining technique is paramount for detecting and assessing the severe degeneration that occurs in the elastic tissues of the skin and its arteries in response to prolonged exposure to actinic radiation. With a selective \"controlled\" hematoxylin-and-eosin stain, actinically damaged (\"elastotic\") elastic tissue stains blue, as Unna described, and contrasts with normal and simply hyperplastic elastic tissue, which stains red. \"Special\" elastic stains such as Orcein and Verhoeff do not demonstrate this difference. When resorptive (elastolytic) giant cell reactions develop in relation to actinically degenerate elastic tissue of the skin, the papules that arise tend to form expanding, annular rings. A previously used and appropriate name for these autoimmune lesions in the skin is actinic granuloma because this name highlights the likely actinic origin and pathogenesis of many such lesions. Granulomatous inflammation in connection with actinically degenerate internal elastic lamina appears to be the basis of temporal arteritis. Actinic granulomas may occur in the skin concurrently with temporal arteritis. A recent study of temporal arteritis strongly relates its elastic tissue changes to those of \"accelerated\" atherosclerosis.", + "@language" : "x-unspecified" + }, + "views" : [ { + "id" : "v1", + "metadata" : { + "contains" : { + "http://vocab.lappsgrid.org/Paragraph" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Paragraph" + }, + "http://vocab.lappsgrid.org/Sentence" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Sentence" + }, + "http://vocab.lappsgrid.org/Token" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Token" + }, + "http://vocab.lappsgrid.org/Token#lemma" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Lemma" + }, + "http://vocab.lappsgrid.org/Token#pos" : { + "producer" : "DKPro Core LIF Converter", + "type" : "POS" + }, + "http://vocab.lappsgrid.org/NamedEntity" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Named entity" + }, + "http://vocab.lappsgrid.org/Dependency" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Dependencies" + }, + "http://vocab.lappsgrid.org/PhraseStructure" : { + "producer" : "DKPro Core LIF Converter", + "type" : "Constituents" + } + } + }, + "annotations" : [ { + "id" : "tok-0", + "start" : 0, + "end" : 11, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNP" + } + }, { + "id" : "tok-1", + "start" : 12, + "end" : 22, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-2", + "start" : 23, + "end" : 30, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-3", + "start" : 31, + "end" : 37, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-4", + "start" : 38, + "end" : 40, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBZ" + } + }, { + "id" : "tok-5", + "start" : 41, + "end" : 44, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-6", + "start" : 45, + "end" : 51, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-7", + "start" : 52, + "end" : 61, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-8", + "start" : 62, + "end" : 67, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-9", + "start" : 68, + "end" : 70, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-10", + "start" : 71, + "end" : 78, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-11", + "start" : 79, + "end" : 88, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-12", + "start" : 89, + "end" : 91, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-13", + "start" : 92, + "end" : 95, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-14", + "start" : 96, + "end" : 100, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-15", + "start" : 101, + "end" : 104, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "CC" + } + }, { + "id" : "tok-16", + "start" : 105, + "end" : 107, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-17", + "start" : 108, + "end" : 116, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-18", + "start" : 117, + "end" : 126, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-19", + "start" : 126, + "end" : 127, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "." + } + }, { + "id" : "tok-20", + "start" : 128, + "end" : 136, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBG" + } + }, { + "id" : "tok-21", + "start" : 137, + "end" : 146, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-22", + "start" : 147, + "end" : 149, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBZ" + } + }, { + "id" : "tok-23", + "start" : 150, + "end" : 159, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-24", + "start" : 160, + "end" : 163, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-25", + "start" : 164, + "end" : 173, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBG" + } + }, { + "id" : "tok-26", + "start" : 174, + "end" : 177, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "CC" + } + }, { + "id" : "tok-27", + "start" : 178, + "end" : 187, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBG" + } + }, { + "id" : "tok-28", + "start" : 188, + "end" : 191, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-29", + "start" : 192, + "end" : 198, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-30", + "start" : 199, + "end" : 211, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-31", + "start" : 212, + "end" : 216, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "WDT" + } + }, { + "id" : "tok-32", + "start" : 217, + "end" : 223, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBZ" + } + }, { + "id" : "tok-33", + "start" : 224, + "end" : 226, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-34", + "start" : 227, + "end" : 230, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-35", + "start" : 231, + "end" : 238, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-36", + "start" : 239, + "end" : 246, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-37", + "start" : 247, + "end" : 249, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-38", + "start" : 250, + "end" : 253, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-39", + "start" : 254, + "end" : 258, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-40", + "start" : 259, + "end" : 262, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "CC" + } + }, { + "id" : "tok-41", + "start" : 263, + "end" : 266, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "PRP$" + } + }, { + "id" : "tok-42", + "start" : 267, + "end" : 275, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-43", + "start" : 276, + "end" : 278, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-44", + "start" : 279, + "end" : 287, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-45", + "start" : 288, + "end" : 290, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "TO" + } + }, { + "id" : "tok-46", + "start" : 291, + "end" : 300, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-47", + "start" : 301, + "end" : 309, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-48", + "start" : 310, + "end" : 312, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "TO" + } + }, { + "id" : "tok-49", + "start" : 313, + "end" : 320, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-50", + "start" : 321, + "end" : 330, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-51", + "start" : 330, + "end" : 331, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "." + } + }, { + "id" : "tok-52", + "start" : 332, + "end" : 336, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-53", + "start" : 337, + "end" : 338, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-54", + "start" : 339, + "end" : 348, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-55", + "start" : 349, + "end" : 350, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "``" + } + }, { + "id" : "tok-56", + "start" : 350, + "end" : 360, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBN" + } + }, { + "id" : "tok-57", + "start" : 360, + "end" : 361, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "''" + } + }, { + "id" : "tok-58", + "start" : 362, + "end" : 383, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-59", + "start" : 384, + "end" : 389, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBP" + } + }, { + "id" : "tok-60", + "start" : 389, + "end" : 390, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "," + } + }, { + "id" : "tok-61", + "start" : 391, + "end" : 402, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "RB" + } + }, { + "id" : "tok-62", + "start" : 403, + "end" : 410, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBN" + } + }, { + "id" : "tok-63", + "start" : 411, + "end" : 412, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "-LRB-" + } + }, { + "id" : "tok-64", + "start" : 412, + "end" : 413, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "``" + } + }, { + "id" : "tok-65", + "start" : 413, + "end" : 422, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-66", + "start" : 422, + "end" : 423, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "''" + } + }, { + "id" : "tok-67", + "start" : 423, + "end" : 424, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "-RRB-" + } + }, { + "id" : "tok-68", + "start" : 425, + "end" : 432, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-69", + "start" : 433, + "end" : 439, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-70", + "start" : 440, + "end" : 446, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-71", + "start" : 447, + "end" : 451, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-72", + "start" : 451, + "end" : 452, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "," + } + }, { + "id" : "tok-73", + "start" : 453, + "end" : 455, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-74", + "start" : 456, + "end" : 460, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNP" + } + }, { + "id" : "tok-75", + "start" : 461, + "end" : 470, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBD" + } + }, { + "id" : "tok-76", + "start" : 470, + "end" : 471, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "," + } + }, { + "id" : "tok-77", + "start" : 472, + "end" : 475, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "CC" + } + }, { + "id" : "tok-78", + "start" : 476, + "end" : 485, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBZ" + } + }, { + "id" : "tok-79", + "start" : 486, + "end" : 490, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-80", + "start" : 491, + "end" : 497, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-81", + "start" : 498, + "end" : 501, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "CC" + } + }, { + "id" : "tok-82", + "start" : 502, + "end" : 508, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "RB" + } + }, { + "id" : "tok-83", + "start" : 509, + "end" : 521, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-84", + "start" : 522, + "end" : 529, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-85", + "start" : 530, + "end" : 536, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-86", + "start" : 536, + "end" : 537, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "," + } + }, { + "id" : "tok-87", + "start" : 538, + "end" : 543, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "WDT" + } + }, { + "id" : "tok-88", + "start" : 544, + "end" : 550, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-89", + "start" : 551, + "end" : 554, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-90", + "start" : 554, + "end" : 555, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "." + } + }, { + "id" : "tok-91", + "start" : 556, + "end" : 557, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "``" + } + }, { + "id" : "tok-92", + "start" : 557, + "end" : 564, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-93", + "start" : 564, + "end" : 565, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "''" + } + }, { + "id" : "tok-94", + "start" : 566, + "end" : 573, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-95", + "start" : 574, + "end" : 580, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-96", + "start" : 581, + "end" : 585, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-97", + "start" : 586, + "end" : 588, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-98", + "start" : 589, + "end" : 595, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNP" + } + }, { + "id" : "tok-99", + "start" : 596, + "end" : 599, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "CC" + } + }, { + "id" : "tok-100", + "start" : 600, + "end" : 608, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNP" + } + }, { + "id" : "tok-101", + "start" : 609, + "end" : 611, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBP" + } + }, { + "id" : "tok-102", + "start" : 612, + "end" : 615, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "RB" + } + }, { + "id" : "tok-103", + "start" : 616, + "end" : 627, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VB" + } + }, { + "id" : "tok-104", + "start" : 628, + "end" : 632, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-105", + "start" : 633, + "end" : 643, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-106", + "start" : 643, + "end" : 644, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "." + } + }, { + "id" : "tok-107", + "start" : 645, + "end" : 649, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "WRB" + } + }, { + "id" : "tok-108", + "start" : 650, + "end" : 660, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-109", + "start" : 661, + "end" : 662, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "-LRB-" + } + }, { + "id" : "tok-110", + "start" : 662, + "end" : 673, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-111", + "start" : 673, + "end" : 674, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "-RRB-" + } + }, { + "id" : "tok-112", + "start" : 675, + "end" : 680, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-113", + "start" : 681, + "end" : 685, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-114", + "start" : 686, + "end" : 695, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-115", + "start" : 696, + "end" : 703, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBP" + } + }, { + "id" : "tok-116", + "start" : 704, + "end" : 706, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-117", + "start" : 707, + "end" : 715, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-118", + "start" : 716, + "end" : 718, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "TO" + } + }, { + "id" : "tok-119", + "start" : 719, + "end" : 730, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "RB" + } + }, { + "id" : "tok-120", + "start" : 731, + "end" : 741, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VB" + } + }, { + "id" : "tok-121", + "start" : 742, + "end" : 749, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-122", + "start" : 750, + "end" : 756, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-123", + "start" : 757, + "end" : 759, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-124", + "start" : 760, + "end" : 763, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-125", + "start" : 764, + "end" : 768, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-126", + "start" : 768, + "end" : 769, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "," + } + }, { + "id" : "tok-127", + "start" : 770, + "end" : 773, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-128", + "start" : 774, + "end" : 781, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-129", + "start" : 782, + "end" : 786, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "WDT" + } + }, { + "id" : "tok-130", + "start" : 787, + "end" : 792, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBP" + } + }, { + "id" : "tok-131", + "start" : 793, + "end" : 797, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VB" + } + }, { + "id" : "tok-132", + "start" : 798, + "end" : 800, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "TO" + } + }, { + "id" : "tok-133", + "start" : 801, + "end" : 805, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VB" + } + }, { + "id" : "tok-134", + "start" : 806, + "end" : 815, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBG" + } + }, { + "id" : "tok-135", + "start" : 815, + "end" : 816, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "," + } + }, { + "id" : "tok-136", + "start" : 817, + "end" : 824, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-137", + "start" : 825, + "end" : 830, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-138", + "start" : 830, + "end" : 831, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "." + } + }, { + "id" : "tok-139", + "start" : 832, + "end" : 833, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-140", + "start" : 834, + "end" : 844, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "RB" + } + }, { + "id" : "tok-141", + "start" : 845, + "end" : 849, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBN" + } + }, { + "id" : "tok-142", + "start" : 850, + "end" : 853, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "CC" + } + }, { + "id" : "tok-143", + "start" : 854, + "end" : 865, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-144", + "start" : 866, + "end" : 870, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-145", + "start" : 871, + "end" : 874, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-146", + "start" : 875, + "end" : 880, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-147", + "start" : 881, + "end" : 891, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-148", + "start" : 892, + "end" : 899, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-149", + "start" : 900, + "end" : 902, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-150", + "start" : 903, + "end" : 906, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-151", + "start" : 907, + "end" : 911, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-152", + "start" : 912, + "end" : 914, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBZ" + } + }, { + "id" : "tok-153", + "start" : 915, + "end" : 922, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-154", + "start" : 923, + "end" : 932, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-155", + "start" : 933, + "end" : 940, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-156", + "start" : 941, + "end" : 945, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-157", + "start" : 946, + "end" : 950, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-158", + "start" : 951, + "end" : 961, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBZ" + } + }, { + "id" : "tok-159", + "start" : 962, + "end" : 965, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-160", + "start" : 966, + "end" : 972, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-161", + "start" : 973, + "end" : 980, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-162", + "start" : 981, + "end" : 987, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-163", + "start" : 988, + "end" : 991, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "CC" + } + }, { + "id" : "tok-164", + "start" : 992, + "end" : 1004, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-165", + "start" : 1005, + "end" : 1007, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-166", + "start" : 1008, + "end" : 1012, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-167", + "start" : 1013, + "end" : 1017, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-168", + "start" : 1018, + "end" : 1025, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-169", + "start" : 1025, + "end" : 1026, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "." + } + }, { + "id" : "tok-170", + "start" : 1027, + "end" : 1040, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-171", + "start" : 1041, + "end" : 1053, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-172", + "start" : 1054, + "end" : 1056, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-173", + "start" : 1057, + "end" : 1067, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-174", + "start" : 1068, + "end" : 1072, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-175", + "start" : 1073, + "end" : 1084, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "RB" + } + }, { + "id" : "tok-176", + "start" : 1085, + "end" : 1095, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-177", + "start" : 1096, + "end" : 1104, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-178", + "start" : 1105, + "end" : 1112, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-179", + "start" : 1113, + "end" : 1119, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-180", + "start" : 1120, + "end" : 1127, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBZ" + } + }, { + "id" : "tok-181", + "start" : 1128, + "end" : 1130, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "TO" + } + }, { + "id" : "tok-182", + "start" : 1131, + "end" : 1133, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VB" + } + }, { + "id" : "tok-183", + "start" : 1134, + "end" : 1137, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-184", + "start" : 1138, + "end" : 1143, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-185", + "start" : 1144, + "end" : 1146, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-186", + "start" : 1147, + "end" : 1155, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-187", + "start" : 1156, + "end" : 1165, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-188", + "start" : 1165, + "end" : 1166, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "." + } + }, { + "id" : "tok-189", + "start" : 1167, + "end" : 1174, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-190", + "start" : 1175, + "end" : 1185, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-191", + "start" : 1186, + "end" : 1189, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "MD" + } + }, { + "id" : "tok-192", + "start" : 1190, + "end" : 1195, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VB" + } + }, { + "id" : "tok-193", + "start" : 1196, + "end" : 1198, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-194", + "start" : 1199, + "end" : 1202, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-195", + "start" : 1203, + "end" : 1207, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-196", + "start" : 1208, + "end" : 1220, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "RB" + } + }, { + "id" : "tok-197", + "start" : 1221, + "end" : 1225, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-198", + "start" : 1226, + "end" : 1234, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-199", + "start" : 1235, + "end" : 1244, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-200", + "start" : 1244, + "end" : 1245, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "." + } + }, { + "id" : "tok-201", + "start" : 1246, + "end" : 1247, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-202", + "start" : 1248, + "end" : 1254, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-203", + "start" : 1255, + "end" : 1260, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-204", + "start" : 1261, + "end" : 1263, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-205", + "start" : 1264, + "end" : 1272, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-206", + "start" : 1273, + "end" : 1282, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-207", + "start" : 1283, + "end" : 1291, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "RB" + } + }, { + "id" : "tok-208", + "start" : 1292, + "end" : 1299, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBZ" + } + }, { + "id" : "tok-209", + "start" : 1300, + "end" : 1303, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "PRP$" + } + }, { + "id" : "tok-210", + "start" : 1304, + "end" : 1311, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "JJ" + } + }, { + "id" : "tok-211", + "start" : 1312, + "end" : 1318, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-212", + "start" : 1319, + "end" : 1326, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NNS" + } + }, { + "id" : "tok-213", + "start" : 1327, + "end" : 1329, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "TO" + } + }, { + "id" : "tok-214", + "start" : 1330, + "end" : 1335, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "DT" + } + }, { + "id" : "tok-215", + "start" : 1336, + "end" : 1338, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "IN" + } + }, { + "id" : "tok-216", + "start" : 1339, + "end" : 1340, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "``" + } + }, { + "id" : "tok-217", + "start" : 1340, + "end" : 1351, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "VBN" + } + }, { + "id" : "tok-218", + "start" : 1351, + "end" : 1352, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "''" + } + }, { + "id" : "tok-219", + "start" : 1353, + "end" : 1368, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "NN" + } + }, { + "id" : "tok-220", + "start" : 1368, + "end" : 1369, + "@type" : "http://vocab.lappsgrid.org/Token", + "features" : { + "pos" : "." + } + } ] + } ] + }, + "parameters" : { } +} \ No newline at end of file diff --git a/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos.lif b/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos.lif new file mode 100644 index 0000000000..438d91cd9a --- /dev/null +++ b/dkpro-core-io-lif-asl/src/test/resources/lif/stanford-pos.lif @@ -0,0 +1,2237 @@ +{ + "discriminator": "http://vocab.lappsgrid.org/ns/media/jsonld#lif", + "payload": { + "@context": "http://vocab.lappsgrid.org/context-1.0.0.jsonld", + "metadata": { + "sourceid": "10025748", + "sourcedb": "PubMed" + }, + "text": { + "@value": "Actinically degenerate elastic tissue is the likely antigenic basis of actinic granuloma of the skin and of temporal arteritis.\nStaining technique is paramount for detecting and assessing the severe degeneration that occurs in the elastic tissues of the skin and its arteries in response to prolonged exposure to actinic radiation. With a selective \"controlled\" hematoxylin-and-eosin stain, actinically damaged (\"elastotic\") elastic tissue stains blue, as Unna described, and contrasts with normal and simply hyperplastic elastic tissue, which stains red. \"Special\" elastic stains such as Orcein and Verhoeff do not demonstrate this difference. When resorptive (elastolytic) giant cell reactions develop in relation to actinically degenerate elastic tissue of the skin, the papules that arise tend to form expanding, annular rings. A previously used and appropriate name for these autoimmune lesions in the skin is actinic granuloma because this name highlights the likely actinic origin and pathogenesis of many such lesions. Granulomatous inflammation in connection with actinically degenerate internal elastic lamina appears to be the basis of temporal arteritis. Actinic granulomas may occur in the skin concurrently with temporal arteritis. A recent study of temporal arteritis strongly relates its elastic tissue changes to those of \"accelerated\" atherosclerosis." + }, + "views": [ + { + "metadata": { + "contains": { + "http://vocab.lappsgrid.org/Token#pos": { + "producer": "edu.brandeis.cs.lappsgrid.stanford.corenlp.POSTagger:2.0.4", + "type": "tagger:stanford" + } + } + }, + "annotations": [ + { + "id": "tk_0_0", + "start": 0, + "end": 11, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNP", + "word": "Actinically" + } + }, + { + "id": "tk_1_1", + "start": 12, + "end": 22, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "degenerate" + } + }, + { + "id": "tk_2_2", + "start": 23, + "end": 30, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastic" + } + }, + { + "id": "tk_3_3", + "start": 31, + "end": 37, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "tissue" + } + }, + { + "id": "tk_4_4", + "start": 38, + "end": 40, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBZ", + "word": "is" + } + }, + { + "id": "tk_5_5", + "start": 41, + "end": 44, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_6_6", + "start": 45, + "end": 51, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "likely" + } + }, + { + "id": "tk_7_7", + "start": 52, + "end": 61, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "antigenic" + } + }, + { + "id": "tk_8_8", + "start": 62, + "end": 67, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "basis" + } + }, + { + "id": "tk_9_9", + "start": 68, + "end": 70, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "of" + } + }, + { + "id": "tk_10_10", + "start": 71, + "end": 78, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "actinic" + } + }, + { + "id": "tk_11_11", + "start": 79, + "end": 88, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "granuloma" + } + }, + { + "id": "tk_12_12", + "start": 89, + "end": 91, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "of" + } + }, + { + "id": "tk_13_13", + "start": 92, + "end": 95, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_14_14", + "start": 96, + "end": 100, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "skin" + } + }, + { + "id": "tk_15_15", + "start": 101, + "end": 104, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "CC", + "word": "and" + } + }, + { + "id": "tk_16_16", + "start": 105, + "end": 107, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "of" + } + }, + { + "id": "tk_17_17", + "start": 108, + "end": 116, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "temporal" + } + }, + { + "id": "tk_18_18", + "start": 117, + "end": 126, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "arteritis" + } + }, + { + "id": "tk_19_19", + "start": 126, + "end": 127, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ".", + "word": "." + } + }, + { + "id": "tk_20_0", + "start": 128, + "end": 136, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBG", + "word": "Staining" + } + }, + { + "id": "tk_21_1", + "start": 137, + "end": 146, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "technique" + } + }, + { + "id": "tk_22_2", + "start": 147, + "end": 149, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBZ", + "word": "is" + } + }, + { + "id": "tk_23_3", + "start": 150, + "end": 159, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "paramount" + } + }, + { + "id": "tk_24_4", + "start": 160, + "end": 163, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "for" + } + }, + { + "id": "tk_25_5", + "start": 164, + "end": 173, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBG", + "word": "detecting" + } + }, + { + "id": "tk_26_6", + "start": 174, + "end": 177, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "CC", + "word": "and" + } + }, + { + "id": "tk_27_7", + "start": 178, + "end": 187, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBG", + "word": "assessing" + } + }, + { + "id": "tk_28_8", + "start": 188, + "end": 191, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_29_9", + "start": 192, + "end": 198, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "severe" + } + }, + { + "id": "tk_30_10", + "start": 199, + "end": 211, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "degeneration" + } + }, + { + "id": "tk_31_11", + "start": 212, + "end": 216, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "WDT", + "word": "that" + } + }, + { + "id": "tk_32_12", + "start": 217, + "end": 223, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBZ", + "word": "occurs" + } + }, + { + "id": "tk_33_13", + "start": 224, + "end": 226, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "in" + } + }, + { + "id": "tk_34_14", + "start": 227, + "end": 230, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_35_15", + "start": 231, + "end": 238, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastic" + } + }, + { + "id": "tk_36_16", + "start": 239, + "end": 246, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "tissues" + } + }, + { + "id": "tk_37_17", + "start": 247, + "end": 249, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "of" + } + }, + { + "id": "tk_38_18", + "start": 250, + "end": 253, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_39_19", + "start": 254, + "end": 258, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "skin" + } + }, + { + "id": "tk_40_20", + "start": 259, + "end": 262, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "CC", + "word": "and" + } + }, + { + "id": "tk_41_21", + "start": 263, + "end": 266, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "PRP$", + "word": "its" + } + }, + { + "id": "tk_42_22", + "start": 267, + "end": 275, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "arteries" + } + }, + { + "id": "tk_43_23", + "start": 276, + "end": 278, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "in" + } + }, + { + "id": "tk_44_24", + "start": 279, + "end": 287, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "response" + } + }, + { + "id": "tk_45_25", + "start": 288, + "end": 290, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "TO", + "word": "to" + } + }, + { + "id": "tk_46_26", + "start": 291, + "end": 300, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "prolonged" + } + }, + { + "id": "tk_47_27", + "start": 301, + "end": 309, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "exposure" + } + }, + { + "id": "tk_48_28", + "start": 310, + "end": 312, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "TO", + "word": "to" + } + }, + { + "id": "tk_49_29", + "start": 313, + "end": 320, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "actinic" + } + }, + { + "id": "tk_50_30", + "start": 321, + "end": 330, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "radiation" + } + }, + { + "id": "tk_51_31", + "start": 330, + "end": 331, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ".", + "word": "." + } + }, + { + "id": "tk_52_0", + "start": 332, + "end": 336, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "With" + } + }, + { + "id": "tk_53_1", + "start": 337, + "end": 338, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "a" + } + }, + { + "id": "tk_54_2", + "start": 339, + "end": 348, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "selective" + } + }, + { + "id": "tk_55_3", + "start": 349, + "end": 350, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "``", + "word": "``" + } + }, + { + "id": "tk_56_4", + "start": 350, + "end": 360, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBN", + "word": "controlled" + } + }, + { + "id": "tk_57_5", + "start": 360, + "end": 361, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "''", + "word": "''" + } + }, + { + "id": "tk_58_6", + "start": 362, + "end": 383, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "hematoxylin-and-eosin" + } + }, + { + "id": "tk_59_7", + "start": 384, + "end": 389, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBP", + "word": "stain" + } + }, + { + "id": "tk_60_8", + "start": 389, + "end": 390, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ",", + "word": "," + } + }, + { + "id": "tk_61_9", + "start": 391, + "end": 402, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "RB", + "word": "actinically" + } + }, + { + "id": "tk_62_10", + "start": 403, + "end": 410, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBN", + "word": "damaged" + } + }, + { + "id": "tk_63_11", + "start": 411, + "end": 412, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "-LRB-", + "word": "-LRB-" + } + }, + { + "id": "tk_64_12", + "start": 412, + "end": 413, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "``", + "word": "``" + } + }, + { + "id": "tk_65_13", + "start": 413, + "end": 422, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastotic" + } + }, + { + "id": "tk_66_14", + "start": 422, + "end": 423, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "''", + "word": "''" + } + }, + { + "id": "tk_67_15", + "start": 423, + "end": 424, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "-RRB-", + "word": "-RRB-" + } + }, + { + "id": "tk_68_16", + "start": 425, + "end": 432, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastic" + } + }, + { + "id": "tk_69_17", + "start": 433, + "end": 439, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "tissue" + } + }, + { + "id": "tk_70_18", + "start": 440, + "end": 446, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "stains" + } + }, + { + "id": "tk_71_19", + "start": 447, + "end": 451, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "blue" + } + }, + { + "id": "tk_72_20", + "start": 451, + "end": 452, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ",", + "word": "," + } + }, + { + "id": "tk_73_21", + "start": 453, + "end": 455, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "as" + } + }, + { + "id": "tk_74_22", + "start": 456, + "end": 460, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNP", + "word": "Unna" + } + }, + { + "id": "tk_75_23", + "start": 461, + "end": 470, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBD", + "word": "described" + } + }, + { + "id": "tk_76_24", + "start": 470, + "end": 471, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ",", + "word": "," + } + }, + { + "id": "tk_77_25", + "start": 472, + "end": 475, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "CC", + "word": "and" + } + }, + { + "id": "tk_78_26", + "start": 476, + "end": 485, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBZ", + "word": "contrasts" + } + }, + { + "id": "tk_79_27", + "start": 486, + "end": 490, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "with" + } + }, + { + "id": "tk_80_28", + "start": 491, + "end": 497, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "normal" + } + }, + { + "id": "tk_81_29", + "start": 498, + "end": 501, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "CC", + "word": "and" + } + }, + { + "id": "tk_82_30", + "start": 502, + "end": 508, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "RB", + "word": "simply" + } + }, + { + "id": "tk_83_31", + "start": 509, + "end": 521, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "hyperplastic" + } + }, + { + "id": "tk_84_32", + "start": 522, + "end": 529, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastic" + } + }, + { + "id": "tk_85_33", + "start": 530, + "end": 536, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "tissue" + } + }, + { + "id": "tk_86_34", + "start": 536, + "end": 537, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ",", + "word": "," + } + }, + { + "id": "tk_87_35", + "start": 538, + "end": 543, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "WDT", + "word": "which" + } + }, + { + "id": "tk_88_36", + "start": 544, + "end": 550, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "stains" + } + }, + { + "id": "tk_89_37", + "start": 551, + "end": 554, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "red" + } + }, + { + "id": "tk_90_38", + "start": 554, + "end": 555, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ".", + "word": "." + } + }, + { + "id": "tk_91_0", + "start": 556, + "end": 557, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "``", + "word": "``" + } + }, + { + "id": "tk_92_1", + "start": 557, + "end": 564, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "Special" + } + }, + { + "id": "tk_93_2", + "start": 564, + "end": 565, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "''", + "word": "''" + } + }, + { + "id": "tk_94_3", + "start": 566, + "end": 573, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastic" + } + }, + { + "id": "tk_95_4", + "start": 574, + "end": 580, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "stains" + } + }, + { + "id": "tk_96_5", + "start": 581, + "end": 585, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "such" + } + }, + { + "id": "tk_97_6", + "start": 586, + "end": 588, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "as" + } + }, + { + "id": "tk_98_7", + "start": 589, + "end": 595, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNP", + "word": "Orcein" + } + }, + { + "id": "tk_99_8", + "start": 596, + "end": 599, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "CC", + "word": "and" + } + }, + { + "id": "tk_100_9", + "start": 600, + "end": 608, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNP", + "word": "Verhoeff" + } + }, + { + "id": "tk_101_10", + "start": 609, + "end": 611, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBP", + "word": "do" + } + }, + { + "id": "tk_102_11", + "start": 612, + "end": 615, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "RB", + "word": "not" + } + }, + { + "id": "tk_103_12", + "start": 616, + "end": 627, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VB", + "word": "demonstrate" + } + }, + { + "id": "tk_104_13", + "start": 628, + "end": 632, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "this" + } + }, + { + "id": "tk_105_14", + "start": 633, + "end": 643, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "difference" + } + }, + { + "id": "tk_106_15", + "start": 643, + "end": 644, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ".", + "word": "." + } + }, + { + "id": "tk_107_0", + "start": 645, + "end": 649, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "WRB", + "word": "When" + } + }, + { + "id": "tk_108_1", + "start": 650, + "end": 660, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "resorptive" + } + }, + { + "id": "tk_109_2", + "start": 661, + "end": 662, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "-LRB-", + "word": "-LRB-" + } + }, + { + "id": "tk_110_3", + "start": 662, + "end": 673, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastolytic" + } + }, + { + "id": "tk_111_4", + "start": 673, + "end": 674, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "-RRB-", + "word": "-RRB-" + } + }, + { + "id": "tk_112_5", + "start": 675, + "end": 680, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "giant" + } + }, + { + "id": "tk_113_6", + "start": 681, + "end": 685, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "cell" + } + }, + { + "id": "tk_114_7", + "start": 686, + "end": 695, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "reactions" + } + }, + { + "id": "tk_115_8", + "start": 696, + "end": 703, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBP", + "word": "develop" + } + }, + { + "id": "tk_116_9", + "start": 704, + "end": 706, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "in" + } + }, + { + "id": "tk_117_10", + "start": 707, + "end": 715, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "relation" + } + }, + { + "id": "tk_118_11", + "start": 716, + "end": 718, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "TO", + "word": "to" + } + }, + { + "id": "tk_119_12", + "start": 719, + "end": 730, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "RB", + "word": "actinically" + } + }, + { + "id": "tk_120_13", + "start": 731, + "end": 741, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VB", + "word": "degenerate" + } + }, + { + "id": "tk_121_14", + "start": 742, + "end": 749, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastic" + } + }, + { + "id": "tk_122_15", + "start": 750, + "end": 756, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "tissue" + } + }, + { + "id": "tk_123_16", + "start": 757, + "end": 759, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "of" + } + }, + { + "id": "tk_124_17", + "start": 760, + "end": 763, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_125_18", + "start": 764, + "end": 768, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "skin" + } + }, + { + "id": "tk_126_19", + "start": 768, + "end": 769, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ",", + "word": "," + } + }, + { + "id": "tk_127_20", + "start": 770, + "end": 773, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_128_21", + "start": 774, + "end": 781, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "papules" + } + }, + { + "id": "tk_129_22", + "start": 782, + "end": 786, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "WDT", + "word": "that" + } + }, + { + "id": "tk_130_23", + "start": 787, + "end": 792, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBP", + "word": "arise" + } + }, + { + "id": "tk_131_24", + "start": 793, + "end": 797, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VB", + "word": "tend" + } + }, + { + "id": "tk_132_25", + "start": 798, + "end": 800, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "TO", + "word": "to" + } + }, + { + "id": "tk_133_26", + "start": 801, + "end": 805, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VB", + "word": "form" + } + }, + { + "id": "tk_134_27", + "start": 806, + "end": 815, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBG", + "word": "expanding" + } + }, + { + "id": "tk_135_28", + "start": 815, + "end": 816, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ",", + "word": "," + } + }, + { + "id": "tk_136_29", + "start": 817, + "end": 824, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "annular" + } + }, + { + "id": "tk_137_30", + "start": 825, + "end": 830, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "rings" + } + }, + { + "id": "tk_138_31", + "start": 830, + "end": 831, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ".", + "word": "." + } + }, + { + "id": "tk_139_0", + "start": 832, + "end": 833, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "A" + } + }, + { + "id": "tk_140_1", + "start": 834, + "end": 844, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "RB", + "word": "previously" + } + }, + { + "id": "tk_141_2", + "start": 845, + "end": 849, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBN", + "word": "used" + } + }, + { + "id": "tk_142_3", + "start": 850, + "end": 853, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "CC", + "word": "and" + } + }, + { + "id": "tk_143_4", + "start": 854, + "end": 865, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "appropriate" + } + }, + { + "id": "tk_144_5", + "start": 866, + "end": 870, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "name" + } + }, + { + "id": "tk_145_6", + "start": 871, + "end": 874, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "for" + } + }, + { + "id": "tk_146_7", + "start": 875, + "end": 880, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "these" + } + }, + { + "id": "tk_147_8", + "start": 881, + "end": 891, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "autoimmune" + } + }, + { + "id": "tk_148_9", + "start": 892, + "end": 899, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "lesions" + } + }, + { + "id": "tk_149_10", + "start": 900, + "end": 902, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "in" + } + }, + { + "id": "tk_150_11", + "start": 903, + "end": 906, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_151_12", + "start": 907, + "end": 911, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "skin" + } + }, + { + "id": "tk_152_13", + "start": 912, + "end": 914, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBZ", + "word": "is" + } + }, + { + "id": "tk_153_14", + "start": 915, + "end": 922, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "actinic" + } + }, + { + "id": "tk_154_15", + "start": 923, + "end": 932, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "granuloma" + } + }, + { + "id": "tk_155_16", + "start": 933, + "end": 940, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "because" + } + }, + { + "id": "tk_156_17", + "start": 941, + "end": 945, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "this" + } + }, + { + "id": "tk_157_18", + "start": 946, + "end": 950, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "name" + } + }, + { + "id": "tk_158_19", + "start": 951, + "end": 961, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBZ", + "word": "highlights" + } + }, + { + "id": "tk_159_20", + "start": 962, + "end": 965, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_160_21", + "start": 966, + "end": 972, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "likely" + } + }, + { + "id": "tk_161_22", + "start": 973, + "end": 980, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "actinic" + } + }, + { + "id": "tk_162_23", + "start": 981, + "end": 987, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "origin" + } + }, + { + "id": "tk_163_24", + "start": 988, + "end": 991, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "CC", + "word": "and" + } + }, + { + "id": "tk_164_25", + "start": 992, + "end": 1004, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "pathogenesis" + } + }, + { + "id": "tk_165_26", + "start": 1005, + "end": 1007, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "of" + } + }, + { + "id": "tk_166_27", + "start": 1008, + "end": 1012, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "many" + } + }, + { + "id": "tk_167_28", + "start": 1013, + "end": 1017, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "such" + } + }, + { + "id": "tk_168_29", + "start": 1018, + "end": 1025, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "lesions" + } + }, + { + "id": "tk_169_30", + "start": 1025, + "end": 1026, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ".", + "word": "." + } + }, + { + "id": "tk_170_0", + "start": 1027, + "end": 1040, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "Granulomatous" + } + }, + { + "id": "tk_171_1", + "start": 1041, + "end": 1053, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "inflammation" + } + }, + { + "id": "tk_172_2", + "start": 1054, + "end": 1056, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "in" + } + }, + { + "id": "tk_173_3", + "start": 1057, + "end": 1067, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "connection" + } + }, + { + "id": "tk_174_4", + "start": 1068, + "end": 1072, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "with" + } + }, + { + "id": "tk_175_5", + "start": 1073, + "end": 1084, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "RB", + "word": "actinically" + } + }, + { + "id": "tk_176_6", + "start": 1085, + "end": 1095, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "degenerate" + } + }, + { + "id": "tk_177_7", + "start": 1096, + "end": 1104, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "internal" + } + }, + { + "id": "tk_178_8", + "start": 1105, + "end": 1112, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastic" + } + }, + { + "id": "tk_179_9", + "start": 1113, + "end": 1119, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "lamina" + } + }, + { + "id": "tk_180_10", + "start": 1120, + "end": 1127, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBZ", + "word": "appears" + } + }, + { + "id": "tk_181_11", + "start": 1128, + "end": 1130, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "TO", + "word": "to" + } + }, + { + "id": "tk_182_12", + "start": 1131, + "end": 1133, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VB", + "word": "be" + } + }, + { + "id": "tk_183_13", + "start": 1134, + "end": 1137, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_184_14", + "start": 1138, + "end": 1143, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "basis" + } + }, + { + "id": "tk_185_15", + "start": 1144, + "end": 1146, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "of" + } + }, + { + "id": "tk_186_16", + "start": 1147, + "end": 1155, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "temporal" + } + }, + { + "id": "tk_187_17", + "start": 1156, + "end": 1165, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "arteritis" + } + }, + { + "id": "tk_188_18", + "start": 1165, + "end": 1166, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ".", + "word": "." + } + }, + { + "id": "tk_189_0", + "start": 1167, + "end": 1174, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "Actinic" + } + }, + { + "id": "tk_190_1", + "start": 1175, + "end": 1185, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "granulomas" + } + }, + { + "id": "tk_191_2", + "start": 1186, + "end": 1189, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "MD", + "word": "may" + } + }, + { + "id": "tk_192_3", + "start": 1190, + "end": 1195, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VB", + "word": "occur" + } + }, + { + "id": "tk_193_4", + "start": 1196, + "end": 1198, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "in" + } + }, + { + "id": "tk_194_5", + "start": 1199, + "end": 1202, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "the" + } + }, + { + "id": "tk_195_6", + "start": 1203, + "end": 1207, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "skin" + } + }, + { + "id": "tk_196_7", + "start": 1208, + "end": 1220, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "RB", + "word": "concurrently" + } + }, + { + "id": "tk_197_8", + "start": 1221, + "end": 1225, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "with" + } + }, + { + "id": "tk_198_9", + "start": 1226, + "end": 1234, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "temporal" + } + }, + { + "id": "tk_199_10", + "start": 1235, + "end": 1244, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "arteritis" + } + }, + { + "id": "tk_200_11", + "start": 1244, + "end": 1245, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ".", + "word": "." + } + }, + { + "id": "tk_201_0", + "start": 1246, + "end": 1247, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "A" + } + }, + { + "id": "tk_202_1", + "start": 1248, + "end": 1254, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "recent" + } + }, + { + "id": "tk_203_2", + "start": 1255, + "end": 1260, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "study" + } + }, + { + "id": "tk_204_3", + "start": 1261, + "end": 1263, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "of" + } + }, + { + "id": "tk_205_4", + "start": 1264, + "end": 1272, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "temporal" + } + }, + { + "id": "tk_206_5", + "start": 1273, + "end": 1282, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "arteritis" + } + }, + { + "id": "tk_207_6", + "start": 1283, + "end": 1291, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "RB", + "word": "strongly" + } + }, + { + "id": "tk_208_7", + "start": 1292, + "end": 1299, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBZ", + "word": "relates" + } + }, + { + "id": "tk_209_8", + "start": 1300, + "end": 1303, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "PRP$", + "word": "its" + } + }, + { + "id": "tk_210_9", + "start": 1304, + "end": 1311, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "JJ", + "word": "elastic" + } + }, + { + "id": "tk_211_10", + "start": 1312, + "end": 1318, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "tissue" + } + }, + { + "id": "tk_212_11", + "start": 1319, + "end": 1326, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NNS", + "word": "changes" + } + }, + { + "id": "tk_213_12", + "start": 1327, + "end": 1329, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "TO", + "word": "to" + } + }, + { + "id": "tk_214_13", + "start": 1330, + "end": 1335, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "DT", + "word": "those" + } + }, + { + "id": "tk_215_14", + "start": 1336, + "end": 1338, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "IN", + "word": "of" + } + }, + { + "id": "tk_216_15", + "start": 1339, + "end": 1340, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "``", + "word": "``" + } + }, + { + "id": "tk_217_16", + "start": 1340, + "end": 1351, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "VBN", + "word": "accelerated" + } + }, + { + "id": "tk_218_17", + "start": 1351, + "end": 1352, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "''", + "word": "''" + } + }, + { + "id": "tk_219_18", + "start": 1353, + "end": 1368, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": "NN", + "word": "atherosclerosis" + } + }, + { + "id": "tk_220_19", + "start": 1368, + "end": 1369, + "@type": "http://vocab.lappsgrid.org/Token", + "features": { + "pos": ".", + "word": "." + } + } + ] + } + ] + } +} \ No newline at end of file diff --git a/dkpro-core-io-lif-asl/src/test/resources/log4j.properties b/dkpro-core-io-lif-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-io-lif-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-lif-asl/src/test/resources/log4j2.xml b/dkpro-core-io-lif-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-lif-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-lxf-asl/pom.xml b/dkpro-core-io-lxf-asl/pom.xml index 524966ad3a..5ebc73c2e2 100644 --- a/dkpro-core-io-lxf-asl/pom.xml +++ b/dkpro-core-io-lxf-asl/pom.xml @@ -18,29 +18,26 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - org.dkpro.core dkpro-core-io-lxf-asl DKPro Core ASL - IO - LXF + https://dkpro.github.io/dkpro-core/ com.fasterxml.jackson.core jackson-databind - 2.7.1 com.fasterxml.jackson.core jackson-core - 2.7.1 com.fasterxml.jackson.core jackson-annotations - 2.7.1 org.apache.uima @@ -51,28 +48,32 @@ uimafit-core - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.syntax-asl + org.dkpro.core + dkpro-core-api-syntax-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -80,13 +81,13 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl + org.dkpro.core + dkpro-core-opennlp-asl test @@ -98,9 +99,9 @@ - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-opennlp-asl + 2.3.0-SNAPSHOT pom import diff --git a/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/LxfReader.java b/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/LxfReader.java index 95d3a37f01..6656c34ab3 100644 --- a/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/LxfReader.java +++ b/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/LxfReader.java @@ -28,16 +28,21 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.MimeTypes; import org.dkpro.core.io.lxf.internal.Lxf2DKPro; import org.dkpro.core.io.lxf.internal.model.LxfGraph; import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.databind.ObjectMapper; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; +import eu.openminted.share.annotations.api.DocumentationResource; -@ResourceMetaData(name="CLARINO LAP LXF Reader") +/** + * Reader for the CLARINO LAP LXF format. + */ +@ResourceMetaData(name = "CLARINO LAP LXF Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.APPLICATION_X_LXF_JSON}) @TypeCapability( outputs = { @@ -77,5 +82,5 @@ public void getNext(JCas aCAS) // Allow to get information about everything added beyond this point aCAS.getCasImpl().createMarker(); - } + } } diff --git a/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/LxfWriter.java b/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/LxfWriter.java index 806a189b5b..f772521a1d 100644 --- a/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/LxfWriter.java +++ b/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/LxfWriter.java @@ -30,18 +30,23 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; import org.dkpro.core.io.lxf.internal.DKPro2Lxf; import org.dkpro.core.io.lxf.internal.model.LxfGraph; import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.databind.ObjectMapper; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; +import eu.openminted.share.annotations.api.DocumentationResource; -@ResourceMetaData(name="CLARINO LAP LXF Writer") +/** + * Writer for the CLARINO LAP LXF format. + */ +@ResourceMetaData(name = "CLARINO LAP LXF Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.APPLICATION_X_LXF_JSON}) @TypeCapability( inputs = { @@ -55,13 +60,17 @@ public class LxfWriter extends JCasFileWriter_ImplBase { /** - * Specify the suffix of output files. Default value .lxf. If the suffix is not - * needed, provide an empty string as value. + * Use this filename extension. */ - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".lxf") private String filenameSuffix; + /** + * Write only the changes to the annotations. This works only in conjunction with the + * {@link LxfReader}. + */ public static final String PARAM_DELTA = "delta"; @ConfigurationParameter(name = PARAM_DELTA, mandatory = true, defaultValue = "false") private boolean delta; @@ -87,7 +96,8 @@ public void process(JCas aJCas) if (delta) { DocumentMetaData dmd = DocumentMetaData.get(aJCas); - try (InputStream is = new BufferedInputStream(new URL(dmd.getDocumentUri()).openStream())) { + try (InputStream is = new BufferedInputStream( + new URL(dmd.getDocumentUri()).openStream())) { LxfGraph reference = mapper.readValue(is, LxfGraph.class); DKPro2Lxf.convert(aJCas, reference, lxf); } diff --git a/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/internal/DKPro2Lxf.java b/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/internal/DKPro2Lxf.java index 029cfe93aa..f727c5b71d 100644 --- a/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/internal/DKPro2Lxf.java +++ b/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/internal/DKPro2Lxf.java @@ -28,6 +28,7 @@ import java.util.Collection; import java.util.HashMap; +import java.util.List; import java.util.Map; import org.apache.uima.cas.FeatureStructure; @@ -63,13 +64,12 @@ public static void convert(JCas aJCas, LxfGraph aSource, LxfGraph aTarget) * the layer was present in the source than the tool from the source will be used for the layer. * Otherwise the toolName will be used. * - * @param toolName + * @param aToolName * - Tool name for new layers * @param aSource * - original lxf for DKPro - * @return */ - public static Map createIdMap(String toolName, LxfGraph aSource) + public static Map createIdMap(String aToolName, LxfGraph aSource) { Map ids = new HashMap<>(); if (aSource != null) { @@ -77,14 +77,18 @@ public static Map createIdMap(String toolName, LxfGraph aSource) ids.put(n.getType(), n.getOrigin()); } } - if (!ids.containsKey(LAYER_DEPENDENCY)) - ids.put(LAYER_DEPENDENCY, toolName); - if (!ids.containsKey(LAYER_MORPHOLOGY)) - ids.put(LAYER_MORPHOLOGY, toolName); - if (!ids.containsKey(LAYER_SENTENCE)) - ids.put(LAYER_SENTENCE, toolName); - if (!ids.containsKey(LAYER_TOKEN)) - ids.put(LAYER_TOKEN, toolName); + if (!ids.containsKey(LAYER_DEPENDENCY)) { + ids.put(LAYER_DEPENDENCY, aToolName); + } + if (!ids.containsKey(LAYER_MORPHOLOGY)) { + ids.put(LAYER_MORPHOLOGY, aToolName); + } + if (!ids.containsKey(LAYER_SENTENCE)) { + ids.put(LAYER_SENTENCE, aToolName); + } + if (!ids.containsKey(LAYER_TOKEN)) { + ids.put(LAYER_TOKEN, aToolName); + } return ids; } @@ -97,34 +101,33 @@ public static Map createIdMap(String toolName, LxfGraph aSource) * the original LXF. If this is non-null, then delta-mode is enabled. * @param aTarget * the target LXF. - * @param tooName + * @param aToolName * the name of the tool generating the new annotation - * @param ids + * @param aIds * The ids of the tool responsible for generation of the annotation Layer. The key is * the annotation layer. The value is the tool that generates the annotation. */ public static void convert(JCas aJCas, LxfGraph aSource, LxfGraph aTarget, - Map ids, String toolName) + Map aIds, String aToolName) { if (aSource == null) { aTarget.setMedia(new LxfText(aJCas.getDocumentText())); } - ToolGeneratorIndex toolEdgeIndex = new ToolGeneratorIndex(ids.values()); - ToolGeneratorIndex toolNodeIndex = new ToolGeneratorIndex(ids.values()); - ToolGeneratorIndex toolRegionIndex = new ToolGeneratorIndex(ids.values()); + ToolGeneratorIndex toolEdgeIndex = new ToolGeneratorIndex(aIds.values()); + ToolGeneratorIndex toolNodeIndex = new ToolGeneratorIndex(aIds.values()); + ToolGeneratorIndex toolRegionIndex = new ToolGeneratorIndex(aIds.values()); NodeIterator iter = new NodeIterator(aSource); - Map> idxSentTok = indexCovered(aJCas, Sentence.class, - Token.class); + Map> idxSentTok = indexCovered(aJCas, Sentence.class, Token.class); - Map> idxSentDep = indexCovered(aJCas, Sentence.class, + Map> idxSentDep = indexCovered(aJCas, Sentence.class, Dependency.class); for (Sentence sentence : select(aJCas, Sentence.class)) { LxfNode sentenceNode; - String toolid = ids.get(LAYER_SENTENCE); + String toolid = aIds.get(LAYER_SENTENCE); if (aSource == null || needsExport(aJCas, sentence)) { // Sentence region @@ -148,7 +151,7 @@ public static void convert(JCas aJCas, LxfGraph aSource, LxfGraph aTarget, for (Token token : tokens) { // Convert or obtain token node LxfNode tokenNode; - toolid = ids.get(LAYER_TOKEN); + toolid = aIds.get(LAYER_TOKEN); if (aSource == null || needsExport(aJCas, token)) { LxfRegion tokenRegion = new LxfRegion(toolid, toolRegionIndex.nextIndex(toolid), @@ -168,7 +171,7 @@ public static void convert(JCas aJCas, LxfGraph aSource, LxfGraph aTarget, tokenNode = iter.next(toolid, LAYER_TOKEN); } - toolid = ids.get(LAYER_MORPHOLOGY); + toolid = aIds.get(LAYER_MORPHOLOGY); // Convert POS if exists - if we create a node, pass it on to the lemma conversion // as well @@ -200,7 +203,7 @@ public static void convert(JCas aJCas, LxfGraph aSource, LxfGraph aTarget, if (lemma != null && (aSource == null || needsExport(aJCas, lemma))) { LxfNode lemmaNode = newMorphNode ? morphNode : null; if (lemmaNode == null) { - lemmaNode = new LxfNode(LAYER_MORPHOLOGY, toolName, + lemmaNode = new LxfNode(LAYER_MORPHOLOGY, aToolName, toolNodeIndex.nextIndex(toolid), 0); aTarget.addNode(lemmaNode); aTarget.addEdge(new LxfEdge(lemmaNode.getOrigin(), @@ -212,15 +215,16 @@ public static void convert(JCas aJCas, LxfGraph aSource, LxfGraph aTarget, } - toolid = ids.get(LAYER_DEPENDENCY); + toolid = aIds.get(LAYER_DEPENDENCY); // Dependencies Collection deps = idxSentDep.get(sentence); for (Dependency dep : deps) { - if (aSource != null && !needsExport(aJCas, dep)) + if (aSource != null && !needsExport(aJCas, dep)) { continue; + } LxfNode depNode = new LxfNode(LAYER_DEPENDENCY, toolid, toolNodeIndex.nextIndex(toolid), 0); diff --git a/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/internal/model/LxfAnnotatedObject.java b/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/internal/model/LxfAnnotatedObject.java index 1deb2ed984..01d7678889 100644 --- a/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/internal/model/LxfAnnotatedObject.java +++ b/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/internal/model/LxfAnnotatedObject.java @@ -21,7 +21,6 @@ import java.util.Map; import com.fasterxml.jackson.annotation.JsonIgnore; -import com.fasterxml.jackson.annotation.JsonProperty; public class LxfAnnotatedObject extends LxfObject diff --git a/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/internal/model/LxfGraph.java b/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/internal/model/LxfGraph.java index 8888e1eb8f..b674b9ffe0 100644 --- a/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/internal/model/LxfGraph.java +++ b/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/internal/model/LxfGraph.java @@ -18,6 +18,7 @@ package org.dkpro.core.io.lxf.internal.model; import static java.util.Arrays.asList; + import java.util.ArrayList; import java.util.List; import java.util.Optional; diff --git a/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/internal/model/LxfObject.java b/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/internal/model/LxfObject.java index d44e607407..cc208ab2ce 100644 --- a/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/internal/model/LxfObject.java +++ b/dkpro-core-io-lxf-asl/src/main/java/org/dkpro/core/io/lxf/internal/model/LxfObject.java @@ -17,7 +17,9 @@ */ package org.dkpro.core.io.lxf.internal.model; -import static org.dkpro.core.io.lxf.internal.model.LxfVocabulary.*; +import static org.dkpro.core.io.lxf.internal.model.LxfVocabulary.ENTITY_EDGE; +import static org.dkpro.core.io.lxf.internal.model.LxfVocabulary.ENTITY_NODE; +import static org.dkpro.core.io.lxf.internal.model.LxfVocabulary.ENTITY_REGION; import com.fasterxml.jackson.annotation.JsonSubTypes; import com.fasterxml.jackson.annotation.JsonTypeInfo; diff --git a/dkpro-core-io-lxf-asl/src/test/java/org/dkpro/core/io/lxf/LxfReaderTest.java b/dkpro-core-io-lxf-asl/src/test/java/org/dkpro/core/io/lxf/LxfReaderTest.java index 933fef37a3..e0621817ae 100644 --- a/dkpro-core-io-lxf-asl/src/test/java/org/dkpro/core/io/lxf/LxfReaderTest.java +++ b/dkpro-core-io-lxf-asl/src/test/java/org/dkpro/core/io/lxf/LxfReaderTest.java @@ -17,18 +17,17 @@ */ package org.dkpro.core.io.lxf; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertDependencies; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertPOS; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertSentence; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertToken; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.AssertAnnotations.assertDependencies; +import static org.dkpro.core.testing.AssertAnnotations.assertPOS; +import static org.dkpro.core.testing.AssertAnnotations.assertSentence; +import static org.dkpro.core.testing.AssertAnnotations.assertToken; import static org.junit.Assert.assertEquals; import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.pipeline.JCasIterable; import org.apache.uima.jcas.JCas; -import org.dkpro.core.io.lxf.LxfReader; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; diff --git a/dkpro-core-io-lxf-asl/src/test/java/org/dkpro/core/io/lxf/LxfReaderWriterDeltaTest.java b/dkpro-core-io-lxf-asl/src/test/java/org/dkpro/core/io/lxf/LxfReaderWriterDeltaTest.java index f52e8018e9..f90ea37881 100644 --- a/dkpro-core-io-lxf-asl/src/test/java/org/dkpro/core/io/lxf/LxfReaderWriterDeltaTest.java +++ b/dkpro-core-io-lxf-asl/src/test/java/org/dkpro/core/io/lxf/LxfReaderWriterDeltaTest.java @@ -25,14 +25,11 @@ import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReaderDescription; -import org.dkpro.core.io.lxf.LxfReader; -import org.dkpro.core.io.lxf.LxfWriter; +import org.dkpro.core.opennlp.OpenNlpPosTagger; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - public class LxfReaderWriterDeltaTest { @Test diff --git a/dkpro-core-io-lxf-asl/src/test/java/org/dkpro/core/io/lxf/LxfReaderWriterTest.java b/dkpro-core-io-lxf-asl/src/test/java/org/dkpro/core/io/lxf/LxfReaderWriterTest.java index 58ec1866ff..4d1b5b1c43 100644 --- a/dkpro-core-io-lxf-asl/src/test/java/org/dkpro/core/io/lxf/LxfReaderWriterTest.java +++ b/dkpro-core-io-lxf-asl/src/test/java/org/dkpro/core/io/lxf/LxfReaderWriterTest.java @@ -17,13 +17,12 @@ */ package org.dkpro.core.io.lxf; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testOneWay; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - public class LxfReaderWriterTest { @Test diff --git a/dkpro-core-io-lxf-asl/src/test/resources/log4j.properties b/dkpro-core-io-lxf-asl/src/test/resources/log4j.properties deleted file mode 100644 index 9ef9876f5c..0000000000 --- a/dkpro-core-io-lxf-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,7 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG diff --git a/dkpro-core-io-lxf-asl/src/test/resources/log4j2.xml b/dkpro-core-io-lxf-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..19bf03b585 --- /dev/null +++ b/dkpro-core-io-lxf-asl/src/test/resources/log4j2.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-negra-asl/pom.xml b/dkpro-core-io-negra-asl/pom.xml index 3fd5a92448..1b83114dd2 100644 --- a/dkpro-core-io-negra-asl/pom.xml +++ b/dkpro-core-io-negra-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.negra-asl + dkpro-core-io-negra-asl jar DKPro Core ASL - IO - NEGRA + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -44,32 +45,36 @@ commons-lang3 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.syntax-asl + org.dkpro.core + dkpro-core-api-syntax-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.penntree-asl + org.dkpro.core + dkpro-core-io-penntree-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -77,8 +82,8 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test diff --git a/dkpro-core-io-negra-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/negra/package-info.java b/dkpro-core-io-negra-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/negra/package-info.java deleted file mode 100644 index 6d9fb1f84a..0000000000 --- a/dkpro-core-io-negra-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/negra/package-info.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright 2010-2011 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Support for the NEGRA - * export format. - * - * @since 1.2.0 - */ -package de.tudarmstadt.ukp.dkpro.core.io.negra; diff --git a/dkpro-core-io-negra-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/negra/NegraExportReader.java b/dkpro-core-io-negra-asl/src/main/java/org/dkpro/core/io/negra/NegraExportReader.java similarity index 92% rename from dkpro-core-io-negra-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/negra/NegraExportReader.java rename to dkpro-core-io-negra-asl/src/main/java/org/dkpro/core/io/negra/NegraExportReader.java index 33cc241d7b..fd5de9fb48 100644 --- a/dkpro-core-io-negra-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/negra/NegraExportReader.java +++ b/dkpro-core-io-negra-asl/src/main/java/org/dkpro/core/io/negra/NegraExportReader.java @@ -15,11 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.negra; +package org.dkpro.core.io.negra; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.commons.lang3.StringUtils.startsWith; import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; + import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; @@ -41,6 +43,7 @@ import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.factory.JCasBuilder; import org.apache.uima.jcas.JCas; @@ -50,28 +53,37 @@ import org.apache.uima.util.Level; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.io.penntree.PennTreeNode; +import org.dkpro.core.io.penntree.PennTreeUtils; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; -import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeNode; -import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.Parameters; +import eu.openminted.share.annotations.api.constants.OperationType; /** * This CollectionReader reads a file which is formatted in the NEGRA export format. The texts and * add. information like constituent structure is reproduced in CASes, one CAS per text (article) . */ +@Component(value = OperationType.READER) +@ResourceMetaData(name = "NEGRA Export Format Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@Parameters( + exclude = { + NegraExportReader.PARAM_SOURCE_LOCATION }) @MimeTypeCapability({MimeTypes.APPLICATION_X_NEGRA3, MimeTypes.APPLICATION_X_NEGRA4}) @TypeCapability( outputs = { @@ -107,13 +119,12 @@ public static enum DocumentUnit * Character encoding of the input data. */ public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String sourceEncoding; /** * Write part-of-speech information. - * - * Default: {@code true} */ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") @@ -121,8 +132,6 @@ public static enum DocumentUnit /** * Write lemma information. - * - * Default: {@code true} */ public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") @@ -132,17 +141,24 @@ public static enum DocumentUnit * Write Penn Treebank bracketed structure information. Mind this may not work with all tagsets, * in particular not with such that contain "(" or ")" in their tags. The tree is generated * using the original tag set in the corpus, not using the mapped tagset! - * - * Default: {@code false} */ public static final String PARAM_READ_PENN_TREE = ComponentParameters.PARAM_READ_PENN_TREE; @ConfigurationParameter(name = PARAM_READ_PENN_TREE, mandatory = true, defaultValue = "false") private boolean pennTreeEnabled; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Location of the mapping file for part-of-speech tags to UIMA types. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String mappingPosLocation; @@ -156,7 +172,7 @@ public static enum DocumentUnit protected String posTagset; /** - * The collection ID to the written to the document meta data. (Default: none) + * The collection ID to the written to the document meta data. */ public static final String PARAM_COLLECTION_ID = "collectionId"; @ConfigurationParameter(name = PARAM_COLLECTION_ID, mandatory = false) @@ -164,7 +180,7 @@ public static enum DocumentUnit /** * If true, the unit IDs are used only to detect if a new document (CAS) needs to be created, - * but for the purpose of setting the document ID, a new ID is generated. (Default: false) + * but for the purpose of setting the document ID, a new ID is generated. */ public static final String PARAM_GENERATE_NEW_IDS = "generateNewIds"; @ConfigurationParameter(name = PARAM_GENERATE_NEW_IDS, mandatory = true, defaultValue = "false") @@ -173,8 +189,7 @@ public static enum DocumentUnit /** * What indicates if a new CAS should be started. E.g., if set to * {@link DocumentUnit#ORIGIN_NAME ORIGIN_NAME}, a new CAS is generated whenever the origin name - * of the current sentence differs from the origin name of the last sentence. (Default: - * ORIGIN_NAME) + * of the current sentence differs from the origin name of the last sentence. */ public static final String PARAM_DOCUMENT_UNIT = "documentUnit"; @ConfigurationParameter(name = PARAM_DOCUMENT_UNIT, mandatory = true, defaultValue = "ORIGIN_NAME") @@ -254,8 +269,8 @@ public void initialize(UimaContext aContext) throw new ResourceInitializationException(e); } - posMappingProvider = MappingProviderFactory.createPosMappingProvider(mappingPosLocation, - posTagset, language); + posMappingProvider = createPosMappingProvider(this, mappingPosLocation, posTagset, + language); } @Override @@ -546,7 +561,7 @@ private void readSentence(JCas aJCas, JCasBuilder aBuilder, String aSentenceId, constituents.put("0", root); // Initialize dependency relations - Map> relations = new LinkedHashMap>(); + Map> relations = new LinkedHashMap<>(); // handle tokens String line; @@ -559,7 +574,7 @@ private void readSentence(JCas aJCas, JCasBuilder aBuilder, String aSentenceId, Token token = aBuilder.add(parts[TOKEN_TEXT], Token.class); sentEnd = token.getEnd(); aBuilder.add(" "); - aIdMap.put(aSentenceId+":"+id, token); + aIdMap.put(aSentenceId + ":" + id, token); // get/create parent Constituent parent = constituents.get(parts[TOKEN_PARENT_ID]); @@ -583,7 +598,8 @@ private void readSentence(JCas aJCas, JCasBuilder aBuilder, String aSentenceId, Type posTag = posMappingProvider.getTagType(parts[TOKEN_POS_TAG]); POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); - pos.setPosValue(parts[TOKEN_POS_TAG].intern()); + pos.setPosValue( + parts[TOKEN_POS_TAG] != null ? parts[TOKEN_POS_TAG].intern() : null); POSUtils.assignCoarseValue(pos); pos.addToIndexes(); token.setPos(pos); @@ -648,7 +664,7 @@ private void readSentence(JCas aJCas, JCasBuilder aBuilder, String aSentenceId, // add constituents at the end of the sentence for (Entry e : constituents.entrySet()) { e.getValue().addToIndexes(aJCas); - aIdMap.put(aSentenceId+":"+e.getKey(), e.getValue()); + aIdMap.put(aSentenceId + ":" + e.getKey(), e.getValue()); } } diff --git a/dkpro-core-io-negra-asl/src/main/java/org/dkpro/core/io/negra/package-info.java b/dkpro-core-io-negra-asl/src/main/java/org/dkpro/core/io/negra/package-info.java new file mode 100644 index 0000000000..70d6fe8f0a --- /dev/null +++ b/dkpro-core-io-negra-asl/src/main/java/org/dkpro/core/io/negra/package-info.java @@ -0,0 +1,25 @@ +/* + * Copyright 2010-2011 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Support for the NEGRA + * export format. + * + * @since 1.2.0 + */ +package org.dkpro.core.io.negra; diff --git a/dkpro-core-io-negra-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/negra/NegraExportReaderTest.java b/dkpro-core-io-negra-asl/src/test/java/org/dkpro/core/io/negra/NegraExportReaderTest.java similarity index 86% rename from dkpro-core-io-negra-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/negra/NegraExportReaderTest.java rename to dkpro-core-io-negra-asl/src/test/java/org/dkpro/core/io/negra/NegraExportReaderTest.java index ad7a54fd25..4e3868e9b7 100644 --- a/dkpro-core-io-negra-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/negra/NegraExportReaderTest.java +++ b/dkpro-core-io-negra-asl/src/test/java/org/dkpro/core/io/negra/NegraExportReaderTest.java @@ -15,16 +15,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.negra; +package org.dkpro.core.io.negra; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.*; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; +import org.dkpro.core.io.negra.NegraExportReader; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - /** * Sample is taken from * http://www.coli.uni-saarland.de/projects/sfb378/negra-corpus @@ -33,10 +33,10 @@ */ public class NegraExportReaderTest { - @Test - public void negraTest() - throws Exception - { + @Test + public void negraTest() + throws Exception + { testOneWay( createReaderDescription(NegraExportReader.class, NegraExportReader.PARAM_LANGUAGE, "de", @@ -44,12 +44,12 @@ public void negraTest() NegraExportReader.PARAM_READ_PENN_TREE, true), "sentence.export.dump", "sentence.export"); - } + } - @Test - public void negraTigerTest() - throws Exception - { + @Test + public void negraTigerTest() + throws Exception + { testOneWay( createReaderDescription(NegraExportReader.class, NegraExportReader.PARAM_LANGUAGE, "de", @@ -57,12 +57,12 @@ public void negraTigerTest() NegraExportReader.PARAM_READ_PENN_TREE, true), "tiger-sample.export.dump", "tiger-sample.export"); - } + } - @Test - public void tuebaTest() - throws Exception - { + @Test + public void tuebaTest() + throws Exception + { testOneWay( createReaderDescription(NegraExportReader.class, NegraExportReader.PARAM_LANGUAGE, "de", @@ -70,7 +70,7 @@ public void tuebaTest() NegraExportReader.PARAM_READ_PENN_TREE, true), "tueba-sample.export.dump", "tueba-sample.export"); - } + } @Test public void testFormat4WithCoref() diff --git a/dkpro-core-io-negra-asl/src/test/resources/format4-with-coref-sample.export.dump b/dkpro-core-io-negra-asl/src/test/resources/format4-with-coref-sample.export.dump index a4ab4582a3..642ed6ef75 100644 --- a/dkpro-core-io-negra-asl/src/test/resources/format4-with-coref-sample.export.dump +++ b/dkpro-core-io-negra-asl/src/test/resources/format4-with-coref-sample.export.dump @@ -22,7 +22,7 @@ PennTree sofa: _InitialView begin: 0 end: 14 - PennTree: "(ROOT (P3 C) (P5 E) (NX--- (P2 B) (NX-- (P1 A))) (NX--- (NX-HD (P4 D)) (NX-- (NX-HD (P6 F)) (NX-- (CARD 20)))))" + PennTree: "(ROOT (P3 C) (P5 E) (NX--- (P2 B) (NX-- (P1 A))) (NX--- (NX-HD (P4 D)) (NX-- (NX..." [A B C D E F 20] ROOT sofa: _InitialView @@ -45,12 +45,11 @@ Constituent children: FSArray syntacticFunction: "--" [A] -POS_X +POS sofa: _InitialView begin: 0 end: 1 PosValue: "P1" - coarseValue: "X" [A] Lemma sofa: _InitialView @@ -87,12 +86,12 @@ Token begin: 0 end: 1 value: "A" - pos: POS_X + pos: POS sofa: _InitialView begin: 0 end: 1 PosValue: "P1" - coarseValue: "X" + order: 0 [A] Constituent sofa: _InitialView @@ -115,12 +114,11 @@ Constituent children: FSArray syntacticFunction: "-" [B] -POS_X +POS sofa: _InitialView begin: 2 end: 3 PosValue: "P2" - coarseValue: "X" [B] Lemma sofa: _InitialView @@ -150,19 +148,18 @@ Token begin: 2 end: 3 value: "B" - pos: POS_X + pos: POS sofa: _InitialView begin: 2 end: 3 PosValue: "P2" - coarseValue: "X" + order: 0 [C] -POS_X +POS sofa: _InitialView begin: 4 end: 5 PosValue: "P3" - coarseValue: "X" [C] Lemma sofa: _InitialView @@ -185,12 +182,12 @@ Token begin: 4 end: 5 value: "C" - pos: POS_X + pos: POS sofa: _InitialView begin: 4 end: 5 PosValue: "P3" - coarseValue: "X" + order: 0 [D E F 20] Constituent sofa: _InitialView @@ -206,12 +203,11 @@ Constituent children: FSArray syntacticFunction: "--" [D] -POS_X +POS sofa: _InitialView begin: 6 end: 7 PosValue: "P4" - coarseValue: "X" [D] Lemma sofa: _InitialView @@ -248,12 +244,12 @@ Token begin: 6 end: 7 value: "D" - pos: POS_X + pos: POS sofa: _InitialView begin: 6 end: 7 PosValue: "P4" - coarseValue: "X" + order: 0 [D] Constituent sofa: _InitialView @@ -276,12 +272,11 @@ Constituent children: FSArray syntacticFunction: "HD" [E] -POS_X +POS sofa: _InitialView begin: 8 end: 9 PosValue: "P5" - coarseValue: "X" [E] Lemma sofa: _InitialView @@ -304,12 +299,12 @@ Token begin: 8 end: 9 value: "E" - pos: POS_X + pos: POS sofa: _InitialView begin: 8 end: 9 PosValue: "P5" - coarseValue: "X" + order: 0 [F 20] Constituent sofa: _InitialView @@ -332,12 +327,11 @@ Constituent children: FSArray syntacticFunction: "-" [F] -POS_X +POS sofa: _InitialView begin: 10 end: 11 PosValue: "P6" - coarseValue: "X" [F] Lemma sofa: _InitialView @@ -381,12 +375,12 @@ Token begin: 10 end: 11 value: "F" - pos: POS_X + pos: POS sofa: _InitialView begin: 10 end: 11 PosValue: "P6" - coarseValue: "X" + order: 0 [F] Constituent sofa: _InitialView @@ -471,6 +465,7 @@ Token end: 14 PosValue: "CARD" coarseValue: "NUM" + order: 0 [20] Constituent sofa: _InitialView @@ -501,6 +496,4 @@ Constituent syntacticFunction: "-" -------- View _InitialView end ---------------------------------- -======== CAS 0 end ================================== - - +======== CAS 0 end ================================== \ No newline at end of file diff --git a/dkpro-core-io-negra-asl/src/test/resources/log4j.properties b/dkpro-core-io-negra-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-io-negra-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-negra-asl/src/test/resources/log4j2.xml b/dkpro-core-io-negra-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-negra-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-negra-asl/src/test/resources/sentence.export.dump b/dkpro-core-io-negra-asl/src/test/resources/sentence.export.dump index 44a9ea4fdb..085d118e99 100644 --- a/dkpro-core-io-negra-asl/src/test/resources/sentence.export.dump +++ b/dkpro-core-io-negra-asl/src/test/resources/sentence.export.dump @@ -22,7 +22,7 @@ PennTree sofa: _InitialView begin: 0 end: 91 - PennTree: "(ROOT ($, ,) ($. .) (CS--- (S-CJ (PPER Sie) (VVFIN gehen) (PTKVZ ein) (CNP-OA (KON und) (NN Risiken) (NP-CJ (ADJA gewagte) (NN Verbindungen)))) (S-CJ (VVFIN versuchen) (VP-OC (VVIZU auszureizen) (NP-OA (PPOSAT ihre) (NN Möglichkeiten))))))" + PennTree: "(ROOT ($, ,) ($. .) (CS--- (S-CJ (PPER Sie) (VVFIN gehen) (PTKVZ ein) (CNP-OA (K..." [Sie gehen gewagte Verbindungen und Risiken ein , versuchen ihre Möglichkeiten auszureizen .] ROOT sofa: _InitialView @@ -103,6 +103,7 @@ Token end: 3 PosValue: "PPER" coarseValue: "PRON" + order: 0 [gehen] POS_VERB sofa: _InitialView @@ -141,6 +142,7 @@ Token end: 9 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [gewagte Verbindungen und Risiken] Constituent sofa: _InitialView @@ -256,6 +258,7 @@ Token end: 17 PosValue: "ADJA" coarseValue: "ADJ" + order: 0 [Verbindungen] POS_NOUN sofa: _InitialView @@ -308,6 +311,7 @@ Token end: 30 PosValue: "NN" coarseValue: "NOUN" + order: 0 [und] POS_CONJ sofa: _InitialView @@ -353,6 +357,7 @@ Token end: 34 PosValue: "KON" coarseValue: "CONJ" + order: 0 [Risiken] POS_NOUN sofa: _InitialView @@ -398,6 +403,7 @@ Token end: 42 PosValue: "NN" coarseValue: "NOUN" + order: 0 [ein] POS_VERB sofa: _InitialView @@ -436,6 +442,7 @@ Token end: 46 PosValue: "PTKVZ" coarseValue: "VERB" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -460,6 +467,7 @@ Token end: 48 PosValue: "$," coarseValue: "PUNCT" + order: 0 [versuchen ihre Möglichkeiten auszureizen] Constituent sofa: _InitialView @@ -519,6 +527,7 @@ Token end: 58 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [ihre Möglichkeiten auszureizen] Constituent sofa: _InitialView @@ -634,6 +643,7 @@ Token end: 63 PosValue: "PPOSAT" coarseValue: "PRON" + order: 0 [Möglichkeiten] POS_NOUN sofa: _InitialView @@ -686,6 +696,7 @@ Token end: 77 PosValue: "NN" coarseValue: "NOUN" + order: 0 [auszureizen] POS_VERB sofa: _InitialView @@ -731,6 +742,7 @@ Token end: 89 PosValue: "VVIZU" coarseValue: "VERB" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -755,6 +767,7 @@ Token end: 91 PosValue: "$." coarseValue: "PUNCT" + order: 0 -------- View _InitialView end ---------------------------------- ======== CAS 0 end ================================== \ No newline at end of file diff --git a/dkpro-core-io-negra-asl/src/test/resources/tiger-sample.export.dump b/dkpro-core-io-negra-asl/src/test/resources/tiger-sample.export.dump index a8c6234a33..f41465217e 100644 --- a/dkpro-core-io-negra-asl/src/test/resources/tiger-sample.export.dump +++ b/dkpro-core-io-negra-asl/src/test/resources/tiger-sample.export.dump @@ -22,7 +22,7 @@ PennTree sofa: _InitialView begin: 0 end: 56 - PennTree: "(ROOT ($( ``) ($( '') (S--- (VAFIN wäre) (ADV vielleicht) (PN-SB (NE Ross) (NE Perot)) (NP-PD (ART ein) (ADJA prächtiger) (NN Diktator))))" + PennTree: "(ROOT ($( ``) ($( '') (S--- (VAFIN wäre) (ADV vielleicht) (PN-SB (NE Ross) (NE P..." [`` Ross Perot wäre vielleicht ein prächtiger Diktator ''] ROOT sofa: _InitialView @@ -65,6 +65,7 @@ Token end: 2 PosValue: "$(" coarseValue: "PUNCT" + order: 0 [Ross Perot wäre vielleicht ein prächtiger Diktator] Constituent sofa: _InitialView @@ -149,6 +150,7 @@ Token end: 7 PosValue: "NE" coarseValue: "PROPN" + order: 0 [Perot] POS_PROPN sofa: _InitialView @@ -198,6 +200,7 @@ Token end: 13 PosValue: "NE" coarseValue: "PROPN" + order: 0 [wäre] POS_VERB sofa: _InitialView @@ -240,6 +243,7 @@ Token end: 18 PosValue: "VAFIN" coarseValue: "VERB" + order: 0 [vielleicht] POS_ADV sofa: _InitialView @@ -282,6 +286,7 @@ Token end: 29 PosValue: "ADV" coarseValue: "ADV" + order: 0 [ein prächtiger Diktator] Constituent sofa: _InitialView @@ -352,6 +357,7 @@ Token end: 33 PosValue: "ART" coarseValue: "DET" + order: 0 [prächtiger] POS_ADJ sofa: _InitialView @@ -401,6 +407,7 @@ Token end: 44 PosValue: "ADJA" coarseValue: "ADJ" + order: 0 [Diktator] POS_NOUN sofa: _InitialView @@ -450,6 +457,7 @@ Token end: 53 PosValue: "NN" coarseValue: "NOUN" + order: 0 [''] POS_PUNCT sofa: _InitialView @@ -485,6 +493,7 @@ Token end: 56 PosValue: "$(" coarseValue: "PUNCT" + order: 0 [Konzernchefs lehnen den Milliardär als US-Präsidenten ab /] Sentence sofa: _InitialView @@ -495,7 +504,7 @@ PennTree sofa: _InitialView begin: 57 end: 115 - PennTree: "(ROOT ($( /) (S--- (NN Konzernchefs) (VVFIN lehnen) (PTKVZ ab) (NP-OA (ART den) (NN Milliardär)) (PP-MO (APPR als) (NN US-Präsidenten))))" + PennTree: "(ROOT ($( /) (S--- (NN Konzernchefs) (VVFIN lehnen) (PTKVZ ab) (NP-OA (ART den) ..." [Konzernchefs lehnen den Milliardär als US-Präsidenten ab /] ROOT sofa: _InitialView @@ -559,6 +568,7 @@ Token end: 69 PosValue: "NN" coarseValue: "NOUN" + order: 0 [lehnen] POS_VERB sofa: _InitialView @@ -601,6 +611,7 @@ Token end: 76 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [den Milliardär] Constituent sofa: _InitialView @@ -671,6 +682,7 @@ Token end: 80 PosValue: "ART" coarseValue: "DET" + order: 0 [Milliardär] POS_NOUN sofa: _InitialView @@ -720,6 +732,7 @@ Token end: 91 PosValue: "NN" coarseValue: "NOUN" + order: 0 [als US-Präsidenten] Constituent sofa: _InitialView @@ -790,6 +803,7 @@ Token end: 95 PosValue: "APPR" coarseValue: "ADP" + order: 0 [US-Präsidenten] POS_NOUN sofa: _InitialView @@ -839,6 +853,7 @@ Token end: 110 PosValue: "NN" coarseValue: "NOUN" + order: 0 [ab] POS_VERB sofa: _InitialView @@ -881,6 +896,7 @@ Token end: 113 PosValue: "PTKVZ" coarseValue: "VERB" + order: 0 [/] POS_PUNCT sofa: _InitialView @@ -916,6 +932,7 @@ Token end: 115 PosValue: "$(" coarseValue: "PUNCT" + order: 0 [Texaner gibt nur vage Auskunft über seine Wirtschaftspolitik] Sentence sofa: _InitialView @@ -926,7 +943,7 @@ PennTree sofa: _InitialView begin: 116 end: 176 - PennTree: "(ROOT (S--- (NN Texaner) (VVFIN gibt) (NP-OA (NN Auskunft) (AP-NK (ADV nur) (ADJA vage)) (PP-MNR (APPR über) (PPOSAT seine) (NN Wirtschaftspolitik)))))" + PennTree: "(ROOT (S--- (NN Texaner) (VVFIN gibt) (NP-OA (NN Auskunft) (AP-NK (ADV nur) (ADJ..." [Texaner gibt nur vage Auskunft über seine Wirtschaftspolitik] Constituent sofa: _InitialView @@ -990,6 +1007,7 @@ Token end: 123 PosValue: "NN" coarseValue: "NOUN" + order: 0 [gibt] POS_VERB sofa: _InitialView @@ -1032,6 +1050,7 @@ Token end: 128 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [nur vage Auskunft über seine Wirtschaftspolitik] Constituent sofa: _InitialView @@ -1137,6 +1156,7 @@ Token end: 132 PosValue: "ADV" coarseValue: "ADV" + order: 0 [vage] POS_ADJ sofa: _InitialView @@ -1193,6 +1213,7 @@ Token end: 137 PosValue: "ADJA" coarseValue: "ADJ" + order: 0 [Auskunft] POS_NOUN sofa: _InitialView @@ -1242,6 +1263,7 @@ Token end: 146 PosValue: "NN" coarseValue: "NOUN" + order: 0 [über seine Wirtschaftspolitik] Constituent sofa: _InitialView @@ -1326,6 +1348,7 @@ Token end: 151 PosValue: "APPR" coarseValue: "ADP" + order: 0 [seine] POS_PRON sofa: _InitialView @@ -1382,6 +1405,7 @@ Token end: 157 PosValue: "PPOSAT" coarseValue: "PRON" + order: 0 [Wirtschaftspolitik] POS_NOUN sofa: _InitialView @@ -1438,6 +1462,7 @@ Token end: 176 PosValue: "NN" coarseValue: "NOUN" + order: 0 [Der texanische Milliardär Ross Perot hat das politische Establishment in Washington aufgeschreckt .] Sentence sofa: _InitialView @@ -1448,7 +1473,7 @@ PennTree sofa: _InitialView begin: 177 end: 276 - PennTree: "(ROOT ($. .) (S--- (VAFIN hat) (NP-SB (ART Der) (ADJA texanische) (NN Milliardär) (PN-NK (NE Ross) (NE Perot))) (VP-OC (VVPP aufgeschreckt) (NP-OA (ART das) (ADJA politische) (NN Establishment) (PP-MNR (APPR in) (NE Washington))))))" + PennTree: "(ROOT ($. .) (S--- (VAFIN hat) (NP-SB (ART Der) (ADJA texanische) (NN Milliardär..." [Der texanische Milliardär Ross Perot hat das politische Establishment in Washington aufgeschreckt .] ROOT sofa: _InitialView @@ -1540,6 +1565,7 @@ Token end: 180 PosValue: "ART" coarseValue: "DET" + order: 0 [texanische] POS_ADJ sofa: _InitialView @@ -1589,6 +1615,7 @@ Token end: 191 PosValue: "ADJA" coarseValue: "ADJ" + order: 0 [Milliardär] POS_NOUN sofa: _InitialView @@ -1638,6 +1665,7 @@ Token end: 202 PosValue: "NN" coarseValue: "NOUN" + order: 0 [Ross Perot] Constituent sofa: _InitialView @@ -1722,6 +1750,7 @@ Token end: 207 PosValue: "NE" coarseValue: "PROPN" + order: 0 [Perot] POS_PROPN sofa: _InitialView @@ -1778,6 +1807,7 @@ Token end: 213 PosValue: "NE" coarseValue: "PROPN" + order: 0 [hat] POS_VERB sofa: _InitialView @@ -1820,6 +1850,7 @@ Token end: 217 PosValue: "VAFIN" coarseValue: "VERB" + order: 0 [das politische Establishment in Washington aufgeschreckt] Constituent sofa: _InitialView @@ -1925,6 +1956,7 @@ Token end: 221 PosValue: "ART" coarseValue: "DET" + order: 0 [politische] POS_ADJ sofa: _InitialView @@ -1981,6 +2013,7 @@ Token end: 232 PosValue: "ADJA" coarseValue: "ADJ" + order: 0 [Establishment] POS_NOUN sofa: _InitialView @@ -2037,6 +2070,7 @@ Token end: 246 PosValue: "NN" coarseValue: "NOUN" + order: 0 [in Washington] Constituent sofa: _InitialView @@ -2135,6 +2169,7 @@ Token end: 249 PosValue: "APPR" coarseValue: "ADP" + order: 0 [Washington] POS_PROPN sofa: _InitialView @@ -2198,6 +2233,7 @@ Token end: 260 PosValue: "NE" coarseValue: "PROPN" + order: 0 [aufgeschreckt] POS_VERB sofa: _InitialView @@ -2247,6 +2283,7 @@ Token end: 274 PosValue: "VVPP" coarseValue: "VERB" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -2282,6 +2319,7 @@ Token end: 276 PosValue: "$." coarseValue: "PUNCT" + order: 0 [Nach Meinungsumfragen liegt der parteilose Self-mademan gut im Rennen um den Chefsessel im Weißen Haus mit dem amtierenden Präsidenten George Bush und dem Demokraten Bill Clinton .] Sentence sofa: _InitialView @@ -2292,7 +2330,7 @@ PennTree sofa: _InitialView begin: 277 end: 457 - PennTree: "(ROOT ($. .) (S--- (VVFIN liegt) (ADJD gut) (PP-MO (APPR Nach) (NN Meinungsumfragen)) (NP-SB (ART der) (ADJA parteilose) (FM Self-mademan)) (PP-MO (APPRART im) (NN Rennen) (PP-OP (APPR um) (ART den) (NN Chefsessel) (PP-MNR (APPRART im) (PN-NK (NP-PNC (ADJA Weißen) (NN Haus))))) (PP-MNR (APPR mit) (CNP-NK (KON und) (NP-CJ (ART dem) (ADJA amtierenden) (NN Präsidenten) (PN-NK (NE George) (NE Bush))) (NP-CJ (ART dem) (NN Demokraten) (PN-NK (NE Bill) (NE Clinton))))))))" + PennTree: "(ROOT ($. .) (S--- (VVFIN liegt) (ADJD gut) (PP-MO (APPR Nach) (NN Meinungsumfra..." [Nach Meinungsumfragen liegt der parteilose Self-mademan gut im Rennen um den Chefsessel im Weißen Haus mit dem amtierenden Präsidenten George Bush und dem Demokraten Bill Clinton .] ROOT sofa: _InitialView @@ -2384,6 +2422,7 @@ Token end: 281 PosValue: "APPR" coarseValue: "ADP" + order: 0 [Meinungsumfragen] POS_NOUN sofa: _InitialView @@ -2433,6 +2472,7 @@ Token end: 298 PosValue: "NN" coarseValue: "NOUN" + order: 0 [liegt] POS_VERB sofa: _InitialView @@ -2475,6 +2515,7 @@ Token end: 304 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [der parteilose Self-mademan] Constituent sofa: _InitialView @@ -2545,6 +2586,7 @@ Token end: 308 PosValue: "ART" coarseValue: "DET" + order: 0 [parteilose] POS_ADJ sofa: _InitialView @@ -2594,6 +2636,7 @@ Token end: 319 PosValue: "ADJA" coarseValue: "ADJ" + order: 0 [Self-mademan] POS_X sofa: _InitialView @@ -2643,6 +2686,7 @@ Token end: 332 PosValue: "FM" coarseValue: "X" + order: 0 [gut] POS_ADJ sofa: _InitialView @@ -2685,6 +2729,7 @@ Token end: 336 PosValue: "ADJD" coarseValue: "ADJ" + order: 0 [im Rennen um den Chefsessel im Weißen Haus mit dem amtierenden Präsidenten George Bush und dem Demokraten Bill Clinton] Constituent sofa: _InitialView @@ -2755,6 +2800,7 @@ Token end: 339 PosValue: "APPRART" coarseValue: "ADP" + order: 0 [Rennen] POS_NOUN sofa: _InitialView @@ -2804,6 +2850,7 @@ Token end: 346 PosValue: "NN" coarseValue: "NOUN" + order: 0 [um den Chefsessel im Weißen Haus] Constituent sofa: _InitialView @@ -2888,6 +2935,7 @@ Token end: 349 PosValue: "APPR" coarseValue: "ADP" + order: 0 [den] POS_DET sofa: _InitialView @@ -2944,6 +2992,7 @@ Token end: 353 PosValue: "ART" coarseValue: "DET" + order: 0 [Chefsessel] POS_NOUN sofa: _InitialView @@ -3000,6 +3049,7 @@ Token end: 364 PosValue: "NN" coarseValue: "NOUN" + order: 0 [im Weißen Haus] Constituent sofa: _InitialView @@ -3098,6 +3148,7 @@ Token end: 367 PosValue: "APPRART" coarseValue: "ADP" + order: 0 [Weißen Haus] Constituent sofa: _InitialView @@ -3266,6 +3317,7 @@ Token end: 374 PosValue: "ADJA" coarseValue: "ADJ" + order: 0 [Haus] POS_NOUN sofa: _InitialView @@ -3343,6 +3395,7 @@ Token end: 379 PosValue: "NN" coarseValue: "NOUN" + order: 0 [mit dem amtierenden Präsidenten George Bush und dem Demokraten Bill Clinton] Constituent sofa: _InitialView @@ -3427,6 +3480,7 @@ Token end: 383 PosValue: "APPR" coarseValue: "ADP" + order: 0 [dem amtierenden Präsidenten George Bush und dem Demokraten Bill Clinton] Constituent sofa: _InitialView @@ -3574,6 +3628,7 @@ Token end: 387 PosValue: "ART" coarseValue: "DET" + order: 0 [amtierenden] POS_ADJ sofa: _InitialView @@ -3644,6 +3699,7 @@ Token end: 399 PosValue: "ADJA" coarseValue: "ADJ" + order: 0 [Präsidenten] POS_NOUN sofa: _InitialView @@ -3714,6 +3770,7 @@ Token end: 411 PosValue: "NN" coarseValue: "NOUN" + order: 0 [George Bush] Constituent sofa: _InitialView @@ -3840,6 +3897,7 @@ Token end: 418 PosValue: "NE" coarseValue: "PROPN" + order: 0 [Bush] POS_PROPN sofa: _InitialView @@ -3917,6 +3975,7 @@ Token end: 423 PosValue: "NE" coarseValue: "PROPN" + order: 0 [und] POS_CONJ sofa: _InitialView @@ -3980,6 +4039,7 @@ Token end: 427 PosValue: "KON" coarseValue: "CONJ" + order: 0 [dem Demokraten Bill Clinton] Constituent sofa: _InitialView @@ -4092,6 +4152,7 @@ Token end: 431 PosValue: "ART" coarseValue: "DET" + order: 0 [Demokraten] POS_NOUN sofa: _InitialView @@ -4162,6 +4223,7 @@ Token end: 442 PosValue: "NN" coarseValue: "NOUN" + order: 0 [Bill Clinton] Constituent sofa: _InitialView @@ -4288,6 +4350,7 @@ Token end: 447 PosValue: "NE" coarseValue: "PROPN" + order: 0 [Clinton] POS_PROPN sofa: _InitialView @@ -4365,6 +4428,7 @@ Token end: 455 PosValue: "NE" coarseValue: "PROPN" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -4400,6 +4464,7 @@ Token end: 457 PosValue: "$." coarseValue: "PUNCT" + order: 0 [Daß Perot ein Unternehmen erfolgreich leiten kann , davon sind selbst seine Kritiker überzeugt .] Sentence sofa: _InitialView @@ -4410,7 +4475,7 @@ PennTree sofa: _InitialView begin: 458 end: 554 - PennTree: "(ROOT ($, ,) ($. .) (S--- (VAFIN sind) (NP-SB (ADV selbst) (PPOSAT seine) (NN Kritiker)) (VP-PD (VVPP überzeugt) (PP-OP (PROAV davon) (S-RE (KOUS Daß) (NE Perot) (VMFIN kann) (VP-OC (ADJD erfolgreich) (VVINF leiten) (NP-OA (ART ein) (NN Unternehmen))))))))" + PennTree: "(ROOT ($, ,) ($. .) (S--- (VAFIN sind) (NP-SB (ADV selbst) (PPOSAT seine) (NN Kr..." [Daß Perot ein Unternehmen erfolgreich leiten kann , davon sind selbst seine Kritiker überzeugt .] ROOT sofa: _InitialView @@ -4579,6 +4644,7 @@ Token end: 461 PosValue: "KOUS" coarseValue: "CONJ" + order: 0 [Perot] POS_PROPN sofa: _InitialView @@ -4642,6 +4708,7 @@ Token end: 467 PosValue: "NE" coarseValue: "PROPN" + order: 0 [ein Unternehmen erfolgreich leiten] Constituent sofa: _InitialView @@ -4810,6 +4877,7 @@ Token end: 471 PosValue: "ART" coarseValue: "DET" + order: 0 [Unternehmen] POS_NOUN sofa: _InitialView @@ -4887,6 +4955,7 @@ Token end: 483 PosValue: "NN" coarseValue: "NOUN" + order: 0 [erfolgreich] POS_ADJ sofa: _InitialView @@ -4957,6 +5026,7 @@ Token end: 495 PosValue: "ADJD" coarseValue: "ADJ" + order: 0 [leiten] POS_VERB sofa: _InitialView @@ -5027,6 +5097,7 @@ Token end: 502 PosValue: "VVINF" coarseValue: "VERB" + order: 0 [kann] POS_VERB sofa: _InitialView @@ -5090,6 +5161,7 @@ Token end: 507 PosValue: "VMFIN" coarseValue: "VERB" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -5125,6 +5197,7 @@ Token end: 509 PosValue: "$," coarseValue: "PUNCT" + order: 0 [davon] POS_ADV sofa: _InitialView @@ -5181,6 +5254,7 @@ Token end: 515 PosValue: "PROAV" coarseValue: "ADV" + order: 0 [sind] POS_VERB sofa: _InitialView @@ -5223,6 +5297,7 @@ Token end: 520 PosValue: "VAFIN" coarseValue: "VERB" + order: 0 [selbst seine Kritiker] Constituent sofa: _InitialView @@ -5293,6 +5368,7 @@ Token end: 527 PosValue: "ADV" coarseValue: "ADV" + order: 0 [seine] POS_PRON sofa: _InitialView @@ -5342,6 +5418,7 @@ Token end: 533 PosValue: "PPOSAT" coarseValue: "PRON" + order: 0 [Kritiker] POS_NOUN sofa: _InitialView @@ -5391,6 +5468,7 @@ Token end: 542 PosValue: "NN" coarseValue: "NOUN" + order: 0 [überzeugt] POS_VERB sofa: _InitialView @@ -5440,6 +5518,7 @@ Token end: 552 PosValue: "VVPP" coarseValue: "VERB" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -5475,6 +5554,7 @@ Token end: 554 PosValue: "$." coarseValue: "PUNCT" + order: 0 [Ob diese Fähigkeiten aber ausreichen , um die größte Volkswirtschaft der Welt aus ihrer Krise zu führen , bezweifeln viele Ökonomen .] Sentence sofa: _InitialView @@ -5485,7 +5565,7 @@ PennTree sofa: _InitialView begin: 555 end: 688 - PennTree: "(ROOT ($, ,) ($, ,) ($. .) (S--- (VVFIN bezweifeln) (NP-SB (PIAT viele) (NN Ökonomen)) (S-OC (KOUS Ob) (ADV aber) (VVFIN ausreichen) (NP-SB (PDAT diese) (NN Fähigkeiten)) (VP-MO (KOUI um) (PP-MO (APPR aus) (PPOSAT ihrer) (NN Krise)) (VZ-HD (PTKZU zu) (VVINF führen)) (NP-OA (ART die) (ADJA größte) (NN Volkswirtschaft) (NP-AG (ART der) (NN Welt)))))))" + PennTree: "(ROOT ($, ,) ($, ,) ($. .) (S--- (VVFIN bezweifeln) (NP-SB (PIAT viele) (NN Ökon..." [Ob diese Fähigkeiten aber ausreichen , um die größte Volkswirtschaft der Welt aus ihrer Krise zu führen , bezweifeln viele Ökonomen .] ROOT sofa: _InitialView @@ -5577,6 +5657,7 @@ Token end: 557 PosValue: "KOUS" coarseValue: "CONJ" + order: 0 [diese Fähigkeiten] Constituent sofa: _InitialView @@ -5661,6 +5742,7 @@ Token end: 563 PosValue: "PDAT" coarseValue: "PRON" + order: 0 [Fähigkeiten] POS_NOUN sofa: _InitialView @@ -5717,6 +5799,7 @@ Token end: 575 PosValue: "NN" coarseValue: "NOUN" + order: 0 [aber] POS_ADV sofa: _InitialView @@ -5766,6 +5849,7 @@ Token end: 580 PosValue: "ADV" coarseValue: "ADV" + order: 0 [ausreichen] POS_VERB sofa: _InitialView @@ -5815,6 +5899,7 @@ Token end: 591 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -5850,6 +5935,7 @@ Token end: 593 PosValue: "$," coarseValue: "PUNCT" + order: 0 [um die größte Volkswirtschaft der Welt aus ihrer Krise zu führen] Constituent sofa: _InitialView @@ -5934,6 +6020,7 @@ Token end: 596 PosValue: "KOUI" coarseValue: "CONJ" + order: 0 [die größte Volkswirtschaft der Welt] Constituent sofa: _InitialView @@ -6032,6 +6119,7 @@ Token end: 600 PosValue: "ART" coarseValue: "DET" + order: 0 [größte] POS_ADJ sofa: _InitialView @@ -6095,6 +6183,7 @@ Token end: 607 PosValue: "ADJA" coarseValue: "ADJ" + order: 0 [Volkswirtschaft] POS_NOUN sofa: _InitialView @@ -6158,6 +6247,7 @@ Token end: 623 PosValue: "NN" coarseValue: "NOUN" + order: 0 [der Welt] Constituent sofa: _InitialView @@ -6270,6 +6360,7 @@ Token end: 627 PosValue: "ART" coarseValue: "DET" + order: 0 [Welt] POS_NOUN sofa: _InitialView @@ -6340,6 +6431,7 @@ Token end: 632 PosValue: "NN" coarseValue: "NOUN" + order: 0 [aus ihrer Krise] Constituent sofa: _InitialView @@ -6438,6 +6530,7 @@ Token end: 636 PosValue: "APPR" coarseValue: "ADP" + order: 0 [ihrer] POS_PRON sofa: _InitialView @@ -6501,6 +6594,7 @@ Token end: 642 PosValue: "PPOSAT" coarseValue: "PRON" + order: 0 [Krise] POS_NOUN sofa: _InitialView @@ -6564,6 +6658,7 @@ Token end: 648 PosValue: "NN" coarseValue: "NOUN" + order: 0 [zu führen] Constituent sofa: _InitialView @@ -6662,6 +6757,7 @@ Token end: 651 PosValue: "PTKZU" coarseValue: "X" + order: 0 [führen] POS_VERB sofa: _InitialView @@ -6725,6 +6821,7 @@ Token end: 658 PosValue: "VVINF" coarseValue: "VERB" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -6760,6 +6857,7 @@ Token end: 660 PosValue: "$," coarseValue: "PUNCT" + order: 0 [bezweifeln] POS_VERB sofa: _InitialView @@ -6802,6 +6900,7 @@ Token end: 671 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [viele Ökonomen] Constituent sofa: _InitialView @@ -6872,6 +6971,7 @@ Token end: 677 PosValue: "PIAT" coarseValue: "PRON" + order: 0 [Ökonomen] POS_NOUN sofa: _InitialView @@ -6921,6 +7021,7 @@ Token end: 686 PosValue: "NN" coarseValue: "NOUN" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -6956,6 +7057,7 @@ Token end: 688 PosValue: "$." coarseValue: "PUNCT" + order: 0 [Und auch die Konzernchefs in den USA halten nicht viel von dem 62jährigen .] Sentence sofa: _InitialView @@ -6966,7 +7068,7 @@ PennTree sofa: _InitialView begin: 689 end: 764 - PennTree: "(ROOT ($. .) (S--- (KON Und) (VVFIN halten) (NP-OA (PTKNEG nicht) (PIS viel)) (PP-OP (APPR von) (ART dem) (NN 62jährigen)) (NP-SB (ADV auch) (ART die) (NN Konzernchefs) (PP-MNR (APPR in) (ART den) (NE USA)))))" + PennTree: "(ROOT ($. .) (S--- (KON Und) (VVFIN halten) (NP-OA (PTKNEG nicht) (PIS viel)) (P..." [Und auch die Konzernchefs in den USA halten nicht viel von dem 62jährigen .] ROOT sofa: _InitialView @@ -7030,6 +7132,7 @@ Token end: 692 PosValue: "KON" coarseValue: "CONJ" + order: 0 [auch die Konzernchefs in den USA] Constituent sofa: _InitialView @@ -7100,6 +7203,7 @@ Token end: 697 PosValue: "ADV" coarseValue: "ADV" + order: 0 [die] POS_DET sofa: _InitialView @@ -7149,6 +7253,7 @@ Token end: 701 PosValue: "ART" coarseValue: "DET" + order: 0 [Konzernchefs] POS_NOUN sofa: _InitialView @@ -7198,6 +7303,7 @@ Token end: 714 PosValue: "NN" coarseValue: "NOUN" + order: 0 [in den USA] Constituent sofa: _InitialView @@ -7282,6 +7388,7 @@ Token end: 717 PosValue: "APPR" coarseValue: "ADP" + order: 0 [den] POS_DET sofa: _InitialView @@ -7338,6 +7445,7 @@ Token end: 721 PosValue: "ART" coarseValue: "DET" + order: 0 [USA] POS_PROPN sofa: _InitialView @@ -7394,6 +7502,7 @@ Token end: 725 PosValue: "NE" coarseValue: "PROPN" + order: 0 [halten] POS_VERB sofa: _InitialView @@ -7436,6 +7545,7 @@ Token end: 732 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [nicht viel] Constituent sofa: _InitialView @@ -7506,6 +7616,7 @@ Token end: 738 PosValue: "PTKNEG" coarseValue: "X" + order: 0 [viel] POS_PRON sofa: _InitialView @@ -7555,6 +7666,7 @@ Token end: 743 PosValue: "PIS" coarseValue: "PRON" + order: 0 [von dem 62jährigen] Constituent sofa: _InitialView @@ -7625,6 +7737,7 @@ Token end: 747 PosValue: "APPR" coarseValue: "ADP" + order: 0 [dem] POS_DET sofa: _InitialView @@ -7674,6 +7787,7 @@ Token end: 751 PosValue: "ART" coarseValue: "DET" + order: 0 [62jährigen] POS_NOUN sofa: _InitialView @@ -7723,6 +7837,7 @@ Token end: 762 PosValue: "NN" coarseValue: "NOUN" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -7758,6 +7873,7 @@ Token end: 764 PosValue: "$." coarseValue: "PUNCT" + order: 0 [Zwar können sich die meisten Topmanager durchaus einen Unternehmer als Präsidenten vorstellen - nur nicht ausgerechnet Perot .] Sentence sofa: _InitialView @@ -7768,7 +7884,7 @@ PennTree sofa: _InitialView begin: 765 end: 891 - PennTree: "(ROOT ($( -) ($. .) (CS--- (S-CJ (ADV Zwar) (VMFIN können) (ADV durchaus) (NP-SB (ART die) (PIAT meisten) (NN Topmanager)) (VP-OC (PRF sich) (VVINF vorstellen) (NP-OA (ART einen) (NN Unternehmer)) (PP-MO (APPR als) (NN Präsidenten)))) (S-CJ (ADV nur) (PTKNEG nicht) (VP-OC (NP-OA (ADV ausgerechnet) (NE Perot))))))" + PennTree: "(ROOT ($( -) ($. .) (CS--- (S-CJ (ADV Zwar) (VMFIN können) (ADV durchaus) (NP-SB..." [Zwar können sich die meisten Topmanager durchaus einen Unternehmer als Präsidenten vorstellen - nur nicht ausgerechnet Perot .] ROOT sofa: _InitialView @@ -7860,6 +7976,7 @@ Token end: 769 PosValue: "ADV" coarseValue: "ADV" + order: 0 [können] POS_VERB sofa: _InitialView @@ -7909,6 +8026,7 @@ Token end: 776 PosValue: "VMFIN" coarseValue: "VERB" + order: 0 [sich die meisten Topmanager durchaus einen Unternehmer als Präsidenten vorstellen] Constituent sofa: _InitialView @@ -7993,6 +8111,7 @@ Token end: 781 PosValue: "PRF" coarseValue: "PRON" + order: 0 [die meisten Topmanager] Constituent sofa: _InitialView @@ -8077,6 +8196,7 @@ Token end: 785 PosValue: "ART" coarseValue: "DET" + order: 0 [meisten] POS_PRON sofa: _InitialView @@ -8133,6 +8253,7 @@ Token end: 793 PosValue: "PIAT" coarseValue: "PRON" + order: 0 [Topmanager] POS_NOUN sofa: _InitialView @@ -8189,6 +8310,7 @@ Token end: 804 PosValue: "NN" coarseValue: "NOUN" + order: 0 [durchaus] POS_ADV sofa: _InitialView @@ -8238,6 +8360,7 @@ Token end: 813 PosValue: "ADV" coarseValue: "ADV" + order: 0 [einen Unternehmer] Constituent sofa: _InitialView @@ -8336,6 +8459,7 @@ Token end: 819 PosValue: "ART" coarseValue: "DET" + order: 0 [Unternehmer] POS_NOUN sofa: _InitialView @@ -8399,6 +8523,7 @@ Token end: 831 PosValue: "NN" coarseValue: "NOUN" + order: 0 [als Präsidenten] Constituent sofa: _InitialView @@ -8497,6 +8622,7 @@ Token end: 835 PosValue: "APPR" coarseValue: "ADP" + order: 0 [Präsidenten] POS_NOUN sofa: _InitialView @@ -8560,6 +8686,7 @@ Token end: 847 PosValue: "NN" coarseValue: "NOUN" + order: 0 [vorstellen] POS_VERB sofa: _InitialView @@ -8616,6 +8743,7 @@ Token end: 858 PosValue: "VVINF" coarseValue: "VERB" + order: 0 [-] POS_PUNCT sofa: _InitialView @@ -8651,6 +8779,7 @@ Token end: 860 PosValue: "$(" coarseValue: "PUNCT" + order: 0 [nur nicht ausgerechnet Perot] Constituent sofa: _InitialView @@ -8721,6 +8850,7 @@ Token end: 864 PosValue: "ADV" coarseValue: "ADV" + order: 0 [nicht] POS_X sofa: _InitialView @@ -8770,6 +8900,7 @@ Token end: 870 PosValue: "PTKNEG" coarseValue: "X" + order: 0 [ausgerechnet Perot] Constituent sofa: _InitialView @@ -8896,6 +9027,7 @@ Token end: 883 PosValue: "ADV" coarseValue: "ADV" + order: 0 [Perot] POS_PROPN sofa: _InitialView @@ -8959,6 +9091,7 @@ Token end: 889 PosValue: "NE" coarseValue: "PROPN" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -8994,6 +9127,7 @@ Token end: 891 PosValue: "$." coarseValue: "PUNCT" + order: 0 [Nach einer Umfrage des Wirtschaftsmagazins Fortune unter den Bossen von 500 Großunternehmen wünschten im Mai nur elf Prozent `` Ross for President '' , während 78 Prozent sich für Bush und vier Prozent für Clinton aussprachen .] Sentence sofa: _InitialView @@ -9004,7 +9138,7 @@ PennTree sofa: _InitialView begin: 892 end: 1119 - PennTree: "(ROOT ($( ``) ($( '') ($, ,) ($. .) (S--- (VVFIN wünschten) (NE Ross) (PP-MO (APPRART im) (NN Mai)) (CH-MO (FM for) (FM President)) (NP-SB (NN Prozent) (AP-NK (ADV nur) (CARD elf))) (PP-MO (APPR Nach) (ART einer) (NN Umfrage) (NP-AG (ART des) (NN Wirtschaftsmagazins) (NE Fortune)) (PP-MNR (APPR unter) (ART den) (NN Bossen) (PP-PG (APPR von) (CARD 500) (NN Großunternehmen)))) (CS-MO (KON und) (S-CJ (KOUS während) (PRF sich) (NP-SB (CARD 78) (NN Prozent)) (PP-MO (APPR für) (NE Bush))) (S-CJ (VVFIN aussprachen) (NP-SB (CARD vier) (NN Prozent)) (PP-MO (APPR für) (NE Clinton))))))" + PennTree: "(ROOT ($( ``) ($( '') ($, ,) ($. .) (S--- (VVFIN wünschten) (NE Ross) (PP-MO (AP..." [Nach einer Umfrage des Wirtschaftsmagazins Fortune unter den Bossen von 500 Großunternehmen wünschten im Mai nur elf Prozent `` Ross for President '' , während 78 Prozent sich für Bush und vier Prozent für Clinton aussprachen .] ROOT sofa: _InitialView @@ -9096,6 +9230,7 @@ Token end: 896 PosValue: "APPR" coarseValue: "ADP" + order: 0 [einer] POS_DET sofa: _InitialView @@ -9145,6 +9280,7 @@ Token end: 902 PosValue: "ART" coarseValue: "DET" + order: 0 [Umfrage] POS_NOUN sofa: _InitialView @@ -9194,6 +9330,7 @@ Token end: 910 PosValue: "NN" coarseValue: "NOUN" + order: 0 [des Wirtschaftsmagazins Fortune] Constituent sofa: _InitialView @@ -9278,6 +9415,7 @@ Token end: 914 PosValue: "ART" coarseValue: "DET" + order: 0 [Wirtschaftsmagazins] POS_NOUN sofa: _InitialView @@ -9334,6 +9472,7 @@ Token end: 934 PosValue: "NN" coarseValue: "NOUN" + order: 0 [Fortune] POS_PROPN sofa: _InitialView @@ -9390,6 +9529,7 @@ Token end: 942 PosValue: "NE" coarseValue: "PROPN" + order: 0 [unter den Bossen von 500 Großunternehmen] Constituent sofa: _InitialView @@ -9474,6 +9614,7 @@ Token end: 948 PosValue: "APPR" coarseValue: "ADP" + order: 0 [den] POS_DET sofa: _InitialView @@ -9530,6 +9671,7 @@ Token end: 952 PosValue: "ART" coarseValue: "DET" + order: 0 [Bossen] POS_NOUN sofa: _InitialView @@ -9586,6 +9728,7 @@ Token end: 959 PosValue: "NN" coarseValue: "NOUN" + order: 0 [von 500 Großunternehmen] Constituent sofa: _InitialView @@ -9684,6 +9827,7 @@ Token end: 963 PosValue: "APPR" coarseValue: "ADP" + order: 0 [500] POS_NUM sofa: _InitialView @@ -9747,6 +9891,7 @@ Token end: 967 PosValue: "CARD" coarseValue: "NUM" + order: 0 [Großunternehmen] POS_NOUN sofa: _InitialView @@ -9810,6 +9955,7 @@ Token end: 983 PosValue: "NN" coarseValue: "NOUN" + order: 0 [wünschten] POS_VERB sofa: _InitialView @@ -9852,6 +9998,7 @@ Token end: 993 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [im Mai] Constituent sofa: _InitialView @@ -9922,6 +10069,7 @@ Token end: 996 PosValue: "APPRART" coarseValue: "ADP" + order: 0 [Mai] POS_NOUN sofa: _InitialView @@ -9971,6 +10119,7 @@ Token end: 1000 PosValue: "NN" coarseValue: "NOUN" + order: 0 [nur elf Prozent] Constituent sofa: _InitialView @@ -10076,6 +10225,7 @@ Token end: 1004 PosValue: "ADV" coarseValue: "ADV" + order: 0 [elf] POS_NUM sofa: _InitialView @@ -10132,6 +10282,7 @@ Token end: 1008 PosValue: "CARD" coarseValue: "NUM" + order: 0 [Prozent] POS_NOUN sofa: _InitialView @@ -10181,6 +10332,7 @@ Token end: 1016 PosValue: "NN" coarseValue: "NOUN" + order: 0 [``] POS_PUNCT sofa: _InitialView @@ -10216,6 +10368,7 @@ Token end: 1019 PosValue: "$(" coarseValue: "PUNCT" + order: 0 [Ross] POS_PROPN sofa: _InitialView @@ -10258,6 +10411,7 @@ Token end: 1024 PosValue: "NE" coarseValue: "PROPN" + order: 0 [for President] Constituent sofa: _InitialView @@ -10328,6 +10482,7 @@ Token end: 1028 PosValue: "FM" coarseValue: "X" + order: 0 [President] POS_X sofa: _InitialView @@ -10377,6 +10532,7 @@ Token end: 1038 PosValue: "FM" coarseValue: "X" + order: 0 [''] POS_PUNCT sofa: _InitialView @@ -10412,6 +10568,7 @@ Token end: 1041 PosValue: "$(" coarseValue: "PUNCT" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -10447,6 +10604,7 @@ Token end: 1043 PosValue: "$," coarseValue: "PUNCT" + order: 0 [während 78 Prozent sich für Bush und vier Prozent für Clinton aussprachen] Constituent sofa: _InitialView @@ -10552,6 +10710,7 @@ Token end: 1051 PosValue: "KOUS" coarseValue: "CONJ" + order: 0 [78 Prozent] Constituent sofa: _InitialView @@ -10650,6 +10809,7 @@ Token end: 1054 PosValue: "CARD" coarseValue: "NUM" + order: 0 [Prozent] POS_NOUN sofa: _InitialView @@ -10713,6 +10873,7 @@ Token end: 1062 PosValue: "NN" coarseValue: "NOUN" + order: 0 [sich] POS_PRON sofa: _InitialView @@ -10769,6 +10930,7 @@ Token end: 1067 PosValue: "PRF" coarseValue: "PRON" + order: 0 [für Bush] Constituent sofa: _InitialView @@ -10867,6 +11029,7 @@ Token end: 1071 PosValue: "APPR" coarseValue: "ADP" + order: 0 [Bush] POS_PROPN sofa: _InitialView @@ -10930,6 +11093,7 @@ Token end: 1076 PosValue: "NE" coarseValue: "PROPN" + order: 0 [und] POS_CONJ sofa: _InitialView @@ -10979,6 +11143,7 @@ Token end: 1080 PosValue: "KON" coarseValue: "CONJ" + order: 0 [vier Prozent für Clinton aussprachen] Constituent sofa: _InitialView @@ -11105,6 +11270,7 @@ Token end: 1085 PosValue: "CARD" coarseValue: "NUM" + order: 0 [Prozent] POS_NOUN sofa: _InitialView @@ -11168,6 +11334,7 @@ Token end: 1093 PosValue: "NN" coarseValue: "NOUN" + order: 0 [für Clinton] Constituent sofa: _InitialView @@ -11266,6 +11433,7 @@ Token end: 1097 PosValue: "APPR" coarseValue: "ADP" + order: 0 [Clinton] POS_PROPN sofa: _InitialView @@ -11329,6 +11497,7 @@ Token end: 1105 PosValue: "NE" coarseValue: "PROPN" + order: 0 [aussprachen] POS_VERB sofa: _InitialView @@ -11385,6 +11554,7 @@ Token end: 1117 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -11420,6 +11590,7 @@ Token end: 1119 PosValue: "$." coarseValue: "PUNCT" + order: 0 [Allerdings glaubt fast die Hälfte der Chief Executives , daß Perot durchaus Chancen habe , die Wahl im November zu gewinnen , wenn er denn kandidiert .] Sentence sofa: _InitialView @@ -11430,7 +11601,7 @@ PennTree sofa: _InitialView begin: 1120 end: 1271 - PennTree: "(ROOT ($, ,) ($, ,) ($, ,) ($. .) (S--- (ADV Allerdings) (VVFIN glaubt) (NP-SB (ADV fast) (ART die) (NN Hälfte) (NP-AG (ART der) (FM Chief) (FM Executives))) (S-OC (KOUS daß) (NE Perot) (ADV durchaus) (VAFIN habe) (S-MO (KOUS wenn) (PPER er) (ADV denn) (VVFIN kandidiert)) (NP-OA (NN Chancen) (VP-OC (VZ-HD (PTKZU zu) (VVINF gewinnen)) (NP-OA (ART die) (NN Wahl) (PP-MNR (APPRART im) (NN November))))))))" + PennTree: "(ROOT ($, ,) ($, ,) ($, ,) ($. .) (S--- (ADV Allerdings) (VVFIN glaubt) (NP-SB (..." [Allerdings glaubt fast die Hälfte der Chief Executives , daß Perot durchaus Chancen habe , die Wahl im November zu gewinnen , wenn er denn kandidiert .] ROOT sofa: _InitialView @@ -11494,6 +11665,7 @@ Token end: 1130 PosValue: "ADV" coarseValue: "ADV" + order: 0 [glaubt] POS_VERB sofa: _InitialView @@ -11536,6 +11708,7 @@ Token end: 1137 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [fast die Hälfte der Chief Executives] Constituent sofa: _InitialView @@ -11606,6 +11779,7 @@ Token end: 1142 PosValue: "ADV" coarseValue: "ADV" + order: 0 [die] POS_DET sofa: _InitialView @@ -11655,6 +11829,7 @@ Token end: 1146 PosValue: "ART" coarseValue: "DET" + order: 0 [Hälfte] POS_NOUN sofa: _InitialView @@ -11704,6 +11879,7 @@ Token end: 1153 PosValue: "NN" coarseValue: "NOUN" + order: 0 [der Chief Executives] Constituent sofa: _InitialView @@ -11788,6 +11964,7 @@ Token end: 1157 PosValue: "ART" coarseValue: "DET" + order: 0 [Chief] POS_X sofa: _InitialView @@ -11844,6 +12021,7 @@ Token end: 1163 PosValue: "FM" coarseValue: "X" + order: 0 [Executives] POS_X sofa: _InitialView @@ -11900,6 +12078,7 @@ Token end: 1174 PosValue: "FM" coarseValue: "X" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -11935,6 +12114,7 @@ Token end: 1176 PosValue: "$," coarseValue: "PUNCT" + order: 0 [daß Perot durchaus Chancen habe , die Wahl im November zu gewinnen , wenn er denn kandidiert] Constituent sofa: _InitialView @@ -12005,6 +12185,7 @@ Token end: 1180 PosValue: "KOUS" coarseValue: "CONJ" + order: 0 [Perot] POS_PROPN sofa: _InitialView @@ -12054,6 +12235,7 @@ Token end: 1186 PosValue: "NE" coarseValue: "PROPN" + order: 0 [durchaus] POS_ADV sofa: _InitialView @@ -12103,6 +12285,7 @@ Token end: 1195 PosValue: "ADV" coarseValue: "ADV" + order: 0 [Chancen habe , die Wahl im November zu gewinnen] Constituent sofa: _InitialView @@ -12187,6 +12370,7 @@ Token end: 1203 PosValue: "NN" coarseValue: "NOUN" + order: 0 [habe] POS_VERB sofa: _InitialView @@ -12236,6 +12420,7 @@ Token end: 1208 PosValue: "VAFIN" coarseValue: "VERB" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -12271,6 +12456,7 @@ Token end: 1210 PosValue: "$," coarseValue: "PUNCT" + order: 0 [die Wahl im November zu gewinnen] Constituent sofa: _InitialView @@ -12418,6 +12604,7 @@ Token end: 1214 PosValue: "ART" coarseValue: "DET" + order: 0 [Wahl] POS_NOUN sofa: _InitialView @@ -12488,6 +12675,7 @@ Token end: 1219 PosValue: "NN" coarseValue: "NOUN" + order: 0 [im November] Constituent sofa: _InitialView @@ -12614,6 +12802,7 @@ Token end: 1222 PosValue: "APPRART" coarseValue: "ADP" + order: 0 [November] POS_NOUN sofa: _InitialView @@ -12691,6 +12880,7 @@ Token end: 1231 PosValue: "NN" coarseValue: "NOUN" + order: 0 [zu gewinnen] Constituent sofa: _InitialView @@ -12803,6 +12993,7 @@ Token end: 1234 PosValue: "PTKZU" coarseValue: "X" + order: 0 [gewinnen] POS_VERB sofa: _InitialView @@ -12873,6 +13064,7 @@ Token end: 1243 PosValue: "VVINF" coarseValue: "VERB" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -12908,6 +13100,7 @@ Token end: 1245 PosValue: "$," coarseValue: "PUNCT" + order: 0 [wenn er denn kandidiert] Constituent sofa: _InitialView @@ -12992,6 +13185,7 @@ Token end: 1250 PosValue: "KOUS" coarseValue: "CONJ" + order: 0 [er] POS_PRON sofa: _InitialView @@ -13048,6 +13242,7 @@ Token end: 1253 PosValue: "PPER" coarseValue: "PRON" + order: 0 [denn] POS_ADV sofa: _InitialView @@ -13104,6 +13299,7 @@ Token end: 1258 PosValue: "ADV" coarseValue: "ADV" + order: 0 [kandidiert] POS_VERB sofa: _InitialView @@ -13160,6 +13356,7 @@ Token end: 1269 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -13195,6 +13392,7 @@ Token end: 1271 PosValue: "$." coarseValue: "PUNCT" + order: 0 [Als größte Schwäche des Texaners nennen die Befragten seinen Mangel an Erfahrung auf dem politischen Parkett .] Sentence sofa: _InitialView @@ -13205,7 +13403,7 @@ PennTree sofa: _InitialView begin: 1272 end: 1382 - PennTree: "(ROOT ($. .) (S--- (VVFIN nennen) (NP-SB (ART die) (NN Befragten)) (PP-MO (APPR Als) (ADJA größte) (NN Schwäche) (NP-AG (ART des) (NN Texaners))) (NP-OA (PPOSAT seinen) (NN Mangel) (PP-OP (APPR an) (NN Erfahrung) (PP-MNR (APPR auf) (ART dem) (ADJA politischen) (NN Parkett))))))" + PennTree: "(ROOT ($. .) (S--- (VVFIN nennen) (NP-SB (ART die) (NN Befragten)) (PP-MO (APPR ..." [Als größte Schwäche des Texaners nennen die Befragten seinen Mangel an Erfahrung auf dem politischen Parkett .] ROOT sofa: _InitialView @@ -13297,6 +13495,7 @@ Token end: 1275 PosValue: "APPR" coarseValue: "ADP" + order: 0 [größte] POS_ADJ sofa: _InitialView @@ -13346,6 +13545,7 @@ Token end: 1282 PosValue: "ADJA" coarseValue: "ADJ" + order: 0 [Schwäche] POS_NOUN sofa: _InitialView @@ -13395,6 +13595,7 @@ Token end: 1291 PosValue: "NN" coarseValue: "NOUN" + order: 0 [des Texaners] Constituent sofa: _InitialView @@ -13479,6 +13680,7 @@ Token end: 1295 PosValue: "ART" coarseValue: "DET" + order: 0 [Texaners] POS_NOUN sofa: _InitialView @@ -13535,6 +13737,7 @@ Token end: 1304 PosValue: "NN" coarseValue: "NOUN" + order: 0 [nennen] POS_VERB sofa: _InitialView @@ -13577,6 +13780,7 @@ Token end: 1311 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [die Befragten] Constituent sofa: _InitialView @@ -13647,6 +13851,7 @@ Token end: 1315 PosValue: "ART" coarseValue: "DET" + order: 0 [Befragten] POS_NOUN sofa: _InitialView @@ -13696,6 +13901,7 @@ Token end: 1325 PosValue: "NN" coarseValue: "NOUN" + order: 0 [seinen Mangel an Erfahrung auf dem politischen Parkett] Constituent sofa: _InitialView @@ -13766,6 +13972,7 @@ Token end: 1332 PosValue: "PPOSAT" coarseValue: "PRON" + order: 0 [Mangel] POS_NOUN sofa: _InitialView @@ -13815,6 +14022,7 @@ Token end: 1339 PosValue: "NN" coarseValue: "NOUN" + order: 0 [an Erfahrung auf dem politischen Parkett] Constituent sofa: _InitialView @@ -13899,6 +14107,7 @@ Token end: 1342 PosValue: "APPR" coarseValue: "ADP" + order: 0 [Erfahrung] POS_NOUN sofa: _InitialView @@ -13955,6 +14164,7 @@ Token end: 1352 PosValue: "NN" coarseValue: "NOUN" + order: 0 [auf dem politischen Parkett] Constituent sofa: _InitialView @@ -14053,6 +14263,7 @@ Token end: 1356 PosValue: "APPR" coarseValue: "ADP" + order: 0 [dem] POS_DET sofa: _InitialView @@ -14116,6 +14327,7 @@ Token end: 1360 PosValue: "ART" coarseValue: "DET" + order: 0 [politischen] POS_ADJ sofa: _InitialView @@ -14179,6 +14391,7 @@ Token end: 1372 PosValue: "ADJA" coarseValue: "ADJ" + order: 0 [Parkett] POS_NOUN sofa: _InitialView @@ -14242,6 +14455,7 @@ Token end: 1380 PosValue: "NN" coarseValue: "NOUN" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -14277,6 +14491,7 @@ Token end: 1382 PosValue: "$." coarseValue: "PUNCT" + order: 0 [Viele meinen , daß Perot mit seinem Befehlston auf dem Capitol gegen eine Wand laufen würde .] Sentence sofa: _InitialView @@ -14287,7 +14502,7 @@ PennTree sofa: _InitialView begin: 1383 end: 1476 - PennTree: "(ROOT ($, ,) ($. .) (S--- (PIS Viele) (VVFIN meinen) (S-OC (KOUS daß) (NE Perot) (VAFIN würde) (VP-OC (VVINF laufen) (PP-MO (APPR mit) (PPOSAT seinem) (NN Befehlston)) (PP-MO (APPR auf) (ART dem) (NE Capitol)) (PP-MO (APPR gegen) (ART eine) (NN Wand))))))" + PennTree: "(ROOT ($, ,) ($. .) (S--- (PIS Viele) (VVFIN meinen) (S-OC (KOUS daß) (NE Perot)..." [Viele meinen , daß Perot mit seinem Befehlston auf dem Capitol gegen eine Wand laufen würde .] ROOT sofa: _InitialView @@ -14351,6 +14566,7 @@ Token end: 1388 PosValue: "PIS" coarseValue: "PRON" + order: 0 [meinen] POS_VERB sofa: _InitialView @@ -14393,6 +14609,7 @@ Token end: 1395 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -14428,6 +14645,7 @@ Token end: 1397 PosValue: "$," coarseValue: "PUNCT" + order: 0 [daß Perot mit seinem Befehlston auf dem Capitol gegen eine Wand laufen würde] Constituent sofa: _InitialView @@ -14498,6 +14716,7 @@ Token end: 1401 PosValue: "KOUS" coarseValue: "CONJ" + order: 0 [Perot] POS_PROPN sofa: _InitialView @@ -14547,6 +14766,7 @@ Token end: 1407 PosValue: "NE" coarseValue: "PROPN" + order: 0 [mit seinem Befehlston auf dem Capitol gegen eine Wand laufen] Constituent sofa: _InitialView @@ -14673,6 +14893,7 @@ Token end: 1411 PosValue: "APPR" coarseValue: "ADP" + order: 0 [seinem] POS_PRON sofa: _InitialView @@ -14736,6 +14957,7 @@ Token end: 1418 PosValue: "PPOSAT" coarseValue: "PRON" + order: 0 [Befehlston] POS_NOUN sofa: _InitialView @@ -14799,6 +15021,7 @@ Token end: 1429 PosValue: "NN" coarseValue: "NOUN" + order: 0 [auf dem Capitol] Constituent sofa: _InitialView @@ -14897,6 +15120,7 @@ Token end: 1433 PosValue: "APPR" coarseValue: "ADP" + order: 0 [dem] POS_DET sofa: _InitialView @@ -14960,6 +15184,7 @@ Token end: 1437 PosValue: "ART" coarseValue: "DET" + order: 0 [Capitol] POS_PROPN sofa: _InitialView @@ -15023,6 +15248,7 @@ Token end: 1445 PosValue: "NE" coarseValue: "PROPN" + order: 0 [gegen eine Wand] Constituent sofa: _InitialView @@ -15121,6 +15347,7 @@ Token end: 1451 PosValue: "APPR" coarseValue: "ADP" + order: 0 [eine] POS_DET sofa: _InitialView @@ -15184,6 +15411,7 @@ Token end: 1456 PosValue: "ART" coarseValue: "DET" + order: 0 [Wand] POS_NOUN sofa: _InitialView @@ -15247,6 +15475,7 @@ Token end: 1461 PosValue: "NN" coarseValue: "NOUN" + order: 0 [laufen] POS_VERB sofa: _InitialView @@ -15303,6 +15532,7 @@ Token end: 1468 PosValue: "VVINF" coarseValue: "VERB" + order: 0 [würde] POS_VERB sofa: _InitialView @@ -15352,6 +15582,7 @@ Token end: 1474 PosValue: "VAFIN" coarseValue: "VERB" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -15387,6 +15618,7 @@ Token end: 1476 PosValue: "$." coarseValue: "PUNCT" + order: 0 [So erklärt etwa Edward Brandon von dem Unternehmen National City in Ohio :] Sentence sofa: _InitialView @@ -15397,7 +15629,7 @@ PennTree sofa: _InitialView begin: 1477 end: 1551 - PennTree: "(ROOT ($. :) (S--- (ADV So) (VVFIN erklärt) (NP-SB (ADV etwa) (PN-NK (NE Edward) (NE Brandon)) (PP-MNR (APPR von) (ART dem) (NN Unternehmen) (PN-NK (NE National) (NE City)) (PP-MNR (APPR in) (NE Ohio))))))" + PennTree: "(ROOT ($. :) (S--- (ADV So) (VVFIN erklärt) (NP-SB (ADV etwa) (PN-NK (NE Edward)..." [So erklärt etwa Edward Brandon von dem Unternehmen National City in Ohio :] ROOT sofa: _InitialView @@ -15461,6 +15693,7 @@ Token end: 1479 PosValue: "ADV" coarseValue: "ADV" + order: 0 [erklärt] POS_VERB sofa: _InitialView @@ -15503,6 +15736,7 @@ Token end: 1487 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [etwa Edward Brandon von dem Unternehmen National City in Ohio] Constituent sofa: _InitialView @@ -15573,6 +15807,7 @@ Token end: 1492 PosValue: "ADV" coarseValue: "ADV" + order: 0 [Edward Brandon] Constituent sofa: _InitialView @@ -15657,6 +15892,7 @@ Token end: 1499 PosValue: "NE" coarseValue: "PROPN" + order: 0 [Brandon] POS_PROPN sofa: _InitialView @@ -15713,6 +15949,7 @@ Token end: 1507 PosValue: "NE" coarseValue: "PROPN" + order: 0 [von dem Unternehmen National City in Ohio] Constituent sofa: _InitialView @@ -15797,6 +16034,7 @@ Token end: 1511 PosValue: "APPR" coarseValue: "ADP" + order: 0 [dem] POS_DET sofa: _InitialView @@ -15853,6 +16091,7 @@ Token end: 1515 PosValue: "ART" coarseValue: "DET" + order: 0 [Unternehmen] POS_NOUN sofa: _InitialView @@ -15909,6 +16148,7 @@ Token end: 1527 PosValue: "NN" coarseValue: "NOUN" + order: 0 [National City] Constituent sofa: _InitialView @@ -16007,6 +16247,7 @@ Token end: 1536 PosValue: "NE" coarseValue: "PROPN" + order: 0 [City] POS_PROPN sofa: _InitialView @@ -16070,6 +16311,7 @@ Token end: 1541 PosValue: "NE" coarseValue: "PROPN" + order: 0 [in Ohio] Constituent sofa: _InitialView @@ -16168,6 +16410,7 @@ Token end: 1544 PosValue: "APPR" coarseValue: "ADP" + order: 0 [Ohio] POS_PROPN sofa: _InitialView @@ -16231,6 +16474,7 @@ Token end: 1549 PosValue: "NE" coarseValue: "PROPN" + order: 0 [:] POS_PUNCT sofa: _InitialView @@ -16266,6 +16510,7 @@ Token end: 1551 PosValue: "$." coarseValue: "PUNCT" + order: 0 [`` Ich glaube kaum , daß mit seinem , naja , etwas undiplomatischen Stil im Weißen Haus dem Land ein Gefallen getan wäre .] Sentence sofa: _InitialView @@ -16276,7 +16521,7 @@ PennTree sofa: _InitialView begin: 1552 end: 1674 - PennTree: "(ROOT ($( ``) ($, ,) ($, ,) ($, ,) ($. .) (S--- (PPER Ich) (VVFIN glaube) (ADV kaum) (S-OC (KOUS daß) (VAFIN wäre) (NP-SB (ART ein) (NN Gefallen)) (VP-OC (VVPP getan) (NP-DA (ART dem) (NN Land)) (PP-MO (APPR mit) (PPOSAT seinem) (NN Stil) (AP-NK (ITJ naja) (ADV etwas) (ADJA undiplomatischen)) (PP-MNR (APPRART im) (PN-NK (NP-PNC (ADJA Weißen) (NN Haus)))))))))" + PennTree: "(ROOT ($( ``) ($, ,) ($, ,) ($, ,) ($. .) (S--- (PPER Ich) (VVFIN glaube) (ADV k..." [`` Ich glaube kaum , daß mit seinem , naja , etwas undiplomatischen Stil im Weißen Haus dem Land ein Gefallen getan wäre .] ROOT sofa: _InitialView @@ -16319,6 +16564,7 @@ Token end: 1554 PosValue: "$(" coarseValue: "PUNCT" + order: 0 [Ich glaube kaum , daß mit seinem , naja , etwas undiplomatischen Stil im Weißen Haus dem Land ein Gefallen getan wäre] Constituent sofa: _InitialView @@ -16375,6 +16621,7 @@ Token end: 1558 PosValue: "PPER" coarseValue: "PRON" + order: 0 [glaube] POS_VERB sofa: _InitialView @@ -16417,6 +16664,7 @@ Token end: 1565 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [kaum] POS_ADV sofa: _InitialView @@ -16459,6 +16707,7 @@ Token end: 1570 PosValue: "ADV" coarseValue: "ADV" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -16494,6 +16743,7 @@ Token end: 1572 PosValue: "$," coarseValue: "PUNCT" + order: 0 [daß mit seinem , naja , etwas undiplomatischen Stil im Weißen Haus dem Land ein Gefallen getan wäre] Constituent sofa: _InitialView @@ -16564,6 +16814,7 @@ Token end: 1576 PosValue: "KOUS" coarseValue: "CONJ" + order: 0 [mit seinem , naja , etwas undiplomatischen Stil im Weißen Haus dem Land ein Gefallen getan] Constituent sofa: _InitialView @@ -16690,6 +16941,7 @@ Token end: 1580 PosValue: "APPR" coarseValue: "ADP" + order: 0 [seinem] POS_PRON sofa: _InitialView @@ -16753,6 +17005,7 @@ Token end: 1587 PosValue: "PPOSAT" coarseValue: "PRON" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -16788,6 +17041,7 @@ Token end: 1589 PosValue: "$," coarseValue: "PUNCT" + order: 0 [naja , etwas undiplomatischen] Constituent sofa: _InitialView @@ -16900,6 +17154,7 @@ Token end: 1594 PosValue: "ITJ" coarseValue: "X" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -16935,6 +17190,7 @@ Token end: 1596 PosValue: "$," coarseValue: "PUNCT" + order: 0 [etwas] POS_ADV sofa: _InitialView @@ -17005,6 +17261,7 @@ Token end: 1602 PosValue: "ADV" coarseValue: "ADV" + order: 0 [undiplomatischen] POS_ADJ sofa: _InitialView @@ -17075,6 +17332,7 @@ Token end: 1619 PosValue: "ADJA" coarseValue: "ADJ" + order: 0 [Stil] POS_NOUN sofa: _InitialView @@ -17138,6 +17396,7 @@ Token end: 1624 PosValue: "NN" coarseValue: "NOUN" + order: 0 [im Weißen Haus] Constituent sofa: _InitialView @@ -17250,6 +17509,7 @@ Token end: 1627 PosValue: "APPRART" coarseValue: "ADP" + order: 0 [Weißen Haus] Constituent sofa: _InitialView @@ -17439,6 +17699,7 @@ Token end: 1634 PosValue: "ADJA" coarseValue: "ADJ" + order: 0 [Haus] POS_NOUN sofa: _InitialView @@ -17523,6 +17784,7 @@ Token end: 1639 PosValue: "NN" coarseValue: "NOUN" + order: 0 [dem Land] Constituent sofa: _InitialView @@ -17621,6 +17883,7 @@ Token end: 1643 PosValue: "ART" coarseValue: "DET" + order: 0 [Land] POS_NOUN sofa: _InitialView @@ -17684,6 +17947,7 @@ Token end: 1648 PosValue: "NN" coarseValue: "NOUN" + order: 0 [ein Gefallen] Constituent sofa: _InitialView @@ -17768,6 +18032,7 @@ Token end: 1652 PosValue: "ART" coarseValue: "DET" + order: 0 [Gefallen] POS_NOUN sofa: _InitialView @@ -17824,6 +18089,7 @@ Token end: 1661 PosValue: "NN" coarseValue: "NOUN" + order: 0 [getan] POS_VERB sofa: _InitialView @@ -17880,6 +18146,7 @@ Token end: 1667 PosValue: "VVPP" coarseValue: "VERB" + order: 0 [wäre] POS_VERB sofa: _InitialView @@ -17929,6 +18196,7 @@ Token end: 1672 PosValue: "VAFIN" coarseValue: "VERB" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -17964,6 +18232,7 @@ Token end: 1674 PosValue: "$." coarseValue: "PUNCT" + order: 0 [Er wäre vielleicht ein prächtiger Diktator - aber das ist nicht unser System . ''] Sentence sofa: _InitialView @@ -17974,7 +18243,7 @@ PennTree sofa: _InitialView begin: 1675 end: 1756 - PennTree: "(ROOT ($( -) ($. .) ($( '') (CS--- (KON aber) (S-CJ (PPER Er) (VAFIN wäre) (ADV vielleicht) (NP-PD (ART ein) (ADJA prächtiger) (NN Diktator))) (S-CJ (PDS das) (VAFIN ist) (PTKNEG nicht) (NP-PD (PPOSAT unser) (NN System)))))" + PennTree: "(ROOT ($( -) ($. .) ($( '') (CS--- (KON aber) (S-CJ (PPER Er) (VAFIN wäre) (ADV ..." [Er wäre vielleicht ein prächtiger Diktator - aber das ist nicht unser System . ''] ROOT sofa: _InitialView @@ -18066,6 +18335,7 @@ Token end: 1677 PosValue: "PPER" coarseValue: "PRON" + order: 0 [wäre] POS_VERB sofa: _InitialView @@ -18115,6 +18385,7 @@ Token end: 1682 PosValue: "VAFIN" coarseValue: "VERB" + order: 0 [vielleicht] POS_ADV sofa: _InitialView @@ -18164,6 +18435,7 @@ Token end: 1693 PosValue: "ADV" coarseValue: "ADV" + order: 0 [ein prächtiger Diktator] Constituent sofa: _InitialView @@ -18248,6 +18520,7 @@ Token end: 1697 PosValue: "ART" coarseValue: "DET" + order: 0 [prächtiger] POS_ADJ sofa: _InitialView @@ -18304,6 +18577,7 @@ Token end: 1708 PosValue: "ADJA" coarseValue: "ADJ" + order: 0 [Diktator] POS_NOUN sofa: _InitialView @@ -18360,6 +18634,7 @@ Token end: 1717 PosValue: "NN" coarseValue: "NOUN" + order: 0 [-] POS_PUNCT sofa: _InitialView @@ -18395,6 +18670,7 @@ Token end: 1719 PosValue: "$(" coarseValue: "PUNCT" + order: 0 [aber] POS_CONJ sofa: _InitialView @@ -18437,6 +18713,7 @@ Token end: 1724 PosValue: "KON" coarseValue: "CONJ" + order: 0 [das ist nicht unser System] Constituent sofa: _InitialView @@ -18507,6 +18784,7 @@ Token end: 1728 PosValue: "PDS" coarseValue: "PRON" + order: 0 [ist] POS_VERB sofa: _InitialView @@ -18556,6 +18834,7 @@ Token end: 1732 PosValue: "VAFIN" coarseValue: "VERB" + order: 0 [nicht] POS_X sofa: _InitialView @@ -18605,6 +18884,7 @@ Token end: 1738 PosValue: "PTKNEG" coarseValue: "X" + order: 0 [unser System] Constituent sofa: _InitialView @@ -18689,6 +18969,7 @@ Token end: 1744 PosValue: "PPOSAT" coarseValue: "PRON" + order: 0 [System] POS_NOUN sofa: _InitialView @@ -18745,6 +19026,7 @@ Token end: 1751 PosValue: "NN" coarseValue: "NOUN" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -18780,6 +19062,7 @@ Token end: 1753 PosValue: "$." coarseValue: "PUNCT" + order: 0 [''] POS_PUNCT sofa: _InitialView @@ -18815,6 +19098,7 @@ Token end: 1756 PosValue: "$(" coarseValue: "PUNCT" + order: 0 [Und ein anderer Manager vermutet , daß sich `` ein Dogmatiker wie Perot in Washington schwer tun würde , es sei denn er schafft den Kongreß ab '' .] Sentence sofa: _InitialView @@ -18825,7 +19109,7 @@ PennTree sofa: _InitialView begin: 1757 end: 1904 - PennTree: "(ROOT ($, ,) ($( ``) ($, ,) ($( '') ($. .) (S--- (KON Und) (VVFIN vermutet) (NP-SB (ART ein) (ADJA anderer) (NN Manager)) (S-OC (KOUS daß) (VAFIN würde) (NP-SB (ART ein) (NN Dogmatiker) (NP-CC (KOKOM wie) (NE Perot))) (VP-OC (PRF sich) (ADJD schwer) (VVINF tun) (PP-MO (APPR in) (NE Washington)) (S-MO (PPER es) (VAFIN sei) (ADV denn) (S-OC (PPER er) (VVFIN schafft) (PTKVZ ab) (NP-OA (ART den) (NN Kongreß))))))))" + PennTree: "(ROOT ($, ,) ($( ``) ($, ,) ($( '') ($. .) (S--- (KON Und) (VVFIN vermutet) (NP-..." [Und ein anderer Manager vermutet , daß sich `` ein Dogmatiker wie Perot in Washington schwer tun würde , es sei denn er schafft den Kongreß ab '' .] ROOT sofa: _InitialView @@ -18889,6 +19173,7 @@ Token end: 1760 PosValue: "KON" coarseValue: "CONJ" + order: 0 [ein anderer Manager] Constituent sofa: _InitialView @@ -18959,6 +19244,7 @@ Token end: 1764 PosValue: "ART" coarseValue: "DET" + order: 0 [anderer] POS_ADJ sofa: _InitialView @@ -19008,6 +19294,7 @@ Token end: 1772 PosValue: "ADJA" coarseValue: "ADJ" + order: 0 [Manager] POS_NOUN sofa: _InitialView @@ -19057,6 +19344,7 @@ Token end: 1780 PosValue: "NN" coarseValue: "NOUN" + order: 0 [vermutet] POS_VERB sofa: _InitialView @@ -19099,6 +19387,7 @@ Token end: 1789 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -19134,6 +19423,7 @@ Token end: 1791 PosValue: "$," coarseValue: "PUNCT" + order: 0 [daß sich `` ein Dogmatiker wie Perot in Washington schwer tun würde , es sei denn er schafft den Kongreß ab] Constituent sofa: _InitialView @@ -19204,6 +19494,7 @@ Token end: 1795 PosValue: "KOUS" coarseValue: "CONJ" + order: 0 [sich `` ein Dogmatiker wie Perot in Washington schwer tun würde , es sei denn er schafft den Kongreß ab] Constituent sofa: _InitialView @@ -19288,6 +19579,7 @@ Token end: 1800 PosValue: "PRF" coarseValue: "PRON" + order: 0 [``] POS_PUNCT sofa: _InitialView @@ -19323,6 +19615,7 @@ Token end: 1803 PosValue: "$(" coarseValue: "PUNCT" + order: 0 [ein Dogmatiker wie Perot] Constituent sofa: _InitialView @@ -19407,6 +19700,7 @@ Token end: 1807 PosValue: "ART" coarseValue: "DET" + order: 0 [Dogmatiker] POS_NOUN sofa: _InitialView @@ -19463,6 +19757,7 @@ Token end: 1818 PosValue: "NN" coarseValue: "NOUN" + order: 0 [wie Perot] Constituent sofa: _InitialView @@ -19561,6 +19856,7 @@ Token end: 1822 PosValue: "KOKOM" coarseValue: "CONJ" + order: 0 [Perot] POS_PROPN sofa: _InitialView @@ -19624,6 +19920,7 @@ Token end: 1828 PosValue: "NE" coarseValue: "PROPN" + order: 0 [in Washington] Constituent sofa: _InitialView @@ -19722,6 +20019,7 @@ Token end: 1831 PosValue: "APPR" coarseValue: "ADP" + order: 0 [Washington] POS_PROPN sofa: _InitialView @@ -19785,6 +20083,7 @@ Token end: 1842 PosValue: "NE" coarseValue: "PROPN" + order: 0 [schwer] POS_ADJ sofa: _InitialView @@ -19841,6 +20140,7 @@ Token end: 1849 PosValue: "ADJD" coarseValue: "ADJ" + order: 0 [tun] POS_VERB sofa: _InitialView @@ -19897,6 +20197,7 @@ Token end: 1853 PosValue: "VVINF" coarseValue: "VERB" + order: 0 [würde] POS_VERB sofa: _InitialView @@ -19946,6 +20247,7 @@ Token end: 1859 PosValue: "VAFIN" coarseValue: "VERB" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -19981,6 +20283,7 @@ Token end: 1861 PosValue: "$," coarseValue: "PUNCT" + order: 0 [es sei denn er schafft den Kongreß ab] Constituent sofa: _InitialView @@ -20079,6 +20382,7 @@ Token end: 1864 PosValue: "PPER" coarseValue: "PRON" + order: 0 [sei] POS_VERB sofa: _InitialView @@ -20142,6 +20446,7 @@ Token end: 1868 PosValue: "VAFIN" coarseValue: "VERB" + order: 0 [denn] POS_ADV sofa: _InitialView @@ -20205,6 +20510,7 @@ Token end: 1873 PosValue: "ADV" coarseValue: "ADV" + order: 0 [er schafft den Kongreß ab] Constituent sofa: _InitialView @@ -20317,6 +20623,7 @@ Token end: 1876 PosValue: "PPER" coarseValue: "PRON" + order: 0 [schafft] POS_VERB sofa: _InitialView @@ -20387,6 +20694,7 @@ Token end: 1884 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [den Kongreß] Constituent sofa: _InitialView @@ -20513,6 +20821,7 @@ Token end: 1888 PosValue: "ART" coarseValue: "DET" + order: 0 [Kongreß] POS_NOUN sofa: _InitialView @@ -20590,6 +20899,7 @@ Token end: 1896 PosValue: "NN" coarseValue: "NOUN" + order: 0 [ab] POS_VERB sofa: _InitialView @@ -20660,6 +20970,7 @@ Token end: 1899 PosValue: "PTKVZ" coarseValue: "VERB" + order: 0 [''] POS_PUNCT sofa: _InitialView @@ -20695,6 +21006,7 @@ Token end: 1902 PosValue: "$(" coarseValue: "PUNCT" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -20730,6 +21042,7 @@ Token end: 1904 PosValue: "$." coarseValue: "PUNCT" + order: 0 [Ein ehemaliger Geschäftsführer , der heute in fünf Konzernen im Aufsichtsgremium sitzt , `` kennt niemanden , der nicht glaubt , daß Perot als Präsident eine absolute Katastrophe wäre '' .] Sentence sofa: _InitialView @@ -20740,7 +21053,7 @@ PennTree sofa: _InitialView begin: 1905 end: 2093 - PennTree: "(ROOT ($, ,) ($, ,) ($( ``) ($, ,) ($, ,) ($( '') ($. .) (S--- (VVFIN kennt) (NP-SB (ART Ein) (ADJA ehemaliger) (NN Geschäftsführer) (S-RC (PRELS der) (ADV heute) (VVFIN sitzt) (PP-MO (APPR in) (CARD fünf) (NN Konzernen)) (PP-MO (APPRART im) (NN Aufsichtsgremium)))) (NP-OA (PIS niemanden) (S-RC (PRELS der) (PTKNEG nicht) (VVFIN glaubt) (S-OC (KOUS daß) (NE Perot) (VAFIN wäre) (PP-MO (APPR als) (NN Präsident)) (NP-PD (ART eine) (ADJA absolute) (NN Katastrophe)))))))" + PennTree: "(ROOT ($, ,) ($, ,) ($( ``) ($, ,) ($, ,) ($( '') ($. .) (S--- (VVFIN kennt) (NP..." [Ein ehemaliger Geschäftsführer , der heute in fünf Konzernen im Aufsichtsgremium sitzt , `` kennt niemanden , der nicht glaubt , daß Perot als Präsident eine absolute Katastrophe wäre '' .] ROOT sofa: _InitialView @@ -20832,6 +21145,7 @@ Token end: 1908 PosValue: "ART" coarseValue: "DET" + order: 0 [ehemaliger] POS_ADJ sofa: _InitialView @@ -20881,6 +21195,7 @@ Token end: 1919 PosValue: "ADJA" coarseValue: "ADJ" + order: 0 [Geschäftsführer] POS_NOUN sofa: _InitialView @@ -20930,6 +21245,7 @@ Token end: 1935 PosValue: "NN" coarseValue: "NOUN" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -20965,6 +21281,7 @@ Token end: 1937 PosValue: "$," coarseValue: "PUNCT" + order: 0 [der heute in fünf Konzernen im Aufsichtsgremium sitzt] Constituent sofa: _InitialView @@ -21049,6 +21366,7 @@ Token end: 1941 PosValue: "PRELS" coarseValue: "PRON" + order: 0 [heute] POS_ADV sofa: _InitialView @@ -21105,6 +21423,7 @@ Token end: 1947 PosValue: "ADV" coarseValue: "ADV" + order: 0 [in fünf Konzernen] Constituent sofa: _InitialView @@ -21203,6 +21522,7 @@ Token end: 1950 PosValue: "APPR" coarseValue: "ADP" + order: 0 [fünf] POS_NUM sofa: _InitialView @@ -21266,6 +21586,7 @@ Token end: 1955 PosValue: "CARD" coarseValue: "NUM" + order: 0 [Konzernen] POS_NOUN sofa: _InitialView @@ -21329,6 +21650,7 @@ Token end: 1965 PosValue: "NN" coarseValue: "NOUN" + order: 0 [im Aufsichtsgremium] Constituent sofa: _InitialView @@ -21427,6 +21749,7 @@ Token end: 1968 PosValue: "APPRART" coarseValue: "ADP" + order: 0 [Aufsichtsgremium] POS_NOUN sofa: _InitialView @@ -21490,6 +21813,7 @@ Token end: 1985 PosValue: "NN" coarseValue: "NOUN" + order: 0 [sitzt] POS_VERB sofa: _InitialView @@ -21546,6 +21870,7 @@ Token end: 1991 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -21581,6 +21906,7 @@ Token end: 1993 PosValue: "$," coarseValue: "PUNCT" + order: 0 [``] POS_PUNCT sofa: _InitialView @@ -21616,6 +21942,7 @@ Token end: 1996 PosValue: "$(" coarseValue: "PUNCT" + order: 0 [kennt] POS_VERB sofa: _InitialView @@ -21658,6 +21985,7 @@ Token end: 2002 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [niemanden , der nicht glaubt , daß Perot als Präsident eine absolute Katastrophe wäre] Constituent sofa: _InitialView @@ -21728,6 +22056,7 @@ Token end: 2012 PosValue: "PIS" coarseValue: "PRON" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -21763,6 +22092,7 @@ Token end: 2014 PosValue: "$," coarseValue: "PUNCT" + order: 0 [der nicht glaubt , daß Perot als Präsident eine absolute Katastrophe wäre] Constituent sofa: _InitialView @@ -21847,6 +22177,7 @@ Token end: 2018 PosValue: "PRELS" coarseValue: "PRON" + order: 0 [nicht] POS_X sofa: _InitialView @@ -21903,6 +22234,7 @@ Token end: 2024 PosValue: "PTKNEG" coarseValue: "X" + order: 0 [glaubt] POS_VERB sofa: _InitialView @@ -21959,6 +22291,7 @@ Token end: 2031 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -21994,6 +22327,7 @@ Token end: 2033 PosValue: "$," coarseValue: "PUNCT" + order: 0 [daß Perot als Präsident eine absolute Katastrophe wäre] Constituent sofa: _InitialView @@ -22092,6 +22426,7 @@ Token end: 2037 PosValue: "KOUS" coarseValue: "CONJ" + order: 0 [Perot] POS_PROPN sofa: _InitialView @@ -22155,6 +22490,7 @@ Token end: 2043 PosValue: "NE" coarseValue: "PROPN" + order: 0 [als Präsident] Constituent sofa: _InitialView @@ -22267,6 +22603,7 @@ Token end: 2047 PosValue: "APPR" coarseValue: "ADP" + order: 0 [Präsident] POS_NOUN sofa: _InitialView @@ -22337,6 +22674,7 @@ Token end: 2057 PosValue: "NN" coarseValue: "NOUN" + order: 0 [eine absolute Katastrophe] Constituent sofa: _InitialView @@ -22449,6 +22787,7 @@ Token end: 2062 PosValue: "ART" coarseValue: "DET" + order: 0 [absolute] POS_ADJ sofa: _InitialView @@ -22519,6 +22858,7 @@ Token end: 2071 PosValue: "ADJA" coarseValue: "ADJ" + order: 0 [Katastrophe] POS_NOUN sofa: _InitialView @@ -22589,6 +22929,7 @@ Token end: 2083 PosValue: "NN" coarseValue: "NOUN" + order: 0 [wäre] POS_VERB sofa: _InitialView @@ -22652,6 +22993,7 @@ Token end: 2088 PosValue: "VAFIN" coarseValue: "VERB" + order: 0 [''] POS_PUNCT sofa: _InitialView @@ -22687,6 +23029,7 @@ Token end: 2091 PosValue: "$(" coarseValue: "PUNCT" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -22722,6 +23065,7 @@ Token end: 2093 PosValue: "$." coarseValue: "PUNCT" + order: 0 [Allerdings gibt es dem Magazin zufolge in kleinen und mittleren Firmen viele Unternehmer , die meinen , Perot sei einer von ihnen , und die den Texaner unterstützen .] Sentence sofa: _InitialView @@ -22732,7 +23076,7 @@ PennTree sofa: _InitialView begin: 2094 end: 2260 - PennTree: "(ROOT ($, ,) ($, ,) ($, ,) ($. .) (S--- (ADV Allerdings) (VVFIN gibt) (PPER es) (PP-MO (ART dem) (NN Magazin) (APPO zufolge)) (PP-MO (APPR in) (NN Firmen) (CAP-NK (ADJA kleinen) (KON und) (ADJA mittleren))) (NP-OA (PIAT viele) (NN Unternehmer) (CS-RC (KON und) (S-CJ (PRELS die) (VVFIN unterstützen) (NP-OA (ART den) (NN Texaner))) (S-CJ (PRELS die) (VVFIN meinen) (S-OC (NE Perot) (VAFIN sei) (NP-PD (PIS einer) (PP-PG (APPR von) (PPER ihnen)))))))))" + PennTree: "(ROOT ($, ,) ($, ,) ($, ,) ($. .) (S--- (ADV Allerdings) (VVFIN gibt) (PPER es) ..." [Allerdings gibt es dem Magazin zufolge in kleinen und mittleren Firmen viele Unternehmer , die meinen , Perot sei einer von ihnen , und die den Texaner unterstützen .] ROOT sofa: _InitialView @@ -22796,6 +23140,7 @@ Token end: 2104 PosValue: "ADV" coarseValue: "ADV" + order: 0 [gibt] POS_VERB sofa: _InitialView @@ -22838,6 +23183,7 @@ Token end: 2109 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [es] POS_PRON sofa: _InitialView @@ -22880,6 +23226,7 @@ Token end: 2112 PosValue: "PPER" coarseValue: "PRON" + order: 0 [dem Magazin zufolge] Constituent sofa: _InitialView @@ -22950,6 +23297,7 @@ Token end: 2116 PosValue: "ART" coarseValue: "DET" + order: 0 [Magazin] POS_NOUN sofa: _InitialView @@ -22999,6 +23347,7 @@ Token end: 2124 PosValue: "NN" coarseValue: "NOUN" + order: 0 [zufolge] POS_ADP sofa: _InitialView @@ -23048,6 +23397,7 @@ Token end: 2132 PosValue: "APPO" coarseValue: "ADP" + order: 0 [in kleinen und mittleren Firmen] Constituent sofa: _InitialView @@ -23118,6 +23468,7 @@ Token end: 2135 PosValue: "APPR" coarseValue: "ADP" + order: 0 [kleinen und mittleren] Constituent sofa: _InitialView @@ -23202,6 +23553,7 @@ Token end: 2143 PosValue: "ADJA" coarseValue: "ADJ" + order: 0 [und] POS_CONJ sofa: _InitialView @@ -23258,6 +23610,7 @@ Token end: 2147 PosValue: "KON" coarseValue: "CONJ" + order: 0 [mittleren] POS_ADJ sofa: _InitialView @@ -23314,6 +23667,7 @@ Token end: 2157 PosValue: "ADJA" coarseValue: "ADJ" + order: 0 [Firmen] POS_NOUN sofa: _InitialView @@ -23363,6 +23717,7 @@ Token end: 2164 PosValue: "NN" coarseValue: "NOUN" + order: 0 [viele Unternehmer , die meinen , Perot sei einer von ihnen , und die den Texaner unterstützen] Constituent sofa: _InitialView @@ -23433,6 +23788,7 @@ Token end: 2170 PosValue: "PIAT" coarseValue: "PRON" + order: 0 [Unternehmer] POS_NOUN sofa: _InitialView @@ -23482,6 +23838,7 @@ Token end: 2182 PosValue: "NN" coarseValue: "NOUN" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -23517,6 +23874,7 @@ Token end: 2184 PosValue: "$," coarseValue: "PUNCT" + order: 0 [die meinen , Perot sei einer von ihnen , und die den Texaner unterstützen] Constituent sofa: _InitialView @@ -23643,6 +24001,7 @@ Token end: 2188 PosValue: "PRELS" coarseValue: "PRON" + order: 0 [meinen] POS_VERB sofa: _InitialView @@ -23706,6 +24065,7 @@ Token end: 2195 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -23741,6 +24101,7 @@ Token end: 2197 PosValue: "$," coarseValue: "PUNCT" + order: 0 [Perot sei einer von ihnen] Constituent sofa: _InitialView @@ -23853,6 +24214,7 @@ Token end: 2203 PosValue: "NE" coarseValue: "PROPN" + order: 0 [sei] POS_VERB sofa: _InitialView @@ -23923,6 +24285,7 @@ Token end: 2207 PosValue: "VAFIN" coarseValue: "VERB" + order: 0 [einer von ihnen] Constituent sofa: _InitialView @@ -24049,6 +24412,7 @@ Token end: 2213 PosValue: "PIS" coarseValue: "PRON" + order: 0 [von ihnen] Constituent sofa: _InitialView @@ -24189,6 +24553,7 @@ Token end: 2217 PosValue: "APPR" coarseValue: "ADP" + order: 0 [ihnen] POS_PRON sofa: _InitialView @@ -24273,6 +24638,7 @@ Token end: 2223 PosValue: "PPER" coarseValue: "PRON" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -24308,6 +24674,7 @@ Token end: 2225 PosValue: "$," coarseValue: "PUNCT" + order: 0 [und] POS_CONJ sofa: _InitialView @@ -24364,6 +24731,7 @@ Token end: 2229 PosValue: "KON" coarseValue: "CONJ" + order: 0 [die den Texaner unterstützen] Constituent sofa: _InitialView @@ -24462,6 +24830,7 @@ Token end: 2233 PosValue: "PRELS" coarseValue: "PRON" + order: 0 [den Texaner] Constituent sofa: _InitialView @@ -24574,6 +24943,7 @@ Token end: 2237 PosValue: "ART" coarseValue: "DET" + order: 0 [Texaner] POS_NOUN sofa: _InitialView @@ -24644,6 +25014,7 @@ Token end: 2245 PosValue: "NN" coarseValue: "NOUN" + order: 0 [unterstützen] POS_VERB sofa: _InitialView @@ -24707,6 +25078,7 @@ Token end: 2258 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -24742,6 +25114,7 @@ Token end: 2260 PosValue: "$." coarseValue: "PUNCT" + order: 0 [Zwei Themen , die Perot immer wieder anspricht , Rezession und Bürokratie , machen ihnen besonders zu schaffen .] Sentence sofa: _InitialView @@ -24752,7 +25125,7 @@ PennTree sofa: _InitialView begin: 2261 end: 2373 - PennTree: "(ROOT ($, ,) ($, ,) ($, ,) ($. .) (S--- (VVFIN machen) (VP-OC (PPER ihnen) (ADV besonders) (VZ-HD (PTKZU zu) (VVINF schaffen))) (NP-SB (CARD Zwei) (NN Themen) (CNP-APP (NN Rezession) (KON und) (NN Bürokratie)) (S-RC (PRELS die) (NE Perot) (VVFIN anspricht) (AVP-MO (ADV immer) (ADV wieder))))))" + PennTree: "(ROOT ($, ,) ($, ,) ($, ,) ($. .) (S--- (VVFIN machen) (VP-OC (PPER ihnen) (ADV ..." [Zwei Themen , die Perot immer wieder anspricht , Rezession und Bürokratie , machen ihnen besonders zu schaffen .] ROOT sofa: _InitialView @@ -24844,6 +25217,7 @@ Token end: 2265 PosValue: "CARD" coarseValue: "NUM" + order: 0 [Themen] POS_NOUN sofa: _InitialView @@ -24893,6 +25267,7 @@ Token end: 2272 PosValue: "NN" coarseValue: "NOUN" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -24928,6 +25303,7 @@ Token end: 2274 PosValue: "$," coarseValue: "PUNCT" + order: 0 [die Perot immer wieder anspricht] Constituent sofa: _InitialView @@ -25012,6 +25388,7 @@ Token end: 2278 PosValue: "PRELS" coarseValue: "PRON" + order: 0 [Perot] POS_PROPN sofa: _InitialView @@ -25068,6 +25445,7 @@ Token end: 2284 PosValue: "NE" coarseValue: "PROPN" + order: 0 [immer wieder] Constituent sofa: _InitialView @@ -25166,6 +25544,7 @@ Token end: 2290 PosValue: "ADV" coarseValue: "ADV" + order: 0 [wieder] POS_ADV sofa: _InitialView @@ -25229,6 +25608,7 @@ Token end: 2297 PosValue: "ADV" coarseValue: "ADV" + order: 0 [anspricht] POS_VERB sofa: _InitialView @@ -25285,6 +25665,7 @@ Token end: 2307 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -25320,6 +25701,7 @@ Token end: 2309 PosValue: "$," coarseValue: "PUNCT" + order: 0 [Rezession und Bürokratie] Constituent sofa: _InitialView @@ -25404,6 +25786,7 @@ Token end: 2319 PosValue: "NN" coarseValue: "NOUN" + order: 0 [und] POS_CONJ sofa: _InitialView @@ -25460,6 +25843,7 @@ Token end: 2323 PosValue: "KON" coarseValue: "CONJ" + order: 0 [Bürokratie] POS_NOUN sofa: _InitialView @@ -25516,6 +25900,7 @@ Token end: 2334 PosValue: "NN" coarseValue: "NOUN" + order: 0 [,] POS_PUNCT sofa: _InitialView @@ -25551,6 +25936,7 @@ Token end: 2336 PosValue: "$," coarseValue: "PUNCT" + order: 0 [machen] POS_VERB sofa: _InitialView @@ -25593,6 +25979,7 @@ Token end: 2343 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [ihnen besonders zu schaffen] Constituent sofa: _InitialView @@ -25663,6 +26050,7 @@ Token end: 2349 PosValue: "PPER" coarseValue: "PRON" + order: 0 [besonders] POS_ADV sofa: _InitialView @@ -25712,6 +26100,7 @@ Token end: 2359 PosValue: "ADV" coarseValue: "ADV" + order: 0 [zu schaffen] Constituent sofa: _InitialView @@ -25796,6 +26185,7 @@ Token end: 2362 PosValue: "PTKZU" coarseValue: "X" + order: 0 [schaffen] POS_VERB sofa: _InitialView @@ -25852,6 +26242,7 @@ Token end: 2371 PosValue: "VVINF" coarseValue: "VERB" + order: 0 [.] POS_PUNCT sofa: _InitialView @@ -25887,6 +26278,7 @@ Token end: 2373 PosValue: "$." coarseValue: "PUNCT" + order: 0 -------- View _InitialView end ---------------------------------- ======== CAS 0 end ================================== \ No newline at end of file diff --git a/dkpro-core-io-negra-asl/src/test/resources/tueba-sample.export.dump b/dkpro-core-io-negra-asl/src/test/resources/tueba-sample.export.dump index 545b936cfd..bcd7acfe34 100644 --- a/dkpro-core-io-negra-asl/src/test/resources/tueba-sample.export.dump +++ b/dkpro-core-io-negra-asl/src/test/resources/tueba-sample.export.dump @@ -22,7 +22,7 @@ PennTree sofa: _InitialView begin: 0 end: 92 - PennTree: "(ROOT ($, ,) ($. .) (SIMPX--- (SIMPX-KONJ (VF-- (NX-OA (ART den) (NN AWO-Posten))) (MF-- (ADVX-MOD (PTKNEG nicht)))) (SIMPX-KONJ (LK-- (VXFIN-HD (VMFIN mußte))) (MF-- (NX-ON (PPER er)) (ADVX-V-MOD (ADV damals))) (VC-- (VXINF-OV (VVINF niederlegen))) (VF-- (NX-OA (NX-HD (PPOSAT Seine) (NN Position)) (PX-- (APPR bei) (NX-HD (ART der) (NN Prüfgesellschaft))))))))" + PennTree: "(ROOT ($, ,) ($. .) (SIMPX--- (SIMPX-KONJ (VF-- (NX-OA (ART den) (NN AWO-Posten)..." [Seine Position bei der Prüfgesellschaft mußte er damals niederlegen , den AWO-Posten nicht .] ROOT sofa: _InitialView @@ -229,6 +229,7 @@ Token end: 5 PosValue: "PPOSAT" coarseValue: "PRON" + order: 0 [Position] POS_NOUN sofa: _InitialView @@ -288,6 +289,7 @@ Token end: 14 PosValue: "NN" coarseValue: "NOUN" + order: 0 [bei der Prüfgesellschaft] Constituent sofa: _InitialView @@ -389,6 +391,7 @@ Token end: 18 PosValue: "APPR" coarseValue: "ADP" + order: 0 [der Prüfgesellschaft] Constituent sofa: _InitialView @@ -504,6 +507,7 @@ Token end: 22 PosValue: "ART" coarseValue: "DET" + order: 0 [Prüfgesellschaft] POS_NOUN sofa: _InitialView @@ -570,6 +574,7 @@ Token end: 39 PosValue: "NN" coarseValue: "NOUN" + order: 0 [mußte] POS_VERB sofa: _InitialView @@ -622,6 +627,7 @@ Token end: 45 PosValue: "VMFIN" coarseValue: "VERB" + order: 0 [mußte] Constituent sofa: _InitialView @@ -765,6 +771,7 @@ Token end: 48 PosValue: "PPER" coarseValue: "PRON" + order: 0 [er] Constituent sofa: _InitialView @@ -852,6 +859,7 @@ Token end: 55 PosValue: "ADV" coarseValue: "ADV" + order: 0 [damals] Constituent sofa: _InitialView @@ -939,6 +947,7 @@ Token end: 67 PosValue: "VVINF" coarseValue: "VERB" + order: 0 [niederlegen] Constituent sofa: _InitialView @@ -1026,6 +1035,7 @@ Token end: 69 PosValue: "$," coarseValue: "PUNCT" + order: 0 [den AWO-Posten nicht] Constituent sofa: _InitialView @@ -1162,6 +1172,7 @@ Token end: 73 PosValue: "ART" coarseValue: "DET" + order: 0 [AWO-Posten] POS_NOUN sofa: _InitialView @@ -1214,6 +1225,7 @@ Token end: 84 PosValue: "NN" coarseValue: "NOUN" + order: 0 [nicht] POS_X sofa: _InitialView @@ -1266,6 +1278,7 @@ Token end: 90 PosValue: "PTKNEG" coarseValue: "X" + order: 0 [nicht] Constituent sofa: _InitialView @@ -1353,6 +1366,7 @@ Token end: 92 PosValue: "$." coarseValue: "PUNCT" + order: 0 [K. W.] Sentence sofa: _InitialView @@ -1444,6 +1458,7 @@ Token end: 95 PosValue: "NE" coarseValue: "PROPN" + order: 0 [W.] POS_PROPN sofa: _InitialView @@ -1482,6 +1497,7 @@ Token end: 98 PosValue: "NE" coarseValue: "PROPN" + order: 0 -------- View _InitialView end ---------------------------------- ======== CAS 0 end ================================== @@ -1511,7 +1527,7 @@ PennTree sofa: _InitialView begin: 0 end: 36 - PennTree: "(ROOT ($( /) ($( /) (NX--- (NX-HD (NX-KONJ (NE SPD)) (NX-KONJ (NE CDU)) (NX-KONJ (NE AfB))) (PX-- (APPR für) (NX-HD (NN Daewoo-Millionen)))))" + PennTree: "(ROOT ($( /) ($( /) (NX--- (NX-HD (NX-KONJ (NE SPD)) (NX-KONJ (NE CDU)) (NX-KONJ..." [SPD / CDU / AfB für Daewoo-Millionen] Constituent sofa: _InitialView @@ -1599,6 +1615,7 @@ Token end: 3 PosValue: "NE" coarseValue: "PROPN" + order: 0 [SPD] Constituent sofa: _InitialView @@ -1651,6 +1668,7 @@ Token end: 5 PosValue: "$(" coarseValue: "PUNCT" + order: 0 [CDU] POS_PROPN sofa: _InitialView @@ -1696,6 +1714,7 @@ Token end: 9 PosValue: "NE" coarseValue: "PROPN" + order: 0 [CDU] Constituent sofa: _InitialView @@ -1748,6 +1767,7 @@ Token end: 11 PosValue: "$(" coarseValue: "PUNCT" + order: 0 [AfB] POS_PROPN sofa: _InitialView @@ -1793,6 +1813,7 @@ Token end: 15 PosValue: "NE" coarseValue: "PROPN" + order: 0 [AfB] Constituent sofa: _InitialView @@ -1880,6 +1901,7 @@ Token end: 19 PosValue: "APPR" coarseValue: "ADP" + order: 0 [Daewoo-Millionen] POS_NOUN sofa: _InitialView @@ -1925,6 +1947,7 @@ Token end: 36 PosValue: "NN" coarseValue: "NOUN" + order: 0 [Daewoo-Millionen] Constituent sofa: _InitialView @@ -1963,7 +1986,7 @@ PennTree sofa: _InitialView begin: 37 end: 94 - PennTree: "(ROOT (SIMPX--- (KOORD-- (KON Aber)) (LK-- (VXFIN-HD (VVFIN fordert))) (MF-- (ADVX-V-MOD (ADV jetzt)) (NX-OA (NN Untersuchungsausschuß))) (VF-- (NX-ON (NE AfB) (NX-- (NE Bremerhavens))))))" + PennTree: "(ROOT (SIMPX--- (KOORD-- (KON Aber)) (LK-- (VXFIN-HD (VVFIN fordert))) (MF-- (AD..." [Aber Bremerhavens AfB fordert jetzt Untersuchungsausschuß] Constituent sofa: _InitialView @@ -2023,6 +2046,7 @@ Token end: 41 PosValue: "KON" coarseValue: "CONJ" + order: 0 [Aber] Constituent sofa: _InitialView @@ -2145,6 +2169,7 @@ Token end: 54 PosValue: "NE" coarseValue: "PROPN" + order: 0 [Bremerhavens] Constituent sofa: _InitialView @@ -2225,6 +2250,7 @@ Token end: 58 PosValue: "NE" coarseValue: "PROPN" + order: 0 [fordert] POS_VERB sofa: _InitialView @@ -2270,6 +2296,7 @@ Token end: 66 PosValue: "VVFIN" coarseValue: "VERB" + order: 0 [fordert] Constituent sofa: _InitialView @@ -2385,6 +2412,7 @@ Token end: 72 PosValue: "ADV" coarseValue: "ADV" + order: 0 [jetzt] Constituent sofa: _InitialView @@ -2458,6 +2486,7 @@ Token end: 94 PosValue: "NN" coarseValue: "NOUN" + order: 0 [Untersuchungsausschuß] Constituent sofa: _InitialView diff --git a/dkpro-core-io-nif-asl/pom.xml b/dkpro-core-io-nif-asl/pom.xml index 71fa02596b..050474fda3 100644 --- a/dkpro-core-io-nif-asl/pom.xml +++ b/dkpro-core-io-nif-asl/pom.xml @@ -18,24 +18,24 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - org.dkpro.core dkpro-core-io-nif-asl DKPro Core ASL - IO - NIF + https://dkpro.github.io/dkpro-core/ org.apache.jena jena-core - 3.5.0 + ${jena.version} org.apache.jena jena-arq - 3.5.0 + ${jena.version} org.apache.commons @@ -54,32 +54,36 @@ uimafit-core - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.ner-asl + org.dkpro.core + dkpro-core-api-ner-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -87,8 +91,8 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test diff --git a/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/NifReader.java b/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/NifReader.java index 2b2383f55b..0ef5a3af86 100644 --- a/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/NifReader.java +++ b/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/NifReader.java @@ -17,6 +17,8 @@ */ package org.dkpro.core.io.nif; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; + import java.io.IOException; import java.io.InputStream; @@ -36,21 +38,22 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.api.resources.MappingProvider; import org.dkpro.core.io.nif.internal.NIF; import org.dkpro.core.io.nif.internal.Nif2DKPro; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Reader for the NLP Interchange Format (NIF). The file format (e.g. TURTLE, etc.) is automatically * chosen depending on the name of the file(s) being read. Compressed files are supported. */ -@ResourceMetaData(name="NLP Interchange Format (NIF) Reader") +@ResourceMetaData(name = "NLP Interchange Format (NIF) Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.APPLICATION_X_NIF_TURTLE}) @TypeCapability( outputs = { @@ -75,11 +78,20 @@ public class NifReader @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) private String posTagset; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) private String posMappingLocation; @@ -96,8 +108,8 @@ public void initialize(UimaContext aContext) { super.initialize(aContext); - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - posTagset, getLanguage()); + posMappingProvider = createPosMappingProvider(this, posMappingLocation, posTagset, + getLanguage()); // Seek first article try { @@ -109,18 +121,17 @@ public void initialize(UimaContext aContext) } @Override - public void getNext(JCas aJCas) - throws IOException, CollectionException + public void getNext(JCas aJCas) throws IOException, CollectionException { - try{ + try { posMappingProvider.configure(aJCas.getCas()); } - catch(AnalysisEngineProcessException e){ + catch (AnalysisEngineProcessException e) { throw new IOException(e); } - + initCas(aJCas, res); - + // FIXME The reader is already designed in such a way that multiple documents per NIF file // are supported. However, presently adding a qualifier to initCas would generate document // URIs in the NIF with two fragments, e.g. "urn:a01-cooked.ttl#0#offset_0_1234". @@ -131,10 +142,10 @@ public void getNext(JCas aJCas) Nif2DKPro converter = new Nif2DKPro(); converter.setPosMappingProvider(posMappingProvider); converter.convert(context, aJCas); - + inFileCount++; step(); - } + } private void closeAll() { @@ -194,5 +205,4 @@ private void step() throws IOException closeAll(); } } - } diff --git a/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/NifWriter.java b/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/NifWriter.java index c652f7b9f0..0c6efce05f 100644 --- a/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/NifWriter.java +++ b/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/NifWriter.java @@ -28,20 +28,22 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; import org.dkpro.core.io.nif.internal.DKPro2Nif; import org.dkpro.core.io.nif.internal.ITS; import org.dkpro.core.io.nif.internal.NIF; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Writer for the NLP Interchange Format (NIF). * * @see NIF 2.0 Core Ontology */ -@ResourceMetaData(name="NLP Interchange Format (NIF) Writer") +@ResourceMetaData(name = "NLP Interchange Format (NIF) Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.APPLICATION_X_NIF_TURTLE}) @TypeCapability( inputs = { @@ -63,7 +65,8 @@ public class NifWriter * * @see RDFLanguages */ - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".ttl") private String filenameSuffix; @@ -78,7 +81,8 @@ public void process(JCas aJCas) DKPro2Nif.convert(aJCas, model); try (OutputStream docOS = getOutputStream(aJCas, filenameSuffix)) { - RDFDataMgr.write(docOS, model, RDFLanguages.fileExtToLang(filenameSuffix)); + RDFDataMgr.write(docOS, model.getBaseModel(), + RDFLanguages.fileExtToLang(filenameSuffix)); } catch (Exception e) { throw new AnalysisEngineProcessException(e); diff --git a/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/internal/DKPro2Nif.java b/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/internal/DKPro2Nif.java index 631cde04b4..49bdd35a19 100644 --- a/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/internal/DKPro2Nif.java +++ b/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/internal/DKPro2Nif.java @@ -22,11 +22,11 @@ import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; -import org.apache.commons.lang3.StringUtils; import org.apache.jena.ontology.Individual; import org.apache.jena.ontology.OntModel; import org.apache.jena.rdf.model.Property; import org.apache.jena.rdf.model.Resource; +import org.apache.jena.riot.system.IRIResolver; import org.apache.uima.jcas.JCas; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; @@ -50,6 +50,7 @@ public static void convert(JCas aJCas, OntModel aTarget) final Resource tTitle = m.createResource(NIF.TYPE_TITLE); final Resource tParagraph = m.createResource(NIF.TYPE_PARAGRAPH); final Resource tEntityOccurrence = m.createResource(NIF.TYPE_ENTITY_OCCURRENCE); + final Resource tOffsetBasedString = m.createResource(NIF.TYPE_OFFSET_BASED_STRING); final Property pReferenceContext = m.createProperty(NIF.PROP_REFERENCE_CONTEXT); final Property pIsString = m.createProperty(NIF.PROP_IS_STRING); @@ -79,6 +80,7 @@ public static void convert(JCas aJCas, OntModel aTarget) String uri = String.format("%s#offset_%d_%d", docuri, 0, aJCas.getDocumentText().length()); context = m.createIndividual(uri, tContext); + context.addRDFType(tOffsetBasedString); context.addLiteral(pIsString, m.createTypedLiteral(aJCas.getDocumentText(), XSDstring)); context.addLiteral(pBeginIndex, @@ -92,6 +94,7 @@ public static void convert(JCas aJCas, OntModel aTarget) String headingUri = String.format("%s#offset_%d_%d", docuri, uimaHeading.getBegin(), uimaHeading.getEnd()); Individual nifTitle = m.createIndividual(headingUri, tTitle); + nifTitle.addRDFType(tOffsetBasedString); nifTitle.addProperty(pReferenceContext, context); nifTitle.addLiteral(pAnchorOf, uimaHeading.getCoveredText()); nifTitle.addLiteral(pBeginIndex, @@ -105,6 +108,7 @@ public static void convert(JCas aJCas, OntModel aTarget) String paragraphUri = String.format("%s#offset_%d_%d", docuri, uimaParagraph.getBegin(), uimaParagraph.getEnd()); Individual nifParagraph = m.createIndividual(paragraphUri, tParagraph); + nifParagraph.addRDFType(tOffsetBasedString); nifParagraph.addProperty(pReferenceContext, context); nifParagraph.addLiteral(pAnchorOf, uimaParagraph.getCoveredText()); nifParagraph.addLiteral(pBeginIndex, @@ -119,6 +123,7 @@ public static void convert(JCas aJCas, OntModel aTarget) String sentenceUri = String.format("%s#offset_%d_%d", docuri, uimaSentence.getBegin(), uimaSentence.getEnd()); Individual nifSentence = m.createIndividual(sentenceUri, tSentence); + nifSentence.addRDFType(tOffsetBasedString); nifSentence.addProperty(pReferenceContext, context); nifSentence.addLiteral(pAnchorOf, uimaSentence.getCoveredText()); nifSentence.addLiteral(pBeginIndex, @@ -139,6 +144,7 @@ public static void convert(JCas aJCas, OntModel aTarget) String wordUri = String.format("%s#offset_%d_%d", docuri, uimaToken.getBegin(), uimaToken.getEnd()); Individual nifWord = m.createIndividual(wordUri, tWord); + nifWord.addRDFType(tOffsetBasedString); nifWord.addProperty(pReferenceContext, context); nifWord.addLiteral(pAnchorOf, uimaToken.getText()); nifWord.addLiteral(pBeginIndex, @@ -188,13 +194,12 @@ public static void convert(JCas aJCas, OntModel aTarget) // not have the concept of a NE category. for (NamedEntity uimaNamedEntity : select(aJCas, NamedEntity.class)) { String neClass = uimaNamedEntity.getValue(); - String neIdentifier = uimaNamedEntity.getValue(); + String neIdentifier = uimaNamedEntity.getIdentifier(); - boolean neClassIsUri = StringUtils.startsWith(neClass, "http://"); - boolean neIdentifierIsUri = StringUtils.startsWith(neIdentifier, "http://"); + // checkIRI returns true if there are violations, so we need to negate it + boolean neClassIsUri = neClass != null && !IRIResolver.checkIRI(neClass); + boolean neIdentifierIsUri = neIdentifier != null && !IRIResolver.checkIRI(neIdentifier); - // The crudest form of checking for a URI, but since "http://" appears to be the default - // prefix in the semantic web, let's just stick with it for the moment. if (!neClassIsUri && !neIdentifierIsUri) { continue; } @@ -202,6 +207,7 @@ public static void convert(JCas aJCas, OntModel aTarget) String neUri = String.format("%s#offset_%d_%d", docuri, uimaNamedEntity.getBegin(), uimaNamedEntity.getEnd()); Individual nifNamedEntity = m.createIndividual(neUri, tEntityOccurrence); + nifNamedEntity.addRDFType(tOffsetBasedString); nifNamedEntity.addProperty(pReferenceContext, context); nifNamedEntity.addLiteral(pAnchorOf, uimaNamedEntity.getCoveredText()); nifNamedEntity.addLiteral(pBeginIndex, @@ -214,7 +220,7 @@ public static void convert(JCas aJCas, OntModel aTarget) } if (neIdentifierIsUri) { - nifNamedEntity.addProperty(pTaClassRef, m.createResource(neIdentifier)); + nifNamedEntity.addProperty(pTaIdentRef, m.createResource(neIdentifier)); } } } diff --git a/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/internal/ITS.java b/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/internal/ITS.java index bb25a0518d..693dbdbe1f 100644 --- a/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/internal/ITS.java +++ b/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/internal/ITS.java @@ -29,5 +29,6 @@ public class ITS public static final String NS_ITS = "http://www.w3.org/2005/11/its/rdf#"; public static final String PROP_TA_IDENT_REF = NS_ITS + "taIdentRef"; + public static final String PROP_TA_CLASS_REF = NS_ITS + "taClassRef"; } diff --git a/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/internal/NIF.java b/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/internal/NIF.java index a0bccbde7b..f9c760bd8f 100644 --- a/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/internal/NIF.java +++ b/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/internal/NIF.java @@ -19,8 +19,13 @@ /** * NIF vocabulary. + *

+ * JavaDoc in this class was sourced from the NIF 2.0 and 2.1 Core Ontologies and which are licensed + * under Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) and CC-BY + * (http://creativecommons.org/licenses/by/3.0/). * - * @see NIF 2.0 Core Ontology + * @see NIF + * 2.0 Core Ontology */ public class NIF { @@ -28,27 +33,253 @@ public class NIF public static final String NS_NIF = "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"; + /** + * The begin index of a character range as defined in + * RFC 5147 Section 2.2.1 and + * RFC 5147 Section 2.2.2, + * measured as the gap between two characters, starting to count from 0 (the position before the + * first character of a text). + */ public static final String PROP_BEGIN_INDEX = NS_NIF + "beginIndex"; + + /** + * The end index of a character range as defined in + * RFC 5147 Section 2.2.1 and + * RFC 5147 Section 2.2.2, + * measured as the gap between two characters, starting to count from 0 (the position before the + * first character of a text). + */ public static final String PROP_END_INDEX = NS_NIF + "endIndex"; + + /** + * Links a URI of a string to its reference context of type nif:Context. The reference context + * determines the calculation of begin and end index + * + * Each String that is not an instance of nif:Context MUST have exactly one reference context. + */ public static final String PROP_REFERENCE_CONTEXT = NS_NIF + "referenceContext"; + + /** + * The reference text as rdf:Literal for this nif:Context resource. + * + * NIF requires that the reference text (i.e. the context) is always included in the RDF as an + * rdf:Literal. + * + * Note, that the isString property is the place to keep the string itself in RDF. + * + * All other nif:Strings and nif:URISchemes relate to the text of this property to calculate + * character position and indices. + */ public static final String PROP_IS_STRING = NS_NIF + "isString"; + + /** + * The string, which the URI is representing as an RDF Literal. Some use cases require this + * property, as it is necessary for certain sparql queries. + */ public static final String PROP_ANCHOR_OF = NS_NIF + "anchorOf"; + + /** + * This property links sentences to their words. + */ public static final String PROP_WORD = NS_NIF + "word"; + + /** + * See nif:nextSentence + */ public static final String PROP_NEXT_WORD = NS_NIF + "nextWord"; + + /** + * see nif:nextSentence + */ public static final String PROP_PREVIOUS_WORD = NS_NIF + "previousWord"; + + /** + * This property links words and other structures to their sentence. + */ public static final String PROP_SENTENCE = NS_NIF + "sentence"; + + /** + * This property (and nif:previousSentence, nif:nextWord, nif:previousWord and their transitive + * extension) can be used to make resources of nif:Sentence and nif:Word traversable, it can not + * be assumed that no gaps or whitespaces between sentences or words exist, i.e. string + * adjacency is not mandatory. The transitivity axioms are included in nif-core-inf.ttl and need + * to be included separately to keep a low reasoning profile. They are modeled after + * skos:broader and skos:broaderTransitive. + */ public static final String PROP_NEXT_SENTENCE = NS_NIF + "nextSentence"; + + /** + * see nif:nextSentence + */ public static final String PROP_PREVIOUS_SENTENCE = NS_NIF + "previousSentence"; + + /** + * The lemma(s) of the nif:String. + */ public static final String PROP_LEMMA = NS_NIF + "lemma"; + + /** + * The stem(s) of the nif:String. + */ public static final String PROP_STEM = NS_NIF + "stem"; + + /** + * To include the pos tag as it comes out of the NLP tool as RDF Literal. This property is + * discouraged to use alone, please use oliaLink and oliaCategory. We included it, because some + * people might still want it and will even create their own property, if the string variant is + * missing. + * + * @deprecated Use oliaLink and oliaCategory. + */ + @Deprecated public static final String PROP_POS_TAG = NS_NIF + "posTag"; + + /** + * The confidence of an annotation as decimal between 0 and 1. + */ + public static final String PROP_CONFIDENCE = NS_NIF + "confidence"; - public static final String TYPE_ENTITY_OCCURRENCE = NS_NIF + "EntityOccurrence"; + /** + * This property marks the most specific class from itsrdf:taClassRef. The rule is: from the set + * S of itsrdf:taClassRef attached to this resource taMscRef points to the one that does not + * have any subclasses in the set S except itself. So if taClassRef is owl:Thing, dbo:Agent, + * dbo:Person, dbp:Actor taMsClassRef is dbo:Actor + * + * @see NIF + * 2.0 Core Ontology + */ + public static final String PROP_TA_MS_CLASS_REF = NS_NIF + "taMsClassRef"; + + /** + * A title within a text. + * + * @see NIF + * 2.0 Core Ontology + */ public static final String TYPE_TITLE = NS_NIF + "Title"; + + /** + * A paragraph. + * + * @see NIF + * 2.0 Core Ontology + */ public static final String TYPE_PARAGRAPH = NS_NIF + "Paragraph"; + + /** + * The Word class represents strings that are tokens or words. A string is a Word, if it is a + * word. We don't nitpic about whether it is a a pronoun, a name, a punctuation mark or an + * apostrophe or whether it is separated by white space from another Word or something else. The + * string 'He enters the room.' for example has 5 words. Words are assigned by a tokenizer NIF + * Implementation. Single word phrases might be tagged as nif:Word and nif:Phrase. + * + * Example 1: "The White House" are three Words separated by whitespace + * + * Comment 1: We adopted the definition style from foaf:Person, see here: + * http://xmlns.com/foaf/spec/#term_Person We are well aware that the world out there is much + * more complicated, but we are ignorant about it, for the following reasons: + * + * Comment 2: + * + * 1. NIF has a client-server and the client has the ability to dictate the tokenization to the + * server (i.e. the NIF Implementation) by sending properly tokenized NIF annotated with + * nif:Word. All NIF Implementations are supposed to honor and respect the current assignment of + * the Word class. Thus the client should decide which NIF Implementation should create the + * tokenization. Therefore this class is not descriptive, but prescriptive. + * + * 2. The client may choose to send an existing tokenization to a NIF Implementation, with the + * capability to change (for better or for worse) the tokenization. + * + * The class has not been named 'Token' as the NLP definition of 'token' is descriptive (and not + * well-defined), while the assignment of what is a Word and what not is prescriptive, e.g. + * "can't" could be described as one, two or three tokens or defined as being one, two or three + * words. For further reading, we refer the reader to: By all these lovely tokens... Merging + * conflicting tokenizations by Christian Chiarcos, Julia Ritz, and Manfred Stede. Language + * Resources and Evaluation 46(1):53-74 (2012) or the short form: + * http://www.aclweb.org/anthology/W09-3005 + * + * There the task at hand is to merge two tokenization T_1 and T_2 which is normally not the + * case in the NIF world as tokenization is prescribed, i.e. given as a baseline (Note that this + * ideal state might not be achieved by all implementations.) + * + * @see NIF + * 2.0 Core Ontology + */ public static final String TYPE_WORD = NS_NIF + "Word"; + + /** + * A sentence. + * + * @see NIF 2.0 Core Ontology + */ public static final String TYPE_SENTENCE = NS_NIF + "Sentence"; + + /** + * The string that serves as a context for its substrings. The Unicode String given in the + * nif:isString property must be used to calculate the begin and endIndex for all nif:Strings + * that have a nif:referenceContext property to this URI. For further information, see + * http://svn.aksw.org/papers/2013/ISWC_NIF/public.pdf + * + * @see NIF + * 2.0 Core Ontology + */ public static final String TYPE_CONTEXT = NS_NIF + "Context"; + + /** + * Individuals of this class are a string, i.e. Unicode characters, who have been given a URI + * and are used in the subject of an RDF statement. + * + * This class is abstract and should not be serialized. + * + * NIF-Stanbol (nif-stanbol.ttl): + * + * subclassOf nifs:Annotation because it "annotates" strings for example with begin and end + * index. The class is similar to fise:TextAnnotation + * + * @see NIF + * 2.0 Core Ontology + */ public static final String TYPE_STRING = NS_NIF + "String"; + + /** + * A nif:Phrase can be a nif:String, that is a chunk of several words or a word itself (e.g. a + * NounPhrase as a Named Entity). The term is underspecified and can be compatible with many + * defintitions of phrase. Please subClass it to specify the meaning (e.g. for Chunking or + * Phrase Structure Grammar). Example: ((My dog)(also)(likes)(eating (sausage))) + * + * @see NIF + * 2.0 Core Ontology + */ + public static final String TYPE_PHRASE = NS_NIF + "Phrase"; + + /** + * cf. Linked-Data Aware URI Schemes for Referencing Text Fragments by Sebastian Hellmann, Jens + * Lehmann und Sören Auer in EKAW 2012 http://jens-lehmann.org/files/2012/ekaw_nif.pdf + * + * requires the existence of begin, endIndex and referenceContext + * + * @see NIF + * 2.0 Core Ontology + */ public static final String TYPE_OFFSET_BASED_STRING = NS_NIF + "OffsetBasedString"; + + /** + * Text span annotation denoting that a word or phrase has been detected as occurrence of a + * named entity. (Use this without further annotation property assertions if you just want to + * express the detection of the occurrence when neither the mentioned entity nor its category + * was identified.) + * + * @see NIF + * 2.1 Core Ontology + */ + public static final String TYPE_ENTITY_OCCURRENCE = NS_NIF + "EntityOccurrence"; } diff --git a/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/internal/Nif2DKPro.java b/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/internal/Nif2DKPro.java index 008d76e4d0..cd20ed34e4 100644 --- a/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/internal/Nif2DKPro.java +++ b/dkpro-core-io-nif-asl/src/main/java/org/dkpro/core/io/nif/internal/Nif2DKPro.java @@ -31,11 +31,11 @@ import org.apache.uima.cas.Type; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; @@ -68,6 +68,7 @@ public void convert(Statement aContext, JCas aJCas) final Property pLemma = m.createProperty(NIF.PROP_LEMMA); final Property pStem = m.createProperty(NIF.PROP_STEM); final Property pPosTag = m.createProperty(NIF.PROP_POS_TAG); + final Property pTaMsClassRef = m.createProperty(NIF.PROP_TA_MS_CLASS_REF); final Property pTaIdentRef = m.createProperty(ITS.PROP_TA_IDENT_REF); final Property pTaClassRef = m.createProperty(ITS.PROP_TA_CLASS_REF); @@ -157,7 +158,7 @@ public void convert(Statement aContext, JCas aJCas) Type posTag = posMappingProvider.getTagType(tag); POS uimaPos = (POS) aJCas.getCas().createAnnotation(posTag, uimaToken.getBegin(), uimaToken.getEnd()); - uimaPos.setPosValue(tag.intern()); + uimaPos.setPosValue(tag != null ? tag.intern() : null); POSUtils.assignCoarseValue(uimaPos); uimaPos.addToIndexes(); uimaToken.setPos(uimaPos); @@ -183,27 +184,42 @@ public void convert(Statement aContext, JCas aJCas) // // [1] http://nif.readthedocs.io/en/2.1-rc/prov-and-conf.html // [2] https://datahub.io/dataset/kore-50-nif-ner-corpus - Set nifNamedEntities1 = m + Set nifNamedEntitiesTaIdentRef = m .listResourcesWithProperty(pTaIdentRef) .filterKeep(res -> res.getProperty( pReferenceContext).getResource().equals(aContext.getSubject())) .toSet(); - Set nifNamedEntities2 = m - .listResourcesWithProperty(pTaIdentRef) + Set nifNamedEntitiesTaClassRef = m + .listResourcesWithProperty(pTaClassRef) + .filterKeep(res -> res.getProperty( + pReferenceContext).getResource().equals(aContext.getSubject())) + .toSet(); + Set nifNamedEntitiesTaMsClassRef = m + .listResourcesWithProperty(pTaMsClassRef) .filterKeep(res -> res.getProperty( pReferenceContext).getResource().equals(aContext.getSubject())) .toSet(); Set nifNamedEntities = new HashSet(); - nifNamedEntities.addAll(nifNamedEntities1); - nifNamedEntities.addAll(nifNamedEntities2); + nifNamedEntities.addAll(nifNamedEntitiesTaIdentRef); + nifNamedEntities.addAll(nifNamedEntitiesTaClassRef); + nifNamedEntities.addAll(nifNamedEntitiesTaMsClassRef); for (Resource nifNamedEntity : nifNamedEntities) { int begin = nifNamedEntity.getProperty(pBeginIndex).getInt(); int end = nifNamedEntity.getProperty(pEndIndex).getInt(); NamedEntity uimaNamedEntity = new NamedEntity(aJCas, begin, end); - if (nifNamedEntity.hasProperty(pTaClassRef)) { + + // If there is a most-specific class, then we use that + if (nifNamedEntity.hasProperty(pTaMsClassRef)) { + uimaNamedEntity + .setValue(nifNamedEntity.getProperty(pTaMsClassRef).getResource().getURI()); + } + // ... else, we use some class + else if (nifNamedEntity.hasProperty(pTaClassRef)) { uimaNamedEntity .setValue(nifNamedEntity.getProperty(pTaClassRef).getResource().getURI()); } + + // If the entity is linked, then we keep the identifier if (nifNamedEntity.hasProperty(pTaIdentRef)) { uimaNamedEntity.setIdentifier( nifNamedEntity.getProperty(pTaIdentRef).getResource().getURI()); @@ -229,5 +245,5 @@ private static boolean assertSanity(Resource aNif, Annotation aUima) assert aUima.getEnd() >= 0 && aUima.getEnd() <= docLength; return true; - } + } } diff --git a/dkpro-core-io-nif-asl/src/test/java/org/dkpro/core/io/nif/NifReaderWriterTest.java b/dkpro-core-io-nif-asl/src/test/java/org/dkpro/core/io/nif/NifReaderWriterTest.java index 7d66fa5d22..443df9d7f4 100644 --- a/dkpro-core-io-nif-asl/src/test/java/org/dkpro/core/io/nif/NifReaderWriterTest.java +++ b/dkpro-core-io-nif-asl/src/test/java/org/dkpro/core/io/nif/NifReaderWriterTest.java @@ -17,7 +17,7 @@ */ package org.dkpro.core.io.nif; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testOneWay; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; import static org.junit.Assert.assertEquals; import java.io.File; @@ -32,10 +32,9 @@ import org.apache.jena.ontology.OntModel; import org.apache.jena.rdf.model.Model; import org.apache.jena.rdf.model.ModelFactory; +import org.dkpro.core.testing.TestOptions; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.TestOptions; - public class NifReaderWriterTest { // This is not a test method - just a development utility to convert the Python-like TTL @@ -46,6 +45,7 @@ public void convert() { convert("src/test/resources/nif/brown/a01.ttl", "src/test/resources/nif/brown/a01-cooked.ttl"); convert("src/test/resources/nif/kore50/kore50.ttl", "src/test/resources/nif/kore50/kore50-cooked.ttl"); + convert("src/test/resources/nif/freme/freme.ttl", "src/test/resources/nif/freme/freme-cooked.ttl"); } @Test @@ -58,7 +58,8 @@ public void testBrown() testOneWay( NifReader.class, // the reader NifWriter.class, // the writer - "nif/brown/ref.ttl", "nif/brown/a01-cooked.ttl", + "nif/brown/a01-cooked-ref.ttl", + "nif/brown/a01-cooked.ttl", new TestOptions().resultAssertor(this::assertModelEquals)); } @@ -72,10 +73,35 @@ public void testKore50() testOneWay( NifReader.class, // the reader NifWriter.class, // the writer - "nif/kore50/ref.ttl", "nif/kore50/kore50-cooked.ttl", + "nif/kore50/kore50-cooked-ref.ttl", + "nif/kore50/kore50-cooked.ttl", new TestOptions().resultAssertor(this::assertModelEquals)); } + @Test + public void testFreme() + throws Exception + { + testOneWay( + NifReader.class, // the reader + NifWriter.class, // the writer + "nif/freme/freme-cooked-ref.ttl", + "nif/freme/freme-cooked.ttl", + new TestOptions().resultAssertor(this::assertModelEquals)); + } + + @Test + public void testPyNif() + throws Exception + { + testOneWay( + NifReader.class, // the reader + NifWriter.class, // the writer + "nif/pynif/pynif-example-ref.ttl", + "nif/pynif/pynif-example.ttl", + new TestOptions().resultAssertor(this::assertModelEquals)); + } + private void assertModelEquals(File expected, File actual) { Model mExpected = ModelFactory.createDefaultModel(); diff --git a/dkpro-core-io-nif-asl/src/test/resources/log4j.properties b/dkpro-core-io-nif-asl/src/test/resources/log4j.properties deleted file mode 100644 index 9ef9876f5c..0000000000 --- a/dkpro-core-io-nif-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,7 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG diff --git a/dkpro-core-io-nif-asl/src/test/resources/log4j2.xml b/dkpro-core-io-nif-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..19bf03b585 --- /dev/null +++ b/dkpro-core-io-nif-asl/src/test/resources/log4j2.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-nif-asl/src/test/resources/nif/brown/a01-cooked-ref.ttl b/dkpro-core-io-nif-asl/src/test/resources/nif/brown/a01-cooked-ref.ttl new file mode 100644 index 0000000000..3cfca2b6c8 --- /dev/null +++ b/dkpro-core-io-nif-asl/src/test/resources/nif/brown/a01-cooked-ref.ttl @@ -0,0 +1,23207 @@ +@prefix rdf: . +@prefix owl: . +@prefix xsd: . +@prefix itsrdf: . +@prefix nif: . +@prefix rdfs: . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "6859"^^xsd:nonNegativeInteger ; + nif:endIndex "6862"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "both" ; + nif:beginIndex "1282"^^xsd:nonNegativeInteger ; + nif:endIndex "1286"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Allen" ; + nif:beginIndex "5289"^^xsd:nonNegativeInteger ; + nif:endIndex "5294"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "public" ; + nif:beginIndex "6807"^^xsd:nonNegativeInteger ; + nif:endIndex "6813"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "5416"^^xsd:nonNegativeInteger ; + nif:endIndex "5417"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "distribution" ; + nif:beginIndex "2404"^^xsd:nonNegativeInteger ; + nif:endIndex "2416"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "It listed his wife's age as 74 and place of birth as Opelika, Ala.." ; + nif:beginIndex "4769"^^xsd:nonNegativeInteger ; + nif:endIndex "4836"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "primary" ; + nif:beginIndex "78"^^xsd:nonNegativeInteger ; + nif:endIndex "85"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "7489"^^xsd:nonNegativeInteger ; + nif:endIndex "7491"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "6733"^^xsd:nonNegativeInteger ; + nif:endIndex "6736"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "which" ; + nif:beginIndex "8394"^^xsd:nonNegativeInteger ; + nif:endIndex "8399"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "marked" ; + nif:beginIndex "11460"^^xsd:nonNegativeInteger ; + nif:endIndex "11466"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "cost" ; + nif:beginIndex "1437"^^xsd:nonNegativeInteger ; + nif:endIndex "1441"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Felix" ; + nif:beginIndex "12097"^^xsd:nonNegativeInteger ; + nif:endIndex "12102"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "The jurors said they realize ``a proportionate distribution of these funds might disable this program in our less populous counties''." ; + nif:beginIndex "2357"^^xsd:nonNegativeInteger ; + nif:endIndex "2491"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "himself" ; + nif:beginIndex "12023"^^xsd:nonNegativeInteger ; + nif:endIndex "12030"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "is" ; + nif:beginIndex "8160"^^xsd:nonNegativeInteger ; + nif:endIndex "8162"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "received" ; + nif:beginIndex "10069"^^xsd:nonNegativeInteger ; + nif:endIndex "10077"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "problem" ; + nif:beginIndex "1652"^^xsd:nonNegativeInteger ; + nif:endIndex "1659"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "outgoing" ; + nif:beginIndex "1738"^^xsd:nonNegativeInteger ; + nif:endIndex "1746"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "2605"^^xsd:nonNegativeInteger ; + nif:endIndex "2608"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "would" ; + nif:beginIndex "10238"^^xsd:nonNegativeInteger ; + nif:endIndex "10243"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "bit" ; + nif:beginIndex "12228"^^xsd:nonNegativeInteger ; + nif:endIndex "12231"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "8547"^^xsd:nonNegativeInteger ; + nif:endIndex "8548"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "controversy" ; + nif:beginIndex "11470"^^xsd:nonNegativeInteger ; + nif:endIndex "11481"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Fulton" ; + nif:beginIndex "4"^^xsd:nonNegativeInteger ; + nif:endIndex "10"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "6113"^^xsd:nonNegativeInteger ; + nif:endIndex "6114"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "worth" ; + nif:beginIndex "7859"^^xsd:nonNegativeInteger ; + nif:endIndex "7864"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "303"^^xsd:nonNegativeInteger ; + nif:endIndex "306"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "interest" ; + nif:beginIndex "714"^^xsd:nonNegativeInteger ; + nif:endIndex "722"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "7389"^^xsd:nonNegativeInteger ; + nif:endIndex "7391"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "10282"^^xsd:nonNegativeInteger ; + nif:endIndex "10284"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf ")" ; + nif:beginIndex "3936"^^xsd:nonNegativeInteger ; + nif:endIndex "3937"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "4104"^^xsd:nonNegativeInteger ; + nif:endIndex "4105"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "6466"^^xsd:nonNegativeInteger ; + nif:endIndex "6468"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "4482"^^xsd:nonNegativeInteger ; + nif:endIndex "4485"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "as" ; + nif:beginIndex "4819"^^xsd:nonNegativeInteger ; + nif:endIndex "4821"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "swipe" ; + nif:beginIndex "1929"^^xsd:nonNegativeInteger ; + nif:endIndex "1934"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "protect" ; + nif:beginIndex "3165"^^xsd:nonNegativeInteger ; + nif:endIndex "3172"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "1298"^^xsd:nonNegativeInteger ; + nif:endIndex "1300"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "1927"^^xsd:nonNegativeInteger ; + nif:endIndex "1928"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Republicans" ; + nif:beginIndex "6165"^^xsd:nonNegativeInteger ; + nif:endIndex "6176"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "3068"^^xsd:nonNegativeInteger ; + nif:endIndex "3069"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Atlanta" ; + nif:beginIndex "5077"^^xsd:nonNegativeInteger ; + nif:endIndex "5084"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "11692"^^xsd:nonNegativeInteger ; + nif:endIndex "11695"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "research" ; + nif:beginIndex "9055"^^xsd:nonNegativeInteger ; + nif:endIndex "9063"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "back" ; + nif:beginIndex "9824"^^xsd:nonNegativeInteger ; + nif:endIndex "9828"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "receives" ; + nif:beginIndex "2328"^^xsd:nonNegativeInteger ; + nif:endIndex "2336"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "6139"^^xsd:nonNegativeInteger ; + nif:endIndex "6140"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "6573"^^xsd:nonNegativeInteger ; + nif:endIndex "6575"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "inadequate" ; + nif:beginIndex "888"^^xsd:nonNegativeInteger ; + nif:endIndex "898"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "10774"^^xsd:nonNegativeInteger ; + nif:endIndex "10775"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "7370"^^xsd:nonNegativeInteger ; + nif:endIndex "7372"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "voters" ; + nif:beginIndex "754"^^xsd:nonNegativeInteger ; + nif:endIndex "760"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "assistance" ; + nif:beginIndex "2109"^^xsd:nonNegativeInteger ; + nif:endIndex "2119"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "7757"^^xsd:nonNegativeInteger ; + nif:endIndex "7758"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "into" ; + nif:beginIndex "10418"^^xsd:nonNegativeInteger ; + nif:endIndex "10422"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "days" ; + nif:beginIndex "11642"^^xsd:nonNegativeInteger ; + nif:endIndex "11646"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "grand" ; + nif:beginIndex "1058"^^xsd:nonNegativeInteger ; + nif:endIndex "1063"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "goes" ; + nif:beginIndex "5158"^^xsd:nonNegativeInteger ; + nif:endIndex "5162"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "must" ; + nif:beginIndex "6461"^^xsd:nonNegativeInteger ; + nif:endIndex "6465"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "Davis received 1,119 votes in Saturday's election, and Bush got 402." ; + nif:beginIndex "11018"^^xsd:nonNegativeInteger ; + nif:endIndex "11086"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "1938"^^xsd:nonNegativeInteger ; + nif:endIndex "1941"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "on" ; + nif:beginIndex "9075"^^xsd:nonNegativeInteger ; + nif:endIndex "9077"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "one" ; + nif:beginIndex "5104"^^xsd:nonNegativeInteger ; + nif:endIndex "5107"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "recommended" ; + nif:beginIndex "925"^^xsd:nonNegativeInteger ; + nif:endIndex "936"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "each" ; + nif:beginIndex "6509"^^xsd:nonNegativeInteger ; + nif:endIndex "6513"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "1054"^^xsd:nonNegativeInteger ; + nif:endIndex "1057"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "5084"^^xsd:nonNegativeInteger ; + nif:endIndex "5085"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "night" ; + nif:beginIndex "9039"^^xsd:nonNegativeInteger ; + nif:endIndex "9044"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "7921"^^xsd:nonNegativeInteger ; + nif:endIndex "7923"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "L." ; + nif:beginIndex "4986"^^xsd:nonNegativeInteger ; + nif:endIndex "4988"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "4285"^^xsd:nonNegativeInteger ; + nif:endIndex "4288"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Rural" ; + nif:beginIndex "8349"^^xsd:nonNegativeInteger ; + nif:endIndex "8354"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "clerical" ; + nif:beginIndex "1538"^^xsd:nonNegativeInteger ; + nif:endIndex "1546"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "mayor's" ; + nif:beginIndex "5213"^^xsd:nonNegativeInteger ; + nif:endIndex "5220"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Robert" ; + nif:beginIndex "5552"^^xsd:nonNegativeInteger ; + nif:endIndex "5558"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "brought" ; + nif:beginIndex "5638"^^xsd:nonNegativeInteger ; + nif:endIndex "5645"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "5783"^^xsd:nonNegativeInteger ; + nif:endIndex "5785"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "1091"^^xsd:nonNegativeInteger ; + nif:endIndex "1093"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "State" ; + nif:beginIndex "2154"^^xsd:nonNegativeInteger ; + nif:endIndex "2159"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "10299"^^xsd:nonNegativeInteger ; + nif:endIndex "10303"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "cent" ; + nif:beginIndex "6487"^^xsd:nonNegativeInteger ; + nif:endIndex "6491"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "administration" ; + nif:beginIndex "4127"^^xsd:nonNegativeInteger ; + nif:endIndex "4141"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "got" ; + nif:beginIndex "11078"^^xsd:nonNegativeInteger ; + nif:endIndex "11081"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Grady" ; + nif:beginIndex "4270"^^xsd:nonNegativeInteger ; + nif:endIndex "4275"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "public" ; + nif:beginIndex "10023"^^xsd:nonNegativeInteger ; + nif:endIndex "10029"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "title" ; + nif:beginIndex "1700"^^xsd:nonNegativeInteger ; + nif:endIndex "1705"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Savannah" ; + nif:beginIndex "5802"^^xsd:nonNegativeInteger ; + nif:endIndex "5810"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Opelika" ; + nif:beginIndex "4822"^^xsd:nonNegativeInteger ; + nif:endIndex "4829"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "I" ; + nif:beginIndex "11213"^^xsd:nonNegativeInteger ; + nif:endIndex "11214"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "allow" ; + nif:beginIndex "9678"^^xsd:nonNegativeInteger ; + nif:endIndex "9683"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "priority" ; + nif:beginIndex "7714"^^xsd:nonNegativeInteger ; + nif:endIndex "7722"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "5780"^^xsd:nonNegativeInteger ; + nif:endIndex "5782"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "Implementation of Georgia's automobile title law was also recommended by the outgoing jury." ; + nif:beginIndex "1661"^^xsd:nonNegativeInteger ; + nif:endIndex "1752"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "11315"^^xsd:nonNegativeInteger ; + nif:endIndex "11316"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "term" ; + nif:beginIndex "6251"^^xsd:nonNegativeInteger ; + nif:endIndex "6255"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "won" ; + nif:beginIndex "574"^^xsd:nonNegativeInteger ; + nif:endIndex "577"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Barber" ; + nif:beginIndex "10109"^^xsd:nonNegativeInteger ; + nif:endIndex "10115"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "3070"^^xsd:nonNegativeInteger ; + nif:endIndex "3073"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "similar" ; + nif:beginIndex "9430"^^xsd:nonNegativeInteger ; + nif:endIndex "9437"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "one" ; + nif:beginIndex "6430"^^xsd:nonNegativeInteger ; + nif:endIndex "6433"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "into" ; + nif:beginIndex "2988"^^xsd:nonNegativeInteger ; + nif:endIndex "2992"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Vandiver" ; + nif:beginIndex "7604"^^xsd:nonNegativeInteger ; + nif:endIndex "7612"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "federal" ; + nif:beginIndex "1981"^^xsd:nonNegativeInteger ; + nif:endIndex "1988"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "amicable" ; + nif:beginIndex "4640"^^xsd:nonNegativeInteger ; + nif:endIndex "4648"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "4559"^^xsd:nonNegativeInteger ; + nif:endIndex "4560"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "rescind" ; + nif:beginIndex "9566"^^xsd:nonNegativeInteger ; + nif:endIndex "9573"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "however" ; + nif:beginIndex "8740"^^xsd:nonNegativeInteger ; + nif:endIndex "8747"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "2342"^^xsd:nonNegativeInteger ; + nif:endIndex "2344"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "since" ; + nif:beginIndex "5125"^^xsd:nonNegativeInteger ; + nif:endIndex "5130"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "prison" ; + nif:beginIndex "4256"^^xsd:nonNegativeInteger ; + nif:endIndex "4262"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "permit" ; + nif:beginIndex "4023"^^xsd:nonNegativeInteger ; + nif:endIndex "4029"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "5123"^^xsd:nonNegativeInteger ; + nif:endIndex "5124"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Friday" ; + nif:beginIndex "10440"^^xsd:nonNegativeInteger ; + nif:endIndex "10446"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Ivan" ; + nif:beginIndex "596"^^xsd:nonNegativeInteger ; + nif:endIndex "600"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "repair" ; + nif:beginIndex "8051"^^xsd:nonNegativeInteger ; + nif:endIndex "8057"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Legislature" ; + nif:beginIndex "1776"^^xsd:nonNegativeInteger ; + nif:endIndex "1787"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "His" ; + nif:beginIndex "5137"^^xsd:nonNegativeInteger ; + nif:endIndex "5140"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Bush" ; + nif:beginIndex "11073"^^xsd:nonNegativeInteger ; + nif:endIndex "11077"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "3743"^^xsd:nonNegativeInteger ; + nif:endIndex "3744"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Roads" ; + nif:beginIndex "8230"^^xsd:nonNegativeInteger ; + nif:endIndex "8235"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "has" ; + nif:beginIndex "10711"^^xsd:nonNegativeInteger ; + nif:endIndex "10714"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "trouble" ; + nif:beginIndex "11378"^^xsd:nonNegativeInteger ; + nif:endIndex "11385"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "there" ; + nif:beginIndex "8149"^^xsd:nonNegativeInteger ; + nif:endIndex "8154"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "4226"^^xsd:nonNegativeInteger ; + nif:endIndex "4227"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "3018"^^xsd:nonNegativeInteger ; + nif:endIndex "3021"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "would" ; + nif:beginIndex "9342"^^xsd:nonNegativeInteger ; + nif:endIndex "9347"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "$4" ; + nif:beginIndex "8204"^^xsd:nonNegativeInteger ; + nif:endIndex "8206"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "rural" ; + nif:beginIndex "7430"^^xsd:nonNegativeInteger ; + nif:endIndex "7435"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "rural" ; + nif:beginIndex "8256"^^xsd:nonNegativeInteger ; + nif:endIndex "8261"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "1009"^^xsd:nonNegativeInteger ; + nif:endIndex "1012"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "279"^^xsd:nonNegativeInteger ; + nif:endIndex "280"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "begin" ; + nif:beginIndex "8023"^^xsd:nonNegativeInteger ; + nif:endIndex "8028"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "5326"^^xsd:nonNegativeInteger ; + nif:endIndex "5329"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "2846"^^xsd:nonNegativeInteger ; + nif:endIndex "2847"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "The jury did not elaborate, but it added that ``there should be periodic surveillance of the pricing practices of the concessionaires for the purpose of keeping the prices reasonable''." ; + nif:beginIndex "3523"^^xsd:nonNegativeInteger ; + nif:endIndex "3708"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "816"^^xsd:nonNegativeInteger ; + nif:endIndex "820"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "urban" ; + nif:beginIndex "7440"^^xsd:nonNegativeInteger ; + nif:endIndex "7445"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "3657"^^xsd:nonNegativeInteger ; + nif:endIndex "3660"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "audience" ; + nif:beginIndex "5917"^^xsd:nonNegativeInteger ; + nif:endIndex "5925"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "10090"^^xsd:nonNegativeInteger ; + nif:endIndex "10092"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "7974"^^xsd:nonNegativeInteger ; + nif:endIndex "7977"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "8929"^^xsd:nonNegativeInteger ; + nif:endIndex "8932"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "This" ; + nif:beginIndex "2049"^^xsd:nonNegativeInteger ; + nif:endIndex "2053"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "J." ; + nif:beginIndex "4576"^^xsd:nonNegativeInteger ; + nif:endIndex "4578"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "an" ; + nif:beginIndex "7018"^^xsd:nonNegativeInteger ; + nif:endIndex "7020"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "went" ; + nif:beginIndex "12173"^^xsd:nonNegativeInteger ; + nif:endIndex "12177"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "2228"^^xsd:nonNegativeInteger ; + nif:endIndex "2231"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "10776"^^xsd:nonNegativeInteger ; + nif:endIndex "10778"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "9473"^^xsd:nonNegativeInteger ; + nif:endIndex "9474"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Blue" ; + nif:beginIndex "5893"^^xsd:nonNegativeInteger ; + nif:endIndex "5897"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "1877"^^xsd:nonNegativeInteger ; + nif:endIndex "1879"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Griffin" ; + nif:beginIndex "4594"^^xsd:nonNegativeInteger ; + nif:endIndex "4601"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "City" ; + nif:beginIndex "212"^^xsd:nonNegativeInteger ; + nif:endIndex "216"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "candidates" ; + nif:beginIndex "6602"^^xsd:nonNegativeInteger ; + nif:endIndex "6612"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "12158"^^xsd:nonNegativeInteger ; + nif:endIndex "12159"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "at" ; + nif:beginIndex "11992"^^xsd:nonNegativeInteger ; + nif:endIndex "11994"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "after" ; + nif:beginIndex "5347"^^xsd:nonNegativeInteger ; + nif:endIndex "5352"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "Vandiver opened his race for governor in 1958 with a battle in the Legislature against the issuance of $50 million worth of additional rural roads bonds proposed by then Gov. Marvin Griffin." ; + nif:beginIndex "8496"^^xsd:nonNegativeInteger ; + nif:endIndex "8686"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "3519"^^xsd:nonNegativeInteger ; + nif:endIndex "3521"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Mrs." ; + nif:beginIndex "4571"^^xsd:nonNegativeInteger ; + nif:endIndex "4575"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "would" ; + nif:beginIndex "5976"^^xsd:nonNegativeInteger ; + nif:endIndex "5981"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "employed" ; + nif:beginIndex "3883"^^xsd:nonNegativeInteger ; + nif:endIndex "3891"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "adjournment" ; + nif:beginIndex "7591"^^xsd:nonNegativeInteger ; + nif:endIndex "7602"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "4750"^^xsd:nonNegativeInteger ; + nif:endIndex "4753"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "George" ; + nif:beginIndex "11587"^^xsd:nonNegativeInteger ; + nif:endIndex "11593"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "will" ; + nif:beginIndex "8018"^^xsd:nonNegativeInteger ; + nif:endIndex "8022"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "2524"^^xsd:nonNegativeInteger ; + nif:endIndex "2527"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "voted" ; + nif:beginIndex "9407"^^xsd:nonNegativeInteger ; + nif:endIndex "9412"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "department" ; + nif:beginIndex "8307"^^xsd:nonNegativeInteger ; + nif:endIndex "8317"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "has" ; + nif:beginIndex "4669"^^xsd:nonNegativeInteger ; + nif:endIndex "4672"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Tower" ; + nif:beginIndex "5850"^^xsd:nonNegativeInteger ; + nif:endIndex "5855"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Felix" ; + nif:beginIndex "10921"^^xsd:nonNegativeInteger ; + nif:endIndex "10926"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "largest" ; + nif:beginIndex "6321"^^xsd:nonNegativeInteger ; + nif:endIndex "6328"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "previous" ; + nif:beginIndex "3047"^^xsd:nonNegativeInteger ; + nif:endIndex "3055"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "6599"^^xsd:nonNegativeInteger ; + nif:endIndex "6601"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "1762"^^xsd:nonNegativeInteger ; + nif:endIndex "1766"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "phone" ; + nif:beginIndex "11502"^^xsd:nonNegativeInteger ; + nif:endIndex "11507"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "7774"^^xsd:nonNegativeInteger ; + nif:endIndex "7777"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "medical" ; + nif:beginIndex "3855"^^xsd:nonNegativeInteger ; + nif:endIndex "3862"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "10220"^^xsd:nonNegativeInteger ; + nif:endIndex "10222"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "10493"^^xsd:nonNegativeInteger ; + nif:endIndex "10495"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "became" ; + nif:beginIndex "5304"^^xsd:nonNegativeInteger ; + nif:endIndex "5310"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "prices" ; + nif:beginIndex "3688"^^xsd:nonNegativeInteger ; + nif:endIndex "3694"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "he" ; + nif:beginIndex "11850"^^xsd:nonNegativeInteger ; + nif:endIndex "11852"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "6492"^^xsd:nonNegativeInteger ; + nif:endIndex "6494"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "taken" ; + nif:beginIndex "6469"^^xsd:nonNegativeInteger ; + nif:endIndex "6474"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "highway" ; + nif:beginIndex "7650"^^xsd:nonNegativeInteger ; + nif:endIndex "7657"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "5026"^^xsd:nonNegativeInteger ; + nif:endIndex "5029"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Republicans" ; + nif:beginIndex "5426"^^xsd:nonNegativeInteger ; + nif:endIndex "5437"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Court" ; + nif:beginIndex "4431"^^xsd:nonNegativeInteger ; + nif:endIndex "4436"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Fulton" ; + nif:beginIndex "443"^^xsd:nonNegativeInteger ; + nif:endIndex "449"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "city" ; + nif:beginIndex "5187"^^xsd:nonNegativeInteger ; + nif:endIndex "5191"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "1734"^^xsd:nonNegativeInteger ; + nif:endIndex "1737"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "some" ; + nif:beginIndex "8185"^^xsd:nonNegativeInteger ; + nif:endIndex "8189"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "up" ; + nif:beginIndex "7192"^^xsd:nonNegativeInteger ; + nif:endIndex "7194"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "dissents" ; + nif:beginIndex "6307"^^xsd:nonNegativeInteger ; + nif:endIndex "6315"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "County" ; + nif:beginIndex "3827"^^xsd:nonNegativeInteger ; + nif:endIndex "3833"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "like" ; + nif:beginIndex "11295"^^xsd:nonNegativeInteger ; + nif:endIndex "11299"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Only" ; + nif:beginIndex "614"^^xsd:nonNegativeInteger ; + nif:endIndex "618"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "5762"^^xsd:nonNegativeInteger ; + nif:endIndex "5765"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Sam Caldwell, State Highway Department public relations director, resigned Tuesday to work for Lt. Gov. Garland Byrd's campaign." ; + nif:beginIndex "6768"^^xsd:nonNegativeInteger ; + nif:endIndex "6896"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "consulted" ; + nif:beginIndex "8776"^^xsd:nonNegativeInteger ; + nif:endIndex "8785"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "11514"^^xsd:nonNegativeInteger ; + nif:endIndex "11517"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "During" ; + nif:beginIndex "11717"^^xsd:nonNegativeInteger ; + nif:endIndex "11723"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "It urged that the next Legislature ``provide enabling funds and re-set the effective date so that an orderly implementation of the law may be effected''." ; + nif:beginIndex "1753"^^xsd:nonNegativeInteger ; + nif:endIndex "1906"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Williams" ; + nif:beginIndex "11836"^^xsd:nonNegativeInteger ; + nif:endIndex "11844"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "services" ; + nif:beginIndex "2021"^^xsd:nonNegativeInteger ; + nif:endIndex "2029"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "10453"^^xsd:nonNegativeInteger ; + nif:endIndex "10455"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "yet" ; + nif:beginIndex "8786"^^xsd:nonNegativeInteger ; + nif:endIndex "8789"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "itself" ; + nif:beginIndex "8967"^^xsd:nonNegativeInteger ; + nif:endIndex "8973"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "long" ; + nif:beginIndex "10674"^^xsd:nonNegativeInteger ; + nif:endIndex "10678"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "The jury praised the administration and operation of the Atlanta Police Department, the Fulton Tax Commissioner's Office, the Bellwood and Alpharetta prison farms, Grady Hospital and the Fulton Health Department." ; + nif:beginIndex "4106"^^xsd:nonNegativeInteger ; + nif:endIndex "4318"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "These" ; + nif:beginIndex "3135"^^xsd:nonNegativeInteger ; + nif:endIndex "3140"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "foster" ; + nif:beginIndex "2033"^^xsd:nonNegativeInteger ; + nif:endIndex "2039"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "it" ; + nif:beginIndex "1178"^^xsd:nonNegativeInteger ; + nif:endIndex "1180"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "lived" ; + nif:beginIndex "4879"^^xsd:nonNegativeInteger ; + nif:endIndex "4884"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "race" ; + nif:beginIndex "5515"^^xsd:nonNegativeInteger ; + nif:endIndex "5519"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "enter" ; + nif:beginIndex "6133"^^xsd:nonNegativeInteger ; + nif:endIndex "6138"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Rd." ; + nif:beginIndex "4972"^^xsd:nonNegativeInteger ; + nif:endIndex "4975"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "5590"^^xsd:nonNegativeInteger ; + nif:endIndex "5594"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "registered" ; + nif:beginIndex "6063"^^xsd:nonNegativeInteger ; + nif:endIndex "6073"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "subjected" ; + nif:beginIndex "11863"^^xsd:nonNegativeInteger ; + nif:endIndex "11872"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "8796"^^xsd:nonNegativeInteger ; + nif:endIndex "8799"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "has" ; + nif:beginIndex "5059"^^xsd:nonNegativeInteger ; + nif:endIndex "5062"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "4412"^^xsd:nonNegativeInteger ; + nif:endIndex "4414"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "1767"^^xsd:nonNegativeInteger ; + nif:endIndex "1770"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "He" ; + nif:beginIndex "6953"^^xsd:nonNegativeInteger ; + nif:endIndex "6955"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Pelham" ; + nif:beginIndex "8867"^^xsd:nonNegativeInteger ; + nif:endIndex "8873"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "at" ; + nif:beginIndex "11306"^^xsd:nonNegativeInteger ; + nif:endIndex "11308"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "provided" ; + nif:beginIndex "10041"^^xsd:nonNegativeInteger ; + nif:endIndex "10049"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "was" ; + nif:beginIndex "9515"^^xsd:nonNegativeInteger ; + nif:endIndex "9518"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "new" ; + nif:beginIndex "8400"^^xsd:nonNegativeInteger ; + nif:endIndex "8403"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Attorneys for the mayor said that an amicable property settlement has been agreed upon." ; + nif:beginIndex "4603"^^xsd:nonNegativeInteger ; + nif:endIndex "4690"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "will" ; + nif:beginIndex "10448"^^xsd:nonNegativeInteger ; + nif:endIndex "10452"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "6045"^^xsd:nonNegativeInteger ; + nif:endIndex "6048"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "fire" ; + nif:beginIndex "2791"^^xsd:nonNegativeInteger ; + nif:endIndex "2795"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "2357"^^xsd:nonNegativeInteger ; + nif:endIndex "2360"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "city" ; + nif:beginIndex "1617"^^xsd:nonNegativeInteger ; + nif:endIndex "1621"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "3443"^^xsd:nonNegativeInteger ; + nif:endIndex "3446"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "4737"^^xsd:nonNegativeInteger ; + nif:endIndex "4739"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "it" ; + nif:beginIndex "6224"^^xsd:nonNegativeInteger ; + nif:endIndex "6226"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Bush" ; + nif:beginIndex "11774"^^xsd:nonNegativeInteger ; + nif:endIndex "11778"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "costs" ; + nif:beginIndex "3224"^^xsd:nonNegativeInteger ; + nif:endIndex "3229"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced ``no evidence'' that any irregularities took place." ; + nif:beginIndex "0"^^xsd:nonNegativeInteger ; + nif:endIndex "155"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "both" ; + nif:beginIndex "7557"^^xsd:nonNegativeInteger ; + nif:endIndex "7561"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "generally" ; + nif:beginIndex "1217"^^xsd:nonNegativeInteger ; + nif:endIndex "1226"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "4159"^^xsd:nonNegativeInteger ; + nif:endIndex "4162"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "ever" ; + nif:beginIndex "10804"^^xsd:nonNegativeInteger ; + nif:endIndex "10808"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Gov." ; + nif:beginIndex "6867"^^xsd:nonNegativeInteger ; + nif:endIndex "6871"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "4901"^^xsd:nonNegativeInteger ; + nif:endIndex "4904"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "school" ; + nif:beginIndex "10839"^^xsd:nonNegativeInteger ; + nif:endIndex "10845"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "6718"^^xsd:nonNegativeInteger ; + nif:endIndex "6719"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Department" ; + nif:beginIndex "7792"^^xsd:nonNegativeInteger ; + nif:endIndex "7802"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "as" ; + nif:beginIndex "4734"^^xsd:nonNegativeInteger ; + nif:endIndex "4736"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf ")" ; + nif:beginIndex "10274"^^xsd:nonNegativeInteger ; + nif:endIndex "10275"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "sessions" ; + nif:beginIndex "9741"^^xsd:nonNegativeInteger ; + nif:endIndex "9749"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Monday" ; + nif:beginIndex "7212"^^xsd:nonNegativeInteger ; + nif:endIndex "7218"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "allowed" ; + nif:beginIndex "6576"^^xsd:nonNegativeInteger ; + nif:endIndex "6583"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "had" ; + nif:beginIndex "6920"^^xsd:nonNegativeInteger ; + nif:endIndex "6923"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "an" ; + nif:beginIndex "4637"^^xsd:nonNegativeInteger ; + nif:endIndex "4639"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "approve" ; + nif:beginIndex "7373"^^xsd:nonNegativeInteger ; + nif:endIndex "7380"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "bonds" ; + nif:beginIndex "7891"^^xsd:nonNegativeInteger ; + nif:endIndex "7896"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "8722"^^xsd:nonNegativeInteger ; + nif:endIndex "8725"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "7308"^^xsd:nonNegativeInteger ; + nif:endIndex "7309"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "follow" ; + nif:beginIndex "7294"^^xsd:nonNegativeInteger ; + nif:endIndex "7300"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "1961" ; + nif:beginIndex "7199"^^xsd:nonNegativeInteger ; + nif:endIndex "7203"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "788"^^xsd:nonNegativeInteger ; + nif:endIndex "789"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "being" ; + nif:beginIndex "9064"^^xsd:nonNegativeInteger ; + nif:endIndex "9069"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "effect" ; + nif:beginIndex "3188"^^xsd:nonNegativeInteger ; + nif:endIndex "3194"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "9944"^^xsd:nonNegativeInteger ; + nif:endIndex "9947"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Colquitt" ; + nif:beginIndex "11227"^^xsd:nonNegativeInteger ; + nif:endIndex "11235"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Monday" ; + nif:beginIndex "10470"^^xsd:nonNegativeInteger ; + nif:endIndex "10476"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "whether" ; + nif:beginIndex "9078"^^xsd:nonNegativeInteger ; + nif:endIndex "9085"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "which" ; + nif:beginIndex "6727"^^xsd:nonNegativeInteger ; + nif:endIndex "6732"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Committee" ; + nif:beginIndex "11007"^^xsd:nonNegativeInteger ; + nif:endIndex "11016"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "1510"^^xsd:nonNegativeInteger ; + nif:endIndex "1512"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "exception" ; + nif:beginIndex "5091"^^xsd:nonNegativeInteger ; + nif:endIndex "5100"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "at" ; + nif:beginIndex "1935"^^xsd:nonNegativeInteger ; + nif:endIndex "1937"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Executive" ; + nif:beginIndex "10997"^^xsd:nonNegativeInteger ; + nif:endIndex "11006"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "5913"^^xsd:nonNegativeInteger ; + nif:endIndex "5916"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Fulton" ; + nif:beginIndex "3820"^^xsd:nonNegativeInteger ; + nif:endIndex "3826"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "11762"^^xsd:nonNegativeInteger ; + nif:endIndex "11763"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "weekend" ; + nif:beginIndex "3906"^^xsd:nonNegativeInteger ; + nif:endIndex "3913"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "2386"^^xsd:nonNegativeInteger ; + nif:endIndex "2388"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "him" ; + nif:beginIndex "10353"^^xsd:nonNegativeInteger ; + nif:endIndex "10356"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Jan." ; + nif:beginIndex "3436"^^xsd:nonNegativeInteger ; + nif:endIndex "3440"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "saw" ; + nif:beginIndex "11220"^^xsd:nonNegativeInteger ; + nif:endIndex "11223"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "2622"^^xsd:nonNegativeInteger ; + nif:endIndex "2624"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "attended" ; + nif:beginIndex "6181"^^xsd:nonNegativeInteger ; + nif:endIndex "6189"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "He will be succeeded by Rob Ledford of Gainesville, who has been an assistant more than three years." ; + nif:beginIndex "6953"^^xsd:nonNegativeInteger ; + nif:endIndex "7053"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "339"^^xsd:nonNegativeInteger ; + nif:endIndex "342"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Barber" ; + nif:beginIndex "9915"^^xsd:nonNegativeInteger ; + nif:endIndex "9921"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "2632"^^xsd:nonNegativeInteger ; + nif:endIndex "2634"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Georgia" ; + nif:beginIndex "9610"^^xsd:nonNegativeInteger ; + nif:endIndex "9617"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "7219"^^xsd:nonNegativeInteger ; + nif:endIndex "7222"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Department" ; + nif:beginIndex "8126"^^xsd:nonNegativeInteger ; + nif:endIndex "8136"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "his" ; + nif:beginIndex "10127"^^xsd:nonNegativeInteger ; + nif:endIndex "10130"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "received" ; + nif:beginIndex "11791"^^xsd:nonNegativeInteger ; + nif:endIndex "11799"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "9853"^^xsd:nonNegativeInteger ; + nif:endIndex "9854"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "1131"^^xsd:nonNegativeInteger ; + nif:endIndex "1134"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "listed" ; + nif:beginIndex "5000"^^xsd:nonNegativeInteger ; + nif:endIndex "5006"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "two" ; + nif:beginIndex "3043"^^xsd:nonNegativeInteger ; + nif:endIndex "3046"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "an" ; + nif:beginIndex "12068"^^xsd:nonNegativeInteger ; + nif:endIndex "12070"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "he" ; + nif:beginIndex "10290"^^xsd:nonNegativeInteger ; + nif:endIndex "10292"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "1846"^^xsd:nonNegativeInteger ; + nif:endIndex "1850"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "would" ; + nif:beginIndex "8410"^^xsd:nonNegativeInteger ; + nif:endIndex "8415"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "petition" ; + nif:beginIndex "5014"^^xsd:nonNegativeInteger ; + nif:endIndex "5022"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "platform" ; + nif:beginIndex "6758"^^xsd:nonNegativeInteger ; + nif:endIndex "6766"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Highway" ; + nif:beginIndex "8691"^^xsd:nonNegativeInteger ; + nif:endIndex "8698"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "more" ; + nif:beginIndex "7031"^^xsd:nonNegativeInteger ; + nif:endIndex "7035"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "place" ; + nif:beginIndex "4804"^^xsd:nonNegativeInteger ; + nif:endIndex "4809"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Carey" ; + nif:beginIndex "11096"^^xsd:nonNegativeInteger ; + nif:endIndex "11101"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "but" ; + nif:beginIndex "2146"^^xsd:nonNegativeInteger ; + nif:endIndex "2149"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "petition" ; + nif:beginIndex "4695"^^xsd:nonNegativeInteger ; + nif:endIndex "4703"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Pelham" ; + nif:beginIndex "4965"^^xsd:nonNegativeInteger ; + nif:endIndex "4971"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "widespread" ; + nif:beginIndex "703"^^xsd:nonNegativeInteger ; + nif:endIndex "713"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "It" ; + nif:beginIndex "10478"^^xsd:nonNegativeInteger ; + nif:endIndex "10480"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "1635"^^xsd:nonNegativeInteger ; + nif:endIndex "1637"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "work" ; + nif:beginIndex "7579"^^xsd:nonNegativeInteger ; + nif:endIndex "7583"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Pelham" ; + nif:beginIndex "9586"^^xsd:nonNegativeInteger ; + nif:endIndex "9592"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "was" ; + nif:beginIndex "6206"^^xsd:nonNegativeInteger ; + nif:endIndex "6209"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "6315"^^xsd:nonNegativeInteger ; + nif:endIndex "6316"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "property" ; + nif:beginIndex "4649"^^xsd:nonNegativeInteger ; + nif:endIndex "4657"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "have" ; + nif:beginIndex "11364"^^xsd:nonNegativeInteger ; + nif:endIndex "11368"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "2" ; + nif:beginIndex "4510"^^xsd:nonNegativeInteger ; + nif:endIndex "4511"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "chambers" ; + nif:beginIndex "7562"^^xsd:nonNegativeInteger ; + nif:endIndex "7570"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "11635"^^xsd:nonNegativeInteger ; + nif:endIndex "11636"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "10107"^^xsd:nonNegativeInteger ; + nif:endIndex "10108"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "His petition charged mental cruelty." ; + nif:beginIndex "4445"^^xsd:nonNegativeInteger ; + nif:endIndex "4481"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "12191"^^xsd:nonNegativeInteger ; + nif:endIndex "12192"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "which" ; + nif:beginIndex "2770"^^xsd:nonNegativeInteger ; + nif:endIndex "2775"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "consistently" ; + nif:beginIndex "9872"^^xsd:nonNegativeInteger ; + nif:endIndex "9884"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Georgia" ; + nif:beginIndex "7162"^^xsd:nonNegativeInteger ; + nif:endIndex "7169"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "obtain" ; + nif:beginIndex "6038"^^xsd:nonNegativeInteger ; + nif:endIndex "6044"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "as" ; + nif:beginIndex "7701"^^xsd:nonNegativeInteger ; + nif:endIndex "7703"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "10742"^^xsd:nonNegativeInteger ; + nif:endIndex "10743"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "are" ; + nif:beginIndex "872"^^xsd:nonNegativeInteger ; + nif:endIndex "875"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "1463"^^xsd:nonNegativeInteger ; + nif:endIndex "1466"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "featured" ; + nif:beginIndex "5868"^^xsd:nonNegativeInteger ; + nif:endIndex "5876"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "proportionate" ; + nif:beginIndex "2390"^^xsd:nonNegativeInteger ; + nif:endIndex "2403"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "hard-fought" ; + nif:beginIndex "544"^^xsd:nonNegativeInteger ; + nif:endIndex "555"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Williams" ; + nif:beginIndex "4391"^^xsd:nonNegativeInteger ; + nif:endIndex "4399"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "9265"^^xsd:nonNegativeInteger ; + nif:endIndex "9269"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "shortly" ; + nif:beginIndex "7301"^^xsd:nonNegativeInteger ; + nif:endIndex "7308"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "4410"^^xsd:nonNegativeInteger ; + nif:endIndex "4411"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "7532"^^xsd:nonNegativeInteger ; + nif:endIndex "7535"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "4835"^^xsd:nonNegativeInteger ; + nif:endIndex "4836"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "2150"^^xsd:nonNegativeInteger ; + nif:endIndex "2153"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "6951"^^xsd:nonNegativeInteger ; + nif:endIndex "6952"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Caldwell" ; + nif:beginIndex "6772"^^xsd:nonNegativeInteger ; + nif:endIndex "6780"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Ridge" ; + nif:beginIndex "5898"^^xsd:nonNegativeInteger ; + nif:endIndex "5903"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "fully" ; + nif:beginIndex "9297"^^xsd:nonNegativeInteger ; + nif:endIndex "9302"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "actions" ; + nif:beginIndex "3141"^^xsd:nonNegativeInteger ; + nif:endIndex "3148"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "4691"^^xsd:nonNegativeInteger ; + nif:endIndex "4694"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "program" ; + nif:beginIndex "2451"^^xsd:nonNegativeInteger ; + nif:endIndex "2458"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "State Party Chairman James W. Dorsey added that enthusiasm was picking up for a state rally to be held Sept. 8 in Savannah at which newly elected Texas Sen. John Tower will be the featured speaker." ; + nif:beginIndex "5688"^^xsd:nonNegativeInteger ; + nif:endIndex "5885"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "petition" ; + nif:beginIndex "4449"^^xsd:nonNegativeInteger ; + nif:endIndex "4457"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Caldwell" ; + nif:beginIndex "7094"^^xsd:nonNegativeInteger ; + nif:endIndex "7102"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "succeeded" ; + nif:beginIndex "5271"^^xsd:nonNegativeInteger ; + nif:endIndex "5280"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "order" ; + nif:beginIndex "11165"^^xsd:nonNegativeInteger ; + nif:endIndex "11170"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "promised" ; + nif:beginIndex "12059"^^xsd:nonNegativeInteger ; + nif:endIndex "12067"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Commerce" ; + nif:beginIndex "9925"^^xsd:nonNegativeInteger ; + nif:endIndex "9933"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "should" ; + nif:beginIndex "2549"^^xsd:nonNegativeInteger ; + nif:endIndex "2555"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Party" ; + nif:beginIndex "5694"^^xsd:nonNegativeInteger ; + nif:endIndex "5699"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Williams" ; + nif:beginIndex "11250"^^xsd:nonNegativeInteger ; + nif:endIndex "11258"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "portion" ; + nif:beginIndex "8439"^^xsd:nonNegativeInteger ; + nif:endIndex "8446"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "1181"^^xsd:nonNegativeInteger ; + nif:endIndex "1185"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "improving" ; + nif:beginIndex "1036"^^xsd:nonNegativeInteger ; + nif:endIndex "1045"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "were" ; + nif:beginIndex "9288"^^xsd:nonNegativeInteger ; + nif:endIndex "9292"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "9177"^^xsd:nonNegativeInteger ; + nif:endIndex "9179"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "asking" ; + nif:beginIndex "9937"^^xsd:nonNegativeInteger ; + nif:endIndex "9943"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Many local citizens feared that there would be irregularities at the polls, and Williams got himself a permit to carry a gun and promised an orderly election." ; + nif:beginIndex "11930"^^xsd:nonNegativeInteger ; + nif:endIndex "12088"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "resolution" ; + nif:beginIndex "8887"^^xsd:nonNegativeInteger ; + nif:endIndex "8897"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "quiet" ; + nif:beginIndex "11437"^^xsd:nonNegativeInteger ; + nif:endIndex "11442"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "stood" ; + nif:beginIndex "11133"^^xsd:nonNegativeInteger ; + nif:endIndex "11138"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "pass" ; + nif:beginIndex "3987"^^xsd:nonNegativeInteger ; + nif:endIndex "3991"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "hot" ; + nif:beginIndex "10680"^^xsd:nonNegativeInteger ; + nif:endIndex "10683"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "night" ; + nif:beginIndex "3896"^^xsd:nonNegativeInteger ; + nif:endIndex "3901"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "His political career goes back to his election to city council in 1923." ; + nif:beginIndex "5137"^^xsd:nonNegativeInteger ; + nif:endIndex "5208"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "It says that ``in the event Congress does provide this increase in federal funds'', the State Board of Education should be directed to ``give priority'' to teacher pay raises." ; + nif:beginIndex "10478"^^xsd:nonNegativeInteger ; + nif:endIndex "10653"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "bonds" ; + nif:beginIndex "7742"^^xsd:nonNegativeInteger ; + nif:endIndex "7747"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "(" ; + nif:beginIndex "10259"^^xsd:nonNegativeInteger ; + nif:endIndex "10260"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "10952"^^xsd:nonNegativeInteger ; + nif:endIndex "10955"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Pelham" ; + nif:beginIndex "9315"^^xsd:nonNegativeInteger ; + nif:endIndex "9321"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "received" ; + nif:beginIndex "11024"^^xsd:nonNegativeInteger ; + nif:endIndex "11032"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "336"^^xsd:nonNegativeInteger ; + nif:endIndex "338"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "7999"^^xsd:nonNegativeInteger ; + nif:endIndex "8002"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "11585"^^xsd:nonNegativeInteger ; + nif:endIndex "11586"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "raises" ; + nif:beginIndex "9711"^^xsd:nonNegativeInteger ; + nif:endIndex "9717"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "campaign" ; + nif:beginIndex "6887"^^xsd:nonNegativeInteger ; + nif:endIndex "6895"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "726"^^xsd:nonNegativeInteger ; + nif:endIndex "729"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "12232"^^xsd:nonNegativeInteger ; + nif:endIndex "12234"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "3037"^^xsd:nonNegativeInteger ; + nif:endIndex "3039"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "near" ; + nif:beginIndex "7811"^^xsd:nonNegativeInteger ; + nif:endIndex "7815"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "11145"^^xsd:nonNegativeInteger ; + nif:endIndex "11148"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "by" ; + nif:beginIndex "11467"^^xsd:nonNegativeInteger ; + nif:endIndex "11469"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "$10" ; + nif:beginIndex "8976"^^xsd:nonNegativeInteger ; + nif:endIndex "8979"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "or" ; + nif:beginIndex "3870"^^xsd:nonNegativeInteger ; + nif:endIndex "3872"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Georgia's" ; + nif:beginIndex "829"^^xsd:nonNegativeInteger ; + nif:endIndex "838"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "6196"^^xsd:nonNegativeInteger ; + nif:endIndex "6199"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "3845"^^xsd:nonNegativeInteger ; + nif:endIndex "3846"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "protected" ; + nif:beginIndex "2926"^^xsd:nonNegativeInteger ; + nif:endIndex "2935"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "2275"^^xsd:nonNegativeInteger ; + nif:endIndex "2278"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "announced" ; + nif:beginIndex "5370"^^xsd:nonNegativeInteger ; + nif:endIndex "5379"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "see" ; + nif:beginIndex "10252"^^xsd:nonNegativeInteger ; + nif:endIndex "10255"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Atlanta" ; + nif:beginIndex "3074"^^xsd:nonNegativeInteger ; + nif:endIndex "3081"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "8416"^^xsd:nonNegativeInteger ; + nif:endIndex "8418"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "He" ; + nif:beginIndex "5260"^^xsd:nonNegativeInteger ; + nif:endIndex "5262"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "plan" ; + nif:beginIndex "8165"^^xsd:nonNegativeInteger ; + nif:endIndex "8169"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "11857"^^xsd:nonNegativeInteger ; + nif:endIndex "11858"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "2620"^^xsd:nonNegativeInteger ; + nif:endIndex "2621"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "2936"^^xsd:nonNegativeInteger ; + nif:endIndex "2939"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "3880"^^xsd:nonNegativeInteger ; + nif:endIndex "3882"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "1258"^^xsd:nonNegativeInteger ; + nif:endIndex "1260"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "1597"^^xsd:nonNegativeInteger ; + nif:endIndex "1598"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Byrd's" ; + nif:beginIndex "6880"^^xsd:nonNegativeInteger ; + nif:endIndex "6886"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "10558"^^xsd:nonNegativeInteger ; + nif:endIndex "10560"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "7148"^^xsd:nonNegativeInteger ; + nif:endIndex "7151"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "wife" ; + nif:beginIndex "4905"^^xsd:nonNegativeInteger ; + nif:endIndex "4909"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Schley County Rep. B. D. Pelham will offer a resolution Monday in the House to rescind the body's action of Friday in voting itself a $10 per day increase in expense allowances." ; + nif:beginIndex "8842"^^xsd:nonNegativeInteger ; + nif:endIndex "9019"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "urged" ; + nif:beginIndex "1602"^^xsd:nonNegativeInteger ; + nif:endIndex "1607"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "11223"^^xsd:nonNegativeInteger ; + nif:endIndex "11225"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "by" ; + nif:beginIndex "11139"^^xsd:nonNegativeInteger ; + nif:endIndex "11141"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "at" ; + nif:beginIndex "5811"^^xsd:nonNegativeInteger ; + nif:endIndex "5813"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "as" ; + nif:beginIndex "1557"^^xsd:nonNegativeInteger ; + nif:endIndex "1559"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "none" ; + nif:beginIndex "10304"^^xsd:nonNegativeInteger ; + nif:endIndex "10308"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "11276"^^xsd:nonNegativeInteger ; + nif:endIndex "11279"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "3673"^^xsd:nonNegativeInteger ; + nif:endIndex "3675"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "there" ; + nif:beginIndex "9045"^^xsd:nonNegativeInteger ; + nif:endIndex "9050"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "reports" ; + nif:beginIndex "646"^^xsd:nonNegativeInteger ; + nif:endIndex "653"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Roads" ; + nif:beginIndex "8355"^^xsd:nonNegativeInteger ; + nif:endIndex "8360"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "been" ; + nif:beginIndex "427"^^xsd:nonNegativeInteger ; + nif:endIndex "431"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "handling" ; + nif:beginIndex "1969"^^xsd:nonNegativeInteger ; + nif:endIndex "1977"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "unanimous" ; + nif:beginIndex "6115"^^xsd:nonNegativeInteger ; + nif:endIndex "6124"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "11715"^^xsd:nonNegativeInteger ; + nif:endIndex "11716"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Ask jail deputies" ; + nif:beginIndex "3709"^^xsd:nonNegativeInteger ; + nif:endIndex "3726"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "4156"^^xsd:nonNegativeInteger ; + nif:endIndex "4158"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "391"^^xsd:nonNegativeInteger ; + nif:endIndex "394"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "2959"^^xsd:nonNegativeInteger ; + nif:endIndex "2962"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "surveillance" ; + nif:beginIndex "3596"^^xsd:nonNegativeInteger ; + nif:endIndex "3608"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "per" ; + nif:beginIndex "8980"^^xsd:nonNegativeInteger ; + nif:endIndex "8983"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "317"^^xsd:nonNegativeInteger ; + nif:endIndex "320"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "8885"^^xsd:nonNegativeInteger ; + nif:endIndex "8886"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "But he added that none of Georgia's congressmen specifically asked him to offer the resolution." ; + nif:beginIndex "10286"^^xsd:nonNegativeInteger ; + nif:endIndex "10381"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "9718"^^xsd:nonNegativeInteger ; + nif:endIndex "9721"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "3523"^^xsd:nonNegativeInteger ; + nif:endIndex "3526"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "5010"^^xsd:nonNegativeInteger ; + nif:endIndex "5013"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "set" ; + nif:beginIndex "9356"^^xsd:nonNegativeInteger ; + nif:endIndex "9359"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "-- After a long, hot controversy, Miller County has a new school superintendent, elected, as a policeman put it, in the ``coolest election I ever saw in this county''." ; + nif:beginIndex "10663"^^xsd:nonNegativeInteger ; + nif:endIndex "10830"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "no" ; + nif:beginIndex "6304"^^xsd:nonNegativeInteger ; + nif:endIndex "6306"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "its" ; + nif:beginIndex "6754"^^xsd:nonNegativeInteger ; + nif:endIndex "6757"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "281"^^xsd:nonNegativeInteger ; + nif:endIndex "283"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "we" ; + nif:beginIndex "11354"^^xsd:nonNegativeInteger ; + nif:endIndex "11356"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "$100" ; + nif:beginIndex "7637"^^xsd:nonNegativeInteger ; + nif:endIndex "7641"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "6766"^^xsd:nonNegativeInteger ; + nif:endIndex "6767"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "been" ; + nif:beginIndex "4673"^^xsd:nonNegativeInteger ; + nif:endIndex "4677"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "Meanwhile, it was learned the State Highway Department is very near being ready to issue the first $30 million worth of highway reconstruction bonds." ; + nif:beginIndex "7748"^^xsd:nonNegativeInteger ; + nif:endIndex "7897"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "time" ; + nif:beginIndex "6947"^^xsd:nonNegativeInteger ; + nif:endIndex "6951"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "County" ; + nif:beginIndex "2542"^^xsd:nonNegativeInteger ; + nif:endIndex "2548"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "praised" ; + nif:beginIndex "4115"^^xsd:nonNegativeInteger ; + nif:endIndex "4122"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "got" ; + nif:beginIndex "12019"^^xsd:nonNegativeInteger ; + nif:endIndex "12022"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "telephone" ; + nif:beginIndex "11810"^^xsd:nonNegativeInteger ; + nif:endIndex "11819"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "offer" ; + nif:beginIndex "10360"^^xsd:nonNegativeInteger ; + nif:endIndex "10365"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "there" ; + nif:beginIndex "9509"^^xsd:nonNegativeInteger ; + nif:endIndex "9514"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "9507"^^xsd:nonNegativeInteger ; + nif:endIndex "9508"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "10631"^^xsd:nonNegativeInteger ; + nif:endIndex "10633"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "534"^^xsd:nonNegativeInteger ; + nif:endIndex "536"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "believes" ; + nif:beginIndex "1344"^^xsd:nonNegativeInteger ; + nif:endIndex "1352"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "will" ; + nif:beginIndex "5263"^^xsd:nonNegativeInteger ; + nif:endIndex "5267"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Rep." ; + nif:beginIndex "8856"^^xsd:nonNegativeInteger ; + nif:endIndex "8860"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "2320"^^xsd:nonNegativeInteger ; + nif:endIndex "2321"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Jackson" ; + nif:beginIndex "9761"^^xsd:nonNegativeInteger ; + nif:endIndex "9768"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "1659"^^xsd:nonNegativeInteger ; + nif:endIndex "1660"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "11873"^^xsd:nonNegativeInteger ; + nif:endIndex "11875"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "some" ; + nif:beginIndex "2564"^^xsd:nonNegativeInteger ; + nif:endIndex "2568"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "provide" ; + nif:beginIndex "1790"^^xsd:nonNegativeInteger ; + nif:endIndex "1797"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "voters" ; + nif:beginIndex "6074"^^xsd:nonNegativeInteger ; + nif:endIndex "6080"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "resolution" ; + nif:beginIndex "10370"^^xsd:nonNegativeInteger ; + nif:endIndex "10380"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "8685"^^xsd:nonNegativeInteger ; + nif:endIndex "8686"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "career" ; + nif:beginIndex "5151"^^xsd:nonNegativeInteger ; + nif:endIndex "5157"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Sept." ; + nif:beginIndex "5330"^^xsd:nonNegativeInteger ; + nif:endIndex "5335"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "funds" ; + nif:beginIndex "2596"^^xsd:nonNegativeInteger ; + nif:endIndex "2601"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "departments" ; + nif:beginIndex "1160"^^xsd:nonNegativeInteger ; + nif:endIndex "1171"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "``Being at the polls was just like being at church." ; + nif:beginIndex "11265"^^xsd:nonNegativeInteger ; + nif:endIndex "11316"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "additional" ; + nif:beginIndex "8620"^^xsd:nonNegativeInteger ; + nif:endIndex "8630"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "governor" ; + nif:beginIndex "8525"^^xsd:nonNegativeInteger ; + nif:endIndex "8533"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jury" ; + nif:beginIndex "2719"^^xsd:nonNegativeInteger ; + nif:endIndex "2723"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "5864"^^xsd:nonNegativeInteger ; + nif:endIndex "5867"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "back" ; + nif:beginIndex "5163"^^xsd:nonNegativeInteger ; + nif:endIndex "5167"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "campaign" ; + nif:beginIndex "7077"^^xsd:nonNegativeInteger ; + nif:endIndex "7085"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "12040"^^xsd:nonNegativeInteger ; + nif:endIndex "12042"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "against" ; + nif:beginIndex "8575"^^xsd:nonNegativeInteger ; + nif:endIndex "8582"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "786"^^xsd:nonNegativeInteger ; + nif:endIndex "788"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Aug." ; + nif:beginIndex "4505"^^xsd:nonNegativeInteger ; + nif:endIndex "4509"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Cheshire" ; + nif:beginIndex "4582"^^xsd:nonNegativeInteger ; + nif:endIndex "4590"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "test" ; + nif:beginIndex "7969"^^xsd:nonNegativeInteger ; + nif:endIndex "7973"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "liquor" ; + nif:beginIndex "11342"^^xsd:nonNegativeInteger ; + nif:endIndex "11348"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Chairman" ; + nif:beginIndex "5700"^^xsd:nonNegativeInteger ; + nif:endIndex "5708"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Georgia" ; + nif:beginIndex "9800"^^xsd:nonNegativeInteger ; + nif:endIndex "9807"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "As of Sunday night, there was no word of a resolution being offered there to rescind the action." ; + nif:beginIndex "9489"^^xsd:nonNegativeInteger ; + nif:endIndex "9585"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "6780"^^xsd:nonNegativeInteger ; + nif:endIndex "6781"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "earlier" ; + nif:beginIndex "7678"^^xsd:nonNegativeInteger ; + nif:endIndex "7685"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "before" ; + nif:beginIndex "6402"^^xsd:nonNegativeInteger ; + nif:endIndex "6408"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "7156"^^xsd:nonNegativeInteger ; + nif:endIndex "7157"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "expected" ; + nif:beginIndex "7106"^^xsd:nonNegativeInteger ; + nif:endIndex "7114"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "His" ; + nif:beginIndex "4445"^^xsd:nonNegativeInteger ; + nif:endIndex "4448"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "been" ; + nif:beginIndex "5063"^^xsd:nonNegativeInteger ; + nif:endIndex "5067"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "starts" ; + nif:beginIndex "7086"^^xsd:nonNegativeInteger ; + nif:endIndex "7092"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "870"^^xsd:nonNegativeInteger ; + nif:endIndex "872"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "ask" ; + nif:beginIndex "9792"^^xsd:nonNegativeInteger ; + nif:endIndex "9795"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "1676"^^xsd:nonNegativeInteger ; + nif:endIndex "1678"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "next" ; + nif:beginIndex "1771"^^xsd:nonNegativeInteger ; + nif:endIndex "1775"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "become" ; + nif:beginIndex "7118"^^xsd:nonNegativeInteger ; + nif:endIndex "7124"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Constitution" ; + nif:beginIndex "8726"^^xsd:nonNegativeInteger ; + nif:endIndex "8738"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jury" ; + nif:beginIndex "3306"^^xsd:nonNegativeInteger ; + nif:endIndex "3310"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "guardians" ; + nif:beginIndex "2848"^^xsd:nonNegativeInteger ; + nif:endIndex "2857"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word , nif:Sentence ; + nif:anchorOf "Colquitt" ; + nif:beginIndex "10654"^^xsd:nonNegativeInteger ; + nif:endIndex "10662"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:sentence ; + nif:word . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "must" ; + nif:beginIndex "6521"^^xsd:nonNegativeInteger ; + nif:endIndex "6525"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "wife's" ; + nif:beginIndex "4783"^^xsd:nonNegativeInteger ; + nif:endIndex "4789"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "2616"^^xsd:nonNegativeInteger ; + nif:endIndex "2620"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "support" ; + nif:beginIndex "10011"^^xsd:nonNegativeInteger ; + nif:endIndex "10018"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "``This was the coolest, calmest election I ever saw'', Colquitt Policeman Tom Williams said." ; + nif:beginIndex "11172"^^xsd:nonNegativeInteger ; + nif:endIndex "11264"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "re-set" ; + nif:beginIndex "1817"^^xsd:nonNegativeInteger ; + nif:endIndex "1823"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "12189"^^xsd:nonNegativeInteger ; + nif:endIndex "12191"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "from" ; + nif:beginIndex "4370"^^xsd:nonNegativeInteger ; + nif:endIndex "4374"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "our" ; + nif:beginIndex "10191"^^xsd:nonNegativeInteger ; + nif:endIndex "10194"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "3098"^^xsd:nonNegativeInteger ; + nif:endIndex "3101"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "3230"^^xsd:nonNegativeInteger ; + nif:endIndex "3233"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "638"^^xsd:nonNegativeInteger ; + nif:endIndex "640"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "funds" ; + nif:beginIndex "2214"^^xsd:nonNegativeInteger ; + nif:endIndex "2219"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "fund" ; + nif:beginIndex "8298"^^xsd:nonNegativeInteger ; + nif:endIndex "8302"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "further" ; + nif:beginIndex "165"^^xsd:nonNegativeInteger ; + nif:endIndex "172"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "668"^^xsd:nonNegativeInteger ; + nif:endIndex "669"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "with" ; + nif:beginIndex "5086"^^xsd:nonNegativeInteger ; + nif:endIndex "5090"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "governor's" ; + nif:beginIndex "5504"^^xsd:nonNegativeInteger ; + nif:endIndex "5514"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Friday" ; + nif:beginIndex "8950"^^xsd:nonNegativeInteger ; + nif:endIndex "8956"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "resolution" ; + nif:beginIndex "9532"^^xsd:nonNegativeInteger ; + nif:endIndex "9542"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "2516"^^xsd:nonNegativeInteger ; + nif:endIndex "2520"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "House" ; + nif:beginIndex "8912"^^xsd:nonNegativeInteger ; + nif:endIndex "8917"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "918"^^xsd:nonNegativeInteger ; + nif:endIndex "920"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Friday" ; + nif:beginIndex "34"^^xsd:nonNegativeInteger ; + nif:endIndex "40"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Republicans" ; + nif:beginIndex "6654"^^xsd:nonNegativeInteger ; + nif:endIndex "6665"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "with" ; + nif:beginIndex "11687"^^xsd:nonNegativeInteger ; + nif:endIndex "11691"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "counties" ; + nif:beginIndex "2263"^^xsd:nonNegativeInteger ; + nif:endIndex "2271"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "up" ; + nif:beginIndex "5759"^^xsd:nonNegativeInteger ; + nif:endIndex "5761"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Highway" ; + nif:beginIndex "6788"^^xsd:nonNegativeInteger ; + nif:endIndex "6795"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "6895"^^xsd:nonNegativeInteger ; + nif:endIndex "6896"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "airport" ; + nif:beginIndex "3447"^^xsd:nonNegativeInteger ; + nif:endIndex "3454"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "so" ; + nif:beginIndex "11434"^^xsd:nonNegativeInteger ; + nif:endIndex "11436"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "night" ; + nif:beginIndex "9502"^^xsd:nonNegativeInteger ; + nif:endIndex "9507"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "primary" ; + nif:beginIndex "6678"^^xsd:nonNegativeInteger ; + nif:endIndex "6685"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "approved" ; + nif:beginIndex "7280"^^xsd:nonNegativeInteger ; + nif:endIndex "7288"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "12049"^^xsd:nonNegativeInteger ; + nif:endIndex "12050"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "does" ; + nif:beginIndex "10515"^^xsd:nonNegativeInteger ; + nif:endIndex "10519"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "fees" ; + nif:beginIndex "2897"^^xsd:nonNegativeInteger ; + nif:endIndex "2901"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "has" ; + nif:beginIndex "4871"^^xsd:nonNegativeInteger ; + nif:endIndex "4874"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "about" ; + nif:beginIndex "8790"^^xsd:nonNegativeInteger ; + nif:endIndex "8795"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "candidate" ; + nif:beginIndex "6141"^^xsd:nonNegativeInteger ; + nif:endIndex "6150"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Vandiver" ; + nif:beginIndex "8754"^^xsd:nonNegativeInteger ; + nif:endIndex "8762"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "number" ; + nif:beginIndex "7396"^^xsd:nonNegativeInteger ; + nif:endIndex "7402"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "12160"^^xsd:nonNegativeInteger ; + nif:endIndex "12162"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "10931"^^xsd:nonNegativeInteger ; + nif:endIndex "10932"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Ala." ; + nif:beginIndex "4831"^^xsd:nonNegativeInteger ; + nif:endIndex "4835"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "source" ; + nif:beginIndex "8137"^^xsd:nonNegativeInteger ; + nif:endIndex "8143"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "58"^^xsd:nonNegativeInteger ; + nif:endIndex "60"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "1433"^^xsd:nonNegativeInteger ; + nif:endIndex "1436"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "however" ; + nif:beginIndex "11444"^^xsd:nonNegativeInteger ; + nif:endIndex "11451"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Vandiver" ; + nif:beginIndex "8496"^^xsd:nonNegativeInteger ; + nif:endIndex "8504"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "2259"^^xsd:nonNegativeInteger ; + nif:endIndex "2262"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "6938"^^xsd:nonNegativeInteger ; + nif:endIndex "6941"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "389"^^xsd:nonNegativeInteger ; + nif:endIndex "390"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "wait" ; + nif:beginIndex "6237"^^xsd:nonNegativeInteger ; + nif:endIndex "6241"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "10813"^^xsd:nonNegativeInteger ; + nif:endIndex "10815"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "4142"^^xsd:nonNegativeInteger ; + nif:endIndex "4145"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "10019"^^xsd:nonNegativeInteger ; + nif:endIndex "10022"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "officials" ; + nif:beginIndex "3974"^^xsd:nonNegativeInteger ; + nif:endIndex "3983"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "voting" ; + nif:beginIndex "6018"^^xsd:nonNegativeInteger ; + nif:endIndex "6024"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "voters" ; + nif:beginIndex "6499"^^xsd:nonNegativeInteger ; + nif:endIndex "6505"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Caldwell's" ; + nif:beginIndex "6897"^^xsd:nonNegativeInteger ; + nif:endIndex "6907"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "who" ; + nif:beginIndex "6177"^^xsd:nonNegativeInteger ; + nif:endIndex "6180"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "midnight" ; + nif:beginIndex "11493"^^xsd:nonNegativeInteger ; + nif:endIndex "11501"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Republicans" ; + nif:beginIndex "6561"^^xsd:nonNegativeInteger ; + nif:endIndex "6572"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "9426"^^xsd:nonNegativeInteger ; + nif:endIndex "9427"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "administration" ; + nif:beginIndex "1445"^^xsd:nonNegativeInteger ; + nif:endIndex "1459"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "3392"^^xsd:nonNegativeInteger ; + nif:endIndex "3394"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "rally" ; + nif:beginIndex "5774"^^xsd:nonNegativeInteger ; + nif:endIndex "5779"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "4910"^^xsd:nonNegativeInteger ; + nif:endIndex "4913"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "6189"^^xsd:nonNegativeInteger ; + nif:endIndex "6190"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "is" ; + nif:beginIndex "1512"^^xsd:nonNegativeInteger ; + nif:endIndex "1514"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "11085"^^xsd:nonNegativeInteger ; + nif:endIndex "11086"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "principal" ; + nif:beginIndex "10942"^^xsd:nonNegativeInteger ; + nif:endIndex "10951"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "enthusiasm" ; + nif:beginIndex "5736"^^xsd:nonNegativeInteger ; + nif:endIndex "5746"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "4800"^^xsd:nonNegativeInteger ; + nif:endIndex "4803"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "resignation" ; + nif:beginIndex "6908"^^xsd:nonNegativeInteger ; + nif:endIndex "6919"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "7452"^^xsd:nonNegativeInteger ; + nif:endIndex "7454"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "have" ; + nif:beginIndex "4524"^^xsd:nonNegativeInteger ; + nif:endIndex "4528"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "11745"^^xsd:nonNegativeInteger ; + nif:endIndex "11746"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "10115"^^xsd:nonNegativeInteger ; + nif:endIndex "10116"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "took" ; + nif:beginIndex "1922"^^xsd:nonNegativeInteger ; + nif:endIndex "1926"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "3185"^^xsd:nonNegativeInteger ; + nif:endIndex "3187"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Ledford" ; + nif:beginIndex "6981"^^xsd:nonNegativeInteger ; + nif:endIndex "6988"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "will" ; + nif:beginIndex "4018"^^xsd:nonNegativeInteger ; + nif:endIndex "4022"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Jail" ; + nif:beginIndex "3834"^^xsd:nonNegativeInteger ; + nif:endIndex "3838"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "11339"^^xsd:nonNegativeInteger ; + nif:endIndex "11341"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Department" ; + nif:beginIndex "4307"^^xsd:nonNegativeInteger ; + nif:endIndex "4317"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Department" ; + nif:beginIndex "4178"^^xsd:nonNegativeInteger ; + nif:endIndex "4188"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "9530"^^xsd:nonNegativeInteger ; + nif:endIndex "9531"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "$50" ; + nif:beginIndex "8599"^^xsd:nonNegativeInteger ; + nif:endIndex "8602"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "It" ; + nif:beginIndex "4769"^^xsd:nonNegativeInteger ; + nif:endIndex "4771"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "1380"^^xsd:nonNegativeInteger ; + nif:endIndex "1382"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "5480"^^xsd:nonNegativeInteger ; + nif:endIndex "5481"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "was" ; + nif:beginIndex "6109"^^xsd:nonNegativeInteger ; + nif:endIndex "6112"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "8840"^^xsd:nonNegativeInteger ; + nif:endIndex "8841"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "this" ; + nif:beginIndex "2638"^^xsd:nonNegativeInteger ; + nif:endIndex "2642"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "increased" ; + nif:beginIndex "9993"^^xsd:nonNegativeInteger ; + nif:endIndex "10002"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "no" ; + nif:beginIndex "9519"^^xsd:nonNegativeInteger ; + nif:endIndex "9521"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "notice" ; + nif:beginIndex "9159"^^xsd:nonNegativeInteger ; + nif:endIndex "9165"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "mayor's" ; + nif:beginIndex "5030"^^xsd:nonNegativeInteger ; + nif:endIndex "5037"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "2817"^^xsd:nonNegativeInteger ; + nif:endIndex "2820"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "6130"^^xsd:nonNegativeInteger ; + nif:endIndex "6132"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "must" ; + nif:beginIndex "6666"^^xsd:nonNegativeInteger ; + nif:endIndex "6670"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "4051"^^xsd:nonNegativeInteger ; + nif:endIndex "4052"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "ready" ; + nif:beginIndex "7822"^^xsd:nonNegativeInteger ; + nif:endIndex "7827"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jury" ; + nif:beginIndex "1064"^^xsd:nonNegativeInteger ; + nif:endIndex "1068"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "was" ; + nif:beginIndex "4996"^^xsd:nonNegativeInteger ; + nif:endIndex "4999"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "per" ; + nif:beginIndex "6483"^^xsd:nonNegativeInteger ; + nif:endIndex "6486"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Miller" ; + nif:beginIndex "10972"^^xsd:nonNegativeInteger ; + nif:endIndex "10978"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "read" ; + nif:beginIndex "10465"^^xsd:nonNegativeInteger ; + nif:endIndex "10469"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "9893"^^xsd:nonNegativeInteger ; + nif:endIndex "9895"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "it" ; + nif:beginIndex "5988"^^xsd:nonNegativeInteger ; + nif:endIndex "5990"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "armed" ; + nif:beginIndex "11112"^^xsd:nonNegativeInteger ; + nif:endIndex "11117"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "out" ; + nif:beginIndex "6009"^^xsd:nonNegativeInteger ; + nif:endIndex "6012"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "told" ; + nif:beginIndex "8717"^^xsd:nonNegativeInteger ; + nif:endIndex "8721"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "serve" ; + nif:beginIndex "3156"^^xsd:nonNegativeInteger ; + nif:endIndex "3161"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "11603"^^xsd:nonNegativeInteger ; + nif:endIndex "11604"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "7941"^^xsd:nonNegativeInteger ; + nif:endIndex "7944"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "mayor" ; + nif:beginIndex "5068"^^xsd:nonNegativeInteger ; + nif:endIndex "5073"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "keeping" ; + nif:beginIndex "3676"^^xsd:nonNegativeInteger ; + nif:endIndex "3683"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "session" ; + nif:beginIndex "7204"^^xsd:nonNegativeInteger ; + nif:endIndex "7211"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "8521"^^xsd:nonNegativeInteger ; + nif:endIndex "8524"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "cruelty" ; + nif:beginIndex "4473"^^xsd:nonNegativeInteger ; + nif:endIndex "4480"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "937"^^xsd:nonNegativeInteger ; + nif:endIndex "941"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "I" ; + nif:beginIndex "11317"^^xsd:nonNegativeInteger ; + nif:endIndex "11318"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "10906"^^xsd:nonNegativeInteger ; + nif:endIndex "10907"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "9922"^^xsd:nonNegativeInteger ; + nif:endIndex "9924"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "7228"^^xsd:nonNegativeInteger ; + nif:endIndex "7231"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "354"^^xsd:nonNegativeInteger ; + nif:endIndex "356"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "2603"^^xsd:nonNegativeInteger ; + nif:endIndex "2604"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "criticisms" ; + nif:beginIndex "3288"^^xsd:nonNegativeInteger ; + nif:endIndex "3298"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "enter" ; + nif:beginIndex "5474"^^xsd:nonNegativeInteger ; + nif:endIndex "5479"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "grand" ; + nif:beginIndex "3056"^^xsd:nonNegativeInteger ; + nif:endIndex "3061"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "12211"^^xsd:nonNegativeInteger ; + nif:endIndex "12213"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "from" ; + nif:beginIndex "5669"^^xsd:nonNegativeInteger ; + nif:endIndex "5673"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Department" ; + nif:beginIndex "1483"^^xsd:nonNegativeInteger ; + nif:endIndex "1493"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "all" ; + nif:beginIndex "2255"^^xsd:nonNegativeInteger ; + nif:endIndex "2258"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "is" ; + nif:beginIndex "7803"^^xsd:nonNegativeInteger ; + nif:endIndex "7805"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "election" ; + nif:beginIndex "271"^^xsd:nonNegativeInteger ; + nif:endIndex "279"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "3839"^^xsd:nonNegativeInteger ; + nif:endIndex "3842"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "which" ; + nif:beginIndex "238"^^xsd:nonNegativeInteger ; + nif:endIndex "243"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Pearl" ; + nif:beginIndex "4385"^^xsd:nonNegativeInteger ; + nif:endIndex "4390"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "4383"^^xsd:nonNegativeInteger ; + nif:endIndex "4384"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "study" ; + nif:beginIndex "7383"^^xsd:nonNegativeInteger ; + nif:endIndex "7388"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "make" ; + nif:beginIndex "6259"^^xsd:nonNegativeInteger ; + nif:endIndex "6263"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "birth" ; + nif:beginIndex "4813"^^xsd:nonNegativeInteger ; + nif:endIndex "4818"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Fulton" ; + nif:beginIndex "2697"^^xsd:nonNegativeInteger ; + nif:endIndex "2703"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "resolution" ; + nif:beginIndex "10386"^^xsd:nonNegativeInteger ; + nif:endIndex "10396"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "details" ; + nif:beginIndex "9280"^^xsd:nonNegativeInteger ; + nif:endIndex "9287"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "face" ; + nif:beginIndex "6366"^^xsd:nonNegativeInteger ; + nif:endIndex "6370"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "when" ; + nif:beginIndex "3399"^^xsd:nonNegativeInteger ; + nif:endIndex "3403"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "But" ; + nif:beginIndex "10286"^^xsd:nonNegativeInteger ; + nif:endIndex "10289"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "man" ; + nif:beginIndex "4897"^^xsd:nonNegativeInteger ; + nif:endIndex "4900"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "11069"^^xsd:nonNegativeInteger ; + nif:endIndex "11072"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "1052"^^xsd:nonNegativeInteger ; + nif:endIndex "1053"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "10260"^^xsd:nonNegativeInteger ; + nif:endIndex "10263"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "540"^^xsd:nonNegativeInteger ; + nif:endIndex "543"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "4932"^^xsd:nonNegativeInteger ; + nif:endIndex "4935"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "there" ; + nif:beginIndex "9557"^^xsd:nonNegativeInteger ; + nif:endIndex "9562"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "236"^^xsd:nonNegativeInteger ; + nif:endIndex "237"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "as" ; + nif:beginIndex "7571"^^xsd:nonNegativeInteger ; + nif:endIndex "7573"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "names" ; + nif:beginIndex "6593"^^xsd:nonNegativeInteger ; + nif:endIndex "6598"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "effected" ; + nif:beginIndex "1895"^^xsd:nonNegativeInteger ; + nif:endIndex "1903"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "visit" ; + nif:beginIndex "7548"^^xsd:nonNegativeInteger ; + nif:endIndex "7553"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "71" ; + nif:beginIndex "4765"^^xsd:nonNegativeInteger ; + nif:endIndex "4767"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "crowd" ; + nif:beginIndex "6200"^^xsd:nonNegativeInteger ; + nif:endIndex "6205"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "enabling" ; + nif:beginIndex "1798"^^xsd:nonNegativeInteger ; + nif:endIndex "1806"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "reconstruction" ; + nif:beginIndex "7876"^^xsd:nonNegativeInteger ; + nif:endIndex "7890"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "11451"^^xsd:nonNegativeInteger ; + nif:endIndex "11452"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "3569"^^xsd:nonNegativeInteger ; + nif:endIndex "3571"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "laws" ; + nif:beginIndex "865"^^xsd:nonNegativeInteger ; + nif:endIndex "869"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "637" ; + nif:beginIndex "4958"^^xsd:nonNegativeInteger ; + nif:endIndex "4961"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "12006"^^xsd:nonNegativeInteger ; + nif:endIndex "12009"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Tuesday" ; + nif:beginIndex "6843"^^xsd:nonNegativeInteger ; + nif:endIndex "6850"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "homes" ; + nif:beginIndex "2040"^^xsd:nonNegativeInteger ; + nif:endIndex "2045"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "5298"^^xsd:nonNegativeInteger ; + nif:endIndex "5299"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "reasonable" ; + nif:beginIndex "3695"^^xsd:nonNegativeInteger ; + nif:endIndex "3705"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "enthusiastic" ; + nif:beginIndex "5646"^^xsd:nonNegativeInteger ; + nif:endIndex "5658"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Commissioner's" ; + nif:beginIndex "4205"^^xsd:nonNegativeInteger ; + nif:endIndex "4219"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "chairman" ; + nif:beginIndex "10956"^^xsd:nonNegativeInteger ; + nif:endIndex "10964"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "something" ; + nif:beginIndex "9855"^^xsd:nonNegativeInteger ; + nif:endIndex "9864"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "3371"^^xsd:nonNegativeInteger ; + nif:endIndex "3374"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "considering" ; + nif:beginIndex "687"^^xsd:nonNegativeInteger ; + nif:endIndex "698"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "system" ; + nif:beginIndex "6708"^^xsd:nonNegativeInteger ; + nif:endIndex "6714"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Legislature" ; + nif:beginIndex "7170"^^xsd:nonNegativeInteger ; + nif:endIndex "7181"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "The jury said it did find that many of Georgia's registration and election laws ``are outmoded or inadequate and often ambiguous''." ; + nif:beginIndex "790"^^xsd:nonNegativeInteger ; + nif:endIndex "921"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "its" ; + nif:beginIndex "7195"^^xsd:nonNegativeInteger ; + nif:endIndex "7198"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "divorce" ; + nif:beginIndex "4362"^^xsd:nonNegativeInteger ; + nif:endIndex "4369"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "The couple was married Aug. 2, 1913." ; + nif:beginIndex "4482"^^xsd:nonNegativeInteger ; + nif:endIndex "4518"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "anonymous" ; + nif:beginIndex "11876"^^xsd:nonNegativeInteger ; + nif:endIndex "11885"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "11825"^^xsd:nonNegativeInteger ; + nif:endIndex "11826"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "8908"^^xsd:nonNegativeInteger ; + nif:endIndex "8911"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "4087"^^xsd:nonNegativeInteger ; + nif:endIndex "4090"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "opposes" ; + nif:beginIndex "6743"^^xsd:nonNegativeInteger ; + nif:endIndex "6750"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "10610"^^xsd:nonNegativeInteger ; + nif:endIndex "10612"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "election" ; + nif:beginIndex "730"^^xsd:nonNegativeInteger ; + nif:endIndex "738"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "are" ; + nif:beginIndex "5438"^^xsd:nonNegativeInteger ; + nif:endIndex "5441"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "will" ; + nif:beginIndex "7913"^^xsd:nonNegativeInteger ; + nif:endIndex "7917"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Hartsfield" ; + nif:beginIndex "4336"^^xsd:nonNegativeInteger ; + nif:endIndex "4346"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "first" ; + nif:beginIndex "7841"^^xsd:nonNegativeInteger ; + nif:endIndex "7846"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "10156"^^xsd:nonNegativeInteger ; + nif:endIndex "10157"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "attorney" ; + nif:beginIndex "5038"^^xsd:nonNegativeInteger ; + nif:endIndex "5046"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "such" ; + nif:beginIndex "10055"^^xsd:nonNegativeInteger ; + nif:endIndex "10059"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "was" ; + nif:beginIndex "4493"^^xsd:nonNegativeInteger ; + nif:endIndex "4496"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "grand" ; + nif:beginIndex "1911"^^xsd:nonNegativeInteger ; + nif:endIndex "1916"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "follow" ; + nif:beginIndex "1210"^^xsd:nonNegativeInteger ; + nif:endIndex "1216"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "7403"^^xsd:nonNegativeInteger ; + nif:endIndex "7405"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Byrd" ; + nif:beginIndex "7152"^^xsd:nonNegativeInteger ; + nif:endIndex "7156"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "5991"^^xsd:nonNegativeInteger ; + nif:endIndex "5993"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "age" ; + nif:beginIndex "4758"^^xsd:nonNegativeInteger ; + nif:endIndex "4761"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "3315"^^xsd:nonNegativeInteger ; + nif:endIndex "3316"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "topics" ; + nif:beginIndex "1100"^^xsd:nonNegativeInteger ; + nif:endIndex "1106"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Executive" ; + nif:beginIndex "217"^^xsd:nonNegativeInteger ; + nif:endIndex "226"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Bowden" ; + nif:beginIndex "4989"^^xsd:nonNegativeInteger ; + nif:endIndex "4995"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "personnel" ; + nif:beginIndex "1577"^^xsd:nonNegativeInteger ; + nif:endIndex "1586"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "8047"^^xsd:nonNegativeInteger ; + nif:endIndex "8050"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "there" ; + nif:beginIndex "6293"^^xsd:nonNegativeInteger ; + nif:endIndex "6298"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "154"^^xsd:nonNegativeInteger ; + nif:endIndex "155"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "6506"^^xsd:nonNegativeInteger ; + nif:endIndex "6508"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "petitions" ; + nif:beginIndex "6531"^^xsd:nonNegativeInteger ; + nif:endIndex "6540"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "courts" ; + nif:beginIndex "7934"^^xsd:nonNegativeInteger ; + nif:endIndex "7940"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "legislators" ; + nif:beginIndex "949"^^xsd:nonNegativeInteger ; + nif:endIndex "960"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "on" ; + nif:beginIndex "6613"^^xsd:nonNegativeInteger ; + nif:endIndex "6615"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "4534"^^xsd:nonNegativeInteger ; + nif:endIndex "4535"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "House" ; + nif:beginIndex "9808"^^xsd:nonNegativeInteger ; + nif:endIndex "9813"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "suit" ; + nif:beginIndex "4353"^^xsd:nonNegativeInteger ; + nif:endIndex "4357"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "settlement" ; + nif:beginIndex "4658"^^xsd:nonNegativeInteger ; + nif:endIndex "4668"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "are" ; + nif:beginIndex "1188"^^xsd:nonNegativeInteger ; + nif:endIndex "1191"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "veteran" ; + nif:beginIndex "10879"^^xsd:nonNegativeInteger ; + nif:endIndex "10886"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "last" ; + nif:beginIndex "9625"^^xsd:nonNegativeInteger ; + nif:endIndex "9629"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "5937"^^xsd:nonNegativeInteger ; + nif:endIndex "5941"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Monday" ; + nif:beginIndex "8898"^^xsd:nonNegativeInteger ; + nif:endIndex "8904"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Authority" ; + nif:beginIndex "8236"^^xsd:nonNegativeInteger ; + nif:endIndex "8245"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "interim" ; + nif:beginIndex "3105"^^xsd:nonNegativeInteger ; + nif:endIndex "3112"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Ridge" ; + nif:beginIndex "5632"^^xsd:nonNegativeInteger ; + nif:endIndex "5637"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "A similar resolution passed in the Senate by a vote of 29-5." ; + nif:beginIndex "9428"^^xsd:nonNegativeInteger ; + nif:endIndex "9488"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Dorsey" ; + nif:beginIndex "5718"^^xsd:nonNegativeInteger ; + nif:endIndex "5724"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "12209"^^xsd:nonNegativeInteger ; + nif:endIndex "12210"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "9841"^^xsd:nonNegativeInteger ; + nif:endIndex "9843"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "wife" ; + nif:beginIndex "4379"^^xsd:nonNegativeInteger ; + nif:endIndex "4383"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "8947"^^xsd:nonNegativeInteger ; + nif:endIndex "8949"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "tax" ; + nif:beginIndex "8479"^^xsd:nonNegativeInteger ; + nif:endIndex "8482"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "operated" ; + nif:beginIndex "3458"^^xsd:nonNegativeInteger ; + nif:endIndex "3466"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "8450"^^xsd:nonNegativeInteger ; + nif:endIndex "8453"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "666"^^xsd:nonNegativeInteger ; + nif:endIndex "668"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "It" ; + nif:beginIndex "11453"^^xsd:nonNegativeInteger ; + nif:endIndex "11455"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "it" ; + nif:beginIndex "6274"^^xsd:nonNegativeInteger ; + nif:endIndex "6276"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "10366"^^xsd:nonNegativeInteger ; + nif:endIndex "10369"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "3801"^^xsd:nonNegativeInteger ; + nif:endIndex "3803"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "1613"^^xsd:nonNegativeInteger ; + nif:endIndex "1616"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "ordinary's" ; + nif:beginIndex "2753"^^xsd:nonNegativeInteger ; + nif:endIndex "2763"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "11387"^^xsd:nonNegativeInteger ; + nif:endIndex "11388"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "irregularities" ; + nif:beginIndex "11977"^^xsd:nonNegativeInteger ; + nif:endIndex "11991"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "more" ; + nif:beginIndex "4914"^^xsd:nonNegativeInteger ; + nif:endIndex "4918"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "should" ; + nif:beginIndex "3577"^^xsd:nonNegativeInteger ; + nif:endIndex "3583"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "directed" ; + nif:beginIndex "10601"^^xsd:nonNegativeInteger ; + nif:endIndex "10609"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "as" ; + nif:beginIndex "4894"^^xsd:nonNegativeInteger ; + nif:endIndex "4896"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "County" ; + nif:beginIndex "8849"^^xsd:nonNegativeInteger ; + nif:endIndex "8855"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "leading" ; + nif:beginIndex "11402"^^xsd:nonNegativeInteger ; + nif:endIndex "11409"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "possible" ; + nif:beginIndex "509"^^xsd:nonNegativeInteger ; + nif:endIndex "517"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Lt." ; + nif:beginIndex "6863"^^xsd:nonNegativeInteger ; + nif:endIndex "6866"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "3040"^^xsd:nonNegativeInteger ; + nif:endIndex "3042"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "afternoon" ; + nif:beginIndex "7336"^^xsd:nonNegativeInteger ; + nif:endIndex "7345"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "we" ; + nif:beginIndex "2508"^^xsd:nonNegativeInteger ; + nif:endIndex "2510"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "word" ; + nif:beginIndex "9522"^^xsd:nonNegativeInteger ; + nif:endIndex "9526"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "expected" ; + nif:beginIndex "7515"^^xsd:nonNegativeInteger ; + nif:endIndex "7523"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "number" ; + nif:beginIndex "744"^^xsd:nonNegativeInteger ; + nif:endIndex "750"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "worked" ; + nif:beginIndex "9303"^^xsd:nonNegativeInteger ; + nif:endIndex "9309"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "3479"^^xsd:nonNegativeInteger ; + nif:endIndex "3483"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "lacking" ; + nif:beginIndex "1515"^^xsd:nonNegativeInteger ; + nif:endIndex "1522"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "County" ; + nif:beginIndex "2094"^^xsd:nonNegativeInteger ; + nif:endIndex "2100"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "over-all" ; + nif:beginIndex "248"^^xsd:nonNegativeInteger ; + nif:endIndex "256"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "4190"^^xsd:nonNegativeInteger ; + nif:endIndex "4193"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "723"^^xsd:nonNegativeInteger ; + nif:endIndex "725"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "685"^^xsd:nonNegativeInteger ; + nif:endIndex "687"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "primary" ; + nif:beginIndex "5339"^^xsd:nonNegativeInteger ; + nif:endIndex "5346"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "9584"^^xsd:nonNegativeInteger ; + nif:endIndex "9585"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "will" ; + nif:beginIndex "7620"^^xsd:nonNegativeInteger ; + nif:endIndex "7624"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "has" ; + nif:beginIndex "2181"^^xsd:nonNegativeInteger ; + nif:endIndex "2184"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "as" ; + nif:beginIndex "4794"^^xsd:nonNegativeInteger ; + nif:endIndex "4796"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Blue" ; + nif:beginIndex "5627"^^xsd:nonNegativeInteger ; + nif:endIndex "5631"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "construction" ; + nif:beginIndex "8267"^^xsd:nonNegativeInteger ; + nif:endIndex "8279"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "extern" ; + nif:beginIndex "3873"^^xsd:nonNegativeInteger ; + nif:endIndex "3879"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jury" ; + nif:beginIndex "794"^^xsd:nonNegativeInteger ; + nif:endIndex "798"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "did" ; + nif:beginIndex "3532"^^xsd:nonNegativeInteger ; + nif:endIndex "3535"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "his" ; + nif:beginIndex "12147"^^xsd:nonNegativeInteger ; + nif:endIndex "12150"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Hartsfield" ; + nif:beginIndex "5048"^^xsd:nonNegativeInteger ; + nif:endIndex "5058"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "518"^^xsd:nonNegativeInteger ; + nif:endIndex "520"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "effective" ; + nif:beginIndex "1828"^^xsd:nonNegativeInteger ; + nif:endIndex "1837"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "his" ; + nif:beginIndex "4754"^^xsd:nonNegativeInteger ; + nif:endIndex "4757"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "contracts" ; + nif:beginIndex "8033"^^xsd:nonNegativeInteger ; + nif:endIndex "8042"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "852"^^xsd:nonNegativeInteger ; + nif:endIndex "855"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "out" ; + nif:beginIndex "9601"^^xsd:nonNegativeInteger ; + nif:endIndex "9604"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "City" ; + nif:beginIndex "1467"^^xsd:nonNegativeInteger ; + nif:endIndex "1471"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Robert Snodgrass, state GOP chairman, said a meeting held Tuesday night in Blue Ridge brought enthusiastic responses from the audience." ; + nif:beginIndex "5552"^^xsd:nonNegativeInteger ; + nif:endIndex "5687"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "will" ; + nif:beginIndex "6956"^^xsd:nonNegativeInteger ; + nif:endIndex "6960"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Jury" ; + nif:beginIndex "24"^^xsd:nonNegativeInteger ; + nif:endIndex "28"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "is" ; + nif:beginIndex "10861"^^xsd:nonNegativeInteger ; + nif:endIndex "10863"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "$3" ; + nif:beginIndex "8190"^^xsd:nonNegativeInteger ; + nif:endIndex "8192"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "let" ; + nif:beginIndex "8043"^^xsd:nonNegativeInteger ; + nif:endIndex "8046"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "7997"^^xsd:nonNegativeInteger ; + nif:endIndex "7998"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "friendly" ; + nif:beginIndex "7947"^^xsd:nonNegativeInteger ; + nif:endIndex "7955"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "However" ; + nif:beginIndex "1318"^^xsd:nonNegativeInteger ; + nif:endIndex "1325"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "Despite the warning, there was a unanimous vote to enter a candidate, according to Republicans who attended." ; + nif:beginIndex "6082"^^xsd:nonNegativeInteger ; + nif:endIndex "6190"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "2814"^^xsd:nonNegativeInteger ; + nif:endIndex "2816"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "9322"^^xsd:nonNegativeInteger ; + nif:endIndex "9326"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "laws" ; + nif:beginIndex "981"^^xsd:nonNegativeInteger ; + nif:endIndex "985"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "1353"^^xsd:nonNegativeInteger ; + nif:endIndex "1355"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "then" ; + nif:beginIndex "8003"^^xsd:nonNegativeInteger ; + nif:endIndex "8007"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "2742"^^xsd:nonNegativeInteger ; + nif:endIndex "2745"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "2179"^^xsd:nonNegativeInteger ; + nif:endIndex "2181"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "1206"^^xsd:nonNegativeInteger ; + nif:endIndex "1209"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "which" ; + nif:beginIndex "1172"^^xsd:nonNegativeInteger ; + nif:endIndex "1177"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "sheriff" ; + nif:beginIndex "12197"^^xsd:nonNegativeInteger ; + nif:endIndex "12204"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "duty" ; + nif:beginIndex "3914"^^xsd:nonNegativeInteger ; + nif:endIndex "3918"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "registration" ; + nif:beginIndex "839"^^xsd:nonNegativeInteger ; + nif:endIndex "851"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "2003"^^xsd:nonNegativeInteger ; + nif:endIndex "2006"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Congress" ; + nif:beginIndex "10506"^^xsd:nonNegativeInteger ; + nif:endIndex "10514"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jurors" ; + nif:beginIndex "2609"^^xsd:nonNegativeInteger ; + nif:endIndex "2615"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "allowances" ; + nif:beginIndex "9008"^^xsd:nonNegativeInteger ; + nif:endIndex "9018"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "funds" ; + nif:beginIndex "10060"^^xsd:nonNegativeInteger ; + nif:endIndex "10065"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "seek" ; + nif:beginIndex "9348"^^xsd:nonNegativeInteger ; + nif:endIndex "9352"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "ballot" ; + nif:beginIndex "6637"^^xsd:nonNegativeInteger ; + nif:endIndex "6643"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "apparently" ; + nif:beginIndex "8318"^^xsd:nonNegativeInteger ; + nif:endIndex "8328"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "1325"^^xsd:nonNegativeInteger ; + nif:endIndex "1326"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "11183"^^xsd:nonNegativeInteger ; + nif:endIndex "11186"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "1" ; + nif:beginIndex "3441"^^xsd:nonNegativeInteger ; + nif:endIndex "3442"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "couple" ; + nif:beginIndex "4486"^^xsd:nonNegativeInteger ; + nif:endIndex "4492"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "9127"^^xsd:nonNegativeInteger ; + nif:endIndex "9129"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "alternative" ; + nif:beginIndex "6441"^^xsd:nonNegativeInteger ; + nif:endIndex "6452"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "6150"^^xsd:nonNegativeInteger ; + nif:endIndex "6151"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "by" ; + nif:beginIndex "5281"^^xsd:nonNegativeInteger ; + nif:endIndex "5283"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "issue" ; + nif:beginIndex "7663"^^xsd:nonNegativeInteger ; + nif:endIndex "7668"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "4591"^^xsd:nonNegativeInteger ; + nif:endIndex "4593"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "state" ; + nif:beginIndex "7928"^^xsd:nonNegativeInteger ; + nif:endIndex "7933"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "5184"^^xsd:nonNegativeInteger ; + nif:endIndex "5186"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "1279"^^xsd:nonNegativeInteger ; + nif:endIndex "1281"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "there" ; + nif:beginIndex "11962"^^xsd:nonNegativeInteger ; + nif:endIndex "11967"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "million" ; + nif:beginIndex "8603"^^xsd:nonNegativeInteger ; + nif:endIndex "8610"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "November" ; + nif:beginIndex "9630"^^xsd:nonNegativeInteger ; + nif:endIndex "9638"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "off" ; + nif:beginIndex "8472"^^xsd:nonNegativeInteger ; + nif:endIndex "8475"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "1892"^^xsd:nonNegativeInteger ; + nif:endIndex "1894"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Highway" ; + nif:beginIndex "7784"^^xsd:nonNegativeInteger ; + nif:endIndex "7791"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "``Only a relative handful of such reports was received'', the jury said, ``considering the widespread interest in the election, the number of voters and the size of this city''." ; + nif:beginIndex "612"^^xsd:nonNegativeInteger ; + nif:endIndex "789"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "4689"^^xsd:nonNegativeInteger ; + nif:endIndex "4690"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "3248"^^xsd:nonNegativeInteger ; + nif:endIndex "3251"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "5074"^^xsd:nonNegativeInteger ; + nif:endIndex "5076"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "11481"^^xsd:nonNegativeInteger ; + nif:endIndex "11482"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "7115"^^xsd:nonNegativeInteger ; + nif:endIndex "7117"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "meeting" ; + nif:beginIndex "5904"^^xsd:nonNegativeInteger ; + nif:endIndex "5911"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "these" ; + nif:beginIndex "2420"^^xsd:nonNegativeInteger ; + nif:endIndex "2425"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "periodic" ; + nif:beginIndex "3587"^^xsd:nonNegativeInteger ; + nif:endIndex "3595"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "disproportionate" ; + nif:beginIndex "2668"^^xsd:nonNegativeInteger ; + nif:endIndex "2684"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "time" ; + nif:beginIndex "8432"^^xsd:nonNegativeInteger ; + nif:endIndex "8436"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "8337"^^xsd:nonNegativeInteger ; + nif:endIndex "8339"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "added" ; + nif:beginIndex "10293"^^xsd:nonNegativeInteger ; + nif:endIndex "10298"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "three" ; + nif:beginIndex "7041"^^xsd:nonNegativeInteger ; + nif:endIndex "7046"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "9313"^^xsd:nonNegativeInteger ; + nif:endIndex "9314"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "3930"^^xsd:nonNegativeInteger ; + nif:endIndex "3932"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "10039"^^xsd:nonNegativeInteger ; + nif:endIndex "10040"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "5046"^^xsd:nonNegativeInteger ; + nif:endIndex "5047"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Schley" ; + nif:beginIndex "8842"^^xsd:nonNegativeInteger ; + nif:endIndex "8848"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "11533"^^xsd:nonNegativeInteger ; + nif:endIndex "11535"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "school" ; + nif:beginIndex "10935"^^xsd:nonNegativeInteger ; + nif:endIndex "10941"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "his" ; + nif:beginIndex "9327"^^xsd:nonNegativeInteger ; + nif:endIndex "9330"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Mayor William B. Hartsfield filed suit for divorce from his wife, Pearl Williams Hartsfield, in Fulton Superior Court Friday." ; + nif:beginIndex "4319"^^xsd:nonNegativeInteger ; + nif:endIndex "4444"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "ones" ; + nif:beginIndex "8458"^^xsd:nonNegativeInteger ; + nif:endIndex "8462"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "urged" ; + nif:beginIndex "1756"^^xsd:nonNegativeInteger ; + nif:endIndex "1761"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "calls" ; + nif:beginIndex "11820"^^xsd:nonNegativeInteger ; + nif:endIndex "11825"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "expended" ; + nif:beginIndex "10082"^^xsd:nonNegativeInteger ; + nif:endIndex "10090"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "will" ; + nif:beginIndex "5856"^^xsd:nonNegativeInteger ; + nif:endIndex "5860"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Fulton" ; + nif:beginIndex "4415"^^xsd:nonNegativeInteger ; + nif:endIndex "4421"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "James" ; + nif:beginIndex "5709"^^xsd:nonNegativeInteger ; + nif:endIndex "5714"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "praise" ; + nif:beginIndex "296"^^xsd:nonNegativeInteger ; + nif:endIndex "302"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "getting" ; + nif:beginIndex "5442"^^xsd:nonNegativeInteger ; + nif:endIndex "5449"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "reelection" ; + nif:beginIndex "5406"^^xsd:nonNegativeInteger ; + nif:endIndex "5416"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "general" ; + nif:beginIndex "2101"^^xsd:nonNegativeInteger ; + nif:endIndex "2108"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "1" ; + nif:beginIndex "3773"^^xsd:nonNegativeInteger ; + nif:endIndex "3774"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "end" ; + nif:beginIndex "1013"^^xsd:nonNegativeInteger ; + nif:endIndex "1016"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "campaign" ; + nif:beginIndex "11737"^^xsd:nonNegativeInteger ; + nif:endIndex "11745"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jury" ; + nif:beginIndex "3375"^^xsd:nonNegativeInteger ; + nif:endIndex "3379"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "7837"^^xsd:nonNegativeInteger ; + nif:endIndex "7840"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "on" ; + nif:beginIndex "1079"^^xsd:nonNegativeInteger ; + nif:endIndex "1081"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "issue" ; + nif:beginIndex "8179"^^xsd:nonNegativeInteger ; + nif:endIndex "8184"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "Caldwell's resignation had been expected for some time." ; + nif:beginIndex "6897"^^xsd:nonNegativeInteger ; + nif:endIndex "6952"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "1907"^^xsd:nonNegativeInteger ; + nif:endIndex "1910"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "formally" ; + nif:beginIndex "10456"^^xsd:nonNegativeInteger ; + nif:endIndex "10464"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "purchasing" ; + nif:beginIndex "1149"^^xsd:nonNegativeInteger ; + nif:endIndex "1159"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "2272"^^xsd:nonNegativeInteger ; + nif:endIndex "2274"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "defeated" ; + nif:beginIndex "10912"^^xsd:nonNegativeInteger ; + nif:endIndex "10920"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "3661"^^xsd:nonNegativeInteger ; + nif:endIndex "3664"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "wanted" ; + nif:beginIndex "6227"^^xsd:nonNegativeInteger ; + nif:endIndex "6233"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "10779"^^xsd:nonNegativeInteger ; + nif:endIndex "10782"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf ":" ; + nif:beginIndex "3770"^^xsd:nonNegativeInteger ; + nif:endIndex "3771"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "8284"^^xsd:nonNegativeInteger ; + nif:endIndex "8285"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "When" ; + nif:beginIndex "6191"^^xsd:nonNegativeInteger ; + nif:endIndex "6195"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "2304"^^xsd:nonNegativeInteger ; + nif:endIndex "2306"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "has" ; + nif:beginIndex "2971"^^xsd:nonNegativeInteger ; + nif:endIndex "2974"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "9456"^^xsd:nonNegativeInteger ; + nif:endIndex "9458"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Ask" ; + nif:beginIndex "3709"^^xsd:nonNegativeInteger ; + nif:endIndex "3712"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "but" ; + nif:beginIndex "3551"^^xsd:nonNegativeInteger ; + nif:endIndex "3554"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "11546"^^xsd:nonNegativeInteger ; + nif:endIndex "11549"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "9563"^^xsd:nonNegativeInteger ; + nif:endIndex "9565"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "than" ; + nif:beginIndex "7036"^^xsd:nonNegativeInteger ; + nif:endIndex "7040"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "6289"^^xsd:nonNegativeInteger ; + nif:endIndex "6292"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "6961"^^xsd:nonNegativeInteger ; + nif:endIndex "6963"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "occupation" ; + nif:beginIndex "4723"^^xsd:nonNegativeInteger ; + nif:endIndex "4733"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "7554"^^xsd:nonNegativeInteger ; + nif:endIndex "7556"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "is" ; + nif:beginIndex "4952"^^xsd:nonNegativeInteger ; + nif:endIndex "4954"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "election" ; + nif:beginIndex "10793"^^xsd:nonNegativeInteger ; + nif:endIndex "10801"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "A" ; + nif:beginIndex "9428"^^xsd:nonNegativeInteger ; + nif:endIndex "9429"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "8905"^^xsd:nonNegativeInteger ; + nif:endIndex "8907"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "6272"^^xsd:nonNegativeInteger ; + nif:endIndex "6273"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "8738"^^xsd:nonNegativeInteger ; + nif:endIndex "8739"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "also" ; + nif:beginIndex "2724"^^xsd:nonNegativeInteger ; + nif:endIndex "2728"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "10875"^^xsd:nonNegativeInteger ; + nif:endIndex "10876"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "104"^^xsd:nonNegativeInteger ; + nif:endIndex "106"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "2833"^^xsd:nonNegativeInteger ; + nif:endIndex "2835"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "outright" ; + nif:beginIndex "9139"^^xsd:nonNegativeInteger ; + nif:endIndex "9147"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "charged" ; + nif:beginIndex "432"^^xsd:nonNegativeInteger ; + nif:endIndex "439"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "4924"^^xsd:nonNegativeInteger ; + nif:endIndex "4925"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Implementation" ; + nif:beginIndex "1661"^^xsd:nonNegativeInteger ; + nif:endIndex "1675"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "2144"^^xsd:nonNegativeInteger ; + nif:endIndex "2145"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "recommended" ; + nif:beginIndex "3754"^^xsd:nonNegativeInteger ; + nif:endIndex "3765"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "more" ; + nif:beginIndex "6246"^^xsd:nonNegativeInteger ; + nif:endIndex "6250"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "into" ; + nif:beginIndex "6013"^^xsd:nonNegativeInteger ; + nif:endIndex "6017"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "other" ; + nif:beginIndex "1094"^^xsd:nonNegativeInteger ; + nif:endIndex "1099"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "worth" ; + nif:beginIndex "8215"^^xsd:nonNegativeInteger ; + nif:endIndex "8220"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "363"^^xsd:nonNegativeInteger ; + nif:endIndex "366"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "6692"^^xsd:nonNegativeInteger ; + nif:endIndex "6695"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "receive" ; + nif:beginIndex "2556"^^xsd:nonNegativeInteger ; + nif:endIndex "2563"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "9957"^^xsd:nonNegativeInteger ; + nif:endIndex "9958"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "As" ; + nif:beginIndex "9489"^^xsd:nonNegativeInteger ; + nif:endIndex "9491"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "worth" ; + nif:beginIndex "8611"^^xsd:nonNegativeInteger ; + nif:endIndex "8616"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "campaign" ; + nif:beginIndex "7127"^^xsd:nonNegativeInteger ; + nif:endIndex "7135"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "then" ; + nif:beginIndex "8661"^^xsd:nonNegativeInteger ; + nif:endIndex "8665"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "former" ; + nif:beginIndex "11550"^^xsd:nonNegativeInteger ; + nif:endIndex "11556"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "was" ; + nif:beginIndex "11179"^^xsd:nonNegativeInteger ; + nif:endIndex "11182"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "put" ; + nif:beginIndex "10768"^^xsd:nonNegativeInteger ; + nif:endIndex "10771"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "home" ; + nif:beginIndex "7232"^^xsd:nonNegativeInteger ; + nif:endIndex "7236"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "he" ; + nif:beginIndex "11653"^^xsd:nonNegativeInteger ; + nif:endIndex "11655"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "3311"^^xsd:nonNegativeInteger ; + nif:endIndex "3315"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "by" ; + nif:beginIndex "9470"^^xsd:nonNegativeInteger ; + nif:endIndex "9472"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Department" ; + nif:beginIndex "2168"^^xsd:nonNegativeInteger ; + nif:endIndex "2178"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "city" ; + nif:beginIndex "3969"^^xsd:nonNegativeInteger ; + nif:endIndex "3973"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "future" ; + nif:beginIndex "2528"^^xsd:nonNegativeInteger ; + nif:endIndex "2534"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "just" ; + nif:beginIndex "11290"^^xsd:nonNegativeInteger ; + nif:endIndex "11294"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "178"^^xsd:nonNegativeInteger ; + nif:endIndex "180"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "3470"^^xsd:nonNegativeInteger ; + nif:endIndex "3471"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "its" ; + nif:beginIndex "2800"^^xsd:nonNegativeInteger ; + nif:endIndex "2803"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "office" ; + nif:beginIndex "5237"^^xsd:nonNegativeInteger ; + nif:endIndex "5243"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "1461"^^xsd:nonNegativeInteger ; + nif:endIndex "1462"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "an" ; + nif:beginIndex "41"^^xsd:nonNegativeInteger ; + nif:endIndex "43"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "reportedly" ; + nif:beginIndex "11780"^^xsd:nonNegativeInteger ; + nif:endIndex "11790"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "citizens" ; + nif:beginIndex "11941"^^xsd:nonNegativeInteger ; + nif:endIndex "11949"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "increase" ; + nif:beginIndex "8988"^^xsd:nonNegativeInteger ; + nif:endIndex "8996"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "elaborate" ; + nif:beginIndex "3540"^^xsd:nonNegativeInteger ; + nif:endIndex "3549"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "3298"^^xsd:nonNegativeInteger ; + nif:endIndex "3300"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Sunday" ; + nif:beginIndex "9032"^^xsd:nonNegativeInteger ; + nif:endIndex "9038"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "personnel" ; + nif:beginIndex "1547"^^xsd:nonNegativeInteger ; + nif:endIndex "1556"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "by" ; + nif:beginIndex "440"^^xsd:nonNegativeInteger ; + nif:endIndex "442"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "with" ; + nif:beginIndex "2285"^^xsd:nonNegativeInteger ; + nif:endIndex "2289"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "elected" ; + nif:beginIndex "5826"^^xsd:nonNegativeInteger ; + nif:endIndex "5833"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "6101"^^xsd:nonNegativeInteger ; + nif:endIndex "6102"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "This" ; + nif:beginIndex "11174"^^xsd:nonNegativeInteger ; + nif:endIndex "11178"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "5207"^^xsd:nonNegativeInteger ; + nif:endIndex "5208"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "10476"^^xsd:nonNegativeInteger ; + nif:endIndex "10477"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "765"^^xsd:nonNegativeInteger ; + nif:endIndex "768"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "it" ; + nif:beginIndex "10256"^^xsd:nonNegativeInteger ; + nif:endIndex "10258"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "4517"^^xsd:nonNegativeInteger ; + nif:endIndex "4518"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Fulton" ; + nif:beginIndex "2535"^^xsd:nonNegativeInteger ; + nif:endIndex "2541"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "7251"^^xsd:nonNegativeInteger ; + nif:endIndex "7253"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "law" ; + nif:beginIndex "6382"^^xsd:nonNegativeInteger ; + nif:endIndex "6385"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "so" ; + nif:beginIndex "1843"^^xsd:nonNegativeInteger ; + nif:endIndex "1845"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "members" ; + nif:beginIndex "10180"^^xsd:nonNegativeInteger ; + nif:endIndex "10187"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "appraisers" ; + nif:beginIndex "2836"^^xsd:nonNegativeInteger ; + nif:endIndex "2846"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "seen" ; + nif:beginIndex "2185"^^xsd:nonNegativeInteger ; + nif:endIndex "2189"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "citizens" ; + nif:beginIndex "3113"^^xsd:nonNegativeInteger ; + nif:endIndex "3121"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "8559"^^xsd:nonNegativeInteger ; + nif:endIndex "8562"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "take" ; + nif:beginIndex "1624"^^xsd:nonNegativeInteger ; + nif:endIndex "1628"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "should" ; + nif:beginIndex "1373"^^xsd:nonNegativeInteger ; + nif:endIndex "1379"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "4855"^^xsd:nonNegativeInteger ; + nif:endIndex "4859"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "1032"^^xsd:nonNegativeInteger ; + nif:endIndex "1035"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "its" ; + nif:beginIndex "2993"^^xsd:nonNegativeInteger ; + nif:endIndex "2996"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "number" ; + nif:beginIndex "1084"^^xsd:nonNegativeInteger ; + nif:endIndex "1090"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "pointed" ; + nif:beginIndex "9593"^^xsd:nonNegativeInteger ; + nif:endIndex "9600"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "remedy" ; + nif:beginIndex "1638"^^xsd:nonNegativeInteger ; + nif:endIndex "1644"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "heavily" ; + nif:beginIndex "8089"^^xsd:nonNegativeInteger ; + nif:endIndex "8096"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "5311"^^xsd:nonNegativeInteger ; + nif:endIndex "5312"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "technical" ; + nif:beginIndex "9270"^^xsd:nonNegativeInteger ; + nif:endIndex "9279"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "polls" ; + nif:beginIndex "11999"^^xsd:nonNegativeInteger ; + nif:endIndex "12004"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "9605"^^xsd:nonNegativeInteger ; + nif:endIndex "9609"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "priority" ; + nif:beginIndex "10620"^^xsd:nonNegativeInteger ; + nif:endIndex "10628"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "handful" ; + nif:beginIndex "630"^^xsd:nonNegativeInteger ; + nif:endIndex "637"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "there" ; + nif:beginIndex "3571"^^xsd:nonNegativeInteger ; + nif:endIndex "3576"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "3766"^^xsd:nonNegativeInteger ; + nif:endIndex "3770"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "9821"^^xsd:nonNegativeInteger ; + nif:endIndex "9823"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "7633"^^xsd:nonNegativeInteger ; + nif:endIndex "7636"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "2194"^^xsd:nonNegativeInteger ; + nif:endIndex "2196"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Sheriff" ; + nif:beginIndex "12089"^^xsd:nonNegativeInteger ; + nif:endIndex "12096"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "with" ; + nif:beginIndex "11118"^^xsd:nonNegativeInteger ; + nif:endIndex "11122"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "race" ; + nif:beginIndex "8516"^^xsd:nonNegativeInteger ; + nif:endIndex "8520"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "2459"^^xsd:nonNegativeInteger ; + nif:endIndex "2461"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "two" ; + nif:beginIndex "6437"^^xsd:nonNegativeInteger ; + nif:endIndex "6440"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jury" ; + nif:beginIndex "1499"^^xsd:nonNegativeInteger ; + nif:endIndex "1503"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Highway" ; + nif:beginIndex "8118"^^xsd:nonNegativeInteger ; + nif:endIndex "8125"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "candidate" ; + nif:beginIndex "5482"^^xsd:nonNegativeInteger ; + nif:endIndex "5491"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "was" ; + nif:beginIndex "5747"^^xsd:nonNegativeInteger ; + nif:endIndex "5750"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Atlanta's" ; + nif:beginIndex "61"^^xsd:nonNegativeInteger ; + nif:endIndex "70"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "After" ; + nif:beginIndex "10666"^^xsd:nonNegativeInteger ; + nif:endIndex "10671"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "coolest" ; + nif:beginIndex "10785"^^xsd:nonNegativeInteger ; + nif:endIndex "10792"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "race" ; + nif:beginIndex "6268"^^xsd:nonNegativeInteger ; + nif:endIndex "6272"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "whether" ; + nif:beginIndex "6216"^^xsd:nonNegativeInteger ; + nif:endIndex "6223"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "8974"^^xsd:nonNegativeInteger ; + nif:endIndex "8975"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Bellwood" ; + nif:beginIndex "4232"^^xsd:nonNegativeInteger ; + nif:endIndex "4240"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "elected" ; + nif:beginIndex "3252"^^xsd:nonNegativeInteger ; + nif:endIndex "3259"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "3612"^^xsd:nonNegativeInteger ; + nif:endIndex "3615"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Marvin" ; + nif:beginIndex "8671"^^xsd:nonNegativeInteger ; + nif:endIndex "8677"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Judge" ; + nif:beginIndex "465"^^xsd:nonNegativeInteger ; + nif:endIndex "470"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "new" ; + nif:beginIndex "10717"^^xsd:nonNegativeInteger ; + nif:endIndex "10720"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "1508"^^xsd:nonNegativeInteger ; + nif:endIndex "1509"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "The jury further said in term-end presentments that the City Executive Committee, which had over-all charge of the election, ``deserves the praise and thanks of the City of Atlanta'' for the manner in which the election was conducted." ; + nif:beginIndex "156"^^xsd:nonNegativeInteger ; + nif:endIndex "390"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "efficiency" ; + nif:beginIndex "1411"^^xsd:nonNegativeInteger ; + nif:endIndex "1421"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "four" ; + nif:beginIndex "11637"^^xsd:nonNegativeInteger ; + nif:endIndex "11641"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "gubernatorial" ; + nif:beginIndex "7063"^^xsd:nonNegativeInteger ; + nif:endIndex "7076"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "major" ; + nif:beginIndex "2068"^^xsd:nonNegativeInteger ; + nif:endIndex "2073"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "2715"^^xsd:nonNegativeInteger ; + nif:endIndex "2718"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "bonds" ; + nif:beginIndex "8643"^^xsd:nonNegativeInteger ; + nif:endIndex "8648"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "9492"^^xsd:nonNegativeInteger ; + nif:endIndex "9494"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "pay" ; + nif:beginIndex "9707"^^xsd:nonNegativeInteger ; + nif:endIndex "9710"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "will" ; + nif:beginIndex "2643"^^xsd:nonNegativeInteger ; + nif:endIndex "2647"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "2" ; + nif:beginIndex "6648"^^xsd:nonNegativeInteger ; + nif:endIndex "6649"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "on" ; + nif:beginIndex "8063"^^xsd:nonNegativeInteger ; + nif:endIndex "8065"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "292"^^xsd:nonNegativeInteger ; + nif:endIndex "295"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "hold" ; + nif:beginIndex "6671"^^xsd:nonNegativeInteger ; + nif:endIndex "6675"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "inure" ; + nif:beginIndex "1252"^^xsd:nonNegativeInteger ; + nif:endIndex "1257"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Georgia's" ; + nif:beginIndex "1679"^^xsd:nonNegativeInteger ; + nif:endIndex "1688"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "evidence" ; + nif:beginIndex "109"^^xsd:nonNegativeInteger ; + nif:endIndex "117"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "two" ; + nif:beginIndex "1361"^^xsd:nonNegativeInteger ; + nif:endIndex "1364"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "1569"^^xsd:nonNegativeInteger ; + nif:endIndex "1571"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "8617"^^xsd:nonNegativeInteger ; + nif:endIndex "8619"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "4228"^^xsd:nonNegativeInteger ; + nif:endIndex "4231"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "In" ; + nif:beginIndex "5886"^^xsd:nonNegativeInteger ; + nif:endIndex "5888"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "insure" ; + nif:beginIndex "11158"^^xsd:nonNegativeInteger ; + nif:endIndex "11164"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "unit" ; + nif:beginIndex "6703"^^xsd:nonNegativeInteger ; + nif:endIndex "6707"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Texas" ; + nif:beginIndex "5834"^^xsd:nonNegativeInteger ; + nif:endIndex "5839"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Meanwhile" ; + nif:beginIndex "7748"^^xsd:nonNegativeInteger ; + nif:endIndex "7757"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "carry" ; + nif:beginIndex "12043"^^xsd:nonNegativeInteger ; + nif:endIndex "12048"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "1459"^^xsd:nonNegativeInteger ; + nif:endIndex "1461"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "740"^^xsd:nonNegativeInteger ; + nif:endIndex "743"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "well" ; + nif:beginIndex "1192"^^xsd:nonNegativeInteger ; + nif:endIndex "1196"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "(2)" ; + nif:beginIndex "3934"^^xsd:nonNegativeInteger ; + nif:endIndex "3937"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "--" ; + nif:beginIndex "6715"^^xsd:nonNegativeInteger ; + nif:endIndex "6717"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "Five per cent of the voters in each county must sign petitions requesting that the Republicans be allowed to place names of candidates on the general election ballot, or 2" ; + nif:beginIndex "6478"^^xsd:nonNegativeInteger ; + nif:endIndex "6649"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "10672"^^xsd:nonNegativeInteger ; + nif:endIndex "10673"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "3984"^^xsd:nonNegativeInteger ; + nif:endIndex "3986"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "826"^^xsd:nonNegativeInteger ; + nif:endIndex "828"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "3521"^^xsd:nonNegativeInteger ; + nif:endIndex "3522"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "10284"^^xsd:nonNegativeInteger ; + nif:endIndex "10285"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "ambiguous" ; + nif:beginIndex "909"^^xsd:nonNegativeInteger ; + nif:endIndex "918"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "realize" ; + nif:beginIndex "2378"^^xsd:nonNegativeInteger ; + nif:endIndex "2385"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "There" ; + nif:beginIndex "12213"^^xsd:nonNegativeInteger ; + nif:endIndex "12218"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "7345"^^xsd:nonNegativeInteger ; + nif:endIndex "7346"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "1261"^^xsd:nonNegativeInteger ; + nif:endIndex "1264"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "would" ; + nif:beginIndex "9230"^^xsd:nonNegativeInteger ; + nif:endIndex "9235"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "3302"^^xsd:nonNegativeInteger ; + nif:endIndex "3305"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "them" ; + nif:beginIndex "1114"^^xsd:nonNegativeInteger ; + nif:endIndex "1118"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "2388"^^xsd:nonNegativeInteger ; + nif:endIndex "2389"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Callan" ; + nif:beginIndex "11597"^^xsd:nonNegativeInteger ; + nif:endIndex "11603"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "When" ; + nif:beginIndex "7054"^^xsd:nonNegativeInteger ; + nif:endIndex "7058"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jury" ; + nif:beginIndex "674"^^xsd:nonNegativeInteger ; + nif:endIndex "678"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "12055"^^xsd:nonNegativeInteger ; + nif:endIndex "12058"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "12004"^^xsd:nonNegativeInteger ; + nif:endIndex "12005"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Education" ; + nif:beginIndex "10581"^^xsd:nonNegativeInteger ; + nif:endIndex "10590"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "mental" ; + nif:beginIndex "4466"^^xsd:nonNegativeInteger ; + nif:endIndex "4472"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "future" ; + nif:beginIndex "9722"^^xsd:nonNegativeInteger ; + nif:endIndex "9728"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "vote" ; + nif:beginIndex "9699"^^xsd:nonNegativeInteger ; + nif:endIndex "9703"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "on" ; + nif:beginIndex "9107"^^xsd:nonNegativeInteger ; + nif:endIndex "9109"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "recommended" ; + nif:beginIndex "1719"^^xsd:nonNegativeInteger ; + nif:endIndex "1730"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "traditional" ; + nif:beginIndex "7536"^^xsd:nonNegativeInteger ; + nif:endIndex "7547"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "2030"^^xsd:nonNegativeInteger ; + nif:endIndex "2032"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "issued" ; + nif:beginIndex "8419"^^xsd:nonNegativeInteger ; + nif:endIndex "8425"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "calls" ; + nif:beginIndex "11508"^^xsd:nonNegativeInteger ; + nif:endIndex "11513"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "fund" ; + nif:beginIndex "8383"^^xsd:nonNegativeInteger ; + nif:endIndex "8387"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "The largest hurdle the Republicans would have to face is a state law which says that before making a first race, one of two alternative courses must be taken: 1" ; + nif:beginIndex "6317"^^xsd:nonNegativeInteger ; + nif:endIndex "6477"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "meeting" ; + nif:beginIndex "5597"^^xsd:nonNegativeInteger ; + nif:endIndex "5604"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "reports" ; + nif:beginIndex "498"^^xsd:nonNegativeInteger ; + nif:endIndex "505"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "City" ; + nif:beginIndex "321"^^xsd:nonNegativeInteger ; + nif:endIndex "325"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Davis" ; + nif:beginIndex "10870"^^xsd:nonNegativeInteger ; + nif:endIndex "10875"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "8345"^^xsd:nonNegativeInteger ; + nif:endIndex "8348"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "wind" ; + nif:beginIndex "7187"^^xsd:nonNegativeInteger ; + nif:endIndex "7191"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "6336"^^xsd:nonNegativeInteger ; + nif:endIndex "6339"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "matters" ; + nif:beginIndex "3736"^^xsd:nonNegativeInteger ; + nif:endIndex "3743"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "11916"^^xsd:nonNegativeInteger ; + nif:endIndex "11919"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "done" ; + nif:beginIndex "9070"^^xsd:nonNegativeInteger ; + nif:endIndex "9074"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "I" ; + nif:beginIndex "10802"^^xsd:nonNegativeInteger ; + nif:endIndex "10803"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "son" ; + nif:beginIndex "4531"^^xsd:nonNegativeInteger ; + nif:endIndex "4534"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Fulton" ; + nif:beginIndex "1135"^^xsd:nonNegativeInteger ; + nif:endIndex "1141"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "employes" ; + nif:beginIndex "4096"^^xsd:nonNegativeInteger ; + nif:endIndex "4104"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "was" ; + nif:beginIndex "7762"^^xsd:nonNegativeInteger ; + nif:endIndex "7765"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "vote" ; + nif:beginIndex "9102"^^xsd:nonNegativeInteger ; + nif:endIndex "9106"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "promise" ; + nif:beginIndex "12151"^^xsd:nonNegativeInteger ; + nif:endIndex "12158"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "day" ; + nif:beginIndex "8984"^^xsd:nonNegativeInteger ; + nif:endIndex "8987"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "less" ; + nif:beginIndex "2466"^^xsd:nonNegativeInteger ; + nif:endIndex "2470"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "practices" ; + nif:beginIndex "3624"^^xsd:nonNegativeInteger ; + nif:endIndex "3633"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "W." ; + nif:beginIndex "5715"^^xsd:nonNegativeInteger ; + nif:endIndex "5717"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "3853"^^xsd:nonNegativeInteger ; + nif:endIndex "3854"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "county" ; + nif:beginIndex "6514"^^xsd:nonNegativeInteger ; + nif:endIndex "6520"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "steps" ; + nif:beginIndex "1629"^^xsd:nonNegativeInteger ; + nif:endIndex "1634"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "calmest" ; + nif:beginIndex "11196"^^xsd:nonNegativeInteger ; + nif:endIndex "11203"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "funds" ; + nif:beginIndex "2426"^^xsd:nonNegativeInteger ; + nif:endIndex "2431"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "million" ; + nif:beginIndex "8193"^^xsd:nonNegativeInteger ; + nif:endIndex "8200"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "sign" ; + nif:beginIndex "6526"^^xsd:nonNegativeInteger ; + nif:endIndex "6530"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "1788"^^xsd:nonNegativeInteger ; + nif:endIndex "1790"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "listed" ; + nif:beginIndex "4772"^^xsd:nonNegativeInteger ; + nif:endIndex "4778"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "at" ; + nif:beginIndex "3813"^^xsd:nonNegativeInteger ; + nif:endIndex "3815"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "4569"^^xsd:nonNegativeInteger ; + nif:endIndex "4570"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "11410"^^xsd:nonNegativeInteger ; + nif:endIndex "11412"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "3467"^^xsd:nonNegativeInteger ; + nif:endIndex "3469"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "work" ; + nif:beginIndex "8280"^^xsd:nonNegativeInteger ; + nif:endIndex "8284"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "9574"^^xsd:nonNegativeInteger ; + nif:endIndex "9577"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "1119"^^xsd:nonNegativeInteger ; + nif:endIndex "1122"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "years" ; + nif:beginIndex "7047"^^xsd:nonNegativeInteger ; + nif:endIndex "7052"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "school" ; + nif:beginIndex "11564"^^xsd:nonNegativeInteger ; + nif:endIndex "11570"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "11928"^^xsd:nonNegativeInteger ; + nif:endIndex "11929"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "calls" ; + nif:beginIndex "11886"^^xsd:nonNegativeInteger ; + nif:endIndex "11891"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "2488"^^xsd:nonNegativeInteger ; + nif:endIndex "2490"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "4443"^^xsd:nonNegativeInteger ; + nif:endIndex "4444"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "10933"^^xsd:nonNegativeInteger ; + nif:endIndex "10934"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "7059"^^xsd:nonNegativeInteger ; + nif:endIndex "7062"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "10968"^^xsd:nonNegativeInteger ; + nif:endIndex "10971"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "no" ; + nif:beginIndex "6283"^^xsd:nonNegativeInteger ; + nif:endIndex "6285"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "While emphasizing that technical details were not fully worked out, Pelham said his resolution would seek to set aside the privilege resolution which the House voted through 87-31." ; + nif:beginIndex "9247"^^xsd:nonNegativeInteger ; + nif:endIndex "9427"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Nevertheless" ; + nif:beginIndex "2492"^^xsd:nonNegativeInteger ; + nif:endIndex "2504"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "483"^^xsd:nonNegativeInteger ; + nif:endIndex "485"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "conducted" ; + nif:beginIndex "380"^^xsd:nonNegativeInteger ; + nif:endIndex "389"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "recommended" ; + nif:beginIndex "3380"^^xsd:nonNegativeInteger ; + nif:endIndex "3391"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "8252"^^xsd:nonNegativeInteger ; + nif:endIndex "8255"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "manner" ; + nif:beginIndex "347"^^xsd:nonNegativeInteger ; + nif:endIndex "353"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "director" ; + nif:beginIndex "6824"^^xsd:nonNegativeInteger ; + nif:endIndex "6832"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Regarding" ; + nif:beginIndex "3317"^^xsd:nonNegativeInteger ; + nif:endIndex "3326"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "2252"^^xsd:nonNegativeInteger ; + nif:endIndex "2254"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "campaign" ; + nif:beginIndex "11393"^^xsd:nonNegativeInteger ; + nif:endIndex "11401"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "326"^^xsd:nonNegativeInteger ; + nif:endIndex "328"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "7524"^^xsd:nonNegativeInteger ; + nif:endIndex "7526"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "The jury said it found the court ``has incorporated into its operating procedures the recommendations'' of two previous grand juries, the Atlanta Bar Association and an interim citizens committee." ; + nif:beginIndex "2936"^^xsd:nonNegativeInteger ; + nif:endIndex "3132"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "7427"^^xsd:nonNegativeInteger ; + nif:endIndex "7429"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "2083"^^xsd:nonNegativeInteger ; + nif:endIndex "2086"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "8371"^^xsd:nonNegativeInteger ; + nif:endIndex "8372"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "commented" ; + nif:beginIndex "1069"^^xsd:nonNegativeInteger ; + nif:endIndex "1078"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "2140"^^xsd:nonNegativeInteger ; + nif:endIndex "2144"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "1608"^^xsd:nonNegativeInteger ; + nif:endIndex "1612"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Grand" ; + nif:beginIndex "18"^^xsd:nonNegativeInteger ; + nif:endIndex "23"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "9896"^^xsd:nonNegativeInteger ; + nif:endIndex "9899"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "9195"^^xsd:nonNegativeInteger ; + nif:endIndex "9199"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "married" ; + nif:beginIndex "4497"^^xsd:nonNegativeInteger ; + nif:endIndex "4504"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "dispute" ; + nif:beginIndex "11679"^^xsd:nonNegativeInteger ; + nif:endIndex "11686"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "says" ; + nif:beginIndex "10481"^^xsd:nonNegativeInteger ; + nif:endIndex "10485"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "4860"^^xsd:nonNegativeInteger ; + nif:endIndex "4863"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "2577"^^xsd:nonNegativeInteger ; + nif:endIndex "2579"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "3195"^^xsd:nonNegativeInteger ; + nif:endIndex "3198"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "which" ; + nif:beginIndex "564"^^xsd:nonNegativeInteger ; + nif:endIndex "569"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "8221"^^xsd:nonNegativeInteger ; + nif:endIndex "8223"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "his" ; + nif:beginIndex "4779"^^xsd:nonNegativeInteger ; + nif:endIndex "4782"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Monday" ; + nif:beginIndex "7329"^^xsd:nonNegativeInteger ; + nif:endIndex "7335"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "incorporated" ; + nif:beginIndex "2975"^^xsd:nonNegativeInteger ; + nif:endIndex "2987"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "12226"^^xsd:nonNegativeInteger ; + nif:endIndex "12227"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "1422"^^xsd:nonNegativeInteger ; + nif:endIndex "1425"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "distribute" ; + nif:beginIndex "2197"^^xsd:nonNegativeInteger ; + nif:endIndex "2207"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "or" ; + nif:beginIndex "885"^^xsd:nonNegativeInteger ; + nif:endIndex "887"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "asked" ; + nif:beginIndex "10347"^^xsd:nonNegativeInteger ; + nif:endIndex "10352"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "9796"^^xsd:nonNegativeInteger ; + nif:endIndex "9799"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "speaker" ; + nif:beginIndex "5877"^^xsd:nonNegativeInteger ; + nif:endIndex "5884"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "after" ; + nif:beginIndex "11897"^^xsd:nonNegativeInteger ; + nif:endIndex "11902"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "5536"^^xsd:nonNegativeInteger ; + nif:endIndex "5540"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "where" ; + nif:beginIndex "7240"^^xsd:nonNegativeInteger ; + nif:endIndex "7245"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "7896"^^xsd:nonNegativeInteger ; + nif:endIndex "7897"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "drop" ; + nif:beginIndex "11334"^^xsd:nonNegativeInteger ; + nif:endIndex "11338"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "Wards protected" ; + nif:beginIndex "2920"^^xsd:nonNegativeInteger ; + nif:endIndex "2935"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "9236"^^xsd:nonNegativeInteger ; + nif:endIndex "9238"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "picking" ; + nif:beginIndex "5751"^^xsd:nonNegativeInteger ; + nif:endIndex "5758"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "are" ; + nif:beginIndex "10171"^^xsd:nonNegativeInteger ; + nif:endIndex "10174"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "it" ; + nif:beginIndex "9865"^^xsd:nonNegativeInteger ; + nif:endIndex "9867"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "under" ; + nif:beginIndex "6686"^^xsd:nonNegativeInteger ; + nif:endIndex "6691"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "11724"^^xsd:nonNegativeInteger ; + nif:endIndex "11727"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "additional" ; + nif:beginIndex "3781"^^xsd:nonNegativeInteger ; + nif:endIndex "3791"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Saturday's" ; + nif:beginIndex "11048"^^xsd:nonNegativeInteger ; + nif:endIndex "11058"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "such" ; + nif:beginIndex "641"^^xsd:nonNegativeInteger ; + nif:endIndex "645"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "10831"^^xsd:nonNegativeInteger ; + nif:endIndex "10834"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "deputies" ; + nif:beginIndex "3718"^^xsd:nonNegativeInteger ; + nif:endIndex "3726"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "--" ; + nif:beginIndex "10663"^^xsd:nonNegativeInteger ; + nif:endIndex "10665"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "11852"^^xsd:nonNegativeInteger ; + nif:endIndex "11853"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "1923" ; + nif:beginIndex "5203"^^xsd:nonNegativeInteger ; + nif:endIndex "5207"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "on" ; + nif:beginIndex "2739"^^xsd:nonNegativeInteger ; + nif:endIndex "2741"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "2506"^^xsd:nonNegativeInteger ; + nif:endIndex "2508"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "5884"^^xsd:nonNegativeInteger ; + nif:endIndex "5885"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "at" ; + nif:beginIndex "11142"^^xsd:nonNegativeInteger ; + nif:endIndex "11144"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "which" ; + nif:beginIndex "10398"^^xsd:nonNegativeInteger ; + nif:endIndex "10403"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "1913" ; + nif:beginIndex "4513"^^xsd:nonNegativeInteger ; + nif:endIndex "4517"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Everything" ; + nif:beginIndex "12162"^^xsd:nonNegativeInteger ; + nif:endIndex "12172"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "raises" ; + nif:beginIndex "10646"^^xsd:nonNegativeInteger ; + nif:endIndex "10652"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "pistol" ; + nif:beginIndex "11125"^^xsd:nonNegativeInteger ; + nif:endIndex "11131"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "3369"^^xsd:nonNegativeInteger ; + nif:endIndex "3370"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Jr." ; + nif:beginIndex "4550"^^xsd:nonNegativeInteger ; + nif:endIndex "4553"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Hospital" ; + nif:beginIndex "4276"^^xsd:nonNegativeInteger ; + nif:endIndex "4284"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Democratic" ; + nif:beginIndex "10986"^^xsd:nonNegativeInteger ; + nif:endIndex "10996"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "term" ; + nif:beginIndex "5229"^^xsd:nonNegativeInteger ; + nif:endIndex "5233"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "election" ; + nif:beginIndex "11920"^^xsd:nonNegativeInteger ; + nif:endIndex "11928"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jail" ; + nif:beginIndex "3713"^^xsd:nonNegativeInteger ; + nif:endIndex "3717"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Legislature" ; + nif:beginIndex "9729"^^xsd:nonNegativeInteger ; + nif:endIndex "9740"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Republicans" ; + nif:beginIndex "6340"^^xsd:nonNegativeInteger ; + nif:endIndex "6351"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "under" ; + nif:beginIndex "8388"^^xsd:nonNegativeInteger ; + nif:endIndex "8393"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "which" ; + nif:beginIndex "2322"^^xsd:nonNegativeInteger ; + nif:endIndex "2327"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf ":" ; + nif:beginIndex "6474"^^xsd:nonNegativeInteger ; + nif:endIndex "6475"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "hurdle" ; + nif:beginIndex "6329"^^xsd:nonNegativeInteger ; + nif:endIndex "6335"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Authority" ; + nif:beginIndex "8361"^^xsd:nonNegativeInteger ; + nif:endIndex "8370"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "9219"^^xsd:nonNegativeInteger ; + nif:endIndex "9222"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "11348"^^xsd:nonNegativeInteger ; + nif:endIndex "11349"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "6397"^^xsd:nonNegativeInteger ; + nif:endIndex "6401"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "6434"^^xsd:nonNegativeInteger ; + nif:endIndex "6436"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "manner" ; + nif:beginIndex "3472"^^xsd:nonNegativeInteger ; + nif:endIndex "3478"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Hartsfield" ; + nif:beginIndex "4400"^^xsd:nonNegativeInteger ; + nif:endIndex "4410"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "12242"^^xsd:nonNegativeInteger ; + nif:endIndex "12244"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Miller" ; + nif:beginIndex "10697"^^xsd:nonNegativeInteger ; + nif:endIndex "10703"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "not" ; + nif:beginIndex "4875"^^xsd:nonNegativeInteger ; + nif:endIndex "4878"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "charge" ; + nif:beginIndex "257"^^xsd:nonNegativeInteger ; + nif:endIndex "263"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Davis" ; + nif:beginIndex "11764"^^xsd:nonNegativeInteger ; + nif:endIndex "11769"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "himself" ; + nif:beginIndex "11610"^^xsd:nonNegativeInteger ; + nif:endIndex "11617"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "city" ; + nif:beginIndex "1572"^^xsd:nonNegativeInteger ; + nif:endIndex "1576"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "11375"^^xsd:nonNegativeInteger ; + nif:endIndex "11377"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "The new school superintendent is Harry Davis, a veteran agriculture teacher, who defeated Felix Bush, a school principal and chairman of the Miller County Democratic Executive Committee." ; + nif:beginIndex "10831"^^xsd:nonNegativeInteger ; + nif:endIndex "11017"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "will" ; + nif:beginIndex "3484"^^xsd:nonNegativeInteger ; + nif:endIndex "3488"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "official" ; + nif:beginIndex "5527"^^xsd:nonNegativeInteger ; + nif:endIndex "5535"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "no" ; + nif:beginIndex "106"^^xsd:nonNegativeInteger ; + nif:endIndex "108"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "state" ; + nif:beginIndex "6376"^^xsd:nonNegativeInteger ; + nif:endIndex "6381"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "purpose" ; + nif:beginIndex "3665"^^xsd:nonNegativeInteger ; + nif:endIndex "3672"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Georgia Republicans are getting strong encouragement to enter a candidate in the 1962 governor's race, a top official said Wednesday." ; + nif:beginIndex "5418"^^xsd:nonNegativeInteger ; + nif:endIndex "5551"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "encouragement" ; + nif:beginIndex "5457"^^xsd:nonNegativeInteger ; + nif:endIndex "5470"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "6374"^^xsd:nonNegativeInteger ; + nif:endIndex "6375"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "election" ; + nif:beginIndex "11204"^^xsd:nonNegativeInteger ; + nif:endIndex "11212"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jury" ; + nif:beginIndex "3749"^^xsd:nonNegativeInteger ; + nif:endIndex "3753"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "(" ; + nif:beginIndex "3772"^^xsd:nonNegativeInteger ; + nif:endIndex "3773"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "A" ; + nif:beginIndex "8286"^^xsd:nonNegativeInteger ; + nif:endIndex "8287"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "September-October" ; + nif:beginIndex "395"^^xsd:nonNegativeInteger ; + nif:endIndex "412"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "paid" ; + nif:beginIndex "8467"^^xsd:nonNegativeInteger ; + nif:endIndex "8471"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "petitions" ; + nif:beginIndex "5999"^^xsd:nonNegativeInteger ; + nif:endIndex "6008"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Context ; + nif:beginIndex "0"^^xsd:nonNegativeInteger ; + nif:endIndex "12246"^^xsd:nonNegativeInteger ; + nif:isString "The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced ``no evidence'' that any irregularities took place.\nThe jury further said in term-end presentments that the City Executive Committee, which had over-all charge of the election, ``deserves the praise and thanks of the City of Atlanta'' for the manner in which the election was conducted.\nThe September-October term jury had been charged by Fulton Superior Court Judge Durwood Pye to investigate reports of possible ``irregularities'' in the hard-fought primary which was won by Mayor-nominate Ivan Allen Jr..\n``Only a relative handful of such reports was received'', the jury said, ``considering the widespread interest in the election, the number of voters and the size of this city''.\nThe jury said it did find that many of Georgia's registration and election laws ``are outmoded or inadequate and often ambiguous''.\nIt recommended that Fulton legislators act ``to have these laws studied and revised to the end of modernizing and improving them''.\nThe grand jury commented on a number of other topics, among them the Atlanta and Fulton County purchasing departments which it said ``are well operated and follow generally accepted practices which inure to the best interest of both governments''.\nMerger proposed\nHowever, the jury said it believes ``these two offices should be combined to achieve greater efficiency and reduce the cost of administration''.\nThe City Purchasing Department, the jury said, ``is lacking in experienced clerical personnel as a result of city personnel policies''.\nIt urged that the city ``take steps to remedy'' this problem.\nImplementation of Georgia's automobile title law was also recommended by the outgoing jury.\nIt urged that the next Legislature ``provide enabling funds and re-set the effective date so that an orderly implementation of the law may be effected''.\nThe grand jury took a swipe at the State Welfare Department's handling of federal funds granted for child welfare services in foster homes.\n``This is one of the major items in the Fulton County general assistance program'', the jury said, but the State Welfare Department ``has seen fit to distribute these funds through the welfare departments of all the counties in the state with the exception of Fulton County, which receives none of this money.\nThe jurors said they realize ``a proportionate distribution of these funds might disable this program in our less populous counties''.\nNevertheless, ``we feel that in the future Fulton County should receive some portion of these available funds'', the jurors said.\n``Failure to do this will continue to place a disproportionate burden'' on Fulton taxpayers.\nThe jury also commented on the Fulton ordinary's court which has been under fire for its practices in the appointment of appraisers, guardians and administrators and the awarding of fees and compensation.\nWards protected\nThe jury said it found the court ``has incorporated into its operating procedures the recommendations'' of two previous grand juries, the Atlanta Bar Association and an interim citizens committee.\n``These actions should serve to protect in fact and in effect the court's wards from undue costs and its appointed and elected servants from unmeritorious criticisms'', the jury said.\nRegarding Atlanta's new multi-million-dollar airport, the jury recommended ``that when the new management takes charge Jan. 1 the airport be operated in a manner that will eliminate political influences''.\nThe jury did not elaborate, but it added that ``there should be periodic surveillance of the pricing practices of the concessionaires for the purpose of keeping the prices reasonable''.\nAsk jail deputies\nOn other matters, the jury recommended that: (1)\nFour additional deputies be employed at the Fulton County Jail and ``a doctor, medical intern or extern be employed for night and weekend duty at the jail''.\n(2)\nFulton legislators ``work with city officials to pass enabling legislation that will permit the establishment of a fair and equitable'' pension plan for city employes.\nThe jury praised the administration and operation of the Atlanta Police Department, the Fulton Tax Commissioner's Office, the Bellwood and Alpharetta prison farms, Grady Hospital and the Fulton Health Department.\nMayor William B. Hartsfield filed suit for divorce from his wife, Pearl Williams Hartsfield, in Fulton Superior Court Friday.\nHis petition charged mental cruelty.\nThe couple was married Aug. 2, 1913.\nThey have a son, William Berry Jr., and a daughter, Mrs. J. M. Cheshire of Griffin.\nAttorneys for the mayor said that an amicable property settlement has been agreed upon.\nThe petition listed the mayor's occupation as ``attorney'' and his age as 71.\nIt listed his wife's age as 74 and place of birth as Opelika, Ala..\nThe petition said that the couple has not lived together as man and wife for more than a year.\nThe Hartsfield home is at 637 E. Pelham Rd. Aj.\nHenry L. Bowden was listed on the petition as the mayor's attorney.\nHartsfield has been mayor of Atlanta, with exception of one brief interlude, since 1937.\nHis political career goes back to his election to city council in 1923.\nThe mayor's present term of office expires Jan. 1.\nHe will be succeeded by Ivan Allen Jr., who became a candidate in the Sept. 13 primary after Mayor Hartsfield announced that he would not run for reelection.\nGeorgia Republicans are getting strong encouragement to enter a candidate in the 1962 governor's race, a top official said Wednesday.\nRobert Snodgrass, state GOP chairman, said a meeting held Tuesday night in Blue Ridge brought enthusiastic responses from the audience.\nState Party Chairman James W. Dorsey added that enthusiasm was picking up for a state rally to be held Sept. 8 in Savannah at which newly elected Texas Sen. John Tower will be the featured speaker.\nIn the Blue Ridge meeting, the audience was warned that entering a candidate for governor would force it to take petitions out into voting precincts to obtain the signatures of registered voters.\nDespite the warning, there was a unanimous vote to enter a candidate, according to Republicans who attended.\nWhen the crowd was asked whether it wanted to wait one more term to make the race, it voted no -- and there were no dissents.\nThe largest hurdle the Republicans would have to face is a state law which says that before making a first race, one of two alternative courses must be taken: 1\nFive per cent of the voters in each county must sign petitions requesting that the Republicans be allowed to place names of candidates on the general election ballot, or 2\nThe Republicans must hold a primary under the county unit system -- a system which the party opposes in its platform.\nSam Caldwell, State Highway Department public relations director, resigned Tuesday to work for Lt. Gov. Garland Byrd's campaign.\nCaldwell's resignation had been expected for some time.\nHe will be succeeded by Rob Ledford of Gainesville, who has been an assistant more than three years.\nWhen the gubernatorial campaign starts, Caldwell is expected to become a campaign coordinator for Byrd.\nThe Georgia Legislature will wind up its 1961 session Monday and head for home -- where some of the highway bond money it approved will follow shortly.\nBefore adjournment Monday afternoon, the Senate is expected to approve a study of the number of legislators allotted to rural and urban areas to determine what adjustments should be made.\nGov. Vandiver is expected to make the traditional visit to both chambers as they work toward adjournment.\nVandiver likely will mention the $100 million highway bond issue approved earlier in the session as his first priority item.\nConstruction bonds\nMeanwhile, it was learned the State Highway Department is very near being ready to issue the first $30 million worth of highway reconstruction bonds.\nThe bond issue will go to the state courts for a friendly test suit to test the validity of the act, and then the sales will begin and contracts let for repair work on some of Georgia's most heavily traveled highways.\nA Highway Department source said there also is a plan there to issue some $3 million to $4 million worth of Rural Roads Authority bonds for rural road construction work.\nA revolving fund\nThe department apparently intends to make the Rural Roads Authority a revolving fund under which new bonds would be issued every time a portion of the old ones are paid off by tax authorities.\nVandiver opened his race for governor in 1958 with a battle in the Legislature against the issuance of $50 million worth of additional rural roads bonds proposed by then Gov. Marvin Griffin.\nThe Highway Department source told The Constitution, however, that Vandiver has not been consulted yet about the plans to issue the new rural roads bonds.\nSchley County Rep. B. D. Pelham will offer a resolution Monday in the House to rescind the body's action of Friday in voting itself a $10 per day increase in expense allowances.\nPelham said Sunday night there was research being done on whether the ``quickie'' vote on the increase can be repealed outright or whether notice would have to first be given that reconsideration of the action would be sought.\nWhile emphasizing that technical details were not fully worked out, Pelham said his resolution would seek to set aside the privilege resolution which the House voted through 87-31.\nA similar resolution passed in the Senate by a vote of 29-5.\nAs of Sunday night, there was no word of a resolution being offered there to rescind the action.\nPelham pointed out that Georgia voters last November rejected a constitutional amendment to allow legislators to vote on pay raises for future Legislature sessions.\nA veteran Jackson County legislator will ask the Georgia House Monday to back federal aid to education, something it has consistently opposed in the past.\nRep. Mac Barber of Commerce is asking the House in a privilege resolution to ``endorse increased federal support for public education, provided that such funds be received and expended'' as state funds.\nBarber, who is in his 13th year as a legislator, said there ``are some members of our congressional delegation in Washington who would like to see it (the resolution) passed''.\nBut he added that none of Georgia's congressmen specifically asked him to offer the resolution.\nThe resolution, which Barber tossed into the House hopper Friday, will be formally read Monday.\nIt says that ``in the event Congress does provide this increase in federal funds'', the State Board of Education should be directed to ``give priority'' to teacher pay raises.\nColquitt\n-- After a long, hot controversy, Miller County has a new school superintendent, elected, as a policeman put it, in the ``coolest election I ever saw in this county''.\nThe new school superintendent is Harry Davis, a veteran agriculture teacher, who defeated Felix Bush, a school principal and chairman of the Miller County Democratic Executive Committee.\nDavis received 1,119 votes in Saturday's election, and Bush got 402.\nOrdinary Carey Williams, armed with a pistol, stood by at the polls to insure order.\n``This was the coolest, calmest election I ever saw'', Colquitt Policeman Tom Williams said.\n``Being at the polls was just like being at church.\nI didn't smell a drop of liquor, and we didn't have a bit of trouble''.\nThe campaign leading to the election was not so quiet, however.\nIt was marked by controversy, anonymous midnight phone calls and veiled threats of violence.\nThe former county school superintendent, George P. Callan, shot himself to death March 18, four days after he resigned his post in a dispute with the county school board.\nDuring the election campaign, both candidates, Davis and Bush, reportedly received anonymous telephone calls.\nOrdinary Williams said he, too, was subjected to anonymous calls soon after he scheduled the election.\nMany local citizens feared that there would be irregularities at the polls, and Williams got himself a permit to carry a gun and promised an orderly election.\nSheriff Felix Tabb said the ordinary apparently made good his promise.\n``Everything went real smooth'', the sheriff said.\n``There wasn't a bit of trouble''.\n" . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "4601"^^xsd:nonNegativeInteger ; + nif:endIndex "4602"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "work" ; + nif:beginIndex "6854"^^xsd:nonNegativeInteger ; + nif:endIndex "6858"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "city" ; + nif:beginIndex "782"^^xsd:nonNegativeInteger ; + nif:endIndex "786"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "go" ; + nif:beginIndex "7918"^^xsd:nonNegativeInteger ; + nif:endIndex "7920"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "8303"^^xsd:nonNegativeInteger ; + nif:endIndex "8306"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "State" ; + nif:beginIndex "6782"^^xsd:nonNegativeInteger ; + nif:endIndex "6787"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "rural" ; + nif:beginIndex "8823"^^xsd:nonNegativeInteger ; + nif:endIndex "8828"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "1504"^^xsd:nonNegativeInteger ; + nif:endIndex "1508"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "11369"^^xsd:nonNegativeInteger ; + nif:endIndex "11370"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "county" ; + nif:beginIndex "10821"^^xsd:nonNegativeInteger ; + nif:endIndex "10827"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Mayor" ; + nif:beginIndex "5353"^^xsd:nonNegativeInteger ; + nif:endIndex "5358"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "achieve" ; + nif:beginIndex "1395"^^xsd:nonNegativeInteger ; + nif:endIndex "1402"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "servants" ; + nif:beginIndex "3260"^^xsd:nonNegativeInteger ; + nif:endIndex "3268"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "10560"^^xsd:nonNegativeInteger ; + nif:endIndex "10561"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "hopper" ; + nif:beginIndex "10433"^^xsd:nonNegativeInteger ; + nif:endIndex "10439"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "according" ; + nif:beginIndex "6152"^^xsd:nonNegativeInteger ; + nif:endIndex "6161"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "6989"^^xsd:nonNegativeInteger ; + nif:endIndex "6991"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "7496"^^xsd:nonNegativeInteger ; + nif:endIndex "7497"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "force" ; + nif:beginIndex "5982"^^xsd:nonNegativeInteger ; + nif:endIndex "5987"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "very" ; + nif:beginIndex "7806"^^xsd:nonNegativeInteger ; + nif:endIndex "7810"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "night" ; + nif:beginIndex "5618"^^xsd:nonNegativeInteger ; + nif:endIndex "5623"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "was" ; + nif:beginIndex "11286"^^xsd:nonNegativeInteger ; + nif:endIndex "11289"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "best" ; + nif:beginIndex "1265"^^xsd:nonNegativeInteger ; + nif:endIndex "1269"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "4747"^^xsd:nonNegativeInteger ; + nif:endIndex "4749"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "6234"^^xsd:nonNegativeInteger ; + nif:endIndex "6236"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "10756"^^xsd:nonNegativeInteger ; + nif:endIndex "10757"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "election" ; + nif:beginIndex "86"^^xsd:nonNegativeInteger ; + nif:endIndex "94"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "2131"^^xsd:nonNegativeInteger ; + nif:endIndex "2134"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "10783"^^xsd:nonNegativeInteger ; + nif:endIndex "10785"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "2902"^^xsd:nonNegativeInteger ; + nif:endIndex "2905"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "recent" ; + nif:beginIndex "71"^^xsd:nonNegativeInteger ; + nif:endIndex "77"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "6317"^^xsd:nonNegativeInteger ; + nif:endIndex "6320"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "not" ; + nif:beginIndex "11430"^^xsd:nonNegativeInteger ; + nif:endIndex "11433"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "12205"^^xsd:nonNegativeInteger ; + nif:endIndex "12209"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "presentments" ; + nif:beginIndex "190"^^xsd:nonNegativeInteger ; + nif:endIndex "202"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "House" ; + nif:beginIndex "10427"^^xsd:nonNegativeInteger ; + nif:endIndex "10432"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "8747"^^xsd:nonNegativeInteger ; + nif:endIndex "8748"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "3957"^^xsd:nonNegativeInteger ; + nif:endIndex "3959"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "He will be succeeded by Ivan Allen Jr., who became a candidate in the Sept. 13 primary after Mayor Hartsfield announced that he would not run for reelection." ; + nif:beginIndex "5260"^^xsd:nonNegativeInteger ; + nif:endIndex "5417"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "6616"^^xsd:nonNegativeInteger ; + nif:endIndex "6619"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "not" ; + nif:beginIndex "9293"^^xsd:nonNegativeInteger ; + nif:endIndex "9296"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "12113"^^xsd:nonNegativeInteger ; + nif:endIndex "12116"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Tom" ; + nif:beginIndex "11246"^^xsd:nonNegativeInteger ; + nif:endIndex "11249"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "4632"^^xsd:nonNegativeInteger ; + nif:endIndex "4636"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "11170"^^xsd:nonNegativeInteger ; + nif:endIndex "11171"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "juries" ; + nif:beginIndex "3062"^^xsd:nonNegativeInteger ; + nif:endIndex "3068"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "was" ; + nif:beginIndex "5926"^^xsd:nonNegativeInteger ; + nif:endIndex "5929"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "9397"^^xsd:nonNegativeInteger ; + nif:endIndex "9400"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "1560"^^xsd:nonNegativeInteger ; + nif:endIndex "1561"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jury" ; + nif:beginIndex "4110"^^xsd:nonNegativeInteger ; + nif:endIndex "4114"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "new" ; + nif:beginIndex "10835"^^xsd:nonNegativeInteger ; + nif:endIndex "10838"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "6832"^^xsd:nonNegativeInteger ; + nif:endIndex "6833"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jury" ; + nif:beginIndex "160"^^xsd:nonNegativeInteger ; + nif:endIndex "164"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "give" ; + nif:beginIndex "10615"^^xsd:nonNegativeInteger ; + nif:endIndex "10619"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "party" ; + nif:beginIndex "6737"^^xsd:nonNegativeInteger ; + nif:endIndex "6742"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "do" ; + nif:beginIndex "2635"^^xsd:nonNegativeInteger ; + nif:endIndex "2637"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "9086"^^xsd:nonNegativeInteger ; + nif:endIndex "9089"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "was" ; + nif:beginIndex "376"^^xsd:nonNegativeInteger ; + nif:endIndex "379"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "school" ; + nif:beginIndex "11703"^^xsd:nonNegativeInteger ; + nif:endIndex "11709"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "these" ; + nif:beginIndex "1355"^^xsd:nonNegativeInteger ; + nif:endIndex "1360"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "which" ; + nif:beginIndex "5814"^^xsd:nonNegativeInteger ; + nif:endIndex "5819"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "scheduled" ; + nif:beginIndex "11906"^^xsd:nonNegativeInteger ; + nif:endIndex "11915"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "4837"^^xsd:nonNegativeInteger ; + nif:endIndex "4840"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "11442"^^xsd:nonNegativeInteger ; + nif:endIndex "11443"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "10169"^^xsd:nonNegativeInteger ; + nif:endIndex "10171"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "4930"^^xsd:nonNegativeInteger ; + nif:endIndex "4931"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "one" ; + nif:beginIndex "2057"^^xsd:nonNegativeInteger ; + nif:endIndex "2060"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "education" ; + nif:beginIndex "9844"^^xsd:nonNegativeInteger ; + nif:endIndex "9853"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "11544"^^xsd:nonNegativeInteger ; + nif:endIndex "11545"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "from" ; + nif:beginIndex "3213"^^xsd:nonNegativeInteger ; + nif:endIndex "3217"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "requesting" ; + nif:beginIndex "6541"^^xsd:nonNegativeInteger ; + nif:endIndex "6551"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "feared" ; + nif:beginIndex "11950"^^xsd:nonNegativeInteger ; + nif:endIndex "11956"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "both" ; + nif:beginIndex "11747"^^xsd:nonNegativeInteger ; + nif:endIndex "11751"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "new" ; + nif:beginIndex "3408"^^xsd:nonNegativeInteger ; + nif:endIndex "3411"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "4711"^^xsd:nonNegativeInteger ; + nif:endIndex "4714"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "5234"^^xsd:nonNegativeInteger ; + nif:endIndex "5236"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "3892"^^xsd:nonNegativeInteger ; + nif:endIndex "3895"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "term" ; + nif:beginIndex "413"^^xsd:nonNegativeInteger ; + nif:endIndex "417"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "8029"^^xsd:nonNegativeInteger ; + nif:endIndex "8032"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "10382"^^xsd:nonNegativeInteger ; + nif:endIndex "10385"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "operating" ; + nif:beginIndex "2997"^^xsd:nonNegativeInteger ; + nif:endIndex "3006"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "welfare" ; + nif:beginIndex "2232"^^xsd:nonNegativeInteger ; + nif:endIndex "2239"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "bond" ; + nif:beginIndex "7902"^^xsd:nonNegativeInteger ; + nif:endIndex "7906"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "8997"^^xsd:nonNegativeInteger ; + nif:endIndex "8999"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "his" ; + nif:beginIndex "4375"^^xsd:nonNegativeInteger ; + nif:endIndex "4378"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "increase" ; + nif:beginIndex "10533"^^xsd:nonNegativeInteger ; + nif:endIndex "10541"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "apparently" ; + nif:beginIndex "12126"^^xsd:nonNegativeInteger ; + nif:endIndex "12136"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "it" ; + nif:beginIndex "10772"^^xsd:nonNegativeInteger ; + nif:endIndex "10774"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "2945"^^xsd:nonNegativeInteger ; + nif:endIndex "2949"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "8437"^^xsd:nonNegativeInteger ; + nif:endIndex "8438"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "resolution" ; + nif:beginIndex "10264"^^xsd:nonNegativeInteger ; + nif:endIndex "10274"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "traveled" ; + nif:beginIndex "8097"^^xsd:nonNegativeInteger ; + nif:endIndex "8105"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "4048"^^xsd:nonNegativeInteger ; + nif:endIndex "4050"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "Ordinary Carey Williams, armed with a pistol, stood by at the polls to insure order." ; + nif:beginIndex "11087"^^xsd:nonNegativeInteger ; + nif:endIndex "11171"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "6428"^^xsd:nonNegativeInteger ; + nif:endIndex "6429"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "1880"^^xsd:nonNegativeInteger ; + nif:endIndex "1883"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "event" ; + nif:beginIndex "10500"^^xsd:nonNegativeInteger ; + nif:endIndex "10505"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "establishment" ; + nif:beginIndex "4034"^^xsd:nonNegativeInteger ; + nif:endIndex "4047"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "farms" ; + nif:beginIndex "4263"^^xsd:nonNegativeInteger ; + nif:endIndex "4268"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "Pelham pointed out that Georgia voters last November rejected a constitutional amendment to allow legislators to vote on pay raises for future Legislature sessions." ; + nif:beginIndex "9586"^^xsd:nonNegativeInteger ; + nif:endIndex "9750"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "1978"^^xsd:nonNegativeInteger ; + nif:endIndex "1980"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Rep. Mac Barber of Commerce is asking the House in a privilege resolution to ``endorse increased federal support for public education, provided that such funds be received and expended'' as state funds." ; + nif:beginIndex "9906"^^xsd:nonNegativeInteger ; + nif:endIndex "10108"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "offered" ; + nif:beginIndex "9549"^^xsd:nonNegativeInteger ; + nif:endIndex "9556"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "legislators" ; + nif:beginIndex "7406"^^xsd:nonNegativeInteger ; + nif:endIndex "7417"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Aj" ; + nif:beginIndex "4976"^^xsd:nonNegativeInteger ; + nif:endIndex "4978"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "session" ; + nif:beginIndex "7693"^^xsd:nonNegativeInteger ; + nif:endIndex "7700"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Senate" ; + nif:beginIndex "9463"^^xsd:nonNegativeInteger ; + nif:endIndex "9469"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "1006"^^xsd:nonNegativeInteger ; + nif:endIndex "1008"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "brief" ; + nif:beginIndex "5108"^^xsd:nonNegativeInteger ; + nif:endIndex "5113"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Pelham said Sunday night there was research being done on whether the ``quickie'' vote on the increase can be repealed outright or whether notice would have to first be given that reconsideration of the action would be sought." ; + nif:beginIndex "9020"^^xsd:nonNegativeInteger ; + nif:endIndex "9246"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "veteran" ; + nif:beginIndex "9753"^^xsd:nonNegativeInteger ; + nif:endIndex "9760"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "2504"^^xsd:nonNegativeInteger ; + nif:endIndex "2505"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "vote" ; + nif:beginIndex "9475"^^xsd:nonNegativeInteger ; + nif:endIndex "9479"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "privilege" ; + nif:beginIndex "9959"^^xsd:nonNegativeInteger ; + nif:endIndex "9968"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "voters" ; + nif:beginIndex "9618"^^xsd:nonNegativeInteger ; + nif:endIndex "9624"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "7125"^^xsd:nonNegativeInteger ; + nif:endIndex "7126"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "took" ; + nif:beginIndex "144"^^xsd:nonNegativeInteger ; + nif:endIndex "148"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "6256"^^xsd:nonNegativeInteger ; + nif:endIndex "6258"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "this" ; + nif:beginIndex "2446"^^xsd:nonNegativeInteger ; + nif:endIndex "2450"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "candidates" ; + nif:beginIndex "11752"^^xsd:nonNegativeInteger ; + nif:endIndex "11762"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Office" ; + nif:beginIndex "4220"^^xsd:nonNegativeInteger ; + nif:endIndex "4226"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "studied" ; + nif:beginIndex "986"^^xsd:nonNegativeInteger ; + nif:endIndex "993"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "at" ; + nif:beginIndex "3919"^^xsd:nonNegativeInteger ; + nif:endIndex "3921"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "11974"^^xsd:nonNegativeInteger ; + nif:endIndex "11976"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "6363"^^xsd:nonNegativeInteger ; + nif:endIndex "6365"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "tossed" ; + nif:beginIndex "10411"^^xsd:nonNegativeInteger ; + nif:endIndex "10417"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "pricing" ; + nif:beginIndex "3616"^^xsd:nonNegativeInteger ; + nif:endIndex "3623"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "interest" ; + nif:beginIndex "1270"^^xsd:nonNegativeInteger ; + nif:endIndex "1278"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "29"^^xsd:nonNegativeInteger ; + nif:endIndex "33"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "practices" ; + nif:beginIndex "2804"^^xsd:nonNegativeInteger ; + nif:endIndex "2813"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Court" ; + nif:beginIndex "459"^^xsd:nonNegativeInteger ; + nif:endIndex "464"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "The department apparently intends to make the Rural Roads Authority a revolving fund under which new bonds would be issued every time a portion of the old ones are paid off by tax authorities." ; + nif:beginIndex "8303"^^xsd:nonNegativeInteger ; + nif:endIndex "8495"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Sheriff Felix Tabb said the ordinary apparently made good his promise." ; + nif:beginIndex "12089"^^xsd:nonNegativeInteger ; + nif:endIndex "12159"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Georgia" ; + nif:beginIndex "5418"^^xsd:nonNegativeInteger ; + nif:endIndex "5425"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "11172"^^xsd:nonNegativeInteger ; + nif:endIndex "11174"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "recommendations" ; + nif:beginIndex "3022"^^xsd:nonNegativeInteger ; + nif:endIndex "3037"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "toward" ; + nif:beginIndex "7584"^^xsd:nonNegativeInteger ; + nif:endIndex "7590"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Senate" ; + nif:beginIndex "7351"^^xsd:nonNegativeInteger ; + nif:endIndex "7357"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "top" ; + nif:beginIndex "5523"^^xsd:nonNegativeInteger ; + nif:endIndex "5526"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "not" ; + nif:beginIndex "5394"^^xsd:nonNegativeInteger ; + nif:endIndex "5397"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "10695"^^xsd:nonNegativeInteger ; + nif:endIndex "10696"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "with" ; + nif:beginIndex "3964"^^xsd:nonNegativeInteger ; + nif:endIndex "3968"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "10965"^^xsd:nonNegativeInteger ; + nif:endIndex "10967"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "than" ; + nif:beginIndex "4919"^^xsd:nonNegativeInteger ; + nif:endIndex "4923"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "taxpayers" ; + nif:beginIndex "2704"^^xsd:nonNegativeInteger ; + nif:endIndex "2713"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "equitable" ; + nif:beginIndex "4062"^^xsd:nonNegativeInteger ; + nif:endIndex "4071"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "Ordinary Williams said he, too, was subjected to anonymous calls soon after he scheduled the election." ; + nif:beginIndex "11827"^^xsd:nonNegativeInteger ; + nif:endIndex "11929"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Harry" ; + nif:beginIndex "10864"^^xsd:nonNegativeInteger ; + nif:endIndex "10869"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "offices" ; + nif:beginIndex "1365"^^xsd:nonNegativeInteger ; + nif:endIndex "1372"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "issuance" ; + nif:beginIndex "8587"^^xsd:nonNegativeInteger ; + nif:endIndex "8595"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "bonds" ; + nif:beginIndex "8246"^^xsd:nonNegativeInteger ; + nif:endIndex "8251"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "On other matters, the jury recommended that: (1)" ; + nif:beginIndex "3727"^^xsd:nonNegativeInteger ; + nif:endIndex "3775"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "2657"^^xsd:nonNegativeInteger ; + nif:endIndex "2659"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "3637"^^xsd:nonNegativeInteger ; + nif:endIndex "3640"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "3404"^^xsd:nonNegativeInteger ; + nif:endIndex "3407"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "after" ; + nif:beginIndex "11647"^^xsd:nonNegativeInteger ; + nif:endIndex "11652"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "156"^^xsd:nonNegativeInteger ; + nif:endIndex "159"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "pension" ; + nif:beginIndex "4074"^^xsd:nonNegativeInteger ; + nif:endIndex "4081"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Tax" ; + nif:beginIndex "4201"^^xsd:nonNegativeInteger ; + nif:endIndex "4204"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "761"^^xsd:nonNegativeInteger ; + nif:endIndex "764"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "agreed" ; + nif:beginIndex "4678"^^xsd:nonNegativeInteger ; + nif:endIndex "4684"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "areas" ; + nif:beginIndex "7446"^^xsd:nonNegativeInteger ; + nif:endIndex "7451"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "county" ; + nif:beginIndex "11696"^^xsd:nonNegativeInteger ; + nif:endIndex "11702"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "run" ; + nif:beginIndex "5398"^^xsd:nonNegativeInteger ; + nif:endIndex "5401"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "The September-October term jury had been charged by Fulton Superior Court Judge Durwood Pye to investigate reports of possible ``irregularities'' in the hard-fought primary which was won by Mayor-nominate Ivan Allen Jr.." ; + nif:beginIndex "391"^^xsd:nonNegativeInteger ; + nif:endIndex "611"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "3609"^^xsd:nonNegativeInteger ; + nif:endIndex "3611"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "Nevertheless, ``we feel that in the future Fulton County should receive some portion of these available funds'', the jurors said." ; + nif:beginIndex "2492"^^xsd:nonNegativeInteger ; + nif:endIndex "2621"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "County" ; + nif:beginIndex "10704"^^xsd:nonNegativeInteger ; + nif:endIndex "10710"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "10249"^^xsd:nonNegativeInteger ; + nif:endIndex "10251"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Sam" ; + nif:beginIndex "6768"^^xsd:nonNegativeInteger ; + nif:endIndex "6771"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "candidate" ; + nif:beginIndex "5313"^^xsd:nonNegativeInteger ; + nif:endIndex "5322"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "year" ; + nif:beginIndex "4926"^^xsd:nonNegativeInteger ; + nif:endIndex "4930"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "1442"^^xsd:nonNegativeInteger ; + nif:endIndex "1444"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "endorse" ; + nif:beginIndex "9985"^^xsd:nonNegativeInteger ; + nif:endIndex "9992"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "343"^^xsd:nonNegativeInteger ; + nif:endIndex "346"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "these" ; + nif:beginIndex "2580"^^xsd:nonNegativeInteger ; + nif:endIndex "2585"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "8201"^^xsd:nonNegativeInteger ; + nif:endIndex "8203"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "mayor" ; + nif:beginIndex "4621"^^xsd:nonNegativeInteger ; + nif:endIndex "4626"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "3549"^^xsd:nonNegativeInteger ; + nif:endIndex "3550"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "will" ; + nif:beginIndex "7289"^^xsd:nonNegativeInteger ; + nif:endIndex "7293"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "have" ; + nif:beginIndex "9172"^^xsd:nonNegativeInteger ; + nif:endIndex "9176"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "audience" ; + nif:beginIndex "5678"^^xsd:nonNegativeInteger ; + nif:endIndex "5686"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "should" ; + nif:beginIndex "3149"^^xsd:nonNegativeInteger ; + nif:endIndex "3155"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "new" ; + nif:beginIndex "8819"^^xsd:nonNegativeInteger ; + nif:endIndex "8822"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "10158"^^xsd:nonNegativeInteger ; + nif:endIndex "10162"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "opposed" ; + nif:beginIndex "9885"^^xsd:nonNegativeInteger ; + nif:endIndex "9892"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "listed" ; + nif:beginIndex "4704"^^xsd:nonNegativeInteger ; + nif:endIndex "4710"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "enabling" ; + nif:beginIndex "3992"^^xsd:nonNegativeInteger ; + nif:endIndex "4000"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "The campaign leading to the election was not so quiet, however." ; + nif:beginIndex "11389"^^xsd:nonNegativeInteger ; + nif:endIndex "11452"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "9216"^^xsd:nonNegativeInteger ; + nif:endIndex "9218"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "6060"^^xsd:nonNegativeInteger ; + nif:endIndex "6062"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "undue" ; + nif:beginIndex "3218"^^xsd:nonNegativeInteger ; + nif:endIndex "3223"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "1824"^^xsd:nonNegativeInteger ; + nif:endIndex "1827"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "8687"^^xsd:nonNegativeInteger ; + nif:endIndex "8690"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "old" ; + nif:beginIndex "8454"^^xsd:nonNegativeInteger ; + nif:endIndex "8457"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "agriculture" ; + nif:beginIndex "10887"^^xsd:nonNegativeInteger ; + nif:endIndex "10898"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "learned" ; + nif:beginIndex "7766"^^xsd:nonNegativeInteger ; + nif:endIndex "7773"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "outmoded" ; + nif:beginIndex "876"^^xsd:nonNegativeInteger ; + nif:endIndex "884"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "this" ; + nif:beginIndex "2345"^^xsd:nonNegativeInteger ; + nif:endIndex "2349"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "11045"^^xsd:nonNegativeInteger ; + nif:endIndex "11047"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "signatures" ; + nif:beginIndex "6049"^^xsd:nonNegativeInteger ; + nif:endIndex "6059"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Atlanta" ; + nif:beginIndex "329"^^xsd:nonNegativeInteger ; + nif:endIndex "336"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "making" ; + nif:beginIndex "6409"^^xsd:nonNegativeInteger ; + nif:endIndex "6415"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "issue" ; + nif:beginIndex "7907"^^xsd:nonNegativeInteger ; + nif:endIndex "7912"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "expected" ; + nif:beginIndex "6929"^^xsd:nonNegativeInteger ; + nif:endIndex "6937"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Hartsfield" ; + nif:beginIndex "5359"^^xsd:nonNegativeInteger ; + nif:endIndex "5369"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "new" ; + nif:beginIndex "3337"^^xsd:nonNegativeInteger ; + nif:endIndex "3340"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "State" ; + nif:beginIndex "7778"^^xsd:nonNegativeInteger ; + nif:endIndex "7783"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "I didn't smell a drop of liquor, and we didn't have a bit of trouble''." ; + nif:beginIndex "11317"^^xsd:nonNegativeInteger ; + nif:endIndex "11388"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "veiled" ; + nif:beginIndex "11518"^^xsd:nonNegativeInteger ; + nif:endIndex "11524"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "1958" ; + nif:beginIndex "8537"^^xsd:nonNegativeInteger ; + nif:endIndex "8541"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Bar" ; + nif:beginIndex "3082"^^xsd:nonNegativeInteger ; + nif:endIndex "3085"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Despite" ; + nif:beginIndex "6082"^^xsd:nonNegativeInteger ; + nif:endIndex "6089"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "succeeded" ; + nif:beginIndex "6964"^^xsd:nonNegativeInteger ; + nif:endIndex "6973"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "1903"^^xsd:nonNegativeInteger ; + nif:endIndex "1905"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "together" ; + nif:beginIndex "4885"^^xsd:nonNegativeInteger ; + nif:endIndex "4893"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jail" ; + nif:beginIndex "3926"^^xsd:nonNegativeInteger ; + nif:endIndex "3930"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "been" ; + nif:beginIndex "6924"^^xsd:nonNegativeInteger ; + nif:endIndex "6928"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "3564"^^xsd:nonNegativeInteger ; + nif:endIndex "3568"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "5799"^^xsd:nonNegativeInteger ; + nif:endIndex "5801"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "They" ; + nif:beginIndex "4519"^^xsd:nonNegativeInteger ; + nif:endIndex "4523"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "held" ; + nif:beginIndex "5605"^^xsd:nonNegativeInteger ; + nif:endIndex "5609"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "11123"^^xsd:nonNegativeInteger ; + nif:endIndex "11124"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jury" ; + nif:beginIndex "1917"^^xsd:nonNegativeInteger ; + nif:endIndex "1921"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "5135"^^xsd:nonNegativeInteger ; + nif:endIndex "5136"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "superintendent" ; + nif:beginIndex "10846"^^xsd:nonNegativeInteger ; + nif:endIndex "10860"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "commented" ; + nif:beginIndex "2729"^^xsd:nonNegativeInteger ; + nif:endIndex "2738"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "expected" ; + nif:beginIndex "7361"^^xsd:nonNegativeInteger ; + nif:endIndex "7369"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "7158"^^xsd:nonNegativeInteger ; + nif:endIndex "7161"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "William" ; + nif:beginIndex "4325"^^xsd:nonNegativeInteger ; + nif:endIndex "4332"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "11225"^^xsd:nonNegativeInteger ; + nif:endIndex "11226"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "3705"^^xsd:nonNegativeInteger ; + nif:endIndex "3707"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "2368"^^xsd:nonNegativeInteger ; + nif:endIndex "2372"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jury" ; + nif:beginIndex "2135"^^xsd:nonNegativeInteger ; + nif:endIndex "2139"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "1495"^^xsd:nonNegativeInteger ; + nif:endIndex "1498"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "County" ; + nif:beginIndex "11"^^xsd:nonNegativeInteger ; + nif:endIndex "17"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "not" ; + nif:beginIndex "3536"^^xsd:nonNegativeInteger ; + nif:endIndex "3539"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "The grand jury took a swipe at the State Welfare Department's handling of federal funds granted for child welfare services in foster homes." ; + nif:beginIndex "1907"^^xsd:nonNegativeInteger ; + nif:endIndex "2046"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "appointment" ; + nif:beginIndex "2821"^^xsd:nonNegativeInteger ; + nif:endIndex "2832"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "1082"^^xsd:nonNegativeInteger ; + nif:endIndex "1083"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "was" ; + nif:beginIndex "9051"^^xsd:nonNegativeInteger ; + nif:endIndex "9054"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "them" ; + nif:beginIndex "1046"^^xsd:nonNegativeInteger ; + nif:endIndex "1050"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "The bond issue will go to the state courts for a friendly test suit to test the validity of the act, and then the sales will begin and contracts let for repair work on some of Georgia's most heavily traveled highways." ; + nif:beginIndex "7898"^^xsd:nonNegativeInteger ; + nif:endIndex "8115"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "it" ; + nif:beginIndex "7759"^^xsd:nonNegativeInteger ; + nif:endIndex "7761"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "filed" ; + nif:beginIndex "4347"^^xsd:nonNegativeInteger ; + nif:endIndex "4352"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "made" ; + nif:beginIndex "7492"^^xsd:nonNegativeInteger ; + nif:endIndex "7496"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Barber, who is in his 13th year as a legislator, said there ``are some members of our congressional delegation in Washington who would like to see it (the resolution) passed''." ; + nif:beginIndex "10109"^^xsd:nonNegativeInteger ; + nif:endIndex "10285"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "9027"^^xsd:nonNegativeInteger ; + nif:endIndex "9031"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "9366"^^xsd:nonNegativeInteger ; + nif:endIndex "9369"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "10877"^^xsd:nonNegativeInteger ; + nif:endIndex "10878"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Griffin" ; + nif:beginIndex "8678"^^xsd:nonNegativeInteger ; + nif:endIndex "8685"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "highways" ; + nif:beginIndex "8106"^^xsd:nonNegativeInteger ; + nif:endIndex "8114"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "2047"^^xsd:nonNegativeInteger ; + nif:endIndex "2049"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Policeman" ; + nif:beginIndex "11236"^^xsd:nonNegativeInteger ; + nif:endIndex "11245"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "5624"^^xsd:nonNegativeInteger ; + nif:endIndex "5626"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "one" ; + nif:beginIndex "6242"^^xsd:nonNegativeInteger ; + nif:endIndex "6245"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "teacher" ; + nif:beginIndex "10899"^^xsd:nonNegativeInteger ; + nif:endIndex "10906"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "council" ; + nif:beginIndex "5192"^^xsd:nonNegativeInteger ; + nif:endIndex "5199"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "precincts" ; + nif:beginIndex "6025"^^xsd:nonNegativeInteger ; + nif:endIndex "6034"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "as" ; + nif:beginIndex "10141"^^xsd:nonNegativeInteger ; + nif:endIndex "10143"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "4767"^^xsd:nonNegativeInteger ; + nif:endIndex "4768"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "he" ; + nif:beginIndex "5385"^^xsd:nonNegativeInteger ; + nif:endIndex "5387"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "anonymous" ; + nif:beginIndex "11483"^^xsd:nonNegativeInteger ; + nif:endIndex "11492"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "saw" ; + nif:beginIndex "10809"^^xsd:nonNegativeInteger ; + nif:endIndex "10812"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "10827"^^xsd:nonNegativeInteger ; + nif:endIndex "10829"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "``This is one of the major items in the Fulton County general assistance program'', the jury said, but the State Welfare Department ``has seen fit to distribute these funds through the welfare departments of all the counties in the state with the exception of Fulton County, which receives none of this money." ; + nif:beginIndex "2047"^^xsd:nonNegativeInteger ; + nif:endIndex "2356"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "8815"^^xsd:nonNegativeInteger ; + nif:endIndex "8818"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "4123"^^xsd:nonNegativeInteger ; + nif:endIndex "4126"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "election" ; + nif:beginIndex "11728"^^xsd:nonNegativeInteger ; + nif:endIndex "11736"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Welfare" ; + nif:beginIndex "1948"^^xsd:nonNegativeInteger ; + nif:endIndex "1955"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "5492"^^xsd:nonNegativeInteger ; + nif:endIndex "5494"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "legislation" ; + nif:beginIndex "4001"^^xsd:nonNegativeInteger ; + nif:endIndex "4012"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "roads" ; + nif:beginIndex "8829"^^xsd:nonNegativeInteger ; + nif:endIndex "8834"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Mayor-nominate" ; + nif:beginIndex "581"^^xsd:nonNegativeInteger ; + nif:endIndex "595"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "plans" ; + nif:beginIndex "8800"^^xsd:nonNegativeInteger ; + nif:endIndex "8805"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "often" ; + nif:beginIndex "903"^^xsd:nonNegativeInteger ; + nif:endIndex "908"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "present" ; + nif:beginIndex "5221"^^xsd:nonNegativeInteger ; + nif:endIndex "5228"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Durwood" ; + nif:beginIndex "471"^^xsd:nonNegativeInteger ; + nif:endIndex "478"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "8163"^^xsd:nonNegativeInteger ; + nif:endIndex "8164"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "6676"^^xsd:nonNegativeInteger ; + nif:endIndex "6677"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "10050"^^xsd:nonNegativeInteger ; + nif:endIndex "10054"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "is" ; + nif:beginIndex "2054"^^xsd:nonNegativeInteger ; + nif:endIndex "2056"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "7602"^^xsd:nonNegativeInteger ; + nif:endIndex "7603"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "burden" ; + nif:beginIndex "2685"^^xsd:nonNegativeInteger ; + nif:endIndex "2691"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "5380"^^xsd:nonNegativeInteger ; + nif:endIndex "5384"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "7254"^^xsd:nonNegativeInteger ; + nif:endIndex "7257"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "ever" ; + nif:beginIndex "11215"^^xsd:nonNegativeInteger ; + nif:endIndex "11219"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "10598"^^xsd:nonNegativeInteger ; + nif:endIndex "10600"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "10188"^^xsd:nonNegativeInteger ; + nif:endIndex "10190"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "will" ; + nif:beginIndex "7182"^^xsd:nonNegativeInteger ; + nif:endIndex "7186"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "year" ; + nif:beginIndex "10136"^^xsd:nonNegativeInteger ; + nif:endIndex "10140"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "The former county school superintendent, George P. Callan, shot himself to death March 18, four days after he resigned his post in a dispute with the county school board." ; + nif:beginIndex "11546"^^xsd:nonNegativeInteger ; + nif:endIndex "11716"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "election" ; + nif:beginIndex "5175"^^xsd:nonNegativeInteger ; + nif:endIndex "5183"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "4555"^^xsd:nonNegativeInteger ; + nif:endIndex "4558"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "wasn't" ; + nif:beginIndex "12219"^^xsd:nonNegativeInteger ; + nif:endIndex "12225"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "administrators" ; + nif:beginIndex "2862"^^xsd:nonNegativeInteger ; + nif:endIndex "2876"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "federal" ; + nif:beginIndex "9829"^^xsd:nonNegativeInteger ; + nif:endIndex "9836"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "funds" ; + nif:beginIndex "10102"^^xsd:nonNegativeInteger ; + nif:endIndex "10107"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "7990"^^xsd:nonNegativeInteger ; + nif:endIndex "7993"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "1" ; + nif:beginIndex "5257"^^xsd:nonNegativeInteger ; + nif:endIndex "5258"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "8583"^^xsd:nonNegativeInteger ; + nif:endIndex "8586"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "welfare" ; + nif:beginIndex "2013"^^xsd:nonNegativeInteger ; + nif:endIndex "2020"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "expires" ; + nif:beginIndex "5244"^^xsd:nonNegativeInteger ; + nif:endIndex "5251"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "disable" ; + nif:beginIndex "2438"^^xsd:nonNegativeInteger ; + nif:endIndex "2445"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "2061"^^xsd:nonNegativeInteger ; + nif:endIndex "2063"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "County" ; + nif:beginIndex "9769"^^xsd:nonNegativeInteger ; + nif:endIndex "9775"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "compensation" ; + nif:beginIndex "2906"^^xsd:nonNegativeInteger ; + nif:endIndex "2918"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "House" ; + nif:beginIndex "9948"^^xsd:nonNegativeInteger ; + nif:endIndex "9953"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "4030"^^xsd:nonNegativeInteger ; + nif:endIndex "4033"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "place" ; + nif:beginIndex "149"^^xsd:nonNegativeInteger ; + nif:endIndex "154"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "says" ; + nif:beginIndex "6392"^^xsd:nonNegativeInteger ; + nif:endIndex "6396"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Gov." ; + nif:beginIndex "8666"^^xsd:nonNegativeInteger ; + nif:endIndex "8670"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "5519"^^xsd:nonNegativeInteger ; + nif:endIndex "5520"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "1" ; + nif:beginIndex "6476"^^xsd:nonNegativeInteger ; + nif:endIndex "6477"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "7865"^^xsd:nonNegativeInteger ; + nif:endIndex "7867"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "3173"^^xsd:nonNegativeInteger ; + nif:endIndex "3175"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "race" ; + nif:beginIndex "6424"^^xsd:nonNegativeInteger ; + nif:endIndex "6428"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "4829"^^xsd:nonNegativeInteger ; + nif:endIndex "4830"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "On" ; + nif:beginIndex "3727"^^xsd:nonNegativeInteger ; + nif:endIndex "3729"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "found" ; + nif:beginIndex "2953"^^xsd:nonNegativeInteger ; + nif:endIndex "2958"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "Four additional deputies be employed at the Fulton County Jail and ``a doctor, medical intern or extern be employed for night and weekend duty at the jail''." ; + nif:beginIndex "3776"^^xsd:nonNegativeInteger ; + nif:endIndex "3933"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "would" ; + nif:beginIndex "6352"^^xsd:nonNegativeInteger ; + nif:endIndex "6357"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Garland" ; + nif:beginIndex "6872"^^xsd:nonNegativeInteger ; + nif:endIndex "6879"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "result" ; + nif:beginIndex "1562"^^xsd:nonNegativeInteger ; + nif:endIndex "1568"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "resolution" ; + nif:beginIndex "9438"^^xsd:nonNegativeInteger ; + nif:endIndex "9448"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "revolving" ; + nif:beginIndex "8373"^^xsd:nonNegativeInteger ; + nif:endIndex "8382"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Bush" ; + nif:beginIndex "10927"^^xsd:nonNegativeInteger ; + nif:endIndex "10931"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "5674"^^xsd:nonNegativeInteger ; + nif:endIndex "5677"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "court" ; + nif:beginIndex "2963"^^xsd:nonNegativeInteger ; + nif:endIndex "2968"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "on" ; + nif:beginIndex "5007"^^xsd:nonNegativeInteger ; + nif:endIndex "5009"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "5731"^^xsd:nonNegativeInteger ; + nif:endIndex "5735"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "his" ; + nif:beginIndex "8512"^^xsd:nonNegativeInteger ; + nif:endIndex "8515"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf ")" ; + nif:beginIndex "3774"^^xsd:nonNegativeInteger ; + nif:endIndex "3775"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "State" ; + nif:beginIndex "5688"^^xsd:nonNegativeInteger ; + nif:endIndex "5693"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Vandiver likely will mention the $100 million highway bond issue approved earlier in the session as his first priority item." ; + nif:beginIndex "7604"^^xsd:nonNegativeInteger ; + nif:endIndex "7728"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "5686"^^xsd:nonNegativeInteger ; + nif:endIndex "5687"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "7727"^^xsd:nonNegativeInteger ; + nif:endIndex "7728"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "controversy" ; + nif:beginIndex "10684"^^xsd:nonNegativeInteger ; + nif:endIndex "10695"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "voting" ; + nif:beginIndex "8960"^^xsd:nonNegativeInteger ; + nif:endIndex "8966"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Atlanta" ; + nif:beginIndex "1123"^^xsd:nonNegativeInteger ; + nif:endIndex "1130"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "were" ; + nif:beginIndex "6299"^^xsd:nonNegativeInteger ; + nif:endIndex "6303"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "537"^^xsd:nonNegativeInteger ; + nif:endIndex "539"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "implementation" ; + nif:beginIndex "1862"^^xsd:nonNegativeInteger ; + nif:endIndex "1876"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "799"^^xsd:nonNegativeInteger ; + nif:endIndex "803"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "State" ; + nif:beginIndex "10566"^^xsd:nonNegativeInteger ; + nif:endIndex "10571"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "12031"^^xsd:nonNegativeInteger ; + nif:endIndex "12032"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Construction bonds" ; + nif:beginIndex "7729"^^xsd:nonNegativeInteger ; + nif:endIndex "7747"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "6264"^^xsd:nonNegativeInteger ; + nif:endIndex "6267"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Health" ; + nif:beginIndex "4300"^^xsd:nonNegativeInteger ; + nif:endIndex "4306"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Police" ; + nif:beginIndex "4171"^^xsd:nonNegativeInteger ; + nif:endIndex "4177"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "4617"^^xsd:nonNegativeInteger ; + nif:endIndex "4620"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Fulton" ; + nif:beginIndex "2307"^^xsd:nonNegativeInteger ; + nif:endIndex "2313"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "314"^^xsd:nonNegativeInteger ; + nif:endIndex "316"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "1962" ; + nif:beginIndex "5499"^^xsd:nonNegativeInteger ; + nif:endIndex "5503"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "attorney" ; + nif:beginIndex "4739"^^xsd:nonNegativeInteger ; + nif:endIndex "4747"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "an" ; + nif:beginIndex "3102"^^xsd:nonNegativeInteger ; + nif:endIndex "3104"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "2355"^^xsd:nonNegativeInteger ; + nif:endIndex "2356"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "violence" ; + nif:beginIndex "11536"^^xsd:nonNegativeInteger ; + nif:endIndex "11544"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "trouble" ; + nif:beginIndex "12235"^^xsd:nonNegativeInteger ; + nif:endIndex "12242"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "751"^^xsd:nonNegativeInteger ; + nif:endIndex "753"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "make" ; + nif:beginIndex "8340"^^xsd:nonNegativeInteger ; + nif:endIndex "8344"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "every" ; + nif:beginIndex "8426"^^xsd:nonNegativeInteger ; + nif:endIndex "8431"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "was" ; + nif:beginIndex "11859"^^xsd:nonNegativeInteger ; + nif:endIndex "11862"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "3634"^^xsd:nonNegativeInteger ; + nif:endIndex "3636"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "ordinary" ; + nif:beginIndex "12117"^^xsd:nonNegativeInteger ; + nif:endIndex "12125"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "court's" ; + nif:beginIndex "3199"^^xsd:nonNegativeInteger ; + nif:endIndex "3206"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "appointed" ; + nif:beginIndex "3238"^^xsd:nonNegativeInteger ; + nif:endIndex "3247"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "million" ; + nif:beginIndex "8207"^^xsd:nonNegativeInteger ; + nif:endIndex "8214"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "has" ; + nif:beginIndex "7009"^^xsd:nonNegativeInteger ; + nif:endIndex "7012"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "charge" ; + nif:beginIndex "3429"^^xsd:nonNegativeInteger ; + nif:endIndex "3435"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "4529"^^xsd:nonNegativeInteger ; + nif:endIndex "4530"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "County" ; + nif:beginIndex "2314"^^xsd:nonNegativeInteger ; + nif:endIndex "2320"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Fulton" ; + nif:beginIndex "942"^^xsd:nonNegativeInteger ; + nif:endIndex "948"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "irregularities" ; + nif:beginIndex "129"^^xsd:nonNegativeInteger ; + nif:endIndex "143"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "1622"^^xsd:nonNegativeInteger ; + nif:endIndex "1624"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "automobile" ; + nif:beginIndex "1689"^^xsd:nonNegativeInteger ; + nif:endIndex "1699"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "18" ; + nif:beginIndex "11633"^^xsd:nonNegativeInteger ; + nif:endIndex "11635"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "this" ; + nif:beginIndex "10528"^^xsd:nonNegativeInteger ; + nif:endIndex "10532"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "11413"^^xsd:nonNegativeInteger ; + nif:endIndex "11416"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "8806"^^xsd:nonNegativeInteger ; + nif:endIndex "8808"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "management" ; + nif:beginIndex "3412"^^xsd:nonNegativeInteger ; + nif:endIndex "3422"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "7987"^^xsd:nonNegativeInteger ; + nif:endIndex "7989"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Henry L. Bowden was listed on the petition as the mayor's attorney." ; + nif:beginIndex "4980"^^xsd:nonNegativeInteger ; + nif:endIndex "5047"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "13" ; + nif:beginIndex "5336"^^xsd:nonNegativeInteger ; + nif:endIndex "5338"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "funds" ; + nif:beginIndex "1989"^^xsd:nonNegativeInteger ; + nif:endIndex "1994"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "combined" ; + nif:beginIndex "1383"^^xsd:nonNegativeInteger ; + nif:endIndex "1391"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "would" ; + nif:beginIndex "5388"^^xsd:nonNegativeInteger ; + nif:endIndex "5393"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "practices" ; + nif:beginIndex "1236"^^xsd:nonNegativeInteger ; + nif:endIndex "1245"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "authorities" ; + nif:beginIndex "8483"^^xsd:nonNegativeInteger ; + nif:endIndex "8494"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Many" ; + nif:beginIndex "11930"^^xsd:nonNegativeInteger ; + nif:endIndex "11934"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "P." ; + nif:beginIndex "11594"^^xsd:nonNegativeInteger ; + nif:endIndex "11596"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "coolest" ; + nif:beginIndex "11187"^^xsd:nonNegativeInteger ; + nif:endIndex "11194"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "money" ; + nif:beginIndex "7271"^^xsd:nonNegativeInteger ; + nif:endIndex "7276"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "who" ; + nif:beginIndex "10234"^^xsd:nonNegativeInteger ; + nif:endIndex "10237"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "concessionaires" ; + nif:beginIndex "3641"^^xsd:nonNegativeInteger ; + nif:endIndex "3656"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "10066"^^xsd:nonNegativeInteger ; + nif:endIndex "10068"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "2045"^^xsd:nonNegativeInteger ; + nif:endIndex "2046"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "board" ; + nif:beginIndex "11710"^^xsd:nonNegativeInteger ; + nif:endIndex "11715"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "place" ; + nif:beginIndex "2660"^^xsd:nonNegativeInteger ; + nif:endIndex "2665"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Fulton" ; + nif:beginIndex "2746"^^xsd:nonNegativeInteger ; + nif:endIndex "2752"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "this" ; + nif:beginIndex "777"^^xsd:nonNegativeInteger ; + nif:endIndex "781"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "8114"^^xsd:nonNegativeInteger ; + nif:endIndex "8115"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "mayor's" ; + nif:beginIndex "4715"^^xsd:nonNegativeInteger ; + nif:endIndex "4722"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "2080"^^xsd:nonNegativeInteger ; + nif:endIndex "2082"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "first" ; + nif:beginIndex "9180"^^xsd:nonNegativeInteger ; + nif:endIndex "9185"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "1186"^^xsd:nonNegativeInteger ; + nif:endIndex "1188"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "revolving" ; + nif:beginIndex "8288"^^xsd:nonNegativeInteger ; + nif:endIndex "8297"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "exception" ; + nif:beginIndex "2294"^^xsd:nonNegativeInteger ; + nif:endIndex "2303"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "constitutional" ; + nif:beginIndex "9650"^^xsd:nonNegativeInteger ; + nif:endIndex "9664"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "However, the jury said it believes ``these two offices should be combined to achieve greater efficiency and reduce the cost of administration''." ; + nif:beginIndex "1318"^^xsd:nonNegativeInteger ; + nif:endIndex "1462"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "deputies" ; + nif:beginIndex "3792"^^xsd:nonNegativeInteger ; + nif:endIndex "3800"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "10144"^^xsd:nonNegativeInteger ; + nif:endIndex "10145"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Fulton" ; + nif:beginIndex "4293"^^xsd:nonNegativeInteger ; + nif:endIndex "4299"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Attorneys" ; + nif:beginIndex "4603"^^xsd:nonNegativeInteger ; + nif:endIndex "4612"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "173"^^xsd:nonNegativeInteger ; + nif:endIndex "177"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Sept." ; + nif:beginIndex "5791"^^xsd:nonNegativeInteger ; + nif:endIndex "5796"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "teacher" ; + nif:beginIndex "10634"^^xsd:nonNegativeInteger ; + nif:endIndex "10641"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Allen" ; + nif:beginIndex "601"^^xsd:nonNegativeInteger ; + nif:endIndex "606"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "3816"^^xsd:nonNegativeInteger ; + nif:endIndex "3819"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "not" ; + nif:beginIndex "8767"^^xsd:nonNegativeInteger ; + nif:endIndex "8770"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "some" ; + nif:beginIndex "8066"^^xsd:nonNegativeInteger ; + nif:endIndex "8070"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "11995"^^xsd:nonNegativeInteger ; + nif:endIndex "11998"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Being" ; + nif:beginIndex "11267"^^xsd:nonNegativeInteger ; + nif:endIndex "11272"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "4613"^^xsd:nonNegativeInteger ; + nif:endIndex "4616"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "994"^^xsd:nonNegativeInteger ; + nif:endIndex "997"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "revised" ; + nif:beginIndex "998"^^xsd:nonNegativeInteger ; + nif:endIndex "1005"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "6080"^^xsd:nonNegativeInteger ; + nif:endIndex "6081"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "During the election campaign, both candidates, Davis and Bush, reportedly received anonymous telephone calls." ; + nif:beginIndex "11717"^^xsd:nonNegativeInteger ; + nif:endIndex "11826"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "2713"^^xsd:nonNegativeInteger ; + nif:endIndex "2714"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "size" ; + nif:beginIndex "769"^^xsd:nonNegativeInteger ; + nif:endIndex "773"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "resigned" ; + nif:beginIndex "11656"^^xsd:nonNegativeInteger ; + nif:endIndex "11664"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "our" ; + nif:beginIndex "2462"^^xsd:nonNegativeInteger ; + nif:endIndex "2465"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "intends" ; + nif:beginIndex "8329"^^xsd:nonNegativeInteger ; + nif:endIndex "8336"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "It" ; + nif:beginIndex "922"^^xsd:nonNegativeInteger ; + nif:endIndex "924"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "bond" ; + nif:beginIndex "7266"^^xsd:nonNegativeInteger ; + nif:endIndex "7270"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "2" ; + nif:beginIndex "3935"^^xsd:nonNegativeInteger ; + nif:endIndex "3936"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "3684"^^xsd:nonNegativeInteger ; + nif:endIndex "3687"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "who" ; + nif:beginIndex "5300"^^xsd:nonNegativeInteger ; + nif:endIndex "5303"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "his" ; + nif:beginIndex "5171"^^xsd:nonNegativeInteger ; + nif:endIndex "5174"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "newly" ; + nif:beginIndex "5820"^^xsd:nonNegativeInteger ; + nif:endIndex "5825"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "1336"^^xsd:nonNegativeInteger ; + nif:endIndex "1340"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "9696"^^xsd:nonNegativeInteger ; + nif:endIndex "9698"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "The petition listed the mayor's occupation as ``attorney'' and his age as 71." ; + nif:beginIndex "4691"^^xsd:nonNegativeInteger ; + nif:endIndex "4768"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "didn't" ; + nif:beginIndex "11319"^^xsd:nonNegativeInteger ; + nif:endIndex "11325"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "money" ; + nif:beginIndex "2350"^^xsd:nonNegativeInteger ; + nif:endIndex "2355"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "1937" ; + nif:beginIndex "5131"^^xsd:nonNegativeInteger ; + nif:endIndex "5135"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "expense" ; + nif:beginIndex "9000"^^xsd:nonNegativeInteger ; + nif:endIndex "9007"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "203"^^xsd:nonNegativeInteger ; + nif:endIndex "207"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "M." ; + nif:beginIndex "4579"^^xsd:nonNegativeInteger ; + nif:endIndex "4581"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "When the crowd was asked whether it wanted to wait one more term to make the race, it voted no -- and there were no dissents." ; + nif:beginIndex "6191"^^xsd:nonNegativeInteger ; + nif:endIndex "6316"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "10496"^^xsd:nonNegativeInteger ; + nif:endIndex "10499"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "3932"^^xsd:nonNegativeInteger ; + nif:endIndex "3933"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "who" ; + nif:beginIndex "7005"^^xsd:nonNegativeInteger ; + nif:endIndex "7008"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "opened" ; + nif:beginIndex "8505"^^xsd:nonNegativeInteger ; + nif:endIndex "8511"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "10613"^^xsd:nonNegativeInteger ; + nif:endIndex "10615"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "departments" ; + nif:beginIndex "2240"^^xsd:nonNegativeInteger ; + nif:endIndex "2251"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "an" ; + nif:beginIndex "1851"^^xsd:nonNegativeInteger ; + nif:endIndex "1853"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "5951"^^xsd:nonNegativeInteger ; + nif:endIndex "5952"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "790"^^xsd:nonNegativeInteger ; + nif:endIndex "793"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "7092"^^xsd:nonNegativeInteger ; + nif:endIndex "7093"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "also" ; + nif:beginIndex "8155"^^xsd:nonNegativeInteger ; + nif:endIndex "8159"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "3300"^^xsd:nonNegativeInteger ; + nif:endIndex "3301"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "held" ; + nif:beginIndex "5786"^^xsd:nonNegativeInteger ; + nif:endIndex "5790"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "state" ; + nif:beginIndex "2279"^^xsd:nonNegativeInteger ; + nif:endIndex "2284"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "9904"^^xsd:nonNegativeInteger ; + nif:endIndex "9905"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "``There wasn't a bit of trouble''." ; + nif:beginIndex "12211"^^xsd:nonNegativeInteger ; + nif:endIndex "12245"^^xsd:nonNegativeInteger ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "what" ; + nif:beginIndex "7465"^^xsd:nonNegativeInteger ; + nif:endIndex "7469"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "road" ; + nif:beginIndex "8262"^^xsd:nonNegativeInteger ; + nif:endIndex "8266"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "1644"^^xsd:nonNegativeInteger ; + nif:endIndex "1646"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Fulton" ; + nif:beginIndex "4194"^^xsd:nonNegativeInteger ; + nif:endIndex "4200"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Ordinary" ; + nif:beginIndex "11087"^^xsd:nonNegativeInteger ; + nif:endIndex "11095"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "10380"^^xsd:nonNegativeInteger ; + nif:endIndex "10381"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "11155"^^xsd:nonNegativeInteger ; + nif:endIndex "11157"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "2127"^^xsd:nonNegativeInteger ; + nif:endIndex "2129"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "10678"^^xsd:nonNegativeInteger ; + nif:endIndex "10679"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "by" ; + nif:beginIndex "578"^^xsd:nonNegativeInteger ; + nif:endIndex "580"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "reconsideration" ; + nif:beginIndex "9200"^^xsd:nonNegativeInteger ; + nif:endIndex "9215"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "there" ; + nif:beginIndex "6103"^^xsd:nonNegativeInteger ; + nif:endIndex "6108"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "10652"^^xsd:nonNegativeInteger ; + nif:endIndex "10653"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "asked" ; + nif:beginIndex "6210"^^xsd:nonNegativeInteger ; + nif:endIndex "6215"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "11131"^^xsd:nonNegativeInteger ; + nif:endIndex "11132"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Rob" ; + nif:beginIndex "6977"^^xsd:nonNegativeInteger ; + nif:endIndex "6980"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "4106"^^xsd:nonNegativeInteger ; + nif:endIndex "4109"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "these" ; + nif:beginIndex "2208"^^xsd:nonNegativeInteger ; + nif:endIndex "2213"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "11674"^^xsd:nonNegativeInteger ; + nif:endIndex "11676"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "sales" ; + nif:beginIndex "8012"^^xsd:nonNegativeInteger ; + nif:endIndex "8017"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "general" ; + nif:beginIndex "6620"^^xsd:nonNegativeInteger ; + nif:endIndex "6627"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "being" ; + nif:beginIndex "11300"^^xsd:nonNegativeInteger ; + nif:endIndex "11305"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "has" ; + nif:beginIndex "8763"^^xsd:nonNegativeInteger ; + nif:endIndex "8766"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Vandiver" ; + nif:beginIndex "7503"^^xsd:nonNegativeInteger ; + nif:endIndex "7511"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "produced" ; + nif:beginIndex "95"^^xsd:nonNegativeInteger ; + nif:endIndex "103"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "8596"^^xsd:nonNegativeInteger ; + nif:endIndex "8598"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "it" ; + nif:beginIndex "7277"^^xsd:nonNegativeInteger ; + nif:endIndex "7279"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "12087"^^xsd:nonNegativeInteger ; + nif:endIndex "12088"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "6162"^^xsd:nonNegativeInteger ; + nif:endIndex "6164"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "``Failure to do this will continue to place a disproportionate burden'' on Fulton taxpayers." ; + nif:beginIndex "2622"^^xsd:nonNegativeInteger ; + nif:endIndex "2714"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "accepted" ; + nif:beginIndex "1227"^^xsd:nonNegativeInteger ; + nif:endIndex "1235"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "6090"^^xsd:nonNegativeInteger ; + nif:endIndex "6093"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "issue" ; + nif:beginIndex "8809"^^xsd:nonNegativeInteger ; + nif:endIndex "8814"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "11194"^^xsd:nonNegativeInteger ; + nif:endIndex "11195"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "11778"^^xsd:nonNegativeInteger ; + nif:endIndex "11779"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "highway" ; + nif:beginIndex "7868"^^xsd:nonNegativeInteger ; + nif:endIndex "7875"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "The jury also commented on the Fulton ordinary's court which has been under fire for its practices in the appointment of appraisers, guardians and administrators and the awarding of fees and compensation." ; + nif:beginIndex "2715"^^xsd:nonNegativeInteger ; + nif:endIndex "2919"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "make" ; + nif:beginIndex "7527"^^xsd:nonNegativeInteger ; + nif:endIndex "7531"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "679"^^xsd:nonNegativeInteger ; + nif:endIndex "683"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "9186"^^xsd:nonNegativeInteger ; + nif:endIndex "9188"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "8749"^^xsd:nonNegativeInteger ; + nif:endIndex "8753"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "8071"^^xsd:nonNegativeInteger ; + nif:endIndex "8073"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "6584"^^xsd:nonNegativeInteger ; + nif:endIndex "6586"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "greater" ; + nif:beginIndex "1403"^^xsd:nonNegativeInteger ; + nif:endIndex "1410"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Jr." ; + nif:beginIndex "607"^^xsd:nonNegativeInteger ; + nif:endIndex "610"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "post" ; + nif:beginIndex "11669"^^xsd:nonNegativeInteger ; + nif:endIndex "11673"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "find" ; + nif:beginIndex "811"^^xsd:nonNegativeInteger ; + nif:endIndex "815"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "eliminate" ; + nif:beginIndex "3489"^^xsd:nonNegativeInteger ; + nif:endIndex "3498"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "11618"^^xsd:nonNegativeInteger ; + nif:endIndex "11620"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "9954"^^xsd:nonNegativeInteger ; + nif:endIndex "9956"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "act" ; + nif:beginIndex "961"^^xsd:nonNegativeInteger ; + nif:endIndex "964"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "it" ; + nif:beginIndex "1341"^^xsd:nonNegativeInteger ; + nif:endIndex "1343"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "should" ; + nif:beginIndex "10591"^^xsd:nonNegativeInteger ; + nif:endIndex "10597"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "million" ; + nif:beginIndex "7642"^^xsd:nonNegativeInteger ; + nif:endIndex "7649"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "2129"^^xsd:nonNegativeInteger ; + nif:endIndex "2130"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "3181"^^xsd:nonNegativeInteger ; + nif:endIndex "3184"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "amendment" ; + nif:beginIndex "9665"^^xsd:nonNegativeInteger ; + nif:endIndex "9674"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "The Georgia Legislature will wind up its 1961 session Monday and head for home -- where some of the highway bond money it approved will follow shortly." ; + nif:beginIndex "7158"^^xsd:nonNegativeInteger ; + nif:endIndex "7309"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "who" ; + nif:beginIndex "10117"^^xsd:nonNegativeInteger ; + nif:endIndex "10120"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "was" ; + nif:beginIndex "570"^^xsd:nonNegativeInteger ; + nif:endIndex "573"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Barber" ; + nif:beginIndex "10404"^^xsd:nonNegativeInteger ; + nif:endIndex "10410"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "In the Blue Ridge meeting, the audience was warned that entering a candidate for governor would force it to take petitions out into voting precincts to obtain the signatures of registered voters." ; + nif:beginIndex "5886"^^xsd:nonNegativeInteger ; + nif:endIndex "6081"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "deserves" ; + nif:beginIndex "283"^^xsd:nonNegativeInteger ; + nif:endIndex "291"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "would" ; + nif:beginIndex "11968"^^xsd:nonNegativeInteger ; + nif:endIndex "11973"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "10829"^^xsd:nonNegativeInteger ; + nif:endIndex "10830"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Mac" ; + nif:beginIndex "9911"^^xsd:nonNegativeInteger ; + nif:endIndex "9914"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "body's" ; + nif:beginIndex "8933"^^xsd:nonNegativeInteger ; + nif:endIndex "8939"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "are" ; + nif:beginIndex "8463"^^xsd:nonNegativeInteger ; + nif:endIndex "8466"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "test" ; + nif:beginIndex "7956"^^xsd:nonNegativeInteger ; + nif:endIndex "7960"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "item" ; + nif:beginIndex "7723"^^xsd:nonNegativeInteger ; + nif:endIndex "7727"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "3745"^^xsd:nonNegativeInteger ; + nif:endIndex "3748"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "1327"^^xsd:nonNegativeInteger ; + nif:endIndex "1330"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Hartsfield has been mayor of Atlanta, with exception of one brief interlude, since 1937." ; + nif:beginIndex "5048"^^xsd:nonNegativeInteger ; + nif:endIndex "5136"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "employed" ; + nif:beginIndex "3804"^^xsd:nonNegativeInteger ; + nif:endIndex "3812"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "state" ; + nif:beginIndex "5768"^^xsd:nonNegativeInteger ; + nif:endIndex "5773"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "It urged that the city ``take steps to remedy'' this problem." ; + nif:beginIndex "1599"^^xsd:nonNegativeInteger ; + nif:endIndex "1660"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "5766"^^xsd:nonNegativeInteger ; + nif:endIndex "5767"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "12108"^^xsd:nonNegativeInteger ; + nif:endIndex "12112"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "by" ; + nif:beginIndex "8658"^^xsd:nonNegativeInteger ; + nif:endIndex "8660"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "10078"^^xsd:nonNegativeInteger ; + nif:endIndex "10081"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "other" ; + nif:beginIndex "3730"^^xsd:nonNegativeInteger ; + nif:endIndex "3735"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "among" ; + nif:beginIndex "1108"^^xsd:nonNegativeInteger ; + nif:endIndex "1113"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Welfare" ; + nif:beginIndex "2160"^^xsd:nonNegativeInteger ; + nif:endIndex "2167"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "1106"^^xsd:nonNegativeInteger ; + nif:endIndex "1107"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "208"^^xsd:nonNegativeInteger ; + nif:endIndex "211"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "superintendent" ; + nif:beginIndex "11571"^^xsd:nonNegativeInteger ; + nif:endIndex "11585"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "8957"^^xsd:nonNegativeInteger ; + nif:endIndex "8959"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "should" ; + nif:beginIndex "7482"^^xsd:nonNegativeInteger ; + nif:endIndex "7488"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "assistant" ; + nif:beginIndex "7021"^^xsd:nonNegativeInteger ; + nif:endIndex "7030"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "any" ; + nif:beginIndex "125"^^xsd:nonNegativeInteger ; + nif:endIndex "128"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "A veteran Jackson County legislator will ask the Georgia House Monday to back federal aid to education, something it has consistently opposed in the past." ; + nif:beginIndex "9751"^^xsd:nonNegativeInteger ; + nif:endIndex "9905"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "courses" ; + nif:beginIndex "6453"^^xsd:nonNegativeInteger ; + nif:endIndex "6460"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "legislator" ; + nif:beginIndex "10146"^^xsd:nonNegativeInteger ; + nif:endIndex "10156"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "is" ; + nif:beginIndex "7103"^^xsd:nonNegativeInteger ; + nif:endIndex "7105"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "school" ; + nif:beginIndex "10721"^^xsd:nonNegativeInteger ; + nif:endIndex "10727"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "funds" ; + nif:beginIndex "10553"^^xsd:nonNegativeInteger ; + nif:endIndex "10558"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "9090"^^xsd:nonNegativeInteger ; + nif:endIndex "9092"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "primary" ; + nif:beginIndex "556"^^xsd:nonNegativeInteger ; + nif:endIndex "563"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "74" ; + nif:beginIndex "4797"^^xsd:nonNegativeInteger ; + nif:endIndex "4799"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "11067"^^xsd:nonNegativeInteger ; + nif:endIndex "11068"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "7689"^^xsd:nonNegativeInteger ; + nif:endIndex "7692"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "It recommended that Fulton legislators act ``to have these laws studied and revised to the end of modernizing and improving them''." ; + nif:beginIndex "922"^^xsd:nonNegativeInteger ; + nif:endIndex "1053"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "awarding" ; + nif:beginIndex "2885"^^xsd:nonNegativeInteger ; + nif:endIndex "2893"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "action" ; + nif:beginIndex "8940"^^xsd:nonNegativeInteger ; + nif:endIndex "8946"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "made" ; + nif:beginIndex "12137"^^xsd:nonNegativeInteger ; + nif:endIndex "12141"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "being" ; + nif:beginIndex "7816"^^xsd:nonNegativeInteger ; + nif:endIndex "7821"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "who" ; + nif:beginIndex "10908"^^xsd:nonNegativeInteger ; + nif:endIndex "10911"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Ordinary" ; + nif:beginIndex "11827"^^xsd:nonNegativeInteger ; + nif:endIndex "11835"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "allotted" ; + nif:beginIndex "7418"^^xsd:nonNegativeInteger ; + nif:endIndex "7426"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "aid" ; + nif:beginIndex "9837"^^xsd:nonNegativeInteger ; + nif:endIndex "9840"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "federal" ; + nif:beginIndex "10003"^^xsd:nonNegativeInteger ; + nif:endIndex "10010"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Sunday" ; + nif:beginIndex "9495"^^xsd:nonNegativeInteger ; + nif:endIndex "9501"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "upon" ; + nif:beginIndex "4685"^^xsd:nonNegativeInteger ; + nif:endIndex "4689"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "most" ; + nif:beginIndex "8084"^^xsd:nonNegativeInteger ; + nif:endIndex "8088"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "some" ; + nif:beginIndex "10175"^^xsd:nonNegativeInteger ; + nif:endIndex "10179"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "takes" ; + nif:beginIndex "3423"^^xsd:nonNegativeInteger ; + nif:endIndex "3428"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "legislators" ; + nif:beginIndex "3945"^^xsd:nonNegativeInteger ; + nif:endIndex "3956"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "or" ; + nif:beginIndex "9148"^^xsd:nonNegativeInteger ; + nif:endIndex "9150"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "9487"^^xsd:nonNegativeInteger ; + nif:endIndex "9488"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Wednesday" ; + nif:beginIndex "5541"^^xsd:nonNegativeInteger ; + nif:endIndex "5550"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "daughter" ; + nif:beginIndex "4561"^^xsd:nonNegativeInteger ; + nif:endIndex "4569"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "4358"^^xsd:nonNegativeInteger ; + nif:endIndex "4361"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "The resolution, which Barber tossed into the House hopper Friday, will be formally read Monday." ; + nif:beginIndex "10382"^^xsd:nonNegativeInteger ; + nif:endIndex "10477"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "received" ; + nif:beginIndex "658"^^xsd:nonNegativeInteger ; + nif:endIndex "666"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "10124"^^xsd:nonNegativeInteger ; + nif:endIndex "10126"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "10446"^^xsd:nonNegativeInteger ; + nif:endIndex "10447"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Gov." ; + nif:beginIndex "7498"^^xsd:nonNegativeInteger ; + nif:endIndex "7502"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "fair" ; + nif:beginIndex "4053"^^xsd:nonNegativeInteger ; + nif:endIndex "4057"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "965"^^xsd:nonNegativeInteger ; + nif:endIndex "967"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "passed" ; + nif:beginIndex "9449"^^xsd:nonNegativeInteger ; + nif:endIndex "9455"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "920"^^xsd:nonNegativeInteger ; + nif:endIndex "921"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "5101"^^xsd:nonNegativeInteger ; + nif:endIndex "5103"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "investigate" ; + nif:beginIndex "486"^^xsd:nonNegativeInteger ; + nif:endIndex "497"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "home" ; + nif:beginIndex "4947"^^xsd:nonNegativeInteger ; + nif:endIndex "4951"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "election" ; + nif:beginIndex "6628"^^xsd:nonNegativeInteger ; + nif:endIndex "6636"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "emphasizing" ; + nif:beginIndex "9253"^^xsd:nonNegativeInteger ; + nif:endIndex "9264"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "612"^^xsd:nonNegativeInteger ; + nif:endIndex "614"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "9749"^^xsd:nonNegativeInteger ; + nif:endIndex "9750"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "D." ; + nif:beginIndex "8864"^^xsd:nonNegativeInteger ; + nif:endIndex "8866"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "been" ; + nif:beginIndex "2780"^^xsd:nonNegativeInteger ; + nif:endIndex "2784"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "aside" ; + nif:beginIndex "9360"^^xsd:nonNegativeInteger ; + nif:endIndex "9365"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Legislature" ; + nif:beginIndex "8563"^^xsd:nonNegativeInteger ; + nif:endIndex "8574"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Pye" ; + nif:beginIndex "479"^^xsd:nonNegativeInteger ; + nif:endIndex "482"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "county" ; + nif:beginIndex "6696"^^xsd:nonNegativeInteger ; + nif:endIndex "6702"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "highway" ; + nif:beginIndex "7258"^^xsd:nonNegativeInteger ; + nif:endIndex "7265"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "good" ; + nif:beginIndex "12142"^^xsd:nonNegativeInteger ; + nif:endIndex "12146"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "10423"^^xsd:nonNegativeInteger ; + nif:endIndex "10426"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "sought" ; + nif:beginIndex "9239"^^xsd:nonNegativeInteger ; + nif:endIndex "9245"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "B." ; + nif:beginIndex "8861"^^xsd:nonNegativeInteger ; + nif:endIndex "8863"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "699"^^xsd:nonNegativeInteger ; + nif:endIndex "702"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "action" ; + nif:beginIndex "9223"^^xsd:nonNegativeInteger ; + nif:endIndex "9229"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "responses" ; + nif:beginIndex "5659"^^xsd:nonNegativeInteger ; + nif:endIndex "5668"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "10357"^^xsd:nonNegativeInteger ; + nif:endIndex "10359"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "his" ; + nif:beginIndex "11665"^^xsd:nonNegativeInteger ; + nif:endIndex "11668"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "battle" ; + nif:beginIndex "8549"^^xsd:nonNegativeInteger ; + nif:endIndex "8555"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "whether" ; + nif:beginIndex "9151"^^xsd:nonNegativeInteger ; + nif:endIndex "9158"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "11259"^^xsd:nonNegativeInteger ; + nif:endIndex "11263"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "``Everything went real smooth'', the sheriff said." ; + nif:beginIndex "12160"^^xsd:nonNegativeInteger ; + nif:endIndex "12210"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "votes" ; + nif:beginIndex "11039"^^xsd:nonNegativeInteger ; + nif:endIndex "11044"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Fulton" ; + nif:beginIndex "3938"^^xsd:nonNegativeInteger ; + nif:endIndex "3944"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "County" ; + nif:beginIndex "1142"^^xsd:nonNegativeInteger ; + nif:endIndex "1148"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "church" ; + nif:beginIndex "11309"^^xsd:nonNegativeInteger ; + nif:endIndex "11315"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "bonds" ; + nif:beginIndex "8835"^^xsd:nonNegativeInteger ; + nif:endIndex "8840"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "or" ; + nif:beginIndex "6645"^^xsd:nonNegativeInteger ; + nif:endIndex "6647"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "political" ; + nif:beginIndex "3499"^^xsd:nonNegativeInteger ; + nif:endIndex "3508"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "9983"^^xsd:nonNegativeInteger ; + nif:endIndex "9985"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Snodgrass" ; + nif:beginIndex "5559"^^xsd:nonNegativeInteger ; + nif:endIndex "5568"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "through" ; + nif:beginIndex "9413"^^xsd:nonNegativeInteger ; + nif:endIndex "9420"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "given" ; + nif:beginIndex "9189"^^xsd:nonNegativeInteger ; + nif:endIndex "9194"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Department" ; + nif:beginIndex "8699"^^xsd:nonNegativeInteger ; + nif:endIndex "8709"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "with" ; + nif:beginIndex "8542"^^xsd:nonNegativeInteger ; + nif:endIndex "8546"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "8447"^^xsd:nonNegativeInteger ; + nif:endIndex "8449"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "7924"^^xsd:nonNegativeInteger ; + nif:endIndex "7927"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "head" ; + nif:beginIndex "7223"^^xsd:nonNegativeInteger ; + nif:endIndex "7227"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "place" ; + nif:beginIndex "6587"^^xsd:nonNegativeInteger ; + nif:endIndex "6592"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "entering" ; + nif:beginIndex "5942"^^xsd:nonNegativeInteger ; + nif:endIndex "5950"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "chairman" ; + nif:beginIndex "5580"^^xsd:nonNegativeInteger ; + nif:endIndex "5588"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "4058"^^xsd:nonNegativeInteger ; + nif:endIndex "4061"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "11677"^^xsd:nonNegativeInteger ; + nif:endIndex "11678"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "11016"^^xsd:nonNegativeInteger ; + nif:endIndex "11017"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "5323"^^xsd:nonNegativeInteger ; + nif:endIndex "5325"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "orderly" ; + nif:beginIndex "1854"^^xsd:nonNegativeInteger ; + nif:endIndex "1861"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jurors" ; + nif:beginIndex "2361"^^xsd:nonNegativeInteger ; + nif:endIndex "2367"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "which" ; + nif:beginIndex "357"^^xsd:nonNegativeInteger ; + nif:endIndex "362"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "experienced" ; + nif:beginIndex "1526"^^xsd:nonNegativeInteger ; + nif:endIndex "1537"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "The Highway Department source told The Constitution, however, that Vandiver has not been consulted yet about the plans to issue the new rural roads bonds." ; + nif:beginIndex "8687"^^xsd:nonNegativeInteger ; + nif:endIndex "8841"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "11770"^^xsd:nonNegativeInteger ; + nif:endIndex "11773"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "committee" ; + nif:beginIndex "3122"^^xsd:nonNegativeInteger ; + nif:endIndex "3131"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "policeman" ; + nif:beginIndex "10758"^^xsd:nonNegativeInteger ; + nif:endIndex "10767"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "10396"^^xsd:nonNegativeInteger ; + nif:endIndex "10397"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "they" ; + nif:beginIndex "2373"^^xsd:nonNegativeInteger ; + nif:endIndex "2377"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "9980"^^xsd:nonNegativeInteger ; + nif:endIndex "9982"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "is" ; + nif:beginIndex "7358"^^xsd:nonNegativeInteger ; + nif:endIndex "7360"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Hartsfield" ; + nif:beginIndex "4936"^^xsd:nonNegativeInteger ; + nif:endIndex "4946"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "multi-million-dollar" ; + nif:beginIndex "3341"^^xsd:nonNegativeInteger ; + nif:endIndex "3361"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "11263"^^xsd:nonNegativeInteger ; + nif:endIndex "11264"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "8494"^^xsd:nonNegativeInteger ; + nif:endIndex "8495"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "governments" ; + nif:beginIndex "1287"^^xsd:nonNegativeInteger ; + nif:endIndex "1298"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Gainesville" ; + nif:beginIndex "6992"^^xsd:nonNegativeInteger ; + nif:endIndex "7003"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "8008"^^xsd:nonNegativeInteger ; + nif:endIndex "8011"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "political" ; + nif:beginIndex "5141"^^xsd:nonNegativeInteger ; + nif:endIndex "5150"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "election" ; + nif:beginIndex "856"^^xsd:nonNegativeInteger ; + nif:endIndex "864"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Sen." ; + nif:beginIndex "5840"^^xsd:nonNegativeInteger ; + nif:endIndex "5844"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "2417"^^xsd:nonNegativeInteger ; + nif:endIndex "2419"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "resolution" ; + nif:beginIndex "9969"^^xsd:nonNegativeInteger ; + nif:endIndex "9979"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "267"^^xsd:nonNegativeInteger ; + nif:endIndex "270"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "some" ; + nif:beginIndex "7246"^^xsd:nonNegativeInteger ; + nif:endIndex "7250"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "been" ; + nif:beginIndex "7013"^^xsd:nonNegativeInteger ; + nif:endIndex "7017"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "date" ; + nif:beginIndex "1838"^^xsd:nonNegativeInteger ; + nif:endIndex "1842"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Pelham" ; + nif:beginIndex "9020"^^xsd:nonNegativeInteger ; + nif:endIndex "9026"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "past" ; + nif:beginIndex "9900"^^xsd:nonNegativeInteger ; + nif:endIndex "9904"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "Regarding Atlanta's new multi-million-dollar airport, the jury recommended ``that when the new management takes charge Jan. 1 the airport be operated in a manner that will eliminate political influences''." ; + nif:beginIndex "3317"^^xsd:nonNegativeInteger ; + nif:endIndex "3522"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "GOP" ; + nif:beginIndex "5576"^^xsd:nonNegativeInteger ; + nif:endIndex "5579"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "may" ; + nif:beginIndex "1888"^^xsd:nonNegativeInteger ; + nif:endIndex "1891"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "3843"^^xsd:nonNegativeInteger ; + nif:endIndex "3845"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "will" ; + nif:beginIndex "9787"^^xsd:nonNegativeInteger ; + nif:endIndex "9791"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "10491"^^xsd:nonNegativeInteger ; + nif:endIndex "10493"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "Merger proposed" ; + nif:beginIndex "1302"^^xsd:nonNegativeInteger ; + nif:endIndex "1317"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "5588"^^xsd:nonNegativeInteger ; + nif:endIndex "5589"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "election" ; + nif:beginIndex "11417"^^xsd:nonNegativeInteger ; + nif:endIndex "11425"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "1595"^^xsd:nonNegativeInteger ; + nif:endIndex "1597"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "elected" ; + nif:beginIndex "10744"^^xsd:nonNegativeInteger ; + nif:endIndex "10751"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "at" ; + nif:beginIndex "4955"^^xsd:nonNegativeInteger ; + nif:endIndex "4957"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "$30" ; + nif:beginIndex "7847"^^xsd:nonNegativeInteger ; + nif:endIndex "7850"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "2521"^^xsd:nonNegativeInteger ; + nif:endIndex "2523"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "5209"^^xsd:nonNegativeInteger ; + nif:endIndex "5212"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "voted" ; + nif:beginIndex "6277"^^xsd:nonNegativeInteger ; + nif:endIndex "6282"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "9648"^^xsd:nonNegativeInteger ; + nif:endIndex "9649"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "will" ; + nif:beginIndex "8874"^^xsd:nonNegativeInteger ; + nif:endIndex "8878"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "action" ; + nif:beginIndex "9578"^^xsd:nonNegativeInteger ; + nif:endIndex "9584"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "by" ; + nif:beginIndex "6974"^^xsd:nonNegativeInteger ; + nif:endIndex "6976"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "real" ; + nif:beginIndex "12178"^^xsd:nonNegativeInteger ; + nif:endIndex "12182"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "738"^^xsd:nonNegativeInteger ; + nif:endIndex "739"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Monday" ; + nif:beginIndex "9814"^^xsd:nonNegativeInteger ; + nif:endIndex "9820"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "local" ; + nif:beginIndex "11935"^^xsd:nonNegativeInteger ; + nif:endIndex "11940"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "adjustments" ; + nif:beginIndex "7470"^^xsd:nonNegativeInteger ; + nif:endIndex "7481"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "superintendent" ; + nif:beginIndex "10728"^^xsd:nonNegativeInteger ; + nif:endIndex "10742"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "relations" ; + nif:beginIndex "6814"^^xsd:nonNegativeInteger ; + nif:endIndex "6823"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "there" ; + nif:beginIndex "8170"^^xsd:nonNegativeInteger ; + nif:endIndex "8175"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "2918"^^xsd:nonNegativeInteger ; + nif:endIndex "2919"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "5595"^^xsd:nonNegativeInteger ; + nif:endIndex "5596"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "as" ; + nif:beginIndex "5023"^^xsd:nonNegativeInteger ; + nif:endIndex "5025"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "anonymous" ; + nif:beginIndex "11800"^^xsd:nonNegativeInteger ; + nif:endIndex "11809"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "4480"^^xsd:nonNegativeInteger ; + nif:endIndex "4481"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "10486"^^xsd:nonNegativeInteger ; + nif:endIndex "10490"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "Gov. Vandiver is expected to make the traditional visit to both chambers as they work toward adjournment." ; + nif:beginIndex "7498"^^xsd:nonNegativeInteger ; + nif:endIndex "7603"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "It" ; + nif:beginIndex "1599"^^xsd:nonNegativeInteger ; + nif:endIndex "1601"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "B." ; + nif:beginIndex "4333"^^xsd:nonNegativeInteger ; + nif:endIndex "4335"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "plan" ; + nif:beginIndex "4082"^^xsd:nonNegativeInteger ; + nif:endIndex "4086"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "by" ; + nif:beginIndex "1731"^^xsd:nonNegativeInteger ; + nif:endIndex "1733"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "bit" ; + nif:beginIndex "11371"^^xsd:nonNegativeInteger ; + nif:endIndex "11374"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "E." ; + nif:beginIndex "4962"^^xsd:nonNegativeInteger ; + nif:endIndex "4964"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "0"^^xsd:nonNegativeInteger ; + nif:endIndex "3"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "2969"^^xsd:nonNegativeInteger ; + nif:endIndex "2971"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "populous" ; + nif:beginIndex "2471"^^xsd:nonNegativeInteger ; + nif:endIndex "2479"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "11389"^^xsd:nonNegativeInteger ; + nif:endIndex "11392"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Fulton" ; + nif:beginIndex "2087"^^xsd:nonNegativeInteger ; + nif:endIndex "2093"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "2881"^^xsd:nonNegativeInteger ; + nif:endIndex "2884"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "1,119" ; + nif:beginIndex "11033"^^xsd:nonNegativeInteger ; + nif:endIndex "11038"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "governor" ; + nif:beginIndex "5967"^^xsd:nonNegativeInteger ; + nif:endIndex "5975"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "They have a son, William Berry Jr., and a daughter, Mrs. J. M. Cheshire of Griffin." ; + nif:beginIndex "4519"^^xsd:nonNegativeInteger ; + nif:endIndex "4602"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "specifically" ; + nif:beginIndex "10334"^^xsd:nonNegativeInteger ; + nif:endIndex "10346"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "been" ; + nif:beginIndex "8771"^^xsd:nonNegativeInteger ; + nif:endIndex "8775"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "portion" ; + nif:beginIndex "2569"^^xsd:nonNegativeInteger ; + nif:endIndex "2576"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "4850"^^xsd:nonNegativeInteger ; + nif:endIndex "4854"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "A revolving fund" ; + nif:beginIndex "8286"^^xsd:nonNegativeInteger ; + nif:endIndex "8302"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "polls" ; + nif:beginIndex "11280"^^xsd:nonNegativeInteger ; + nif:endIndex "11285"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "County" ; + nif:beginIndex "10979"^^xsd:nonNegativeInteger ; + nif:endIndex "10985"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "87-31" ; + nif:beginIndex "9421"^^xsd:nonNegativeInteger ; + nif:endIndex "9426"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "A" ; + nif:beginIndex "8116"^^xsd:nonNegativeInteger ; + nif:endIndex "8117"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "it" ; + nif:beginIndex "2950"^^xsd:nonNegativeInteger ; + nif:endIndex "2952"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "couple" ; + nif:beginIndex "4864"^^xsd:nonNegativeInteger ; + nif:endIndex "4870"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Atlanta" ; + nif:beginIndex "4163"^^xsd:nonNegativeInteger ; + nif:endIndex "4170"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "670"^^xsd:nonNegativeInteger ; + nif:endIndex "673"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "under" ; + nif:beginIndex "2785"^^xsd:nonNegativeInteger ; + nif:endIndex "2790"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "legislator" ; + nif:beginIndex "9776"^^xsd:nonNegativeInteger ; + nif:endIndex "9786"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "11110"^^xsd:nonNegativeInteger ; + nif:endIndex "11111"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "7392"^^xsd:nonNegativeInteger ; + nif:endIndex "7395"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "age" ; + nif:beginIndex "4790"^^xsd:nonNegativeInteger ; + nif:endIndex "4793"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "6035"^^xsd:nonNegativeInteger ; + nif:endIndex "6037"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "1392"^^xsd:nonNegativeInteger ; + nif:endIndex "1394"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "repealed" ; + nif:beginIndex "9130"^^xsd:nonNegativeInteger ; + nif:endIndex "9138"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "work" ; + nif:beginIndex "8058"^^xsd:nonNegativeInteger ; + nif:endIndex "8062"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "The City Purchasing Department, the jury said, ``is lacking in experienced clerical personnel as a result of city personnel policies''." ; + nif:beginIndex "1463"^^xsd:nonNegativeInteger ; + nif:endIndex "1598"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "reduce" ; + nif:beginIndex "1426"^^xsd:nonNegativeInteger ; + nif:endIndex "1432"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "like" ; + nif:beginIndex "10244"^^xsd:nonNegativeInteger ; + nif:endIndex "10248"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jury" ; + nif:beginIndex "418"^^xsd:nonNegativeInteger ; + nif:endIndex "422"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "11265"^^xsd:nonNegativeInteger ; + nif:endIndex "11267"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "5402"^^xsd:nonNegativeInteger ; + nif:endIndex "5405"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "granted" ; + nif:beginIndex "1995"^^xsd:nonNegativeInteger ; + nif:endIndex "2002"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "9459"^^xsd:nonNegativeInteger ; + nif:endIndex "9462"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "6495"^^xsd:nonNegativeInteger ; + nif:endIndex "6498"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "approved" ; + nif:beginIndex "7669"^^xsd:nonNegativeInteger ; + nif:endIndex "7677"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "as" ; + nif:beginIndex "4762"^^xsd:nonNegativeInteger ; + nif:endIndex "4764"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "rural" ; + nif:beginIndex "8631"^^xsd:nonNegativeInteger ; + nif:endIndex "8636"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "9480"^^xsd:nonNegativeInteger ; + nif:endIndex "9482"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "683"^^xsd:nonNegativeInteger ; + nif:endIndex "684"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "506"^^xsd:nonNegativeInteger ; + nif:endIndex "508"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "6552"^^xsd:nonNegativeInteger ; + nif:endIndex "6556"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "law" ; + nif:beginIndex "1884"^^xsd:nonNegativeInteger ; + nif:endIndex "1887"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Superior" ; + nif:beginIndex "450"^^xsd:nonNegativeInteger ; + nif:endIndex "458"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "5521"^^xsd:nonNegativeInteger ; + nif:endIndex "5522"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "10542"^^xsd:nonNegativeInteger ; + nif:endIndex "10544"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "procedures" ; + nif:beginIndex "3007"^^xsd:nonNegativeInteger ; + nif:endIndex "3017"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "by" ; + nif:beginIndex "8476"^^xsd:nonNegativeInteger ; + nif:endIndex "8478"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "issue" ; + nif:beginIndex "7831"^^xsd:nonNegativeInteger ; + nif:endIndex "7836"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "A Highway Department source said there also is a plan there to issue some $3 million to $4 million worth of Rural Roads Authority bonds for rural road construction work." ; + nif:beginIndex "8116"^^xsd:nonNegativeInteger ; + nif:endIndex "8285"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "3922"^^xsd:nonNegativeInteger ; + nif:endIndex "3925"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "smell" ; + nif:beginIndex "11326"^^xsd:nonNegativeInteger ; + nif:endIndex "11331"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "10628"^^xsd:nonNegativeInteger ; + nif:endIndex "10630"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "is" ; + nif:beginIndex "9934"^^xsd:nonNegativeInteger ; + nif:endIndex "9936"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "610"^^xsd:nonNegativeInteger ; + nif:endIndex "611"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "(" ; + nif:beginIndex "3934"^^xsd:nonNegativeInteger ; + nif:endIndex "3935"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "modernizing" ; + nif:beginIndex "1020"^^xsd:nonNegativeInteger ; + nif:endIndex "1031"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "petition" ; + nif:beginIndex "4841"^^xsd:nonNegativeInteger ; + nif:endIndex "4849"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "had" ; + nif:beginIndex "244"^^xsd:nonNegativeInteger ; + nif:endIndex "247"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "While" ; + nif:beginIndex "9247"^^xsd:nonNegativeInteger ; + nif:endIndex "9252"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "9245"^^xsd:nonNegativeInteger ; + nif:endIndex "9246"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "can" ; + nif:beginIndex "9123"^^xsd:nonNegativeInteger ; + nif:endIndex "9126"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "6643"^^xsd:nonNegativeInteger ; + nif:endIndex "6644"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "7686"^^xsd:nonNegativeInteger ; + nif:endIndex "7688"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "7347"^^xsd:nonNegativeInteger ; + nif:endIndex "7350"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Five" ; + nif:beginIndex "6478"^^xsd:nonNegativeInteger ; + nif:endIndex "6482"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "election" ; + nif:beginIndex "12079"^^xsd:nonNegativeInteger ; + nif:endIndex "12087"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "5268"^^xsd:nonNegativeInteger ; + nif:endIndex "5270"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "law" ; + nif:beginIndex "1706"^^xsd:nonNegativeInteger ; + nif:endIndex "1709"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "264"^^xsd:nonNegativeInteger ; + nif:endIndex "266"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "13th" ; + nif:beginIndex "10131"^^xsd:nonNegativeInteger ; + nif:endIndex "10135"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "was" ; + nif:beginIndex "654"^^xsd:nonNegativeInteger ; + nif:endIndex "657"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "wards" ; + nif:beginIndex "3207"^^xsd:nonNegativeInteger ; + nif:endIndex "3212"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Board" ; + nif:beginIndex "10572"^^xsd:nonNegativeInteger ; + nif:endIndex "10577"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "The mayor's present term of office expires Jan. 1." ; + nif:beginIndex "5209"^^xsd:nonNegativeInteger ; + nif:endIndex "5259"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "1813"^^xsd:nonNegativeInteger ; + nif:endIndex "1816"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "1017"^^xsd:nonNegativeInteger ; + nif:endIndex "1019"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "orderly" ; + nif:beginIndex "12071"^^xsd:nonNegativeInteger ; + nif:endIndex "12078"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "provide" ; + nif:beginIndex "10520"^^xsd:nonNegativeInteger ; + nif:endIndex "10527"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "term-end" ; + nif:beginIndex "181"^^xsd:nonNegativeInteger ; + nif:endIndex "189"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "election" ; + nif:beginIndex "367"^^xsd:nonNegativeInteger ; + nif:endIndex "375"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "they" ; + nif:beginIndex "7574"^^xsd:nonNegativeInteger ; + nif:endIndex "7578"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "``" ; + nif:beginIndex "3133"^^xsd:nonNegativeInteger ; + nif:endIndex "3135"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Tuesday" ; + nif:beginIndex "5610"^^xsd:nonNegativeInteger ; + nif:endIndex "5617"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "5495"^^xsd:nonNegativeInteger ; + nif:endIndex "5498"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "state" ; + nif:beginIndex "10096"^^xsd:nonNegativeInteger ; + nif:endIndex "10101"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "being" ; + nif:beginIndex "9543"^^xsd:nonNegativeInteger ; + nif:endIndex "9548"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Rural" ; + nif:beginIndex "8224"^^xsd:nonNegativeInteger ; + nif:endIndex "8229"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Four" ; + nif:beginIndex "3776"^^xsd:nonNegativeInteger ; + nif:endIndex "3780"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Mayor" ; + nif:beginIndex "4319"^^xsd:nonNegativeInteger ; + nif:endIndex "4324"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "It" ; + nif:beginIndex "1753"^^xsd:nonNegativeInteger ; + nif:endIndex "1755"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "4188"^^xsd:nonNegativeInteger ; + nif:endIndex "4189"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "many" ; + nif:beginIndex "821"^^xsd:nonNegativeInteger ; + nif:endIndex "825"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "4317"^^xsd:nonNegativeInteger ; + nif:endIndex "4318"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "5168"^^xsd:nonNegativeInteger ; + nif:endIndex "5170"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "2796"^^xsd:nonNegativeInteger ; + nif:endIndex "2799"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "6557"^^xsd:nonNegativeInteger ; + nif:endIndex "6560"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "airport" ; + nif:beginIndex "3362"^^xsd:nonNegativeInteger ; + nif:endIndex "3369"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "March" ; + nif:beginIndex "11627"^^xsd:nonNegativeInteger ; + nif:endIndex "11632"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "10751"^^xsd:nonNegativeInteger ; + nif:endIndex "10752"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "adjournment" ; + nif:beginIndex "7317"^^xsd:nonNegativeInteger ; + nif:endIndex "7328"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "it" ; + nif:beginIndex "3555"^^xsd:nonNegativeInteger ; + nif:endIndex "3557"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "29-5" ; + nif:beginIndex "9483"^^xsd:nonNegativeInteger ; + nif:endIndex "9487"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "11332"^^xsd:nonNegativeInteger ; + nif:endIndex "11333"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "1300"^^xsd:nonNegativeInteger ; + nif:endIndex "1301"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "2490"^^xsd:nonNegativeInteger ; + nif:endIndex "2491"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "offer" ; + nif:beginIndex "8879"^^xsd:nonNegativeInteger ; + nif:endIndex "8884"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "soon" ; + nif:beginIndex "11892"^^xsd:nonNegativeInteger ; + nif:endIndex "11896"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "8176"^^xsd:nonNegativeInteger ; + nif:endIndex "8178"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "7436"^^xsd:nonNegativeInteger ; + nif:endIndex "7439"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "continue" ; + nif:beginIndex "2648"^^xsd:nonNegativeInteger ; + nif:endIndex "2656"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "11385"^^xsd:nonNegativeInteger ; + nif:endIndex "11387"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "9527"^^xsd:nonNegativeInteger ; + nif:endIndex "9529"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "is" ; + nif:beginIndex "6371"^^xsd:nonNegativeInteger ; + nif:endIndex "6373"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Wards" ; + nif:beginIndex "2920"^^xsd:nonNegativeInteger ; + nif:endIndex "2925"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "was" ; + nif:beginIndex "1710"^^xsd:nonNegativeInteger ; + nif:endIndex "1713"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "have" ; + nif:beginIndex "970"^^xsd:nonNegativeInteger ; + nif:endIndex "974"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "strong" ; + nif:beginIndex "5450"^^xsd:nonNegativeInteger ; + nif:endIndex "5456"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "had" ; + nif:beginIndex "423"^^xsd:nonNegativeInteger ; + nif:endIndex "426"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "3455"^^xsd:nonNegativeInteger ; + nif:endIndex "3457"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "permit" ; + nif:beginIndex "12033"^^xsd:nonNegativeInteger ; + nif:endIndex "12039"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "relative" ; + nif:beginIndex "621"^^xsd:nonNegativeInteger ; + nif:endIndex "629"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "2064"^^xsd:nonNegativeInteger ; + nif:endIndex "2067"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "available" ; + nif:beginIndex "2586"^^xsd:nonNegativeInteger ; + nif:endIndex "2595"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Jr." ; + nif:beginIndex "5295"^^xsd:nonNegativeInteger ; + nif:endIndex "5298"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "shot" ; + nif:beginIndex "11605"^^xsd:nonNegativeInteger ; + nif:endIndex "11609"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "10578"^^xsd:nonNegativeInteger ; + nif:endIndex "10580"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "thanks" ; + nif:beginIndex "307"^^xsd:nonNegativeInteger ; + nif:endIndex "313"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "congressmen" ; + nif:beginIndex "10322"^^xsd:nonNegativeInteger ; + nif:endIndex "10333"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "quickie" ; + nif:beginIndex "9092"^^xsd:nonNegativeInteger ; + nif:endIndex "9099"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "take" ; + nif:beginIndex "5994"^^xsd:nonNegativeInteger ; + nif:endIndex "5998"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "intern" ; + nif:beginIndex "3863"^^xsd:nonNegativeInteger ; + nif:endIndex "3869"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "source" ; + nif:beginIndex "8710"^^xsd:nonNegativeInteger ; + nif:endIndex "8716"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "resolution" ; + nif:beginIndex "9380"^^xsd:nonNegativeInteger ; + nif:endIndex "9390"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "1493"^^xsd:nonNegativeInteger ; + nif:endIndex "1494"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "5471"^^xsd:nonNegativeInteger ; + nif:endIndex "5473"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "is" ; + nif:beginIndex "7512"^^xsd:nonNegativeInteger ; + nif:endIndex "7514"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "he" ; + nif:beginIndex "11903"^^xsd:nonNegativeInteger ; + nif:endIndex "11905"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Williams" ; + nif:beginIndex "12010"^^xsd:nonNegativeInteger ; + nif:endIndex "12018"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "too" ; + nif:beginIndex "11854"^^xsd:nonNegativeInteger ; + nif:endIndex "11857"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "million" ; + nif:beginIndex "7851"^^xsd:nonNegativeInteger ; + nif:endIndex "7858"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Jan." ; + nif:beginIndex "5252"^^xsd:nonNegativeInteger ; + nif:endIndex "5256"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "election" ; + nif:beginIndex "11059"^^xsd:nonNegativeInteger ; + nif:endIndex "11067"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jury" ; + nif:beginIndex "1331"^^xsd:nonNegativeInteger ; + nif:endIndex "1335"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "2858"^^xsd:nonNegativeInteger ; + nif:endIndex "2861"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "candidate" ; + nif:beginIndex "5953"^^xsd:nonNegativeInteger ; + nif:endIndex "5962"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "education" ; + nif:beginIndex "10030"^^xsd:nonNegativeInteger ; + nif:endIndex "10039"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "for" ; + nif:beginIndex "5963"^^xsd:nonNegativeInteger ; + nif:endIndex "5966"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Failure" ; + nif:beginIndex "2624"^^xsd:nonNegativeInteger ; + nif:endIndex "2631"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "from" ; + nif:beginIndex "3269"^^xsd:nonNegativeInteger ; + nif:endIndex "3273"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "5550"^^xsd:nonNegativeInteger ; + nif:endIndex "5551"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "4810"^^xsd:nonNegativeInteger ; + nif:endIndex "4812"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "irregularities" ; + nif:beginIndex "520"^^xsd:nonNegativeInteger ; + nif:endIndex "534"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "William" ; + nif:beginIndex "4536"^^xsd:nonNegativeInteger ; + nif:endIndex "4543"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "threats" ; + nif:beginIndex "11525"^^xsd:nonNegativeInteger ; + nif:endIndex "11532"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "child" ; + nif:beginIndex "2007"^^xsd:nonNegativeInteger ; + nif:endIndex "2012"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "didn't" ; + nif:beginIndex "11357"^^xsd:nonNegativeInteger ; + nif:endIndex "11363"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Davis" ; + nif:beginIndex "11018"^^xsd:nonNegativeInteger ; + nif:endIndex "11023"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "as" ; + nif:beginIndex "10093"^^xsd:nonNegativeInteger ; + nif:endIndex "10095"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "6650"^^xsd:nonNegativeInteger ; + nif:endIndex "6653"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Construction" ; + nif:beginIndex "7729"^^xsd:nonNegativeInteger ; + nif:endIndex "7741"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "interlude" ; + nif:beginIndex "5114"^^xsd:nonNegativeInteger ; + nif:endIndex "5123"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Alpharetta" ; + nif:beginIndex "4245"^^xsd:nonNegativeInteger ; + nif:endIndex "4255"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "rejected" ; + nif:beginIndex "9639"^^xsd:nonNegativeInteger ; + nif:endIndex "9647"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "was" ; + nif:beginIndex "11426"^^xsd:nonNegativeInteger ; + nif:endIndex "11429"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "as" ; + nif:beginIndex "10753"^^xsd:nonNegativeInteger ; + nif:endIndex "10755"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "The grand jury commented on a number of other topics, among them the Atlanta and Fulton County purchasing departments which it said ``are well operated and follow generally accepted practices which inure to the best interest of both governments''." ; + nif:beginIndex "1054"^^xsd:nonNegativeInteger ; + nif:endIndex "1301"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "8534"^^xsd:nonNegativeInteger ; + nif:endIndex "8536"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "death" ; + nif:beginIndex "11621"^^xsd:nonNegativeInteger ; + nif:endIndex "11626"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "likely" ; + nif:beginIndex "7613"^^xsd:nonNegativeInteger ; + nif:endIndex "7619"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "5889"^^xsd:nonNegativeInteger ; + nif:endIndex "5892"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Department's" ; + nif:beginIndex "1956"^^xsd:nonNegativeInteger ; + nif:endIndex "1968"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "did" ; + nif:beginIndex "807"^^xsd:nonNegativeInteger ; + nif:endIndex "810"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "would" ; + nif:beginIndex "9166"^^xsd:nonNegativeInteger ; + nif:endIndex "9171"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "4627"^^xsd:nonNegativeInteger ; + nif:endIndex "4631"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "polls" ; + nif:beginIndex "11149"^^xsd:nonNegativeInteger ; + nif:endIndex "11154"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "increase" ; + nif:beginIndex "9114"^^xsd:nonNegativeInteger ; + nif:endIndex "9122"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "967"^^xsd:nonNegativeInteger ; + nif:endIndex "969"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "--" ; + nif:beginIndex "6286"^^xsd:nonNegativeInteger ; + nif:endIndex "6288"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "4553"^^xsd:nonNegativeInteger ; + nif:endIndex "4554"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "unmeritorious" ; + nif:beginIndex "3274"^^xsd:nonNegativeInteger ; + nif:endIndex "3287"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "When the gubernatorial campaign starts, Caldwell is expected to become a campaign coordinator for Byrd." ; + nif:beginIndex "7054"^^xsd:nonNegativeInteger ; + nif:endIndex "7157"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "4978"^^xsd:nonNegativeInteger ; + nif:endIndex "4979"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "3162"^^xsd:nonNegativeInteger ; + nif:endIndex "3164"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "is" ; + nif:beginIndex "10121"^^xsd:nonNegativeInteger ; + nif:endIndex "10123"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "2601"^^xsd:nonNegativeInteger ; + nif:endIndex "2603"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "10562"^^xsd:nonNegativeInteger ; + nif:endIndex "10565"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "9099"^^xsd:nonNegativeInteger ; + nif:endIndex "9101"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "fact" ; + nif:beginIndex "3176"^^xsd:nonNegativeInteger ; + nif:endIndex "3180"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "items" ; + nif:beginIndex "2074"^^xsd:nonNegativeInteger ; + nif:endIndex "2079"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "first" ; + nif:beginIndex "7708"^^xsd:nonNegativeInteger ; + nif:endIndex "7713"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "congressional" ; + nif:beginIndex "10195"^^xsd:nonNegativeInteger ; + nif:endIndex "10208"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "federal" ; + nif:beginIndex "10545"^^xsd:nonNegativeInteger ; + nif:endIndex "10552"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "his" ; + nif:beginIndex "7704"^^xsd:nonNegativeInteger ; + nif:endIndex "7707"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "5568"^^xsd:nonNegativeInteger ; + nif:endIndex "5569"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "2290"^^xsd:nonNegativeInteger ; + nif:endIndex "2293"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "determine" ; + nif:beginIndex "7455"^^xsd:nonNegativeInteger ; + nif:endIndex "7464"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "5200"^^xsd:nonNegativeInteger ; + nif:endIndex "5202"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "3584"^^xsd:nonNegativeInteger ; + nif:endIndex "3586"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "this" ; + nif:beginIndex "10816"^^xsd:nonNegativeInteger ; + nif:endIndex "10820"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "10715"^^xsd:nonNegativeInteger ; + nif:endIndex "10716"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "10309"^^xsd:nonNegativeInteger ; + nif:endIndex "10311"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "2894"^^xsd:nonNegativeInteger ; + nif:endIndex "2896"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Rep." ; + nif:beginIndex "9906"^^xsd:nonNegativeInteger ; + nif:endIndex "9910"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "some" ; + nif:beginIndex "6942"^^xsd:nonNegativeInteger ; + nif:endIndex "6946"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "operated" ; + nif:beginIndex "1197"^^xsd:nonNegativeInteger ; + nif:endIndex "1205"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "5911"^^xsd:nonNegativeInteger ; + nif:endIndex "5912"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "7052"^^xsd:nonNegativeInteger ; + nif:endIndex "7053"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "resigned" ; + nif:beginIndex "6834"^^xsd:nonNegativeInteger ; + nif:endIndex "6842"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "warning" ; + nif:beginIndex "6094"^^xsd:nonNegativeInteger ; + nif:endIndex "6101"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Atlanta's" ; + nif:beginIndex "3327"^^xsd:nonNegativeInteger ; + nif:endIndex "3336"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "3131"^^xsd:nonNegativeInteger ; + nif:endIndex "3132"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "added" ; + nif:beginIndex "3558"^^xsd:nonNegativeInteger ; + nif:endIndex "3563"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "``These actions should serve to protect in fact and in effect the court's wards from undue costs and its appointed and elected servants from unmeritorious criticisms'', the jury said." ; + nif:beginIndex "3133"^^xsd:nonNegativeInteger ; + nif:endIndex "3316"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "bond" ; + nif:beginIndex "7658"^^xsd:nonNegativeInteger ; + nif:endIndex "7662"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "fit" ; + nif:beginIndex "2190"^^xsd:nonNegativeInteger ; + nif:endIndex "2193"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "proposed" ; + nif:beginIndex "8649"^^xsd:nonNegativeInteger ; + nif:endIndex "8657"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "1751"^^xsd:nonNegativeInteger ; + nif:endIndex "1752"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "1050"^^xsd:nonNegativeInteger ; + nif:endIndex "1052"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "coordinator" ; + nif:beginIndex "7136"^^xsd:nonNegativeInteger ; + nif:endIndex "7147"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Association" ; + nif:beginIndex "3086"^^xsd:nonNegativeInteger ; + nif:endIndex "3097"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "system" ; + nif:beginIndex "6720"^^xsd:nonNegativeInteger ; + nif:endIndex "6726"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "act" ; + nif:beginIndex "7994"^^xsd:nonNegativeInteger ; + nif:endIndex "7997"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "4241"^^xsd:nonNegativeInteger ; + nif:endIndex "4244"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "11350"^^xsd:nonNegativeInteger ; + nif:endIndex "11353"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Ivan" ; + nif:beginIndex "5284"^^xsd:nonNegativeInteger ; + nif:endIndex "5288"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Berry" ; + nif:beginIndex "4544"^^xsd:nonNegativeInteger ; + nif:endIndex "4549"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "State" ; + nif:beginIndex "1942"^^xsd:nonNegativeInteger ; + nif:endIndex "1947"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "2877"^^xsd:nonNegativeInteger ; + nif:endIndex "2880"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Georgia's" ; + nif:beginIndex "10312"^^xsd:nonNegativeInteger ; + nif:endIndex "10321"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "on" ; + nif:beginIndex "2694"^^xsd:nonNegativeInteger ; + nif:endIndex "2696"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Before" ; + nif:beginIndex "7310"^^xsd:nonNegativeInteger ; + nif:endIndex "7316"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jury" ; + nif:beginIndex "3527"^^xsd:nonNegativeInteger ; + nif:endIndex "3531"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "5258"^^xsd:nonNegativeInteger ; + nif:endIndex "5259"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "7966"^^xsd:nonNegativeInteger ; + nif:endIndex "7968"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "6851"^^xsd:nonNegativeInteger ; + nif:endIndex "6853"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "House" ; + nif:beginIndex "9401"^^xsd:nonNegativeInteger ; + nif:endIndex "9406"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "warned" ; + nif:beginIndex "5930"^^xsd:nonNegativeInteger ; + nif:endIndex "5936"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "4071"^^xsd:nonNegativeInteger ; + nif:endIndex "4073"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "resolution" ; + nif:beginIndex "9331"^^xsd:nonNegativeInteger ; + nif:endIndex "9341"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "The Hartsfield home is at 637 E. Pelham Rd. Aj." ; + nif:beginIndex "4932"^^xsd:nonNegativeInteger ; + nif:endIndex "4979"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "its" ; + nif:beginIndex "3234"^^xsd:nonNegativeInteger ; + nif:endIndex "3237"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "legislators" ; + nif:beginIndex "9684"^^xsd:nonNegativeInteger ; + nif:endIndex "9695"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "2691"^^xsd:nonNegativeInteger ; + nif:endIndex "2693"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "2666"^^xsd:nonNegativeInteger ; + nif:endIndex "2667"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "charged" ; + nif:beginIndex "4458"^^xsd:nonNegativeInteger ; + nif:endIndex "4465"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "8918"^^xsd:nonNegativeInteger ; + nif:endIndex "8920"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "8556"^^xsd:nonNegativeInteger ; + nif:endIndex "8558"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "--" ; + nif:beginIndex "7237"^^xsd:nonNegativeInteger ; + nif:endIndex "7239"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "John" ; + nif:beginIndex "5845"^^xsd:nonNegativeInteger ; + nif:endIndex "5849"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "of" ; + nif:beginIndex "774"^^xsd:nonNegativeInteger ; + nif:endIndex "776"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "9353"^^xsd:nonNegativeInteger ; + nif:endIndex "9355"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "suit" ; + nif:beginIndex "7961"^^xsd:nonNegativeInteger ; + nif:endIndex "7965"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "11845"^^xsd:nonNegativeInteger ; + nif:endIndex "11849"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "6751"^^xsd:nonNegativeInteger ; + nif:endIndex "6753"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "120"^^xsd:nonNegativeInteger ; + nif:endIndex "124"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Washington" ; + nif:beginIndex "10223"^^xsd:nonNegativeInteger ; + nif:endIndex "10233"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "might" ; + nif:beginIndex "2432"^^xsd:nonNegativeInteger ; + nif:endIndex "2437"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "work" ; + nif:beginIndex "3959"^^xsd:nonNegativeInteger ; + nif:endIndex "3963"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "The" ; + nif:beginIndex "7898"^^xsd:nonNegativeInteger ; + nif:endIndex "7901"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "investigation" ; + nif:beginIndex "44"^^xsd:nonNegativeInteger ; + nif:endIndex "57"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "7381"^^xsd:nonNegativeInteger ; + nif:endIndex "7382"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "influences" ; + nif:beginIndex "3509"^^xsd:nonNegativeInteger ; + nif:endIndex "3519"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Department" ; + nif:beginIndex "6796"^^xsd:nonNegativeInteger ; + nif:endIndex "6806"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "The Republicans must hold a primary under the county unit system -- a system which the party opposes in its platform." ; + nif:beginIndex "6650"^^xsd:nonNegativeInteger ; + nif:endIndex "6767"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "899"^^xsd:nonNegativeInteger ; + nif:endIndex "902"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "8" ; + nif:beginIndex "5797"^^xsd:nonNegativeInteger ; + nif:endIndex "5798"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "It was marked by controversy, anonymous midnight phone calls and veiled threats of violence." ; + nif:beginIndex "11453"^^xsd:nonNegativeInteger ; + nif:endIndex "11545"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "roads" ; + nif:beginIndex "8637"^^xsd:nonNegativeInteger ; + nif:endIndex "8642"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "bonds" ; + nif:beginIndex "8404"^^xsd:nonNegativeInteger ; + nif:endIndex "8409"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "doctor" ; + nif:beginIndex "3847"^^xsd:nonNegativeInteger ; + nif:endIndex "3853"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "at" ; + nif:beginIndex "11273"^^xsd:nonNegativeInteger ; + nif:endIndex "11275"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "out" ; + nif:beginIndex "9310"^^xsd:nonNegativeInteger ; + nif:endIndex "9313"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "7945"^^xsd:nonNegativeInteger ; + nif:endIndex "7946"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Before adjournment Monday afternoon, the Senate is expected to approve a study of the number of legislators allotted to rural and urban areas to determine what adjustments should be made." ; + nif:beginIndex "7310"^^xsd:nonNegativeInteger ; + nif:endIndex "7497"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "which" ; + nif:beginIndex "1246"^^xsd:nonNegativeInteger ; + nif:endIndex "1251"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "policies" ; + nif:beginIndex "1587"^^xsd:nonNegativeInteger ; + nif:endIndex "1595"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "and" ; + nif:beginIndex "3902"^^xsd:nonNegativeInteger ; + nif:endIndex "3905"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "also" ; + nif:beginIndex "1714"^^xsd:nonNegativeInteger ; + nif:endIndex "1718"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "delegation" ; + nif:beginIndex "10209"^^xsd:nonNegativeInteger ; + nif:endIndex "10219"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "it" ; + nif:beginIndex "804"^^xsd:nonNegativeInteger ; + nif:endIndex "806"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "feel" ; + nif:beginIndex "2511"^^xsd:nonNegativeInteger ; + nif:endIndex "2515"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "these" ; + nif:beginIndex "975"^^xsd:nonNegativeInteger ; + nif:endIndex "980"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Williams" ; + nif:beginIndex "11102"^^xsd:nonNegativeInteger ; + nif:endIndex "11110"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "12193"^^xsd:nonNegativeInteger ; + nif:endIndex "12196"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "pay" ; + nif:beginIndex "10642"^^xsd:nonNegativeInteger ; + nif:endIndex "10645"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "rescind" ; + nif:beginIndex "8921"^^xsd:nonNegativeInteger ; + nif:endIndex "8928"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "1905"^^xsd:nonNegativeInteger ; + nif:endIndex "1906"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "9675"^^xsd:nonNegativeInteger ; + nif:endIndex "9677"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Friday" ; + nif:beginIndex "4437"^^xsd:nonNegativeInteger ; + nif:endIndex "4443"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Henry" ; + nif:beginIndex "4980"^^xsd:nonNegativeInteger ; + nif:endIndex "4985"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "through" ; + nif:beginIndex "2220"^^xsd:nonNegativeInteger ; + nif:endIndex "2227"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "12244"^^xsd:nonNegativeInteger ; + nif:endIndex "12245"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "4268"^^xsd:nonNegativeInteger ; + nif:endIndex "4269"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "has" ; + nif:beginIndex "9868"^^xsd:nonNegativeInteger ; + nif:endIndex "9871"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "privilege" ; + nif:beginIndex "9370"^^xsd:nonNegativeInteger ; + nif:endIndex "9379"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Superior" ; + nif:beginIndex "4422"^^xsd:nonNegativeInteger ; + nif:endIndex "4430"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "which" ; + nif:beginIndex "6386"^^xsd:nonNegativeInteger ; + nif:endIndex "6391"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "be" ; + nif:beginIndex "5861"^^xsd:nonNegativeInteger ; + nif:endIndex "5863"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "3707"^^xsd:nonNegativeInteger ; + nif:endIndex "3708"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "operation" ; + nif:beginIndex "4146"^^xsd:nonNegativeInteger ; + nif:endIndex "4155"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "." ; + nif:beginIndex "9018"^^xsd:nonNegativeInteger ; + nif:endIndex "9019"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "619"^^xsd:nonNegativeInteger ; + nif:endIndex "620"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "first" ; + nif:beginIndex "6418"^^xsd:nonNegativeInteger ; + nif:endIndex "6423"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "a" ; + nif:beginIndex "6416"^^xsd:nonNegativeInteger ; + nif:endIndex "6417"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Purchasing" ; + nif:beginIndex "1472"^^xsd:nonNegativeInteger ; + nif:endIndex "1482"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "402" ; + nif:beginIndex "11082"^^xsd:nonNegativeInteger ; + nif:endIndex "11085"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "there" ; + nif:beginIndex "10163"^^xsd:nonNegativeInteger ; + nif:endIndex "10168"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "funds" ; + nif:beginIndex "1807"^^xsd:nonNegativeInteger ; + nif:endIndex "1812"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "court" ; + nif:beginIndex "2764"^^xsd:nonNegativeInteger ; + nif:endIndex "2769"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "state" ; + nif:beginIndex "5570"^^xsd:nonNegativeInteger ; + nif:endIndex "5575"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "on" ; + nif:beginIndex "9704"^^xsd:nonNegativeInteger ; + nif:endIndex "9706"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "gun" ; + nif:beginIndex "12051"^^xsd:nonNegativeInteger ; + nif:endIndex "12054"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "program" ; + nif:beginIndex "2120"^^xsd:nonNegativeInteger ; + nif:endIndex "2127"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "have" ; + nif:beginIndex "6358"^^xsd:nonNegativeInteger ; + nif:endIndex "6362"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "vote" ; + nif:beginIndex "6125"^^xsd:nonNegativeInteger ; + nif:endIndex "6129"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "mention" ; + nif:beginIndex "7625"^^xsd:nonNegativeInteger ; + nif:endIndex "7632"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "has" ; + nif:beginIndex "2776"^^xsd:nonNegativeInteger ; + nif:endIndex "2779"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "3394"^^xsd:nonNegativeInteger ; + nif:endIndex "3398"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "9110"^^xsd:nonNegativeInteger ; + nif:endIndex "9113"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jury" ; + nif:beginIndex "1747"^^xsd:nonNegativeInteger ; + nif:endIndex "1751"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "added" ; + nif:beginIndex "5725"^^xsd:nonNegativeInteger ; + nif:endIndex "5730"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "validity" ; + nif:beginIndex "7978"^^xsd:nonNegativeInteger ; + nif:endIndex "7986"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "said" ; + nif:beginIndex "8144"^^xsd:nonNegativeInteger ; + nif:endIndex "8148"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:Sentence , nif:OffsetBasedString ; + nif:anchorOf "The petition said that the couple has not lived together as man and wife for more than a year." ; + nif:beginIndex "4837"^^xsd:nonNegativeInteger ; + nif:endIndex "4931"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "county" ; + nif:beginIndex "11557"^^xsd:nonNegativeInteger ; + nif:endIndex "11563"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "was" ; + nif:beginIndex "11456"^^xsd:nonNegativeInteger ; + nif:endIndex "11459"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Georgia's" ; + nif:beginIndex "8074"^^xsd:nonNegativeInteger ; + nif:endIndex "8083"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Fulton legislators ``work with city officials to pass enabling legislation that will permit the establishment of a fair and equitable'' pension plan for city employes." ; + nif:beginIndex "3938"^^xsd:nonNegativeInteger ; + nif:endIndex "4105"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext ; + nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "jury" ; + nif:beginIndex "2940"^^xsd:nonNegativeInteger ; + nif:endIndex "2944"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "which" ; + nif:beginIndex "9391"^^xsd:nonNegativeInteger ; + nif:endIndex "9396"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "A" ; + nif:beginIndex "9751"^^xsd:nonNegativeInteger ; + nif:endIndex "9752"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "the" ; + nif:beginIndex "4289"^^xsd:nonNegativeInteger ; + nif:endIndex "4292"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Tabb" ; + nif:beginIndex "12103"^^xsd:nonNegativeInteger ; + nif:endIndex "12107"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "4511"^^xsd:nonNegativeInteger ; + nif:endIndex "4512"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "proposed" ; + nif:beginIndex "1309"^^xsd:nonNegativeInteger ; + nif:endIndex "1317"^^xsd:nonNegativeInteger ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "to" ; + nif:beginIndex "7828"^^xsd:nonNegativeInteger ; + nif:endIndex "7830"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "passed" ; + nif:beginIndex "10276"^^xsd:nonNegativeInteger ; + nif:endIndex "10282"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "," ; + nif:beginIndex "7003"^^xsd:nonNegativeInteger ; + nif:endIndex "7004"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "none" ; + nif:beginIndex "2337"^^xsd:nonNegativeInteger ; + nif:endIndex "2341"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Committee" ; + nif:beginIndex "227"^^xsd:nonNegativeInteger ; + nif:endIndex "236"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "this" ; + nif:beginIndex "1647"^^xsd:nonNegativeInteger ; + nif:endIndex "1651"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "city" ; + nif:beginIndex "4091"^^xsd:nonNegativeInteger ; + nif:endIndex "4095"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "in" ; + nif:beginIndex "1523"^^xsd:nonNegativeInteger ; + nif:endIndex "1525"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "''" ; + nif:beginIndex "117"^^xsd:nonNegativeInteger ; + nif:endIndex "119"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "counties" ; + nif:beginIndex "2480"^^xsd:nonNegativeInteger ; + nif:endIndex "2488"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "smooth" ; + nif:beginIndex "12183"^^xsd:nonNegativeInteger ; + nif:endIndex "12189"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "11957"^^xsd:nonNegativeInteger ; + nif:endIndex "11961"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "that" ; + nif:beginIndex "4013"^^xsd:nonNegativeInteger ; + nif:endIndex "4017"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:previousWord ; + nif:referenceContext ; + nif:sentence . + + + a nif:OffsetBasedString , nif:Word ; + nif:anchorOf "Merger" ; + nif:beginIndex "1302"^^xsd:nonNegativeInteger ; + nif:endIndex "1308"^^xsd:nonNegativeInteger ; + nif:nextWord ; + nif:referenceContext ; + nif:sentence . diff --git a/dkpro-core-io-nif-asl/src/test/resources/nif/brown/ref.ttl b/dkpro-core-io-nif-asl/src/test/resources/nif/brown/ref.ttl deleted file mode 100644 index 9e6d65c7a9..0000000000 --- a/dkpro-core-io-nif-asl/src/test/resources/nif/brown/ref.ttl +++ /dev/null @@ -1,23306 +0,0 @@ -@prefix rdf: . -@prefix owl: . -@prefix xsd: . -@prefix itsrdf: . -@prefix nif: . -@prefix rdfs: . - -rdf:rest a rdf:Property , rdfs:Resource ; - rdfs:domain rdf:List ; - rdfs:range rdf:List ; - rdfs:subPropertyOf rdf:rest . - -rdf:List a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdfs:Resource , rdf:List . - -rdf:predicate a rdf:Property , rdfs:Resource ; - rdfs:domain rdf:Statement ; - rdfs:subPropertyOf rdf:predicate . - -rdf:Property a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdfs:Resource , rdf:Property . - -rdfs:label a rdf:Property , rdfs:Resource ; - rdfs:range rdfs:Literal . - -rdf:Statement a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdfs:Resource , rdf:Statement . - -rdfs:Class a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdfs:Resource , rdfs:Class . - -rdf:type a rdf:Property , rdfs:Resource ; - rdfs:range rdfs:Class . - -rdfs:Resource a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdfs:Resource . - -rdf:subject a rdf:Property , rdfs:Resource ; - rdfs:domain rdf:Statement ; - rdfs:subPropertyOf rdf:subject . - -rdf:XMLLiteral a rdfs:Datatype , rdfs:Class , rdfs:Resource . - -rdfs:comment a rdf:Property , rdfs:Resource ; - rdfs:range rdfs:Literal . - -rdfs:range a rdf:Property , rdfs:Resource ; - rdfs:domain rdf:Property ; - rdfs:range rdfs:Class . - -rdfs:subPropertyOf a rdf:Property , rdfs:Resource ; - rdfs:domain rdf:Property ; - rdfs:range rdf:Property . - -rdf:object a rdf:Property , rdfs:Resource ; - rdfs:domain rdf:Statement ; - rdfs:subPropertyOf rdf:object . - -rdf:nil a rdf:List , rdfs:Resource . - -rdfs:domain a rdf:Property , rdfs:Resource ; - rdfs:domain rdf:Property ; - rdfs:range rdfs:Class . - -rdfs:Literal a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdfs:Resource , rdfs:Literal . - -rdf:first a rdf:Property , rdfs:Resource ; - rdfs:domain rdf:List ; - rdfs:subPropertyOf rdf:first . - -rdfs:subClassOf a rdf:Property , rdfs:Resource ; - rdfs:domain rdfs:Class ; - rdfs:range rdfs:Class . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "6859"^^xsd:nonNegativeInteger ; - nif:endIndex "6862"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "both" ; - nif:beginIndex "1282"^^xsd:nonNegativeInteger ; - nif:endIndex "1286"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Allen" ; - nif:beginIndex "5289"^^xsd:nonNegativeInteger ; - nif:endIndex "5294"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "public" ; - nif:beginIndex "6807"^^xsd:nonNegativeInteger ; - nif:endIndex "6813"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "5416"^^xsd:nonNegativeInteger ; - nif:endIndex "5417"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "distribution" ; - nif:beginIndex "2404"^^xsd:nonNegativeInteger ; - nif:endIndex "2416"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "It listed his wife's age as 74 and place of birth as Opelika, Ala.." ; - nif:beginIndex "4769"^^xsd:nonNegativeInteger ; - nif:endIndex "4836"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "primary" ; - nif:beginIndex "78"^^xsd:nonNegativeInteger ; - nif:endIndex "85"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "7489"^^xsd:nonNegativeInteger ; - nif:endIndex "7491"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "6733"^^xsd:nonNegativeInteger ; - nif:endIndex "6736"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "which" ; - nif:beginIndex "8394"^^xsd:nonNegativeInteger ; - nif:endIndex "8399"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "marked" ; - nif:beginIndex "11460"^^xsd:nonNegativeInteger ; - nif:endIndex "11466"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "cost" ; - nif:beginIndex "1437"^^xsd:nonNegativeInteger ; - nif:endIndex "1441"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Felix" ; - nif:beginIndex "12097"^^xsd:nonNegativeInteger ; - nif:endIndex "12102"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The jurors said they realize ``a proportionate distribution of these funds might disable this program in our less populous counties''." ; - nif:beginIndex "2357"^^xsd:nonNegativeInteger ; - nif:endIndex "2491"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "himself" ; - nif:beginIndex "12023"^^xsd:nonNegativeInteger ; - nif:endIndex "12030"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "is" ; - nif:beginIndex "8160"^^xsd:nonNegativeInteger ; - nif:endIndex "8162"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "received" ; - nif:beginIndex "10069"^^xsd:nonNegativeInteger ; - nif:endIndex "10077"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "problem" ; - nif:beginIndex "1652"^^xsd:nonNegativeInteger ; - nif:endIndex "1659"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "outgoing" ; - nif:beginIndex "1738"^^xsd:nonNegativeInteger ; - nif:endIndex "1746"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "2605"^^xsd:nonNegativeInteger ; - nif:endIndex "2608"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "would" ; - nif:beginIndex "10238"^^xsd:nonNegativeInteger ; - nif:endIndex "10243"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "bit" ; - nif:beginIndex "12228"^^xsd:nonNegativeInteger ; - nif:endIndex "12231"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "8547"^^xsd:nonNegativeInteger ; - nif:endIndex "8548"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "controversy" ; - nif:beginIndex "11470"^^xsd:nonNegativeInteger ; - nif:endIndex "11481"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Fulton" ; - nif:beginIndex "4"^^xsd:nonNegativeInteger ; - nif:endIndex "10"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "6113"^^xsd:nonNegativeInteger ; - nif:endIndex "6114"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "worth" ; - nif:beginIndex "7859"^^xsd:nonNegativeInteger ; - nif:endIndex "7864"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "303"^^xsd:nonNegativeInteger ; - nif:endIndex "306"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "interest" ; - nif:beginIndex "714"^^xsd:nonNegativeInteger ; - nif:endIndex "722"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "7389"^^xsd:nonNegativeInteger ; - nif:endIndex "7391"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "10282"^^xsd:nonNegativeInteger ; - nif:endIndex "10284"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf ")" ; - nif:beginIndex "3936"^^xsd:nonNegativeInteger ; - nif:endIndex "3937"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "4104"^^xsd:nonNegativeInteger ; - nif:endIndex "4105"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "6466"^^xsd:nonNegativeInteger ; - nif:endIndex "6468"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "4482"^^xsd:nonNegativeInteger ; - nif:endIndex "4485"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "as" ; - nif:beginIndex "4819"^^xsd:nonNegativeInteger ; - nif:endIndex "4821"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "swipe" ; - nif:beginIndex "1929"^^xsd:nonNegativeInteger ; - nif:endIndex "1934"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "protect" ; - nif:beginIndex "3165"^^xsd:nonNegativeInteger ; - nif:endIndex "3172"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "1298"^^xsd:nonNegativeInteger ; - nif:endIndex "1300"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "1927"^^xsd:nonNegativeInteger ; - nif:endIndex "1928"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Republicans" ; - nif:beginIndex "6165"^^xsd:nonNegativeInteger ; - nif:endIndex "6176"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "3068"^^xsd:nonNegativeInteger ; - nif:endIndex "3069"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Atlanta" ; - nif:beginIndex "5077"^^xsd:nonNegativeInteger ; - nif:endIndex "5084"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "11692"^^xsd:nonNegativeInteger ; - nif:endIndex "11695"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "research" ; - nif:beginIndex "9055"^^xsd:nonNegativeInteger ; - nif:endIndex "9063"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "back" ; - nif:beginIndex "9824"^^xsd:nonNegativeInteger ; - nif:endIndex "9828"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "receives" ; - nif:beginIndex "2328"^^xsd:nonNegativeInteger ; - nif:endIndex "2336"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "6139"^^xsd:nonNegativeInteger ; - nif:endIndex "6140"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "6573"^^xsd:nonNegativeInteger ; - nif:endIndex "6575"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "inadequate" ; - nif:beginIndex "888"^^xsd:nonNegativeInteger ; - nif:endIndex "898"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "10774"^^xsd:nonNegativeInteger ; - nif:endIndex "10775"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "7370"^^xsd:nonNegativeInteger ; - nif:endIndex "7372"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "voters" ; - nif:beginIndex "754"^^xsd:nonNegativeInteger ; - nif:endIndex "760"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "assistance" ; - nif:beginIndex "2109"^^xsd:nonNegativeInteger ; - nif:endIndex "2119"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "7757"^^xsd:nonNegativeInteger ; - nif:endIndex "7758"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "into" ; - nif:beginIndex "10418"^^xsd:nonNegativeInteger ; - nif:endIndex "10422"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "days" ; - nif:beginIndex "11642"^^xsd:nonNegativeInteger ; - nif:endIndex "11646"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "grand" ; - nif:beginIndex "1058"^^xsd:nonNegativeInteger ; - nif:endIndex "1063"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "goes" ; - nif:beginIndex "5158"^^xsd:nonNegativeInteger ; - nif:endIndex "5162"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "must" ; - nif:beginIndex "6461"^^xsd:nonNegativeInteger ; - nif:endIndex "6465"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Davis received 1,119 votes in Saturday's election, and Bush got 402." ; - nif:beginIndex "11018"^^xsd:nonNegativeInteger ; - nif:endIndex "11086"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "1938"^^xsd:nonNegativeInteger ; - nif:endIndex "1941"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "on" ; - nif:beginIndex "9075"^^xsd:nonNegativeInteger ; - nif:endIndex "9077"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "one" ; - nif:beginIndex "5104"^^xsd:nonNegativeInteger ; - nif:endIndex "5107"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "recommended" ; - nif:beginIndex "925"^^xsd:nonNegativeInteger ; - nif:endIndex "936"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "each" ; - nif:beginIndex "6509"^^xsd:nonNegativeInteger ; - nif:endIndex "6513"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "1054"^^xsd:nonNegativeInteger ; - nif:endIndex "1057"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "5084"^^xsd:nonNegativeInteger ; - nif:endIndex "5085"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "night" ; - nif:beginIndex "9039"^^xsd:nonNegativeInteger ; - nif:endIndex "9044"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "7921"^^xsd:nonNegativeInteger ; - nif:endIndex "7923"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "L." ; - nif:beginIndex "4986"^^xsd:nonNegativeInteger ; - nif:endIndex "4988"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "4285"^^xsd:nonNegativeInteger ; - nif:endIndex "4288"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Rural" ; - nif:beginIndex "8349"^^xsd:nonNegativeInteger ; - nif:endIndex "8354"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "clerical" ; - nif:beginIndex "1538"^^xsd:nonNegativeInteger ; - nif:endIndex "1546"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "mayor's" ; - nif:beginIndex "5213"^^xsd:nonNegativeInteger ; - nif:endIndex "5220"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Robert" ; - nif:beginIndex "5552"^^xsd:nonNegativeInteger ; - nif:endIndex "5558"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "brought" ; - nif:beginIndex "5638"^^xsd:nonNegativeInteger ; - nif:endIndex "5645"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "5783"^^xsd:nonNegativeInteger ; - nif:endIndex "5785"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "1091"^^xsd:nonNegativeInteger ; - nif:endIndex "1093"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "State" ; - nif:beginIndex "2154"^^xsd:nonNegativeInteger ; - nif:endIndex "2159"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "10299"^^xsd:nonNegativeInteger ; - nif:endIndex "10303"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "cent" ; - nif:beginIndex "6487"^^xsd:nonNegativeInteger ; - nif:endIndex "6491"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "administration" ; - nif:beginIndex "4127"^^xsd:nonNegativeInteger ; - nif:endIndex "4141"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "got" ; - nif:beginIndex "11078"^^xsd:nonNegativeInteger ; - nif:endIndex "11081"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Grady" ; - nif:beginIndex "4270"^^xsd:nonNegativeInteger ; - nif:endIndex "4275"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "public" ; - nif:beginIndex "10023"^^xsd:nonNegativeInteger ; - nif:endIndex "10029"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "title" ; - nif:beginIndex "1700"^^xsd:nonNegativeInteger ; - nif:endIndex "1705"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Savannah" ; - nif:beginIndex "5802"^^xsd:nonNegativeInteger ; - nif:endIndex "5810"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Opelika" ; - nif:beginIndex "4822"^^xsd:nonNegativeInteger ; - nif:endIndex "4829"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "I" ; - nif:beginIndex "11213"^^xsd:nonNegativeInteger ; - nif:endIndex "11214"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "allow" ; - nif:beginIndex "9678"^^xsd:nonNegativeInteger ; - nif:endIndex "9683"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "priority" ; - nif:beginIndex "7714"^^xsd:nonNegativeInteger ; - nif:endIndex "7722"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "5780"^^xsd:nonNegativeInteger ; - nif:endIndex "5782"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Implementation of Georgia's automobile title law was also recommended by the outgoing jury." ; - nif:beginIndex "1661"^^xsd:nonNegativeInteger ; - nif:endIndex "1752"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "11315"^^xsd:nonNegativeInteger ; - nif:endIndex "11316"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "term" ; - nif:beginIndex "6251"^^xsd:nonNegativeInteger ; - nif:endIndex "6255"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "won" ; - nif:beginIndex "574"^^xsd:nonNegativeInteger ; - nif:endIndex "577"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Barber" ; - nif:beginIndex "10109"^^xsd:nonNegativeInteger ; - nif:endIndex "10115"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "3070"^^xsd:nonNegativeInteger ; - nif:endIndex "3073"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "similar" ; - nif:beginIndex "9430"^^xsd:nonNegativeInteger ; - nif:endIndex "9437"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "one" ; - nif:beginIndex "6430"^^xsd:nonNegativeInteger ; - nif:endIndex "6433"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "into" ; - nif:beginIndex "2988"^^xsd:nonNegativeInteger ; - nif:endIndex "2992"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Vandiver" ; - nif:beginIndex "7604"^^xsd:nonNegativeInteger ; - nif:endIndex "7612"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "federal" ; - nif:beginIndex "1981"^^xsd:nonNegativeInteger ; - nif:endIndex "1988"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "amicable" ; - nif:beginIndex "4640"^^xsd:nonNegativeInteger ; - nif:endIndex "4648"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "4559"^^xsd:nonNegativeInteger ; - nif:endIndex "4560"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "rescind" ; - nif:beginIndex "9566"^^xsd:nonNegativeInteger ; - nif:endIndex "9573"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "however" ; - nif:beginIndex "8740"^^xsd:nonNegativeInteger ; - nif:endIndex "8747"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "2342"^^xsd:nonNegativeInteger ; - nif:endIndex "2344"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "since" ; - nif:beginIndex "5125"^^xsd:nonNegativeInteger ; - nif:endIndex "5130"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "prison" ; - nif:beginIndex "4256"^^xsd:nonNegativeInteger ; - nif:endIndex "4262"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "permit" ; - nif:beginIndex "4023"^^xsd:nonNegativeInteger ; - nif:endIndex "4029"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "5123"^^xsd:nonNegativeInteger ; - nif:endIndex "5124"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Friday" ; - nif:beginIndex "10440"^^xsd:nonNegativeInteger ; - nif:endIndex "10446"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Ivan" ; - nif:beginIndex "596"^^xsd:nonNegativeInteger ; - nif:endIndex "600"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "repair" ; - nif:beginIndex "8051"^^xsd:nonNegativeInteger ; - nif:endIndex "8057"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Legislature" ; - nif:beginIndex "1776"^^xsd:nonNegativeInteger ; - nif:endIndex "1787"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "His" ; - nif:beginIndex "5137"^^xsd:nonNegativeInteger ; - nif:endIndex "5140"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Bush" ; - nif:beginIndex "11073"^^xsd:nonNegativeInteger ; - nif:endIndex "11077"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "3743"^^xsd:nonNegativeInteger ; - nif:endIndex "3744"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Roads" ; - nif:beginIndex "8230"^^xsd:nonNegativeInteger ; - nif:endIndex "8235"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "has" ; - nif:beginIndex "10711"^^xsd:nonNegativeInteger ; - nif:endIndex "10714"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "trouble" ; - nif:beginIndex "11378"^^xsd:nonNegativeInteger ; - nif:endIndex "11385"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "there" ; - nif:beginIndex "8149"^^xsd:nonNegativeInteger ; - nif:endIndex "8154"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "4226"^^xsd:nonNegativeInteger ; - nif:endIndex "4227"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "3018"^^xsd:nonNegativeInteger ; - nif:endIndex "3021"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "would" ; - nif:beginIndex "9342"^^xsd:nonNegativeInteger ; - nif:endIndex "9347"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "$4" ; - nif:beginIndex "8204"^^xsd:nonNegativeInteger ; - nif:endIndex "8206"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "rural" ; - nif:beginIndex "7430"^^xsd:nonNegativeInteger ; - nif:endIndex "7435"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "rural" ; - nif:beginIndex "8256"^^xsd:nonNegativeInteger ; - nif:endIndex "8261"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "1009"^^xsd:nonNegativeInteger ; - nif:endIndex "1012"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "279"^^xsd:nonNegativeInteger ; - nif:endIndex "280"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "begin" ; - nif:beginIndex "8023"^^xsd:nonNegativeInteger ; - nif:endIndex "8028"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "5326"^^xsd:nonNegativeInteger ; - nif:endIndex "5329"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "2846"^^xsd:nonNegativeInteger ; - nif:endIndex "2847"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The jury did not elaborate, but it added that ``there should be periodic surveillance of the pricing practices of the concessionaires for the purpose of keeping the prices reasonable''." ; - nif:beginIndex "3523"^^xsd:nonNegativeInteger ; - nif:endIndex "3708"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "816"^^xsd:nonNegativeInteger ; - nif:endIndex "820"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "urban" ; - nif:beginIndex "7440"^^xsd:nonNegativeInteger ; - nif:endIndex "7445"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "3657"^^xsd:nonNegativeInteger ; - nif:endIndex "3660"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "audience" ; - nif:beginIndex "5917"^^xsd:nonNegativeInteger ; - nif:endIndex "5925"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "10090"^^xsd:nonNegativeInteger ; - nif:endIndex "10092"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "7974"^^xsd:nonNegativeInteger ; - nif:endIndex "7977"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "8929"^^xsd:nonNegativeInteger ; - nif:endIndex "8932"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "This" ; - nif:beginIndex "2049"^^xsd:nonNegativeInteger ; - nif:endIndex "2053"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "J." ; - nif:beginIndex "4576"^^xsd:nonNegativeInteger ; - nif:endIndex "4578"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "an" ; - nif:beginIndex "7018"^^xsd:nonNegativeInteger ; - nif:endIndex "7020"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "went" ; - nif:beginIndex "12173"^^xsd:nonNegativeInteger ; - nif:endIndex "12177"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "2228"^^xsd:nonNegativeInteger ; - nif:endIndex "2231"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "10776"^^xsd:nonNegativeInteger ; - nif:endIndex "10778"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "9473"^^xsd:nonNegativeInteger ; - nif:endIndex "9474"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Blue" ; - nif:beginIndex "5893"^^xsd:nonNegativeInteger ; - nif:endIndex "5897"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "1877"^^xsd:nonNegativeInteger ; - nif:endIndex "1879"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Griffin" ; - nif:beginIndex "4594"^^xsd:nonNegativeInteger ; - nif:endIndex "4601"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "City" ; - nif:beginIndex "212"^^xsd:nonNegativeInteger ; - nif:endIndex "216"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "candidates" ; - nif:beginIndex "6602"^^xsd:nonNegativeInteger ; - nif:endIndex "6612"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "12158"^^xsd:nonNegativeInteger ; - nif:endIndex "12159"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "at" ; - nif:beginIndex "11992"^^xsd:nonNegativeInteger ; - nif:endIndex "11994"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "after" ; - nif:beginIndex "5347"^^xsd:nonNegativeInteger ; - nif:endIndex "5352"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Vandiver opened his race for governor in 1958 with a battle in the Legislature against the issuance of $50 million worth of additional rural roads bonds proposed by then Gov. Marvin Griffin." ; - nif:beginIndex "8496"^^xsd:nonNegativeInteger ; - nif:endIndex "8686"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "3519"^^xsd:nonNegativeInteger ; - nif:endIndex "3521"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Mrs." ; - nif:beginIndex "4571"^^xsd:nonNegativeInteger ; - nif:endIndex "4575"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "would" ; - nif:beginIndex "5976"^^xsd:nonNegativeInteger ; - nif:endIndex "5981"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "employed" ; - nif:beginIndex "3883"^^xsd:nonNegativeInteger ; - nif:endIndex "3891"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "adjournment" ; - nif:beginIndex "7591"^^xsd:nonNegativeInteger ; - nif:endIndex "7602"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "4750"^^xsd:nonNegativeInteger ; - nif:endIndex "4753"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "George" ; - nif:beginIndex "11587"^^xsd:nonNegativeInteger ; - nif:endIndex "11593"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "will" ; - nif:beginIndex "8018"^^xsd:nonNegativeInteger ; - nif:endIndex "8022"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "2524"^^xsd:nonNegativeInteger ; - nif:endIndex "2527"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "voted" ; - nif:beginIndex "9407"^^xsd:nonNegativeInteger ; - nif:endIndex "9412"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "department" ; - nif:beginIndex "8307"^^xsd:nonNegativeInteger ; - nif:endIndex "8317"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "has" ; - nif:beginIndex "4669"^^xsd:nonNegativeInteger ; - nif:endIndex "4672"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Tower" ; - nif:beginIndex "5850"^^xsd:nonNegativeInteger ; - nif:endIndex "5855"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Felix" ; - nif:beginIndex "10921"^^xsd:nonNegativeInteger ; - nif:endIndex "10926"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "largest" ; - nif:beginIndex "6321"^^xsd:nonNegativeInteger ; - nif:endIndex "6328"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "previous" ; - nif:beginIndex "3047"^^xsd:nonNegativeInteger ; - nif:endIndex "3055"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "6599"^^xsd:nonNegativeInteger ; - nif:endIndex "6601"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "1762"^^xsd:nonNegativeInteger ; - nif:endIndex "1766"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "phone" ; - nif:beginIndex "11502"^^xsd:nonNegativeInteger ; - nif:endIndex "11507"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "7774"^^xsd:nonNegativeInteger ; - nif:endIndex "7777"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "medical" ; - nif:beginIndex "3855"^^xsd:nonNegativeInteger ; - nif:endIndex "3862"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "10220"^^xsd:nonNegativeInteger ; - nif:endIndex "10222"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "10493"^^xsd:nonNegativeInteger ; - nif:endIndex "10495"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "became" ; - nif:beginIndex "5304"^^xsd:nonNegativeInteger ; - nif:endIndex "5310"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "prices" ; - nif:beginIndex "3688"^^xsd:nonNegativeInteger ; - nif:endIndex "3694"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "he" ; - nif:beginIndex "11850"^^xsd:nonNegativeInteger ; - nif:endIndex "11852"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "6492"^^xsd:nonNegativeInteger ; - nif:endIndex "6494"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "taken" ; - nif:beginIndex "6469"^^xsd:nonNegativeInteger ; - nif:endIndex "6474"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "highway" ; - nif:beginIndex "7650"^^xsd:nonNegativeInteger ; - nif:endIndex "7657"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "5026"^^xsd:nonNegativeInteger ; - nif:endIndex "5029"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Republicans" ; - nif:beginIndex "5426"^^xsd:nonNegativeInteger ; - nif:endIndex "5437"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Court" ; - nif:beginIndex "4431"^^xsd:nonNegativeInteger ; - nif:endIndex "4436"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Fulton" ; - nif:beginIndex "443"^^xsd:nonNegativeInteger ; - nif:endIndex "449"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "city" ; - nif:beginIndex "5187"^^xsd:nonNegativeInteger ; - nif:endIndex "5191"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "1734"^^xsd:nonNegativeInteger ; - nif:endIndex "1737"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "some" ; - nif:beginIndex "8185"^^xsd:nonNegativeInteger ; - nif:endIndex "8189"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "up" ; - nif:beginIndex "7192"^^xsd:nonNegativeInteger ; - nif:endIndex "7194"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "dissents" ; - nif:beginIndex "6307"^^xsd:nonNegativeInteger ; - nif:endIndex "6315"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "County" ; - nif:beginIndex "3827"^^xsd:nonNegativeInteger ; - nif:endIndex "3833"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "like" ; - nif:beginIndex "11295"^^xsd:nonNegativeInteger ; - nif:endIndex "11299"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Only" ; - nif:beginIndex "614"^^xsd:nonNegativeInteger ; - nif:endIndex "618"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "5762"^^xsd:nonNegativeInteger ; - nif:endIndex "5765"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Sam Caldwell, State Highway Department public relations director, resigned Tuesday to work for Lt. Gov. Garland Byrd's campaign." ; - nif:beginIndex "6768"^^xsd:nonNegativeInteger ; - nif:endIndex "6896"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "consulted" ; - nif:beginIndex "8776"^^xsd:nonNegativeInteger ; - nif:endIndex "8785"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "11514"^^xsd:nonNegativeInteger ; - nif:endIndex "11517"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "During" ; - nif:beginIndex "11717"^^xsd:nonNegativeInteger ; - nif:endIndex "11723"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "It urged that the next Legislature ``provide enabling funds and re-set the effective date so that an orderly implementation of the law may be effected''." ; - nif:beginIndex "1753"^^xsd:nonNegativeInteger ; - nif:endIndex "1906"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "Williams" ; - nif:beginIndex "11836"^^xsd:nonNegativeInteger ; - nif:endIndex "11844"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "services" ; - nif:beginIndex "2021"^^xsd:nonNegativeInteger ; - nif:endIndex "2029"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "10453"^^xsd:nonNegativeInteger ; - nif:endIndex "10455"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "yet" ; - nif:beginIndex "8786"^^xsd:nonNegativeInteger ; - nif:endIndex "8789"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "itself" ; - nif:beginIndex "8967"^^xsd:nonNegativeInteger ; - nif:endIndex "8973"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "long" ; - nif:beginIndex "10674"^^xsd:nonNegativeInteger ; - nif:endIndex "10678"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The jury praised the administration and operation of the Atlanta Police Department, the Fulton Tax Commissioner's Office, the Bellwood and Alpharetta prison farms, Grady Hospital and the Fulton Health Department." ; - nif:beginIndex "4106"^^xsd:nonNegativeInteger ; - nif:endIndex "4318"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "These" ; - nif:beginIndex "3135"^^xsd:nonNegativeInteger ; - nif:endIndex "3140"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "foster" ; - nif:beginIndex "2033"^^xsd:nonNegativeInteger ; - nif:endIndex "2039"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "it" ; - nif:beginIndex "1178"^^xsd:nonNegativeInteger ; - nif:endIndex "1180"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "lived" ; - nif:beginIndex "4879"^^xsd:nonNegativeInteger ; - nif:endIndex "4884"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "race" ; - nif:beginIndex "5515"^^xsd:nonNegativeInteger ; - nif:endIndex "5519"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "enter" ; - nif:beginIndex "6133"^^xsd:nonNegativeInteger ; - nif:endIndex "6138"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Rd." ; - nif:beginIndex "4972"^^xsd:nonNegativeInteger ; - nif:endIndex "4975"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "5590"^^xsd:nonNegativeInteger ; - nif:endIndex "5594"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "registered" ; - nif:beginIndex "6063"^^xsd:nonNegativeInteger ; - nif:endIndex "6073"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "subjected" ; - nif:beginIndex "11863"^^xsd:nonNegativeInteger ; - nif:endIndex "11872"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "8796"^^xsd:nonNegativeInteger ; - nif:endIndex "8799"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "has" ; - nif:beginIndex "5059"^^xsd:nonNegativeInteger ; - nif:endIndex "5062"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "4412"^^xsd:nonNegativeInteger ; - nif:endIndex "4414"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "1767"^^xsd:nonNegativeInteger ; - nif:endIndex "1770"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "He" ; - nif:beginIndex "6953"^^xsd:nonNegativeInteger ; - nif:endIndex "6955"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Pelham" ; - nif:beginIndex "8867"^^xsd:nonNegativeInteger ; - nif:endIndex "8873"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "at" ; - nif:beginIndex "11306"^^xsd:nonNegativeInteger ; - nif:endIndex "11308"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "provided" ; - nif:beginIndex "10041"^^xsd:nonNegativeInteger ; - nif:endIndex "10049"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "was" ; - nif:beginIndex "9515"^^xsd:nonNegativeInteger ; - nif:endIndex "9518"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "new" ; - nif:beginIndex "8400"^^xsd:nonNegativeInteger ; - nif:endIndex "8403"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Attorneys for the mayor said that an amicable property settlement has been agreed upon." ; - nif:beginIndex "4603"^^xsd:nonNegativeInteger ; - nif:endIndex "4690"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "will" ; - nif:beginIndex "10448"^^xsd:nonNegativeInteger ; - nif:endIndex "10452"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "6045"^^xsd:nonNegativeInteger ; - nif:endIndex "6048"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "fire" ; - nif:beginIndex "2791"^^xsd:nonNegativeInteger ; - nif:endIndex "2795"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "2357"^^xsd:nonNegativeInteger ; - nif:endIndex "2360"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "city" ; - nif:beginIndex "1617"^^xsd:nonNegativeInteger ; - nif:endIndex "1621"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "3443"^^xsd:nonNegativeInteger ; - nif:endIndex "3446"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "4737"^^xsd:nonNegativeInteger ; - nif:endIndex "4739"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "it" ; - nif:beginIndex "6224"^^xsd:nonNegativeInteger ; - nif:endIndex "6226"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Bush" ; - nif:beginIndex "11774"^^xsd:nonNegativeInteger ; - nif:endIndex "11778"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "costs" ; - nif:beginIndex "3224"^^xsd:nonNegativeInteger ; - nif:endIndex "3229"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced ``no evidence'' that any irregularities took place." ; - nif:beginIndex "0"^^xsd:nonNegativeInteger ; - nif:endIndex "155"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "both" ; - nif:beginIndex "7557"^^xsd:nonNegativeInteger ; - nif:endIndex "7561"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "generally" ; - nif:beginIndex "1217"^^xsd:nonNegativeInteger ; - nif:endIndex "1226"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "4159"^^xsd:nonNegativeInteger ; - nif:endIndex "4162"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "ever" ; - nif:beginIndex "10804"^^xsd:nonNegativeInteger ; - nif:endIndex "10808"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Gov." ; - nif:beginIndex "6867"^^xsd:nonNegativeInteger ; - nif:endIndex "6871"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "4901"^^xsd:nonNegativeInteger ; - nif:endIndex "4904"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "school" ; - nif:beginIndex "10839"^^xsd:nonNegativeInteger ; - nif:endIndex "10845"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "6718"^^xsd:nonNegativeInteger ; - nif:endIndex "6719"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Department" ; - nif:beginIndex "7792"^^xsd:nonNegativeInteger ; - nif:endIndex "7802"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "as" ; - nif:beginIndex "4734"^^xsd:nonNegativeInteger ; - nif:endIndex "4736"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf ")" ; - nif:beginIndex "10274"^^xsd:nonNegativeInteger ; - nif:endIndex "10275"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "sessions" ; - nif:beginIndex "9741"^^xsd:nonNegativeInteger ; - nif:endIndex "9749"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Monday" ; - nif:beginIndex "7212"^^xsd:nonNegativeInteger ; - nif:endIndex "7218"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "allowed" ; - nif:beginIndex "6576"^^xsd:nonNegativeInteger ; - nif:endIndex "6583"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "had" ; - nif:beginIndex "6920"^^xsd:nonNegativeInteger ; - nif:endIndex "6923"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "an" ; - nif:beginIndex "4637"^^xsd:nonNegativeInteger ; - nif:endIndex "4639"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "approve" ; - nif:beginIndex "7373"^^xsd:nonNegativeInteger ; - nif:endIndex "7380"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "bonds" ; - nif:beginIndex "7891"^^xsd:nonNegativeInteger ; - nif:endIndex "7896"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "8722"^^xsd:nonNegativeInteger ; - nif:endIndex "8725"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "7308"^^xsd:nonNegativeInteger ; - nif:endIndex "7309"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "follow" ; - nif:beginIndex "7294"^^xsd:nonNegativeInteger ; - nif:endIndex "7300"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "1961" ; - nif:beginIndex "7199"^^xsd:nonNegativeInteger ; - nif:endIndex "7203"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "788"^^xsd:nonNegativeInteger ; - nif:endIndex "789"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "being" ; - nif:beginIndex "9064"^^xsd:nonNegativeInteger ; - nif:endIndex "9069"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "effect" ; - nif:beginIndex "3188"^^xsd:nonNegativeInteger ; - nif:endIndex "3194"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "9944"^^xsd:nonNegativeInteger ; - nif:endIndex "9947"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Colquitt" ; - nif:beginIndex "11227"^^xsd:nonNegativeInteger ; - nif:endIndex "11235"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Monday" ; - nif:beginIndex "10470"^^xsd:nonNegativeInteger ; - nif:endIndex "10476"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "whether" ; - nif:beginIndex "9078"^^xsd:nonNegativeInteger ; - nif:endIndex "9085"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "which" ; - nif:beginIndex "6727"^^xsd:nonNegativeInteger ; - nif:endIndex "6732"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Committee" ; - nif:beginIndex "11007"^^xsd:nonNegativeInteger ; - nif:endIndex "11016"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "1510"^^xsd:nonNegativeInteger ; - nif:endIndex "1512"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "exception" ; - nif:beginIndex "5091"^^xsd:nonNegativeInteger ; - nif:endIndex "5100"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "at" ; - nif:beginIndex "1935"^^xsd:nonNegativeInteger ; - nif:endIndex "1937"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Executive" ; - nif:beginIndex "10997"^^xsd:nonNegativeInteger ; - nif:endIndex "11006"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "5913"^^xsd:nonNegativeInteger ; - nif:endIndex "5916"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Fulton" ; - nif:beginIndex "3820"^^xsd:nonNegativeInteger ; - nif:endIndex "3826"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "11762"^^xsd:nonNegativeInteger ; - nif:endIndex "11763"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "weekend" ; - nif:beginIndex "3906"^^xsd:nonNegativeInteger ; - nif:endIndex "3913"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "2386"^^xsd:nonNegativeInteger ; - nif:endIndex "2388"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "him" ; - nif:beginIndex "10353"^^xsd:nonNegativeInteger ; - nif:endIndex "10356"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Jan." ; - nif:beginIndex "3436"^^xsd:nonNegativeInteger ; - nif:endIndex "3440"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "saw" ; - nif:beginIndex "11220"^^xsd:nonNegativeInteger ; - nif:endIndex "11223"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "2622"^^xsd:nonNegativeInteger ; - nif:endIndex "2624"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "attended" ; - nif:beginIndex "6181"^^xsd:nonNegativeInteger ; - nif:endIndex "6189"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "He will be succeeded by Rob Ledford of Gainesville, who has been an assistant more than three years." ; - nif:beginIndex "6953"^^xsd:nonNegativeInteger ; - nif:endIndex "7053"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "339"^^xsd:nonNegativeInteger ; - nif:endIndex "342"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Barber" ; - nif:beginIndex "9915"^^xsd:nonNegativeInteger ; - nif:endIndex "9921"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "2632"^^xsd:nonNegativeInteger ; - nif:endIndex "2634"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Georgia" ; - nif:beginIndex "9610"^^xsd:nonNegativeInteger ; - nif:endIndex "9617"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "7219"^^xsd:nonNegativeInteger ; - nif:endIndex "7222"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Department" ; - nif:beginIndex "8126"^^xsd:nonNegativeInteger ; - nif:endIndex "8136"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "his" ; - nif:beginIndex "10127"^^xsd:nonNegativeInteger ; - nif:endIndex "10130"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "received" ; - nif:beginIndex "11791"^^xsd:nonNegativeInteger ; - nif:endIndex "11799"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "9853"^^xsd:nonNegativeInteger ; - nif:endIndex "9854"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "1131"^^xsd:nonNegativeInteger ; - nif:endIndex "1134"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "listed" ; - nif:beginIndex "5000"^^xsd:nonNegativeInteger ; - nif:endIndex "5006"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "two" ; - nif:beginIndex "3043"^^xsd:nonNegativeInteger ; - nif:endIndex "3046"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "an" ; - nif:beginIndex "12068"^^xsd:nonNegativeInteger ; - nif:endIndex "12070"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "he" ; - nif:beginIndex "10290"^^xsd:nonNegativeInteger ; - nif:endIndex "10292"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "1846"^^xsd:nonNegativeInteger ; - nif:endIndex "1850"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "would" ; - nif:beginIndex "8410"^^xsd:nonNegativeInteger ; - nif:endIndex "8415"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "petition" ; - nif:beginIndex "5014"^^xsd:nonNegativeInteger ; - nif:endIndex "5022"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "platform" ; - nif:beginIndex "6758"^^xsd:nonNegativeInteger ; - nif:endIndex "6766"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Highway" ; - nif:beginIndex "8691"^^xsd:nonNegativeInteger ; - nif:endIndex "8698"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "more" ; - nif:beginIndex "7031"^^xsd:nonNegativeInteger ; - nif:endIndex "7035"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "place" ; - nif:beginIndex "4804"^^xsd:nonNegativeInteger ; - nif:endIndex "4809"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Carey" ; - nif:beginIndex "11096"^^xsd:nonNegativeInteger ; - nif:endIndex "11101"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "but" ; - nif:beginIndex "2146"^^xsd:nonNegativeInteger ; - nif:endIndex "2149"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "petition" ; - nif:beginIndex "4695"^^xsd:nonNegativeInteger ; - nif:endIndex "4703"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Pelham" ; - nif:beginIndex "4965"^^xsd:nonNegativeInteger ; - nif:endIndex "4971"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "widespread" ; - nif:beginIndex "703"^^xsd:nonNegativeInteger ; - nif:endIndex "713"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "It" ; - nif:beginIndex "10478"^^xsd:nonNegativeInteger ; - nif:endIndex "10480"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "1635"^^xsd:nonNegativeInteger ; - nif:endIndex "1637"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "work" ; - nif:beginIndex "7579"^^xsd:nonNegativeInteger ; - nif:endIndex "7583"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Pelham" ; - nif:beginIndex "9586"^^xsd:nonNegativeInteger ; - nif:endIndex "9592"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "was" ; - nif:beginIndex "6206"^^xsd:nonNegativeInteger ; - nif:endIndex "6209"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "6315"^^xsd:nonNegativeInteger ; - nif:endIndex "6316"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "property" ; - nif:beginIndex "4649"^^xsd:nonNegativeInteger ; - nif:endIndex "4657"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "have" ; - nif:beginIndex "11364"^^xsd:nonNegativeInteger ; - nif:endIndex "11368"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "2" ; - nif:beginIndex "4510"^^xsd:nonNegativeInteger ; - nif:endIndex "4511"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "chambers" ; - nif:beginIndex "7562"^^xsd:nonNegativeInteger ; - nif:endIndex "7570"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "11635"^^xsd:nonNegativeInteger ; - nif:endIndex "11636"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "10107"^^xsd:nonNegativeInteger ; - nif:endIndex "10108"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "His petition charged mental cruelty." ; - nif:beginIndex "4445"^^xsd:nonNegativeInteger ; - nif:endIndex "4481"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "12191"^^xsd:nonNegativeInteger ; - nif:endIndex "12192"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "which" ; - nif:beginIndex "2770"^^xsd:nonNegativeInteger ; - nif:endIndex "2775"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "consistently" ; - nif:beginIndex "9872"^^xsd:nonNegativeInteger ; - nif:endIndex "9884"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Georgia" ; - nif:beginIndex "7162"^^xsd:nonNegativeInteger ; - nif:endIndex "7169"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "obtain" ; - nif:beginIndex "6038"^^xsd:nonNegativeInteger ; - nif:endIndex "6044"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "as" ; - nif:beginIndex "7701"^^xsd:nonNegativeInteger ; - nif:endIndex "7703"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "10742"^^xsd:nonNegativeInteger ; - nif:endIndex "10743"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "are" ; - nif:beginIndex "872"^^xsd:nonNegativeInteger ; - nif:endIndex "875"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "1463"^^xsd:nonNegativeInteger ; - nif:endIndex "1466"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "featured" ; - nif:beginIndex "5868"^^xsd:nonNegativeInteger ; - nif:endIndex "5876"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "proportionate" ; - nif:beginIndex "2390"^^xsd:nonNegativeInteger ; - nif:endIndex "2403"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "hard-fought" ; - nif:beginIndex "544"^^xsd:nonNegativeInteger ; - nif:endIndex "555"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Williams" ; - nif:beginIndex "4391"^^xsd:nonNegativeInteger ; - nif:endIndex "4399"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "9265"^^xsd:nonNegativeInteger ; - nif:endIndex "9269"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "shortly" ; - nif:beginIndex "7301"^^xsd:nonNegativeInteger ; - nif:endIndex "7308"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "4410"^^xsd:nonNegativeInteger ; - nif:endIndex "4411"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "7532"^^xsd:nonNegativeInteger ; - nif:endIndex "7535"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "4835"^^xsd:nonNegativeInteger ; - nif:endIndex "4836"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "2150"^^xsd:nonNegativeInteger ; - nif:endIndex "2153"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "6951"^^xsd:nonNegativeInteger ; - nif:endIndex "6952"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Caldwell" ; - nif:beginIndex "6772"^^xsd:nonNegativeInteger ; - nif:endIndex "6780"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Ridge" ; - nif:beginIndex "5898"^^xsd:nonNegativeInteger ; - nif:endIndex "5903"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "fully" ; - nif:beginIndex "9297"^^xsd:nonNegativeInteger ; - nif:endIndex "9302"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "actions" ; - nif:beginIndex "3141"^^xsd:nonNegativeInteger ; - nif:endIndex "3148"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "4691"^^xsd:nonNegativeInteger ; - nif:endIndex "4694"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "program" ; - nif:beginIndex "2451"^^xsd:nonNegativeInteger ; - nif:endIndex "2458"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "State Party Chairman James W. Dorsey added that enthusiasm was picking up for a state rally to be held Sept. 8 in Savannah at which newly elected Texas Sen. John Tower will be the featured speaker." ; - nif:beginIndex "5688"^^xsd:nonNegativeInteger ; - nif:endIndex "5885"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "petition" ; - nif:beginIndex "4449"^^xsd:nonNegativeInteger ; - nif:endIndex "4457"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Caldwell" ; - nif:beginIndex "7094"^^xsd:nonNegativeInteger ; - nif:endIndex "7102"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "succeeded" ; - nif:beginIndex "5271"^^xsd:nonNegativeInteger ; - nif:endIndex "5280"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "order" ; - nif:beginIndex "11165"^^xsd:nonNegativeInteger ; - nif:endIndex "11170"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "promised" ; - nif:beginIndex "12059"^^xsd:nonNegativeInteger ; - nif:endIndex "12067"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Commerce" ; - nif:beginIndex "9925"^^xsd:nonNegativeInteger ; - nif:endIndex "9933"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "should" ; - nif:beginIndex "2549"^^xsd:nonNegativeInteger ; - nif:endIndex "2555"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Party" ; - nif:beginIndex "5694"^^xsd:nonNegativeInteger ; - nif:endIndex "5699"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Williams" ; - nif:beginIndex "11250"^^xsd:nonNegativeInteger ; - nif:endIndex "11258"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "portion" ; - nif:beginIndex "8439"^^xsd:nonNegativeInteger ; - nif:endIndex "8446"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "1181"^^xsd:nonNegativeInteger ; - nif:endIndex "1185"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "improving" ; - nif:beginIndex "1036"^^xsd:nonNegativeInteger ; - nif:endIndex "1045"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "were" ; - nif:beginIndex "9288"^^xsd:nonNegativeInteger ; - nif:endIndex "9292"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "9177"^^xsd:nonNegativeInteger ; - nif:endIndex "9179"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "asking" ; - nif:beginIndex "9937"^^xsd:nonNegativeInteger ; - nif:endIndex "9943"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Many local citizens feared that there would be irregularities at the polls, and Williams got himself a permit to carry a gun and promised an orderly election." ; - nif:beginIndex "11930"^^xsd:nonNegativeInteger ; - nif:endIndex "12088"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "resolution" ; - nif:beginIndex "8887"^^xsd:nonNegativeInteger ; - nif:endIndex "8897"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "quiet" ; - nif:beginIndex "11437"^^xsd:nonNegativeInteger ; - nif:endIndex "11442"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "stood" ; - nif:beginIndex "11133"^^xsd:nonNegativeInteger ; - nif:endIndex "11138"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "pass" ; - nif:beginIndex "3987"^^xsd:nonNegativeInteger ; - nif:endIndex "3991"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "hot" ; - nif:beginIndex "10680"^^xsd:nonNegativeInteger ; - nif:endIndex "10683"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "night" ; - nif:beginIndex "3896"^^xsd:nonNegativeInteger ; - nif:endIndex "3901"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "His political career goes back to his election to city council in 1923." ; - nif:beginIndex "5137"^^xsd:nonNegativeInteger ; - nif:endIndex "5208"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , . - - - a nif:Sentence ; - nif:anchorOf "It says that ``in the event Congress does provide this increase in federal funds'', the State Board of Education should be directed to ``give priority'' to teacher pay raises." ; - nif:beginIndex "10478"^^xsd:nonNegativeInteger ; - nif:endIndex "10653"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "bonds" ; - nif:beginIndex "7742"^^xsd:nonNegativeInteger ; - nif:endIndex "7747"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "(" ; - nif:beginIndex "10259"^^xsd:nonNegativeInteger ; - nif:endIndex "10260"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "10952"^^xsd:nonNegativeInteger ; - nif:endIndex "10955"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Pelham" ; - nif:beginIndex "9315"^^xsd:nonNegativeInteger ; - nif:endIndex "9321"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "received" ; - nif:beginIndex "11024"^^xsd:nonNegativeInteger ; - nif:endIndex "11032"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "336"^^xsd:nonNegativeInteger ; - nif:endIndex "338"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "7999"^^xsd:nonNegativeInteger ; - nif:endIndex "8002"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "11585"^^xsd:nonNegativeInteger ; - nif:endIndex "11586"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "raises" ; - nif:beginIndex "9711"^^xsd:nonNegativeInteger ; - nif:endIndex "9717"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "campaign" ; - nif:beginIndex "6887"^^xsd:nonNegativeInteger ; - nif:endIndex "6895"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "726"^^xsd:nonNegativeInteger ; - nif:endIndex "729"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "12232"^^xsd:nonNegativeInteger ; - nif:endIndex "12234"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "3037"^^xsd:nonNegativeInteger ; - nif:endIndex "3039"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "near" ; - nif:beginIndex "7811"^^xsd:nonNegativeInteger ; - nif:endIndex "7815"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "11145"^^xsd:nonNegativeInteger ; - nif:endIndex "11148"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "by" ; - nif:beginIndex "11467"^^xsd:nonNegativeInteger ; - nif:endIndex "11469"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "$10" ; - nif:beginIndex "8976"^^xsd:nonNegativeInteger ; - nif:endIndex "8979"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "or" ; - nif:beginIndex "3870"^^xsd:nonNegativeInteger ; - nif:endIndex "3872"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Georgia's" ; - nif:beginIndex "829"^^xsd:nonNegativeInteger ; - nif:endIndex "838"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "6196"^^xsd:nonNegativeInteger ; - nif:endIndex "6199"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "3845"^^xsd:nonNegativeInteger ; - nif:endIndex "3846"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "protected" ; - nif:beginIndex "2926"^^xsd:nonNegativeInteger ; - nif:endIndex "2935"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "2275"^^xsd:nonNegativeInteger ; - nif:endIndex "2278"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "announced" ; - nif:beginIndex "5370"^^xsd:nonNegativeInteger ; - nif:endIndex "5379"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "see" ; - nif:beginIndex "10252"^^xsd:nonNegativeInteger ; - nif:endIndex "10255"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Atlanta" ; - nif:beginIndex "3074"^^xsd:nonNegativeInteger ; - nif:endIndex "3081"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "8416"^^xsd:nonNegativeInteger ; - nif:endIndex "8418"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "He" ; - nif:beginIndex "5260"^^xsd:nonNegativeInteger ; - nif:endIndex "5262"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "plan" ; - nif:beginIndex "8165"^^xsd:nonNegativeInteger ; - nif:endIndex "8169"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "11857"^^xsd:nonNegativeInteger ; - nif:endIndex "11858"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "2620"^^xsd:nonNegativeInteger ; - nif:endIndex "2621"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "2936"^^xsd:nonNegativeInteger ; - nif:endIndex "2939"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "3880"^^xsd:nonNegativeInteger ; - nif:endIndex "3882"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "1258"^^xsd:nonNegativeInteger ; - nif:endIndex "1260"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "1597"^^xsd:nonNegativeInteger ; - nif:endIndex "1598"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Byrd's" ; - nif:beginIndex "6880"^^xsd:nonNegativeInteger ; - nif:endIndex "6886"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "10558"^^xsd:nonNegativeInteger ; - nif:endIndex "10560"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "7148"^^xsd:nonNegativeInteger ; - nif:endIndex "7151"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "wife" ; - nif:beginIndex "4905"^^xsd:nonNegativeInteger ; - nif:endIndex "4909"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Schley County Rep. B. D. Pelham will offer a resolution Monday in the House to rescind the body's action of Friday in voting itself a $10 per day increase in expense allowances." ; - nif:beginIndex "8842"^^xsd:nonNegativeInteger ; - nif:endIndex "9019"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "urged" ; - nif:beginIndex "1602"^^xsd:nonNegativeInteger ; - nif:endIndex "1607"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "11223"^^xsd:nonNegativeInteger ; - nif:endIndex "11225"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "by" ; - nif:beginIndex "11139"^^xsd:nonNegativeInteger ; - nif:endIndex "11141"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "at" ; - nif:beginIndex "5811"^^xsd:nonNegativeInteger ; - nif:endIndex "5813"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "as" ; - nif:beginIndex "1557"^^xsd:nonNegativeInteger ; - nif:endIndex "1559"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "none" ; - nif:beginIndex "10304"^^xsd:nonNegativeInteger ; - nif:endIndex "10308"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "11276"^^xsd:nonNegativeInteger ; - nif:endIndex "11279"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "3673"^^xsd:nonNegativeInteger ; - nif:endIndex "3675"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "there" ; - nif:beginIndex "9045"^^xsd:nonNegativeInteger ; - nif:endIndex "9050"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "reports" ; - nif:beginIndex "646"^^xsd:nonNegativeInteger ; - nif:endIndex "653"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Roads" ; - nif:beginIndex "8355"^^xsd:nonNegativeInteger ; - nif:endIndex "8360"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "been" ; - nif:beginIndex "427"^^xsd:nonNegativeInteger ; - nif:endIndex "431"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "handling" ; - nif:beginIndex "1969"^^xsd:nonNegativeInteger ; - nif:endIndex "1977"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "unanimous" ; - nif:beginIndex "6115"^^xsd:nonNegativeInteger ; - nif:endIndex "6124"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "11715"^^xsd:nonNegativeInteger ; - nif:endIndex "11716"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Ask jail deputies" ; - nif:beginIndex "3709"^^xsd:nonNegativeInteger ; - nif:endIndex "3726"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "4156"^^xsd:nonNegativeInteger ; - nif:endIndex "4158"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "391"^^xsd:nonNegativeInteger ; - nif:endIndex "394"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "2959"^^xsd:nonNegativeInteger ; - nif:endIndex "2962"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "surveillance" ; - nif:beginIndex "3596"^^xsd:nonNegativeInteger ; - nif:endIndex "3608"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "per" ; - nif:beginIndex "8980"^^xsd:nonNegativeInteger ; - nif:endIndex "8983"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "317"^^xsd:nonNegativeInteger ; - nif:endIndex "320"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "8885"^^xsd:nonNegativeInteger ; - nif:endIndex "8886"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "But he added that none of Georgia's congressmen specifically asked him to offer the resolution." ; - nif:beginIndex "10286"^^xsd:nonNegativeInteger ; - nif:endIndex "10381"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "9718"^^xsd:nonNegativeInteger ; - nif:endIndex "9721"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "3523"^^xsd:nonNegativeInteger ; - nif:endIndex "3526"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "5010"^^xsd:nonNegativeInteger ; - nif:endIndex "5013"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "set" ; - nif:beginIndex "9356"^^xsd:nonNegativeInteger ; - nif:endIndex "9359"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "-- After a long, hot controversy, Miller County has a new school superintendent, elected, as a policeman put it, in the ``coolest election I ever saw in this county''." ; - nif:beginIndex "10663"^^xsd:nonNegativeInteger ; - nif:endIndex "10830"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "no" ; - nif:beginIndex "6304"^^xsd:nonNegativeInteger ; - nif:endIndex "6306"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "its" ; - nif:beginIndex "6754"^^xsd:nonNegativeInteger ; - nif:endIndex "6757"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "281"^^xsd:nonNegativeInteger ; - nif:endIndex "283"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "we" ; - nif:beginIndex "11354"^^xsd:nonNegativeInteger ; - nif:endIndex "11356"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "$100" ; - nif:beginIndex "7637"^^xsd:nonNegativeInteger ; - nif:endIndex "7641"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "6766"^^xsd:nonNegativeInteger ; - nif:endIndex "6767"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "been" ; - nif:beginIndex "4673"^^xsd:nonNegativeInteger ; - nif:endIndex "4677"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Meanwhile, it was learned the State Highway Department is very near being ready to issue the first $30 million worth of highway reconstruction bonds." ; - nif:beginIndex "7748"^^xsd:nonNegativeInteger ; - nif:endIndex "7897"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "time" ; - nif:beginIndex "6947"^^xsd:nonNegativeInteger ; - nif:endIndex "6951"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "County" ; - nif:beginIndex "2542"^^xsd:nonNegativeInteger ; - nif:endIndex "2548"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "praised" ; - nif:beginIndex "4115"^^xsd:nonNegativeInteger ; - nif:endIndex "4122"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "got" ; - nif:beginIndex "12019"^^xsd:nonNegativeInteger ; - nif:endIndex "12022"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "telephone" ; - nif:beginIndex "11810"^^xsd:nonNegativeInteger ; - nif:endIndex "11819"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "offer" ; - nif:beginIndex "10360"^^xsd:nonNegativeInteger ; - nif:endIndex "10365"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "there" ; - nif:beginIndex "9509"^^xsd:nonNegativeInteger ; - nif:endIndex "9514"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "9507"^^xsd:nonNegativeInteger ; - nif:endIndex "9508"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "10631"^^xsd:nonNegativeInteger ; - nif:endIndex "10633"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "534"^^xsd:nonNegativeInteger ; - nif:endIndex "536"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "believes" ; - nif:beginIndex "1344"^^xsd:nonNegativeInteger ; - nif:endIndex "1352"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "will" ; - nif:beginIndex "5263"^^xsd:nonNegativeInteger ; - nif:endIndex "5267"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Rep." ; - nif:beginIndex "8856"^^xsd:nonNegativeInteger ; - nif:endIndex "8860"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "2320"^^xsd:nonNegativeInteger ; - nif:endIndex "2321"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Jackson" ; - nif:beginIndex "9761"^^xsd:nonNegativeInteger ; - nif:endIndex "9768"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "1659"^^xsd:nonNegativeInteger ; - nif:endIndex "1660"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "11873"^^xsd:nonNegativeInteger ; - nif:endIndex "11875"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "some" ; - nif:beginIndex "2564"^^xsd:nonNegativeInteger ; - nif:endIndex "2568"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "provide" ; - nif:beginIndex "1790"^^xsd:nonNegativeInteger ; - nif:endIndex "1797"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "voters" ; - nif:beginIndex "6074"^^xsd:nonNegativeInteger ; - nif:endIndex "6080"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "resolution" ; - nif:beginIndex "10370"^^xsd:nonNegativeInteger ; - nif:endIndex "10380"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "8685"^^xsd:nonNegativeInteger ; - nif:endIndex "8686"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "career" ; - nif:beginIndex "5151"^^xsd:nonNegativeInteger ; - nif:endIndex "5157"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Sept." ; - nif:beginIndex "5330"^^xsd:nonNegativeInteger ; - nif:endIndex "5335"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "funds" ; - nif:beginIndex "2596"^^xsd:nonNegativeInteger ; - nif:endIndex "2601"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "departments" ; - nif:beginIndex "1160"^^xsd:nonNegativeInteger ; - nif:endIndex "1171"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "``Being at the polls was just like being at church." ; - nif:beginIndex "11265"^^xsd:nonNegativeInteger ; - nif:endIndex "11316"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "additional" ; - nif:beginIndex "8620"^^xsd:nonNegativeInteger ; - nif:endIndex "8630"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "governor" ; - nif:beginIndex "8525"^^xsd:nonNegativeInteger ; - nif:endIndex "8533"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jury" ; - nif:beginIndex "2719"^^xsd:nonNegativeInteger ; - nif:endIndex "2723"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "5864"^^xsd:nonNegativeInteger ; - nif:endIndex "5867"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "back" ; - nif:beginIndex "5163"^^xsd:nonNegativeInteger ; - nif:endIndex "5167"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "campaign" ; - nif:beginIndex "7077"^^xsd:nonNegativeInteger ; - nif:endIndex "7085"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "12040"^^xsd:nonNegativeInteger ; - nif:endIndex "12042"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "against" ; - nif:beginIndex "8575"^^xsd:nonNegativeInteger ; - nif:endIndex "8582"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "786"^^xsd:nonNegativeInteger ; - nif:endIndex "788"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Aug." ; - nif:beginIndex "4505"^^xsd:nonNegativeInteger ; - nif:endIndex "4509"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Cheshire" ; - nif:beginIndex "4582"^^xsd:nonNegativeInteger ; - nif:endIndex "4590"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "test" ; - nif:beginIndex "7969"^^xsd:nonNegativeInteger ; - nif:endIndex "7973"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "liquor" ; - nif:beginIndex "11342"^^xsd:nonNegativeInteger ; - nif:endIndex "11348"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Chairman" ; - nif:beginIndex "5700"^^xsd:nonNegativeInteger ; - nif:endIndex "5708"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Georgia" ; - nif:beginIndex "9800"^^xsd:nonNegativeInteger ; - nif:endIndex "9807"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "As of Sunday night, there was no word of a resolution being offered there to rescind the action." ; - nif:beginIndex "9489"^^xsd:nonNegativeInteger ; - nif:endIndex "9585"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "6780"^^xsd:nonNegativeInteger ; - nif:endIndex "6781"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "earlier" ; - nif:beginIndex "7678"^^xsd:nonNegativeInteger ; - nif:endIndex "7685"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "before" ; - nif:beginIndex "6402"^^xsd:nonNegativeInteger ; - nif:endIndex "6408"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "7156"^^xsd:nonNegativeInteger ; - nif:endIndex "7157"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "expected" ; - nif:beginIndex "7106"^^xsd:nonNegativeInteger ; - nif:endIndex "7114"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "His" ; - nif:beginIndex "4445"^^xsd:nonNegativeInteger ; - nif:endIndex "4448"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "been" ; - nif:beginIndex "5063"^^xsd:nonNegativeInteger ; - nif:endIndex "5067"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "starts" ; - nif:beginIndex "7086"^^xsd:nonNegativeInteger ; - nif:endIndex "7092"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "870"^^xsd:nonNegativeInteger ; - nif:endIndex "872"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "ask" ; - nif:beginIndex "9792"^^xsd:nonNegativeInteger ; - nif:endIndex "9795"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "1676"^^xsd:nonNegativeInteger ; - nif:endIndex "1678"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "next" ; - nif:beginIndex "1771"^^xsd:nonNegativeInteger ; - nif:endIndex "1775"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "become" ; - nif:beginIndex "7118"^^xsd:nonNegativeInteger ; - nif:endIndex "7124"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Constitution" ; - nif:beginIndex "8726"^^xsd:nonNegativeInteger ; - nif:endIndex "8738"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jury" ; - nif:beginIndex "3306"^^xsd:nonNegativeInteger ; - nif:endIndex "3310"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "guardians" ; - nif:beginIndex "2848"^^xsd:nonNegativeInteger ; - nif:endIndex "2857"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word , nif:Sentence ; - nif:anchorOf "Colquitt" ; - nif:beginIndex "10654"^^xsd:nonNegativeInteger ; - nif:endIndex "10662"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:sentence ; - nif:word . - - - a nif:Word ; - nif:anchorOf "must" ; - nif:beginIndex "6521"^^xsd:nonNegativeInteger ; - nif:endIndex "6525"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "wife's" ; - nif:beginIndex "4783"^^xsd:nonNegativeInteger ; - nif:endIndex "4789"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "2616"^^xsd:nonNegativeInteger ; - nif:endIndex "2620"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "support" ; - nif:beginIndex "10011"^^xsd:nonNegativeInteger ; - nif:endIndex "10018"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "``This was the coolest, calmest election I ever saw'', Colquitt Policeman Tom Williams said." ; - nif:beginIndex "11172"^^xsd:nonNegativeInteger ; - nif:endIndex "11264"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "re-set" ; - nif:beginIndex "1817"^^xsd:nonNegativeInteger ; - nif:endIndex "1823"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "12189"^^xsd:nonNegativeInteger ; - nif:endIndex "12191"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "from" ; - nif:beginIndex "4370"^^xsd:nonNegativeInteger ; - nif:endIndex "4374"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "our" ; - nif:beginIndex "10191"^^xsd:nonNegativeInteger ; - nif:endIndex "10194"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "3098"^^xsd:nonNegativeInteger ; - nif:endIndex "3101"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "3230"^^xsd:nonNegativeInteger ; - nif:endIndex "3233"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "638"^^xsd:nonNegativeInteger ; - nif:endIndex "640"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "funds" ; - nif:beginIndex "2214"^^xsd:nonNegativeInteger ; - nif:endIndex "2219"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "fund" ; - nif:beginIndex "8298"^^xsd:nonNegativeInteger ; - nif:endIndex "8302"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "further" ; - nif:beginIndex "165"^^xsd:nonNegativeInteger ; - nif:endIndex "172"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "668"^^xsd:nonNegativeInteger ; - nif:endIndex "669"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "with" ; - nif:beginIndex "5086"^^xsd:nonNegativeInteger ; - nif:endIndex "5090"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "governor's" ; - nif:beginIndex "5504"^^xsd:nonNegativeInteger ; - nif:endIndex "5514"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Friday" ; - nif:beginIndex "8950"^^xsd:nonNegativeInteger ; - nif:endIndex "8956"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "resolution" ; - nif:beginIndex "9532"^^xsd:nonNegativeInteger ; - nif:endIndex "9542"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "2516"^^xsd:nonNegativeInteger ; - nif:endIndex "2520"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "House" ; - nif:beginIndex "8912"^^xsd:nonNegativeInteger ; - nif:endIndex "8917"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "918"^^xsd:nonNegativeInteger ; - nif:endIndex "920"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Friday" ; - nif:beginIndex "34"^^xsd:nonNegativeInteger ; - nif:endIndex "40"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Republicans" ; - nif:beginIndex "6654"^^xsd:nonNegativeInteger ; - nif:endIndex "6665"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "with" ; - nif:beginIndex "11687"^^xsd:nonNegativeInteger ; - nif:endIndex "11691"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "counties" ; - nif:beginIndex "2263"^^xsd:nonNegativeInteger ; - nif:endIndex "2271"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "up" ; - nif:beginIndex "5759"^^xsd:nonNegativeInteger ; - nif:endIndex "5761"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Highway" ; - nif:beginIndex "6788"^^xsd:nonNegativeInteger ; - nif:endIndex "6795"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "6895"^^xsd:nonNegativeInteger ; - nif:endIndex "6896"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "airport" ; - nif:beginIndex "3447"^^xsd:nonNegativeInteger ; - nif:endIndex "3454"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "so" ; - nif:beginIndex "11434"^^xsd:nonNegativeInteger ; - nif:endIndex "11436"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "night" ; - nif:beginIndex "9502"^^xsd:nonNegativeInteger ; - nif:endIndex "9507"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "primary" ; - nif:beginIndex "6678"^^xsd:nonNegativeInteger ; - nif:endIndex "6685"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "approved" ; - nif:beginIndex "7280"^^xsd:nonNegativeInteger ; - nif:endIndex "7288"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "12049"^^xsd:nonNegativeInteger ; - nif:endIndex "12050"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "does" ; - nif:beginIndex "10515"^^xsd:nonNegativeInteger ; - nif:endIndex "10519"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "fees" ; - nif:beginIndex "2897"^^xsd:nonNegativeInteger ; - nif:endIndex "2901"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "has" ; - nif:beginIndex "4871"^^xsd:nonNegativeInteger ; - nif:endIndex "4874"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "about" ; - nif:beginIndex "8790"^^xsd:nonNegativeInteger ; - nif:endIndex "8795"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "candidate" ; - nif:beginIndex "6141"^^xsd:nonNegativeInteger ; - nif:endIndex "6150"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Vandiver" ; - nif:beginIndex "8754"^^xsd:nonNegativeInteger ; - nif:endIndex "8762"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "number" ; - nif:beginIndex "7396"^^xsd:nonNegativeInteger ; - nif:endIndex "7402"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "12160"^^xsd:nonNegativeInteger ; - nif:endIndex "12162"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "10931"^^xsd:nonNegativeInteger ; - nif:endIndex "10932"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Ala." ; - nif:beginIndex "4831"^^xsd:nonNegativeInteger ; - nif:endIndex "4835"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "source" ; - nif:beginIndex "8137"^^xsd:nonNegativeInteger ; - nif:endIndex "8143"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "58"^^xsd:nonNegativeInteger ; - nif:endIndex "60"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "1433"^^xsd:nonNegativeInteger ; - nif:endIndex "1436"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "however" ; - nif:beginIndex "11444"^^xsd:nonNegativeInteger ; - nif:endIndex "11451"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Vandiver" ; - nif:beginIndex "8496"^^xsd:nonNegativeInteger ; - nif:endIndex "8504"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "2259"^^xsd:nonNegativeInteger ; - nif:endIndex "2262"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "6938"^^xsd:nonNegativeInteger ; - nif:endIndex "6941"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "389"^^xsd:nonNegativeInteger ; - nif:endIndex "390"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "wait" ; - nif:beginIndex "6237"^^xsd:nonNegativeInteger ; - nif:endIndex "6241"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "10813"^^xsd:nonNegativeInteger ; - nif:endIndex "10815"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "4142"^^xsd:nonNegativeInteger ; - nif:endIndex "4145"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "10019"^^xsd:nonNegativeInteger ; - nif:endIndex "10022"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "officials" ; - nif:beginIndex "3974"^^xsd:nonNegativeInteger ; - nif:endIndex "3983"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "voting" ; - nif:beginIndex "6018"^^xsd:nonNegativeInteger ; - nif:endIndex "6024"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "voters" ; - nif:beginIndex "6499"^^xsd:nonNegativeInteger ; - nif:endIndex "6505"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Caldwell's" ; - nif:beginIndex "6897"^^xsd:nonNegativeInteger ; - nif:endIndex "6907"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "who" ; - nif:beginIndex "6177"^^xsd:nonNegativeInteger ; - nif:endIndex "6180"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "midnight" ; - nif:beginIndex "11493"^^xsd:nonNegativeInteger ; - nif:endIndex "11501"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Republicans" ; - nif:beginIndex "6561"^^xsd:nonNegativeInteger ; - nif:endIndex "6572"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "9426"^^xsd:nonNegativeInteger ; - nif:endIndex "9427"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "administration" ; - nif:beginIndex "1445"^^xsd:nonNegativeInteger ; - nif:endIndex "1459"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "3392"^^xsd:nonNegativeInteger ; - nif:endIndex "3394"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "rally" ; - nif:beginIndex "5774"^^xsd:nonNegativeInteger ; - nif:endIndex "5779"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "4910"^^xsd:nonNegativeInteger ; - nif:endIndex "4913"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "6189"^^xsd:nonNegativeInteger ; - nif:endIndex "6190"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "is" ; - nif:beginIndex "1512"^^xsd:nonNegativeInteger ; - nif:endIndex "1514"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "11085"^^xsd:nonNegativeInteger ; - nif:endIndex "11086"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "principal" ; - nif:beginIndex "10942"^^xsd:nonNegativeInteger ; - nif:endIndex "10951"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "enthusiasm" ; - nif:beginIndex "5736"^^xsd:nonNegativeInteger ; - nif:endIndex "5746"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "4800"^^xsd:nonNegativeInteger ; - nif:endIndex "4803"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "resignation" ; - nif:beginIndex "6908"^^xsd:nonNegativeInteger ; - nif:endIndex "6919"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "7452"^^xsd:nonNegativeInteger ; - nif:endIndex "7454"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "have" ; - nif:beginIndex "4524"^^xsd:nonNegativeInteger ; - nif:endIndex "4528"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "11745"^^xsd:nonNegativeInteger ; - nif:endIndex "11746"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "10115"^^xsd:nonNegativeInteger ; - nif:endIndex "10116"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "took" ; - nif:beginIndex "1922"^^xsd:nonNegativeInteger ; - nif:endIndex "1926"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "3185"^^xsd:nonNegativeInteger ; - nif:endIndex "3187"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Ledford" ; - nif:beginIndex "6981"^^xsd:nonNegativeInteger ; - nif:endIndex "6988"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "will" ; - nif:beginIndex "4018"^^xsd:nonNegativeInteger ; - nif:endIndex "4022"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Jail" ; - nif:beginIndex "3834"^^xsd:nonNegativeInteger ; - nif:endIndex "3838"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "11339"^^xsd:nonNegativeInteger ; - nif:endIndex "11341"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Department" ; - nif:beginIndex "4307"^^xsd:nonNegativeInteger ; - nif:endIndex "4317"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Department" ; - nif:beginIndex "4178"^^xsd:nonNegativeInteger ; - nif:endIndex "4188"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "9530"^^xsd:nonNegativeInteger ; - nif:endIndex "9531"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "$50" ; - nif:beginIndex "8599"^^xsd:nonNegativeInteger ; - nif:endIndex "8602"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "It" ; - nif:beginIndex "4769"^^xsd:nonNegativeInteger ; - nif:endIndex "4771"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "1380"^^xsd:nonNegativeInteger ; - nif:endIndex "1382"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "5480"^^xsd:nonNegativeInteger ; - nif:endIndex "5481"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "was" ; - nif:beginIndex "6109"^^xsd:nonNegativeInteger ; - nif:endIndex "6112"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "8840"^^xsd:nonNegativeInteger ; - nif:endIndex "8841"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "this" ; - nif:beginIndex "2638"^^xsd:nonNegativeInteger ; - nif:endIndex "2642"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "increased" ; - nif:beginIndex "9993"^^xsd:nonNegativeInteger ; - nif:endIndex "10002"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "no" ; - nif:beginIndex "9519"^^xsd:nonNegativeInteger ; - nif:endIndex "9521"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "notice" ; - nif:beginIndex "9159"^^xsd:nonNegativeInteger ; - nif:endIndex "9165"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "mayor's" ; - nif:beginIndex "5030"^^xsd:nonNegativeInteger ; - nif:endIndex "5037"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "2817"^^xsd:nonNegativeInteger ; - nif:endIndex "2820"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "6130"^^xsd:nonNegativeInteger ; - nif:endIndex "6132"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "must" ; - nif:beginIndex "6666"^^xsd:nonNegativeInteger ; - nif:endIndex "6670"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "4051"^^xsd:nonNegativeInteger ; - nif:endIndex "4052"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "ready" ; - nif:beginIndex "7822"^^xsd:nonNegativeInteger ; - nif:endIndex "7827"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jury" ; - nif:beginIndex "1064"^^xsd:nonNegativeInteger ; - nif:endIndex "1068"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "was" ; - nif:beginIndex "4996"^^xsd:nonNegativeInteger ; - nif:endIndex "4999"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "per" ; - nif:beginIndex "6483"^^xsd:nonNegativeInteger ; - nif:endIndex "6486"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Miller" ; - nif:beginIndex "10972"^^xsd:nonNegativeInteger ; - nif:endIndex "10978"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "read" ; - nif:beginIndex "10465"^^xsd:nonNegativeInteger ; - nif:endIndex "10469"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "9893"^^xsd:nonNegativeInteger ; - nif:endIndex "9895"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "it" ; - nif:beginIndex "5988"^^xsd:nonNegativeInteger ; - nif:endIndex "5990"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "armed" ; - nif:beginIndex "11112"^^xsd:nonNegativeInteger ; - nif:endIndex "11117"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "out" ; - nif:beginIndex "6009"^^xsd:nonNegativeInteger ; - nif:endIndex "6012"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "told" ; - nif:beginIndex "8717"^^xsd:nonNegativeInteger ; - nif:endIndex "8721"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "serve" ; - nif:beginIndex "3156"^^xsd:nonNegativeInteger ; - nif:endIndex "3161"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "11603"^^xsd:nonNegativeInteger ; - nif:endIndex "11604"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "7941"^^xsd:nonNegativeInteger ; - nif:endIndex "7944"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "mayor" ; - nif:beginIndex "5068"^^xsd:nonNegativeInteger ; - nif:endIndex "5073"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "keeping" ; - nif:beginIndex "3676"^^xsd:nonNegativeInteger ; - nif:endIndex "3683"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "session" ; - nif:beginIndex "7204"^^xsd:nonNegativeInteger ; - nif:endIndex "7211"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "8521"^^xsd:nonNegativeInteger ; - nif:endIndex "8524"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "cruelty" ; - nif:beginIndex "4473"^^xsd:nonNegativeInteger ; - nif:endIndex "4480"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "937"^^xsd:nonNegativeInteger ; - nif:endIndex "941"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "I" ; - nif:beginIndex "11317"^^xsd:nonNegativeInteger ; - nif:endIndex "11318"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "10906"^^xsd:nonNegativeInteger ; - nif:endIndex "10907"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "9922"^^xsd:nonNegativeInteger ; - nif:endIndex "9924"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "7228"^^xsd:nonNegativeInteger ; - nif:endIndex "7231"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "354"^^xsd:nonNegativeInteger ; - nif:endIndex "356"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "2603"^^xsd:nonNegativeInteger ; - nif:endIndex "2604"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "criticisms" ; - nif:beginIndex "3288"^^xsd:nonNegativeInteger ; - nif:endIndex "3298"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "enter" ; - nif:beginIndex "5474"^^xsd:nonNegativeInteger ; - nif:endIndex "5479"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "grand" ; - nif:beginIndex "3056"^^xsd:nonNegativeInteger ; - nif:endIndex "3061"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "12211"^^xsd:nonNegativeInteger ; - nif:endIndex "12213"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "from" ; - nif:beginIndex "5669"^^xsd:nonNegativeInteger ; - nif:endIndex "5673"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Department" ; - nif:beginIndex "1483"^^xsd:nonNegativeInteger ; - nif:endIndex "1493"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "all" ; - nif:beginIndex "2255"^^xsd:nonNegativeInteger ; - nif:endIndex "2258"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "is" ; - nif:beginIndex "7803"^^xsd:nonNegativeInteger ; - nif:endIndex "7805"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "election" ; - nif:beginIndex "271"^^xsd:nonNegativeInteger ; - nif:endIndex "279"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "3839"^^xsd:nonNegativeInteger ; - nif:endIndex "3842"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "which" ; - nif:beginIndex "238"^^xsd:nonNegativeInteger ; - nif:endIndex "243"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Pearl" ; - nif:beginIndex "4385"^^xsd:nonNegativeInteger ; - nif:endIndex "4390"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "4383"^^xsd:nonNegativeInteger ; - nif:endIndex "4384"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "study" ; - nif:beginIndex "7383"^^xsd:nonNegativeInteger ; - nif:endIndex "7388"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "make" ; - nif:beginIndex "6259"^^xsd:nonNegativeInteger ; - nif:endIndex "6263"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "birth" ; - nif:beginIndex "4813"^^xsd:nonNegativeInteger ; - nif:endIndex "4818"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Fulton" ; - nif:beginIndex "2697"^^xsd:nonNegativeInteger ; - nif:endIndex "2703"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "resolution" ; - nif:beginIndex "10386"^^xsd:nonNegativeInteger ; - nif:endIndex "10396"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "details" ; - nif:beginIndex "9280"^^xsd:nonNegativeInteger ; - nif:endIndex "9287"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "face" ; - nif:beginIndex "6366"^^xsd:nonNegativeInteger ; - nif:endIndex "6370"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "when" ; - nif:beginIndex "3399"^^xsd:nonNegativeInteger ; - nif:endIndex "3403"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "But" ; - nif:beginIndex "10286"^^xsd:nonNegativeInteger ; - nif:endIndex "10289"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "man" ; - nif:beginIndex "4897"^^xsd:nonNegativeInteger ; - nif:endIndex "4900"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "11069"^^xsd:nonNegativeInteger ; - nif:endIndex "11072"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "1052"^^xsd:nonNegativeInteger ; - nif:endIndex "1053"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "10260"^^xsd:nonNegativeInteger ; - nif:endIndex "10263"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "540"^^xsd:nonNegativeInteger ; - nif:endIndex "543"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "4932"^^xsd:nonNegativeInteger ; - nif:endIndex "4935"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "there" ; - nif:beginIndex "9557"^^xsd:nonNegativeInteger ; - nif:endIndex "9562"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "236"^^xsd:nonNegativeInteger ; - nif:endIndex "237"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "as" ; - nif:beginIndex "7571"^^xsd:nonNegativeInteger ; - nif:endIndex "7573"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "names" ; - nif:beginIndex "6593"^^xsd:nonNegativeInteger ; - nif:endIndex "6598"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "effected" ; - nif:beginIndex "1895"^^xsd:nonNegativeInteger ; - nif:endIndex "1903"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "visit" ; - nif:beginIndex "7548"^^xsd:nonNegativeInteger ; - nif:endIndex "7553"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "71" ; - nif:beginIndex "4765"^^xsd:nonNegativeInteger ; - nif:endIndex "4767"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "crowd" ; - nif:beginIndex "6200"^^xsd:nonNegativeInteger ; - nif:endIndex "6205"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "enabling" ; - nif:beginIndex "1798"^^xsd:nonNegativeInteger ; - nif:endIndex "1806"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "reconstruction" ; - nif:beginIndex "7876"^^xsd:nonNegativeInteger ; - nif:endIndex "7890"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "11451"^^xsd:nonNegativeInteger ; - nif:endIndex "11452"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "3569"^^xsd:nonNegativeInteger ; - nif:endIndex "3571"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "laws" ; - nif:beginIndex "865"^^xsd:nonNegativeInteger ; - nif:endIndex "869"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "637" ; - nif:beginIndex "4958"^^xsd:nonNegativeInteger ; - nif:endIndex "4961"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "12006"^^xsd:nonNegativeInteger ; - nif:endIndex "12009"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Tuesday" ; - nif:beginIndex "6843"^^xsd:nonNegativeInteger ; - nif:endIndex "6850"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "homes" ; - nif:beginIndex "2040"^^xsd:nonNegativeInteger ; - nif:endIndex "2045"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "5298"^^xsd:nonNegativeInteger ; - nif:endIndex "5299"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "reasonable" ; - nif:beginIndex "3695"^^xsd:nonNegativeInteger ; - nif:endIndex "3705"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "enthusiastic" ; - nif:beginIndex "5646"^^xsd:nonNegativeInteger ; - nif:endIndex "5658"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Commissioner's" ; - nif:beginIndex "4205"^^xsd:nonNegativeInteger ; - nif:endIndex "4219"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "chairman" ; - nif:beginIndex "10956"^^xsd:nonNegativeInteger ; - nif:endIndex "10964"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "something" ; - nif:beginIndex "9855"^^xsd:nonNegativeInteger ; - nif:endIndex "9864"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "3371"^^xsd:nonNegativeInteger ; - nif:endIndex "3374"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "considering" ; - nif:beginIndex "687"^^xsd:nonNegativeInteger ; - nif:endIndex "698"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "system" ; - nif:beginIndex "6708"^^xsd:nonNegativeInteger ; - nif:endIndex "6714"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Legislature" ; - nif:beginIndex "7170"^^xsd:nonNegativeInteger ; - nif:endIndex "7181"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The jury said it did find that many of Georgia's registration and election laws ``are outmoded or inadequate and often ambiguous''." ; - nif:beginIndex "790"^^xsd:nonNegativeInteger ; - nif:endIndex "921"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "its" ; - nif:beginIndex "7195"^^xsd:nonNegativeInteger ; - nif:endIndex "7198"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "divorce" ; - nif:beginIndex "4362"^^xsd:nonNegativeInteger ; - nif:endIndex "4369"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The couple was married Aug. 2, 1913." ; - nif:beginIndex "4482"^^xsd:nonNegativeInteger ; - nif:endIndex "4518"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "anonymous" ; - nif:beginIndex "11876"^^xsd:nonNegativeInteger ; - nif:endIndex "11885"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "11825"^^xsd:nonNegativeInteger ; - nif:endIndex "11826"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "8908"^^xsd:nonNegativeInteger ; - nif:endIndex "8911"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "4087"^^xsd:nonNegativeInteger ; - nif:endIndex "4090"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "opposes" ; - nif:beginIndex "6743"^^xsd:nonNegativeInteger ; - nif:endIndex "6750"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "10610"^^xsd:nonNegativeInteger ; - nif:endIndex "10612"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "election" ; - nif:beginIndex "730"^^xsd:nonNegativeInteger ; - nif:endIndex "738"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "are" ; - nif:beginIndex "5438"^^xsd:nonNegativeInteger ; - nif:endIndex "5441"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "will" ; - nif:beginIndex "7913"^^xsd:nonNegativeInteger ; - nif:endIndex "7917"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Hartsfield" ; - nif:beginIndex "4336"^^xsd:nonNegativeInteger ; - nif:endIndex "4346"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "first" ; - nif:beginIndex "7841"^^xsd:nonNegativeInteger ; - nif:endIndex "7846"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "10156"^^xsd:nonNegativeInteger ; - nif:endIndex "10157"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "attorney" ; - nif:beginIndex "5038"^^xsd:nonNegativeInteger ; - nif:endIndex "5046"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "such" ; - nif:beginIndex "10055"^^xsd:nonNegativeInteger ; - nif:endIndex "10059"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "was" ; - nif:beginIndex "4493"^^xsd:nonNegativeInteger ; - nif:endIndex "4496"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "grand" ; - nif:beginIndex "1911"^^xsd:nonNegativeInteger ; - nif:endIndex "1916"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "follow" ; - nif:beginIndex "1210"^^xsd:nonNegativeInteger ; - nif:endIndex "1216"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "7403"^^xsd:nonNegativeInteger ; - nif:endIndex "7405"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Byrd" ; - nif:beginIndex "7152"^^xsd:nonNegativeInteger ; - nif:endIndex "7156"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "5991"^^xsd:nonNegativeInteger ; - nif:endIndex "5993"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "age" ; - nif:beginIndex "4758"^^xsd:nonNegativeInteger ; - nif:endIndex "4761"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "3315"^^xsd:nonNegativeInteger ; - nif:endIndex "3316"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "topics" ; - nif:beginIndex "1100"^^xsd:nonNegativeInteger ; - nif:endIndex "1106"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Executive" ; - nif:beginIndex "217"^^xsd:nonNegativeInteger ; - nif:endIndex "226"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Bowden" ; - nif:beginIndex "4989"^^xsd:nonNegativeInteger ; - nif:endIndex "4995"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "personnel" ; - nif:beginIndex "1577"^^xsd:nonNegativeInteger ; - nif:endIndex "1586"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "8047"^^xsd:nonNegativeInteger ; - nif:endIndex "8050"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "there" ; - nif:beginIndex "6293"^^xsd:nonNegativeInteger ; - nif:endIndex "6298"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "154"^^xsd:nonNegativeInteger ; - nif:endIndex "155"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "6506"^^xsd:nonNegativeInteger ; - nif:endIndex "6508"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "petitions" ; - nif:beginIndex "6531"^^xsd:nonNegativeInteger ; - nif:endIndex "6540"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "courts" ; - nif:beginIndex "7934"^^xsd:nonNegativeInteger ; - nif:endIndex "7940"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "legislators" ; - nif:beginIndex "949"^^xsd:nonNegativeInteger ; - nif:endIndex "960"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "on" ; - nif:beginIndex "6613"^^xsd:nonNegativeInteger ; - nif:endIndex "6615"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "4534"^^xsd:nonNegativeInteger ; - nif:endIndex "4535"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "House" ; - nif:beginIndex "9808"^^xsd:nonNegativeInteger ; - nif:endIndex "9813"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "suit" ; - nif:beginIndex "4353"^^xsd:nonNegativeInteger ; - nif:endIndex "4357"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "settlement" ; - nif:beginIndex "4658"^^xsd:nonNegativeInteger ; - nif:endIndex "4668"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "are" ; - nif:beginIndex "1188"^^xsd:nonNegativeInteger ; - nif:endIndex "1191"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "veteran" ; - nif:beginIndex "10879"^^xsd:nonNegativeInteger ; - nif:endIndex "10886"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "last" ; - nif:beginIndex "9625"^^xsd:nonNegativeInteger ; - nif:endIndex "9629"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "5937"^^xsd:nonNegativeInteger ; - nif:endIndex "5941"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Monday" ; - nif:beginIndex "8898"^^xsd:nonNegativeInteger ; - nif:endIndex "8904"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Authority" ; - nif:beginIndex "8236"^^xsd:nonNegativeInteger ; - nif:endIndex "8245"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "interim" ; - nif:beginIndex "3105"^^xsd:nonNegativeInteger ; - nif:endIndex "3112"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Ridge" ; - nif:beginIndex "5632"^^xsd:nonNegativeInteger ; - nif:endIndex "5637"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "A similar resolution passed in the Senate by a vote of 29-5." ; - nif:beginIndex "9428"^^xsd:nonNegativeInteger ; - nif:endIndex "9488"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "Dorsey" ; - nif:beginIndex "5718"^^xsd:nonNegativeInteger ; - nif:endIndex "5724"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "12209"^^xsd:nonNegativeInteger ; - nif:endIndex "12210"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "9841"^^xsd:nonNegativeInteger ; - nif:endIndex "9843"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "wife" ; - nif:beginIndex "4379"^^xsd:nonNegativeInteger ; - nif:endIndex "4383"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "8947"^^xsd:nonNegativeInteger ; - nif:endIndex "8949"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "tax" ; - nif:beginIndex "8479"^^xsd:nonNegativeInteger ; - nif:endIndex "8482"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "operated" ; - nif:beginIndex "3458"^^xsd:nonNegativeInteger ; - nif:endIndex "3466"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "8450"^^xsd:nonNegativeInteger ; - nif:endIndex "8453"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "666"^^xsd:nonNegativeInteger ; - nif:endIndex "668"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "It" ; - nif:beginIndex "11453"^^xsd:nonNegativeInteger ; - nif:endIndex "11455"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "it" ; - nif:beginIndex "6274"^^xsd:nonNegativeInteger ; - nif:endIndex "6276"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "10366"^^xsd:nonNegativeInteger ; - nif:endIndex "10369"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "3801"^^xsd:nonNegativeInteger ; - nif:endIndex "3803"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "1613"^^xsd:nonNegativeInteger ; - nif:endIndex "1616"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "ordinary's" ; - nif:beginIndex "2753"^^xsd:nonNegativeInteger ; - nif:endIndex "2763"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "11387"^^xsd:nonNegativeInteger ; - nif:endIndex "11388"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "irregularities" ; - nif:beginIndex "11977"^^xsd:nonNegativeInteger ; - nif:endIndex "11991"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "more" ; - nif:beginIndex "4914"^^xsd:nonNegativeInteger ; - nif:endIndex "4918"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "should" ; - nif:beginIndex "3577"^^xsd:nonNegativeInteger ; - nif:endIndex "3583"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "directed" ; - nif:beginIndex "10601"^^xsd:nonNegativeInteger ; - nif:endIndex "10609"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "as" ; - nif:beginIndex "4894"^^xsd:nonNegativeInteger ; - nif:endIndex "4896"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "County" ; - nif:beginIndex "8849"^^xsd:nonNegativeInteger ; - nif:endIndex "8855"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "leading" ; - nif:beginIndex "11402"^^xsd:nonNegativeInteger ; - nif:endIndex "11409"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "possible" ; - nif:beginIndex "509"^^xsd:nonNegativeInteger ; - nif:endIndex "517"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Lt." ; - nif:beginIndex "6863"^^xsd:nonNegativeInteger ; - nif:endIndex "6866"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "3040"^^xsd:nonNegativeInteger ; - nif:endIndex "3042"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "afternoon" ; - nif:beginIndex "7336"^^xsd:nonNegativeInteger ; - nif:endIndex "7345"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "we" ; - nif:beginIndex "2508"^^xsd:nonNegativeInteger ; - nif:endIndex "2510"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "word" ; - nif:beginIndex "9522"^^xsd:nonNegativeInteger ; - nif:endIndex "9526"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "expected" ; - nif:beginIndex "7515"^^xsd:nonNegativeInteger ; - nif:endIndex "7523"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "number" ; - nif:beginIndex "744"^^xsd:nonNegativeInteger ; - nif:endIndex "750"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "worked" ; - nif:beginIndex "9303"^^xsd:nonNegativeInteger ; - nif:endIndex "9309"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "3479"^^xsd:nonNegativeInteger ; - nif:endIndex "3483"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "lacking" ; - nif:beginIndex "1515"^^xsd:nonNegativeInteger ; - nif:endIndex "1522"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "County" ; - nif:beginIndex "2094"^^xsd:nonNegativeInteger ; - nif:endIndex "2100"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "over-all" ; - nif:beginIndex "248"^^xsd:nonNegativeInteger ; - nif:endIndex "256"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "4190"^^xsd:nonNegativeInteger ; - nif:endIndex "4193"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "723"^^xsd:nonNegativeInteger ; - nif:endIndex "725"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "685"^^xsd:nonNegativeInteger ; - nif:endIndex "687"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "primary" ; - nif:beginIndex "5339"^^xsd:nonNegativeInteger ; - nif:endIndex "5346"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "9584"^^xsd:nonNegativeInteger ; - nif:endIndex "9585"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "will" ; - nif:beginIndex "7620"^^xsd:nonNegativeInteger ; - nif:endIndex "7624"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "has" ; - nif:beginIndex "2181"^^xsd:nonNegativeInteger ; - nif:endIndex "2184"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "as" ; - nif:beginIndex "4794"^^xsd:nonNegativeInteger ; - nif:endIndex "4796"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Blue" ; - nif:beginIndex "5627"^^xsd:nonNegativeInteger ; - nif:endIndex "5631"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "construction" ; - nif:beginIndex "8267"^^xsd:nonNegativeInteger ; - nif:endIndex "8279"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "extern" ; - nif:beginIndex "3873"^^xsd:nonNegativeInteger ; - nif:endIndex "3879"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jury" ; - nif:beginIndex "794"^^xsd:nonNegativeInteger ; - nif:endIndex "798"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "did" ; - nif:beginIndex "3532"^^xsd:nonNegativeInteger ; - nif:endIndex "3535"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "his" ; - nif:beginIndex "12147"^^xsd:nonNegativeInteger ; - nif:endIndex "12150"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Hartsfield" ; - nif:beginIndex "5048"^^xsd:nonNegativeInteger ; - nif:endIndex "5058"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "518"^^xsd:nonNegativeInteger ; - nif:endIndex "520"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "effective" ; - nif:beginIndex "1828"^^xsd:nonNegativeInteger ; - nif:endIndex "1837"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "his" ; - nif:beginIndex "4754"^^xsd:nonNegativeInteger ; - nif:endIndex "4757"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "contracts" ; - nif:beginIndex "8033"^^xsd:nonNegativeInteger ; - nif:endIndex "8042"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "852"^^xsd:nonNegativeInteger ; - nif:endIndex "855"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "out" ; - nif:beginIndex "9601"^^xsd:nonNegativeInteger ; - nif:endIndex "9604"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "City" ; - nif:beginIndex "1467"^^xsd:nonNegativeInteger ; - nif:endIndex "1471"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Robert Snodgrass, state GOP chairman, said a meeting held Tuesday night in Blue Ridge brought enthusiastic responses from the audience." ; - nif:beginIndex "5552"^^xsd:nonNegativeInteger ; - nif:endIndex "5687"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "will" ; - nif:beginIndex "6956"^^xsd:nonNegativeInteger ; - nif:endIndex "6960"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Jury" ; - nif:beginIndex "24"^^xsd:nonNegativeInteger ; - nif:endIndex "28"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "is" ; - nif:beginIndex "10861"^^xsd:nonNegativeInteger ; - nif:endIndex "10863"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "$3" ; - nif:beginIndex "8190"^^xsd:nonNegativeInteger ; - nif:endIndex "8192"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "let" ; - nif:beginIndex "8043"^^xsd:nonNegativeInteger ; - nif:endIndex "8046"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "7997"^^xsd:nonNegativeInteger ; - nif:endIndex "7998"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "friendly" ; - nif:beginIndex "7947"^^xsd:nonNegativeInteger ; - nif:endIndex "7955"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "However" ; - nif:beginIndex "1318"^^xsd:nonNegativeInteger ; - nif:endIndex "1325"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Despite the warning, there was a unanimous vote to enter a candidate, according to Republicans who attended." ; - nif:beginIndex "6082"^^xsd:nonNegativeInteger ; - nif:endIndex "6190"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "2814"^^xsd:nonNegativeInteger ; - nif:endIndex "2816"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "9322"^^xsd:nonNegativeInteger ; - nif:endIndex "9326"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "laws" ; - nif:beginIndex "981"^^xsd:nonNegativeInteger ; - nif:endIndex "985"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "1353"^^xsd:nonNegativeInteger ; - nif:endIndex "1355"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "then" ; - nif:beginIndex "8003"^^xsd:nonNegativeInteger ; - nif:endIndex "8007"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "2742"^^xsd:nonNegativeInteger ; - nif:endIndex "2745"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "2179"^^xsd:nonNegativeInteger ; - nif:endIndex "2181"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "1206"^^xsd:nonNegativeInteger ; - nif:endIndex "1209"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "which" ; - nif:beginIndex "1172"^^xsd:nonNegativeInteger ; - nif:endIndex "1177"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "sheriff" ; - nif:beginIndex "12197"^^xsd:nonNegativeInteger ; - nif:endIndex "12204"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "duty" ; - nif:beginIndex "3914"^^xsd:nonNegativeInteger ; - nif:endIndex "3918"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "registration" ; - nif:beginIndex "839"^^xsd:nonNegativeInteger ; - nif:endIndex "851"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "2003"^^xsd:nonNegativeInteger ; - nif:endIndex "2006"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Congress" ; - nif:beginIndex "10506"^^xsd:nonNegativeInteger ; - nif:endIndex "10514"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jurors" ; - nif:beginIndex "2609"^^xsd:nonNegativeInteger ; - nif:endIndex "2615"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "allowances" ; - nif:beginIndex "9008"^^xsd:nonNegativeInteger ; - nif:endIndex "9018"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "funds" ; - nif:beginIndex "10060"^^xsd:nonNegativeInteger ; - nif:endIndex "10065"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "seek" ; - nif:beginIndex "9348"^^xsd:nonNegativeInteger ; - nif:endIndex "9352"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "ballot" ; - nif:beginIndex "6637"^^xsd:nonNegativeInteger ; - nif:endIndex "6643"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "apparently" ; - nif:beginIndex "8318"^^xsd:nonNegativeInteger ; - nif:endIndex "8328"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "1325"^^xsd:nonNegativeInteger ; - nif:endIndex "1326"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "11183"^^xsd:nonNegativeInteger ; - nif:endIndex "11186"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "1" ; - nif:beginIndex "3441"^^xsd:nonNegativeInteger ; - nif:endIndex "3442"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "couple" ; - nif:beginIndex "4486"^^xsd:nonNegativeInteger ; - nif:endIndex "4492"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "9127"^^xsd:nonNegativeInteger ; - nif:endIndex "9129"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "alternative" ; - nif:beginIndex "6441"^^xsd:nonNegativeInteger ; - nif:endIndex "6452"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "6150"^^xsd:nonNegativeInteger ; - nif:endIndex "6151"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "by" ; - nif:beginIndex "5281"^^xsd:nonNegativeInteger ; - nif:endIndex "5283"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "issue" ; - nif:beginIndex "7663"^^xsd:nonNegativeInteger ; - nif:endIndex "7668"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "4591"^^xsd:nonNegativeInteger ; - nif:endIndex "4593"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "state" ; - nif:beginIndex "7928"^^xsd:nonNegativeInteger ; - nif:endIndex "7933"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "5184"^^xsd:nonNegativeInteger ; - nif:endIndex "5186"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "1279"^^xsd:nonNegativeInteger ; - nif:endIndex "1281"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "there" ; - nif:beginIndex "11962"^^xsd:nonNegativeInteger ; - nif:endIndex "11967"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "million" ; - nif:beginIndex "8603"^^xsd:nonNegativeInteger ; - nif:endIndex "8610"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "November" ; - nif:beginIndex "9630"^^xsd:nonNegativeInteger ; - nif:endIndex "9638"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "off" ; - nif:beginIndex "8472"^^xsd:nonNegativeInteger ; - nif:endIndex "8475"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "1892"^^xsd:nonNegativeInteger ; - nif:endIndex "1894"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Highway" ; - nif:beginIndex "7784"^^xsd:nonNegativeInteger ; - nif:endIndex "7791"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "``Only a relative handful of such reports was received'', the jury said, ``considering the widespread interest in the election, the number of voters and the size of this city''." ; - nif:beginIndex "612"^^xsd:nonNegativeInteger ; - nif:endIndex "789"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "4689"^^xsd:nonNegativeInteger ; - nif:endIndex "4690"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "3248"^^xsd:nonNegativeInteger ; - nif:endIndex "3251"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "5074"^^xsd:nonNegativeInteger ; - nif:endIndex "5076"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "11481"^^xsd:nonNegativeInteger ; - nif:endIndex "11482"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "7115"^^xsd:nonNegativeInteger ; - nif:endIndex "7117"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "meeting" ; - nif:beginIndex "5904"^^xsd:nonNegativeInteger ; - nif:endIndex "5911"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "these" ; - nif:beginIndex "2420"^^xsd:nonNegativeInteger ; - nif:endIndex "2425"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "periodic" ; - nif:beginIndex "3587"^^xsd:nonNegativeInteger ; - nif:endIndex "3595"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "disproportionate" ; - nif:beginIndex "2668"^^xsd:nonNegativeInteger ; - nif:endIndex "2684"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "time" ; - nif:beginIndex "8432"^^xsd:nonNegativeInteger ; - nif:endIndex "8436"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "8337"^^xsd:nonNegativeInteger ; - nif:endIndex "8339"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "added" ; - nif:beginIndex "10293"^^xsd:nonNegativeInteger ; - nif:endIndex "10298"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "three" ; - nif:beginIndex "7041"^^xsd:nonNegativeInteger ; - nif:endIndex "7046"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "9313"^^xsd:nonNegativeInteger ; - nif:endIndex "9314"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "3930"^^xsd:nonNegativeInteger ; - nif:endIndex "3932"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "10039"^^xsd:nonNegativeInteger ; - nif:endIndex "10040"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "5046"^^xsd:nonNegativeInteger ; - nif:endIndex "5047"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Schley" ; - nif:beginIndex "8842"^^xsd:nonNegativeInteger ; - nif:endIndex "8848"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "11533"^^xsd:nonNegativeInteger ; - nif:endIndex "11535"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "school" ; - nif:beginIndex "10935"^^xsd:nonNegativeInteger ; - nif:endIndex "10941"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "his" ; - nif:beginIndex "9327"^^xsd:nonNegativeInteger ; - nif:endIndex "9330"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Mayor William B. Hartsfield filed suit for divorce from his wife, Pearl Williams Hartsfield, in Fulton Superior Court Friday." ; - nif:beginIndex "4319"^^xsd:nonNegativeInteger ; - nif:endIndex "4444"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "ones" ; - nif:beginIndex "8458"^^xsd:nonNegativeInteger ; - nif:endIndex "8462"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "urged" ; - nif:beginIndex "1756"^^xsd:nonNegativeInteger ; - nif:endIndex "1761"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "calls" ; - nif:beginIndex "11820"^^xsd:nonNegativeInteger ; - nif:endIndex "11825"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "expended" ; - nif:beginIndex "10082"^^xsd:nonNegativeInteger ; - nif:endIndex "10090"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "will" ; - nif:beginIndex "5856"^^xsd:nonNegativeInteger ; - nif:endIndex "5860"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Fulton" ; - nif:beginIndex "4415"^^xsd:nonNegativeInteger ; - nif:endIndex "4421"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "James" ; - nif:beginIndex "5709"^^xsd:nonNegativeInteger ; - nif:endIndex "5714"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "praise" ; - nif:beginIndex "296"^^xsd:nonNegativeInteger ; - nif:endIndex "302"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "getting" ; - nif:beginIndex "5442"^^xsd:nonNegativeInteger ; - nif:endIndex "5449"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "reelection" ; - nif:beginIndex "5406"^^xsd:nonNegativeInteger ; - nif:endIndex "5416"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "general" ; - nif:beginIndex "2101"^^xsd:nonNegativeInteger ; - nif:endIndex "2108"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "1" ; - nif:beginIndex "3773"^^xsd:nonNegativeInteger ; - nif:endIndex "3774"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "end" ; - nif:beginIndex "1013"^^xsd:nonNegativeInteger ; - nif:endIndex "1016"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "campaign" ; - nif:beginIndex "11737"^^xsd:nonNegativeInteger ; - nif:endIndex "11745"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jury" ; - nif:beginIndex "3375"^^xsd:nonNegativeInteger ; - nif:endIndex "3379"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "7837"^^xsd:nonNegativeInteger ; - nif:endIndex "7840"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "on" ; - nif:beginIndex "1079"^^xsd:nonNegativeInteger ; - nif:endIndex "1081"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "issue" ; - nif:beginIndex "8179"^^xsd:nonNegativeInteger ; - nif:endIndex "8184"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Caldwell's resignation had been expected for some time." ; - nif:beginIndex "6897"^^xsd:nonNegativeInteger ; - nif:endIndex "6952"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "1907"^^xsd:nonNegativeInteger ; - nif:endIndex "1910"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "formally" ; - nif:beginIndex "10456"^^xsd:nonNegativeInteger ; - nif:endIndex "10464"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "purchasing" ; - nif:beginIndex "1149"^^xsd:nonNegativeInteger ; - nif:endIndex "1159"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "2272"^^xsd:nonNegativeInteger ; - nif:endIndex "2274"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "defeated" ; - nif:beginIndex "10912"^^xsd:nonNegativeInteger ; - nif:endIndex "10920"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "3661"^^xsd:nonNegativeInteger ; - nif:endIndex "3664"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "wanted" ; - nif:beginIndex "6227"^^xsd:nonNegativeInteger ; - nif:endIndex "6233"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "10779"^^xsd:nonNegativeInteger ; - nif:endIndex "10782"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf ":" ; - nif:beginIndex "3770"^^xsd:nonNegativeInteger ; - nif:endIndex "3771"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "8284"^^xsd:nonNegativeInteger ; - nif:endIndex "8285"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "When" ; - nif:beginIndex "6191"^^xsd:nonNegativeInteger ; - nif:endIndex "6195"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "2304"^^xsd:nonNegativeInteger ; - nif:endIndex "2306"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "has" ; - nif:beginIndex "2971"^^xsd:nonNegativeInteger ; - nif:endIndex "2974"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "9456"^^xsd:nonNegativeInteger ; - nif:endIndex "9458"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Ask" ; - nif:beginIndex "3709"^^xsd:nonNegativeInteger ; - nif:endIndex "3712"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "but" ; - nif:beginIndex "3551"^^xsd:nonNegativeInteger ; - nif:endIndex "3554"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "11546"^^xsd:nonNegativeInteger ; - nif:endIndex "11549"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "9563"^^xsd:nonNegativeInteger ; - nif:endIndex "9565"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "than" ; - nif:beginIndex "7036"^^xsd:nonNegativeInteger ; - nif:endIndex "7040"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "6289"^^xsd:nonNegativeInteger ; - nif:endIndex "6292"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "6961"^^xsd:nonNegativeInteger ; - nif:endIndex "6963"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "occupation" ; - nif:beginIndex "4723"^^xsd:nonNegativeInteger ; - nif:endIndex "4733"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "7554"^^xsd:nonNegativeInteger ; - nif:endIndex "7556"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "is" ; - nif:beginIndex "4952"^^xsd:nonNegativeInteger ; - nif:endIndex "4954"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "election" ; - nif:beginIndex "10793"^^xsd:nonNegativeInteger ; - nif:endIndex "10801"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "A" ; - nif:beginIndex "9428"^^xsd:nonNegativeInteger ; - nif:endIndex "9429"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "8905"^^xsd:nonNegativeInteger ; - nif:endIndex "8907"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "6272"^^xsd:nonNegativeInteger ; - nif:endIndex "6273"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "8738"^^xsd:nonNegativeInteger ; - nif:endIndex "8739"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "also" ; - nif:beginIndex "2724"^^xsd:nonNegativeInteger ; - nif:endIndex "2728"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "10875"^^xsd:nonNegativeInteger ; - nif:endIndex "10876"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "104"^^xsd:nonNegativeInteger ; - nif:endIndex "106"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "2833"^^xsd:nonNegativeInteger ; - nif:endIndex "2835"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "outright" ; - nif:beginIndex "9139"^^xsd:nonNegativeInteger ; - nif:endIndex "9147"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "charged" ; - nif:beginIndex "432"^^xsd:nonNegativeInteger ; - nif:endIndex "439"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "4924"^^xsd:nonNegativeInteger ; - nif:endIndex "4925"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Implementation" ; - nif:beginIndex "1661"^^xsd:nonNegativeInteger ; - nif:endIndex "1675"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "2144"^^xsd:nonNegativeInteger ; - nif:endIndex "2145"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "recommended" ; - nif:beginIndex "3754"^^xsd:nonNegativeInteger ; - nif:endIndex "3765"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "more" ; - nif:beginIndex "6246"^^xsd:nonNegativeInteger ; - nif:endIndex "6250"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "into" ; - nif:beginIndex "6013"^^xsd:nonNegativeInteger ; - nif:endIndex "6017"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "other" ; - nif:beginIndex "1094"^^xsd:nonNegativeInteger ; - nif:endIndex "1099"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "worth" ; - nif:beginIndex "8215"^^xsd:nonNegativeInteger ; - nif:endIndex "8220"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "363"^^xsd:nonNegativeInteger ; - nif:endIndex "366"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "6692"^^xsd:nonNegativeInteger ; - nif:endIndex "6695"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "receive" ; - nif:beginIndex "2556"^^xsd:nonNegativeInteger ; - nif:endIndex "2563"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "9957"^^xsd:nonNegativeInteger ; - nif:endIndex "9958"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "As" ; - nif:beginIndex "9489"^^xsd:nonNegativeInteger ; - nif:endIndex "9491"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "worth" ; - nif:beginIndex "8611"^^xsd:nonNegativeInteger ; - nif:endIndex "8616"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "campaign" ; - nif:beginIndex "7127"^^xsd:nonNegativeInteger ; - nif:endIndex "7135"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "then" ; - nif:beginIndex "8661"^^xsd:nonNegativeInteger ; - nif:endIndex "8665"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "former" ; - nif:beginIndex "11550"^^xsd:nonNegativeInteger ; - nif:endIndex "11556"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "was" ; - nif:beginIndex "11179"^^xsd:nonNegativeInteger ; - nif:endIndex "11182"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "put" ; - nif:beginIndex "10768"^^xsd:nonNegativeInteger ; - nif:endIndex "10771"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "home" ; - nif:beginIndex "7232"^^xsd:nonNegativeInteger ; - nif:endIndex "7236"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "he" ; - nif:beginIndex "11653"^^xsd:nonNegativeInteger ; - nif:endIndex "11655"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "3311"^^xsd:nonNegativeInteger ; - nif:endIndex "3315"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "by" ; - nif:beginIndex "9470"^^xsd:nonNegativeInteger ; - nif:endIndex "9472"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Department" ; - nif:beginIndex "2168"^^xsd:nonNegativeInteger ; - nif:endIndex "2178"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "city" ; - nif:beginIndex "3969"^^xsd:nonNegativeInteger ; - nif:endIndex "3973"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "future" ; - nif:beginIndex "2528"^^xsd:nonNegativeInteger ; - nif:endIndex "2534"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "just" ; - nif:beginIndex "11290"^^xsd:nonNegativeInteger ; - nif:endIndex "11294"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "178"^^xsd:nonNegativeInteger ; - nif:endIndex "180"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "3470"^^xsd:nonNegativeInteger ; - nif:endIndex "3471"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "its" ; - nif:beginIndex "2800"^^xsd:nonNegativeInteger ; - nif:endIndex "2803"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "office" ; - nif:beginIndex "5237"^^xsd:nonNegativeInteger ; - nif:endIndex "5243"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "1461"^^xsd:nonNegativeInteger ; - nif:endIndex "1462"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "an" ; - nif:beginIndex "41"^^xsd:nonNegativeInteger ; - nif:endIndex "43"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "reportedly" ; - nif:beginIndex "11780"^^xsd:nonNegativeInteger ; - nif:endIndex "11790"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "citizens" ; - nif:beginIndex "11941"^^xsd:nonNegativeInteger ; - nif:endIndex "11949"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "increase" ; - nif:beginIndex "8988"^^xsd:nonNegativeInteger ; - nif:endIndex "8996"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "elaborate" ; - nif:beginIndex "3540"^^xsd:nonNegativeInteger ; - nif:endIndex "3549"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "3298"^^xsd:nonNegativeInteger ; - nif:endIndex "3300"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Sunday" ; - nif:beginIndex "9032"^^xsd:nonNegativeInteger ; - nif:endIndex "9038"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "personnel" ; - nif:beginIndex "1547"^^xsd:nonNegativeInteger ; - nif:endIndex "1556"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "by" ; - nif:beginIndex "440"^^xsd:nonNegativeInteger ; - nif:endIndex "442"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "with" ; - nif:beginIndex "2285"^^xsd:nonNegativeInteger ; - nif:endIndex "2289"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "elected" ; - nif:beginIndex "5826"^^xsd:nonNegativeInteger ; - nif:endIndex "5833"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "6101"^^xsd:nonNegativeInteger ; - nif:endIndex "6102"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "This" ; - nif:beginIndex "11174"^^xsd:nonNegativeInteger ; - nif:endIndex "11178"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "5207"^^xsd:nonNegativeInteger ; - nif:endIndex "5208"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "10476"^^xsd:nonNegativeInteger ; - nif:endIndex "10477"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "765"^^xsd:nonNegativeInteger ; - nif:endIndex "768"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "it" ; - nif:beginIndex "10256"^^xsd:nonNegativeInteger ; - nif:endIndex "10258"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "4517"^^xsd:nonNegativeInteger ; - nif:endIndex "4518"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Fulton" ; - nif:beginIndex "2535"^^xsd:nonNegativeInteger ; - nif:endIndex "2541"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "7251"^^xsd:nonNegativeInteger ; - nif:endIndex "7253"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "law" ; - nif:beginIndex "6382"^^xsd:nonNegativeInteger ; - nif:endIndex "6385"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "so" ; - nif:beginIndex "1843"^^xsd:nonNegativeInteger ; - nif:endIndex "1845"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "members" ; - nif:beginIndex "10180"^^xsd:nonNegativeInteger ; - nif:endIndex "10187"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "appraisers" ; - nif:beginIndex "2836"^^xsd:nonNegativeInteger ; - nif:endIndex "2846"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "seen" ; - nif:beginIndex "2185"^^xsd:nonNegativeInteger ; - nif:endIndex "2189"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "citizens" ; - nif:beginIndex "3113"^^xsd:nonNegativeInteger ; - nif:endIndex "3121"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "8559"^^xsd:nonNegativeInteger ; - nif:endIndex "8562"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "take" ; - nif:beginIndex "1624"^^xsd:nonNegativeInteger ; - nif:endIndex "1628"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "should" ; - nif:beginIndex "1373"^^xsd:nonNegativeInteger ; - nif:endIndex "1379"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "4855"^^xsd:nonNegativeInteger ; - nif:endIndex "4859"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "1032"^^xsd:nonNegativeInteger ; - nif:endIndex "1035"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "its" ; - nif:beginIndex "2993"^^xsd:nonNegativeInteger ; - nif:endIndex "2996"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "number" ; - nif:beginIndex "1084"^^xsd:nonNegativeInteger ; - nif:endIndex "1090"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "pointed" ; - nif:beginIndex "9593"^^xsd:nonNegativeInteger ; - nif:endIndex "9600"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "remedy" ; - nif:beginIndex "1638"^^xsd:nonNegativeInteger ; - nif:endIndex "1644"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "heavily" ; - nif:beginIndex "8089"^^xsd:nonNegativeInteger ; - nif:endIndex "8096"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "5311"^^xsd:nonNegativeInteger ; - nif:endIndex "5312"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "technical" ; - nif:beginIndex "9270"^^xsd:nonNegativeInteger ; - nif:endIndex "9279"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "polls" ; - nif:beginIndex "11999"^^xsd:nonNegativeInteger ; - nif:endIndex "12004"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "9605"^^xsd:nonNegativeInteger ; - nif:endIndex "9609"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "priority" ; - nif:beginIndex "10620"^^xsd:nonNegativeInteger ; - nif:endIndex "10628"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "handful" ; - nif:beginIndex "630"^^xsd:nonNegativeInteger ; - nif:endIndex "637"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "there" ; - nif:beginIndex "3571"^^xsd:nonNegativeInteger ; - nif:endIndex "3576"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "3766"^^xsd:nonNegativeInteger ; - nif:endIndex "3770"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "9821"^^xsd:nonNegativeInteger ; - nif:endIndex "9823"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "7633"^^xsd:nonNegativeInteger ; - nif:endIndex "7636"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "2194"^^xsd:nonNegativeInteger ; - nif:endIndex "2196"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Sheriff" ; - nif:beginIndex "12089"^^xsd:nonNegativeInteger ; - nif:endIndex "12096"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "with" ; - nif:beginIndex "11118"^^xsd:nonNegativeInteger ; - nif:endIndex "11122"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "race" ; - nif:beginIndex "8516"^^xsd:nonNegativeInteger ; - nif:endIndex "8520"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "2459"^^xsd:nonNegativeInteger ; - nif:endIndex "2461"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "two" ; - nif:beginIndex "6437"^^xsd:nonNegativeInteger ; - nif:endIndex "6440"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jury" ; - nif:beginIndex "1499"^^xsd:nonNegativeInteger ; - nif:endIndex "1503"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Highway" ; - nif:beginIndex "8118"^^xsd:nonNegativeInteger ; - nif:endIndex "8125"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "candidate" ; - nif:beginIndex "5482"^^xsd:nonNegativeInteger ; - nif:endIndex "5491"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "was" ; - nif:beginIndex "5747"^^xsd:nonNegativeInteger ; - nif:endIndex "5750"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Atlanta's" ; - nif:beginIndex "61"^^xsd:nonNegativeInteger ; - nif:endIndex "70"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "After" ; - nif:beginIndex "10666"^^xsd:nonNegativeInteger ; - nif:endIndex "10671"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "coolest" ; - nif:beginIndex "10785"^^xsd:nonNegativeInteger ; - nif:endIndex "10792"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "race" ; - nif:beginIndex "6268"^^xsd:nonNegativeInteger ; - nif:endIndex "6272"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "whether" ; - nif:beginIndex "6216"^^xsd:nonNegativeInteger ; - nif:endIndex "6223"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "8974"^^xsd:nonNegativeInteger ; - nif:endIndex "8975"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Bellwood" ; - nif:beginIndex "4232"^^xsd:nonNegativeInteger ; - nif:endIndex "4240"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "elected" ; - nif:beginIndex "3252"^^xsd:nonNegativeInteger ; - nif:endIndex "3259"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "3612"^^xsd:nonNegativeInteger ; - nif:endIndex "3615"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Marvin" ; - nif:beginIndex "8671"^^xsd:nonNegativeInteger ; - nif:endIndex "8677"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Judge" ; - nif:beginIndex "465"^^xsd:nonNegativeInteger ; - nif:endIndex "470"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "new" ; - nif:beginIndex "10717"^^xsd:nonNegativeInteger ; - nif:endIndex "10720"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "1508"^^xsd:nonNegativeInteger ; - nif:endIndex "1509"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The jury further said in term-end presentments that the City Executive Committee, which had over-all charge of the election, ``deserves the praise and thanks of the City of Atlanta'' for the manner in which the election was conducted." ; - nif:beginIndex "156"^^xsd:nonNegativeInteger ; - nif:endIndex "390"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "efficiency" ; - nif:beginIndex "1411"^^xsd:nonNegativeInteger ; - nif:endIndex "1421"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "four" ; - nif:beginIndex "11637"^^xsd:nonNegativeInteger ; - nif:endIndex "11641"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "gubernatorial" ; - nif:beginIndex "7063"^^xsd:nonNegativeInteger ; - nif:endIndex "7076"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "major" ; - nif:beginIndex "2068"^^xsd:nonNegativeInteger ; - nif:endIndex "2073"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "2715"^^xsd:nonNegativeInteger ; - nif:endIndex "2718"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "bonds" ; - nif:beginIndex "8643"^^xsd:nonNegativeInteger ; - nif:endIndex "8648"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "9492"^^xsd:nonNegativeInteger ; - nif:endIndex "9494"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "pay" ; - nif:beginIndex "9707"^^xsd:nonNegativeInteger ; - nif:endIndex "9710"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "will" ; - nif:beginIndex "2643"^^xsd:nonNegativeInteger ; - nif:endIndex "2647"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "2" ; - nif:beginIndex "6648"^^xsd:nonNegativeInteger ; - nif:endIndex "6649"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "on" ; - nif:beginIndex "8063"^^xsd:nonNegativeInteger ; - nif:endIndex "8065"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "292"^^xsd:nonNegativeInteger ; - nif:endIndex "295"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "hold" ; - nif:beginIndex "6671"^^xsd:nonNegativeInteger ; - nif:endIndex "6675"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "inure" ; - nif:beginIndex "1252"^^xsd:nonNegativeInteger ; - nif:endIndex "1257"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Georgia's" ; - nif:beginIndex "1679"^^xsd:nonNegativeInteger ; - nif:endIndex "1688"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "evidence" ; - nif:beginIndex "109"^^xsd:nonNegativeInteger ; - nif:endIndex "117"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "two" ; - nif:beginIndex "1361"^^xsd:nonNegativeInteger ; - nif:endIndex "1364"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "1569"^^xsd:nonNegativeInteger ; - nif:endIndex "1571"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "8617"^^xsd:nonNegativeInteger ; - nif:endIndex "8619"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "4228"^^xsd:nonNegativeInteger ; - nif:endIndex "4231"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "In" ; - nif:beginIndex "5886"^^xsd:nonNegativeInteger ; - nif:endIndex "5888"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "insure" ; - nif:beginIndex "11158"^^xsd:nonNegativeInteger ; - nif:endIndex "11164"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "unit" ; - nif:beginIndex "6703"^^xsd:nonNegativeInteger ; - nif:endIndex "6707"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Texas" ; - nif:beginIndex "5834"^^xsd:nonNegativeInteger ; - nif:endIndex "5839"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Meanwhile" ; - nif:beginIndex "7748"^^xsd:nonNegativeInteger ; - nif:endIndex "7757"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "carry" ; - nif:beginIndex "12043"^^xsd:nonNegativeInteger ; - nif:endIndex "12048"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "1459"^^xsd:nonNegativeInteger ; - nif:endIndex "1461"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "740"^^xsd:nonNegativeInteger ; - nif:endIndex "743"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "well" ; - nif:beginIndex "1192"^^xsd:nonNegativeInteger ; - nif:endIndex "1196"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "(2)" ; - nif:beginIndex "3934"^^xsd:nonNegativeInteger ; - nif:endIndex "3937"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , . - - - a nif:Word ; - nif:anchorOf "--" ; - nif:beginIndex "6715"^^xsd:nonNegativeInteger ; - nif:endIndex "6717"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Five per cent of the voters in each county must sign petitions requesting that the Republicans be allowed to place names of candidates on the general election ballot, or 2" ; - nif:beginIndex "6478"^^xsd:nonNegativeInteger ; - nif:endIndex "6649"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "10672"^^xsd:nonNegativeInteger ; - nif:endIndex "10673"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "3984"^^xsd:nonNegativeInteger ; - nif:endIndex "3986"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "826"^^xsd:nonNegativeInteger ; - nif:endIndex "828"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "3521"^^xsd:nonNegativeInteger ; - nif:endIndex "3522"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "10284"^^xsd:nonNegativeInteger ; - nif:endIndex "10285"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "ambiguous" ; - nif:beginIndex "909"^^xsd:nonNegativeInteger ; - nif:endIndex "918"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "realize" ; - nif:beginIndex "2378"^^xsd:nonNegativeInteger ; - nif:endIndex "2385"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "There" ; - nif:beginIndex "12213"^^xsd:nonNegativeInteger ; - nif:endIndex "12218"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "7345"^^xsd:nonNegativeInteger ; - nif:endIndex "7346"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "1261"^^xsd:nonNegativeInteger ; - nif:endIndex "1264"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "would" ; - nif:beginIndex "9230"^^xsd:nonNegativeInteger ; - nif:endIndex "9235"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "3302"^^xsd:nonNegativeInteger ; - nif:endIndex "3305"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "them" ; - nif:beginIndex "1114"^^xsd:nonNegativeInteger ; - nif:endIndex "1118"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "2388"^^xsd:nonNegativeInteger ; - nif:endIndex "2389"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Callan" ; - nif:beginIndex "11597"^^xsd:nonNegativeInteger ; - nif:endIndex "11603"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "When" ; - nif:beginIndex "7054"^^xsd:nonNegativeInteger ; - nif:endIndex "7058"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jury" ; - nif:beginIndex "674"^^xsd:nonNegativeInteger ; - nif:endIndex "678"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "12055"^^xsd:nonNegativeInteger ; - nif:endIndex "12058"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "12004"^^xsd:nonNegativeInteger ; - nif:endIndex "12005"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Education" ; - nif:beginIndex "10581"^^xsd:nonNegativeInteger ; - nif:endIndex "10590"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "mental" ; - nif:beginIndex "4466"^^xsd:nonNegativeInteger ; - nif:endIndex "4472"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "future" ; - nif:beginIndex "9722"^^xsd:nonNegativeInteger ; - nif:endIndex "9728"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "vote" ; - nif:beginIndex "9699"^^xsd:nonNegativeInteger ; - nif:endIndex "9703"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "on" ; - nif:beginIndex "9107"^^xsd:nonNegativeInteger ; - nif:endIndex "9109"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "recommended" ; - nif:beginIndex "1719"^^xsd:nonNegativeInteger ; - nif:endIndex "1730"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "traditional" ; - nif:beginIndex "7536"^^xsd:nonNegativeInteger ; - nif:endIndex "7547"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "2030"^^xsd:nonNegativeInteger ; - nif:endIndex "2032"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "issued" ; - nif:beginIndex "8419"^^xsd:nonNegativeInteger ; - nif:endIndex "8425"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "calls" ; - nif:beginIndex "11508"^^xsd:nonNegativeInteger ; - nif:endIndex "11513"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "fund" ; - nif:beginIndex "8383"^^xsd:nonNegativeInteger ; - nif:endIndex "8387"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The largest hurdle the Republicans would have to face is a state law which says that before making a first race, one of two alternative courses must be taken: 1" ; - nif:beginIndex "6317"^^xsd:nonNegativeInteger ; - nif:endIndex "6477"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "meeting" ; - nif:beginIndex "5597"^^xsd:nonNegativeInteger ; - nif:endIndex "5604"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "reports" ; - nif:beginIndex "498"^^xsd:nonNegativeInteger ; - nif:endIndex "505"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "City" ; - nif:beginIndex "321"^^xsd:nonNegativeInteger ; - nif:endIndex "325"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Davis" ; - nif:beginIndex "10870"^^xsd:nonNegativeInteger ; - nif:endIndex "10875"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "8345"^^xsd:nonNegativeInteger ; - nif:endIndex "8348"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "wind" ; - nif:beginIndex "7187"^^xsd:nonNegativeInteger ; - nif:endIndex "7191"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "6336"^^xsd:nonNegativeInteger ; - nif:endIndex "6339"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "matters" ; - nif:beginIndex "3736"^^xsd:nonNegativeInteger ; - nif:endIndex "3743"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "11916"^^xsd:nonNegativeInteger ; - nif:endIndex "11919"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "done" ; - nif:beginIndex "9070"^^xsd:nonNegativeInteger ; - nif:endIndex "9074"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "I" ; - nif:beginIndex "10802"^^xsd:nonNegativeInteger ; - nif:endIndex "10803"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "son" ; - nif:beginIndex "4531"^^xsd:nonNegativeInteger ; - nif:endIndex "4534"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Fulton" ; - nif:beginIndex "1135"^^xsd:nonNegativeInteger ; - nif:endIndex "1141"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "employes" ; - nif:beginIndex "4096"^^xsd:nonNegativeInteger ; - nif:endIndex "4104"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "was" ; - nif:beginIndex "7762"^^xsd:nonNegativeInteger ; - nif:endIndex "7765"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "vote" ; - nif:beginIndex "9102"^^xsd:nonNegativeInteger ; - nif:endIndex "9106"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "promise" ; - nif:beginIndex "12151"^^xsd:nonNegativeInteger ; - nif:endIndex "12158"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "day" ; - nif:beginIndex "8984"^^xsd:nonNegativeInteger ; - nif:endIndex "8987"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "less" ; - nif:beginIndex "2466"^^xsd:nonNegativeInteger ; - nif:endIndex "2470"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "practices" ; - nif:beginIndex "3624"^^xsd:nonNegativeInteger ; - nif:endIndex "3633"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "W." ; - nif:beginIndex "5715"^^xsd:nonNegativeInteger ; - nif:endIndex "5717"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "3853"^^xsd:nonNegativeInteger ; - nif:endIndex "3854"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "county" ; - nif:beginIndex "6514"^^xsd:nonNegativeInteger ; - nif:endIndex "6520"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "steps" ; - nif:beginIndex "1629"^^xsd:nonNegativeInteger ; - nif:endIndex "1634"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "calmest" ; - nif:beginIndex "11196"^^xsd:nonNegativeInteger ; - nif:endIndex "11203"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "funds" ; - nif:beginIndex "2426"^^xsd:nonNegativeInteger ; - nif:endIndex "2431"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "million" ; - nif:beginIndex "8193"^^xsd:nonNegativeInteger ; - nif:endIndex "8200"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "sign" ; - nif:beginIndex "6526"^^xsd:nonNegativeInteger ; - nif:endIndex "6530"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "1788"^^xsd:nonNegativeInteger ; - nif:endIndex "1790"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "listed" ; - nif:beginIndex "4772"^^xsd:nonNegativeInteger ; - nif:endIndex "4778"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "at" ; - nif:beginIndex "3813"^^xsd:nonNegativeInteger ; - nif:endIndex "3815"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "4569"^^xsd:nonNegativeInteger ; - nif:endIndex "4570"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "11410"^^xsd:nonNegativeInteger ; - nif:endIndex "11412"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "3467"^^xsd:nonNegativeInteger ; - nif:endIndex "3469"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "work" ; - nif:beginIndex "8280"^^xsd:nonNegativeInteger ; - nif:endIndex "8284"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "9574"^^xsd:nonNegativeInteger ; - nif:endIndex "9577"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "1119"^^xsd:nonNegativeInteger ; - nif:endIndex "1122"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "years" ; - nif:beginIndex "7047"^^xsd:nonNegativeInteger ; - nif:endIndex "7052"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "school" ; - nif:beginIndex "11564"^^xsd:nonNegativeInteger ; - nif:endIndex "11570"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "11928"^^xsd:nonNegativeInteger ; - nif:endIndex "11929"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "calls" ; - nif:beginIndex "11886"^^xsd:nonNegativeInteger ; - nif:endIndex "11891"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "2488"^^xsd:nonNegativeInteger ; - nif:endIndex "2490"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "4443"^^xsd:nonNegativeInteger ; - nif:endIndex "4444"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "10933"^^xsd:nonNegativeInteger ; - nif:endIndex "10934"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "7059"^^xsd:nonNegativeInteger ; - nif:endIndex "7062"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "10968"^^xsd:nonNegativeInteger ; - nif:endIndex "10971"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "no" ; - nif:beginIndex "6283"^^xsd:nonNegativeInteger ; - nif:endIndex "6285"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "While emphasizing that technical details were not fully worked out, Pelham said his resolution would seek to set aside the privilege resolution which the House voted through 87-31." ; - nif:beginIndex "9247"^^xsd:nonNegativeInteger ; - nif:endIndex "9427"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "Nevertheless" ; - nif:beginIndex "2492"^^xsd:nonNegativeInteger ; - nif:endIndex "2504"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "483"^^xsd:nonNegativeInteger ; - nif:endIndex "485"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "conducted" ; - nif:beginIndex "380"^^xsd:nonNegativeInteger ; - nif:endIndex "389"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "recommended" ; - nif:beginIndex "3380"^^xsd:nonNegativeInteger ; - nif:endIndex "3391"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "8252"^^xsd:nonNegativeInteger ; - nif:endIndex "8255"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "manner" ; - nif:beginIndex "347"^^xsd:nonNegativeInteger ; - nif:endIndex "353"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "director" ; - nif:beginIndex "6824"^^xsd:nonNegativeInteger ; - nif:endIndex "6832"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Regarding" ; - nif:beginIndex "3317"^^xsd:nonNegativeInteger ; - nif:endIndex "3326"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "2252"^^xsd:nonNegativeInteger ; - nif:endIndex "2254"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "campaign" ; - nif:beginIndex "11393"^^xsd:nonNegativeInteger ; - nif:endIndex "11401"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "326"^^xsd:nonNegativeInteger ; - nif:endIndex "328"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "7524"^^xsd:nonNegativeInteger ; - nif:endIndex "7526"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The jury said it found the court ``has incorporated into its operating procedures the recommendations'' of two previous grand juries, the Atlanta Bar Association and an interim citizens committee." ; - nif:beginIndex "2936"^^xsd:nonNegativeInteger ; - nif:endIndex "3132"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "7427"^^xsd:nonNegativeInteger ; - nif:endIndex "7429"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "2083"^^xsd:nonNegativeInteger ; - nif:endIndex "2086"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "8371"^^xsd:nonNegativeInteger ; - nif:endIndex "8372"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "commented" ; - nif:beginIndex "1069"^^xsd:nonNegativeInteger ; - nif:endIndex "1078"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "2140"^^xsd:nonNegativeInteger ; - nif:endIndex "2144"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "1608"^^xsd:nonNegativeInteger ; - nif:endIndex "1612"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Grand" ; - nif:beginIndex "18"^^xsd:nonNegativeInteger ; - nif:endIndex "23"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "9896"^^xsd:nonNegativeInteger ; - nif:endIndex "9899"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "9195"^^xsd:nonNegativeInteger ; - nif:endIndex "9199"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "married" ; - nif:beginIndex "4497"^^xsd:nonNegativeInteger ; - nif:endIndex "4504"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "dispute" ; - nif:beginIndex "11679"^^xsd:nonNegativeInteger ; - nif:endIndex "11686"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "says" ; - nif:beginIndex "10481"^^xsd:nonNegativeInteger ; - nif:endIndex "10485"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "4860"^^xsd:nonNegativeInteger ; - nif:endIndex "4863"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "2577"^^xsd:nonNegativeInteger ; - nif:endIndex "2579"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "3195"^^xsd:nonNegativeInteger ; - nif:endIndex "3198"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "which" ; - nif:beginIndex "564"^^xsd:nonNegativeInteger ; - nif:endIndex "569"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "8221"^^xsd:nonNegativeInteger ; - nif:endIndex "8223"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "his" ; - nif:beginIndex "4779"^^xsd:nonNegativeInteger ; - nif:endIndex "4782"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Monday" ; - nif:beginIndex "7329"^^xsd:nonNegativeInteger ; - nif:endIndex "7335"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "incorporated" ; - nif:beginIndex "2975"^^xsd:nonNegativeInteger ; - nif:endIndex "2987"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "12226"^^xsd:nonNegativeInteger ; - nif:endIndex "12227"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "1422"^^xsd:nonNegativeInteger ; - nif:endIndex "1425"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "distribute" ; - nif:beginIndex "2197"^^xsd:nonNegativeInteger ; - nif:endIndex "2207"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "or" ; - nif:beginIndex "885"^^xsd:nonNegativeInteger ; - nif:endIndex "887"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "asked" ; - nif:beginIndex "10347"^^xsd:nonNegativeInteger ; - nif:endIndex "10352"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "9796"^^xsd:nonNegativeInteger ; - nif:endIndex "9799"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "speaker" ; - nif:beginIndex "5877"^^xsd:nonNegativeInteger ; - nif:endIndex "5884"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "after" ; - nif:beginIndex "11897"^^xsd:nonNegativeInteger ; - nif:endIndex "11902"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "5536"^^xsd:nonNegativeInteger ; - nif:endIndex "5540"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "where" ; - nif:beginIndex "7240"^^xsd:nonNegativeInteger ; - nif:endIndex "7245"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "7896"^^xsd:nonNegativeInteger ; - nif:endIndex "7897"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "drop" ; - nif:beginIndex "11334"^^xsd:nonNegativeInteger ; - nif:endIndex "11338"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Wards protected" ; - nif:beginIndex "2920"^^xsd:nonNegativeInteger ; - nif:endIndex "2935"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "9236"^^xsd:nonNegativeInteger ; - nif:endIndex "9238"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "picking" ; - nif:beginIndex "5751"^^xsd:nonNegativeInteger ; - nif:endIndex "5758"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "are" ; - nif:beginIndex "10171"^^xsd:nonNegativeInteger ; - nif:endIndex "10174"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "it" ; - nif:beginIndex "9865"^^xsd:nonNegativeInteger ; - nif:endIndex "9867"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "under" ; - nif:beginIndex "6686"^^xsd:nonNegativeInteger ; - nif:endIndex "6691"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "11724"^^xsd:nonNegativeInteger ; - nif:endIndex "11727"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "additional" ; - nif:beginIndex "3781"^^xsd:nonNegativeInteger ; - nif:endIndex "3791"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Saturday's" ; - nif:beginIndex "11048"^^xsd:nonNegativeInteger ; - nif:endIndex "11058"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "such" ; - nif:beginIndex "641"^^xsd:nonNegativeInteger ; - nif:endIndex "645"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "10831"^^xsd:nonNegativeInteger ; - nif:endIndex "10834"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "deputies" ; - nif:beginIndex "3718"^^xsd:nonNegativeInteger ; - nif:endIndex "3726"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "--" ; - nif:beginIndex "10663"^^xsd:nonNegativeInteger ; - nif:endIndex "10665"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "11852"^^xsd:nonNegativeInteger ; - nif:endIndex "11853"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "1923" ; - nif:beginIndex "5203"^^xsd:nonNegativeInteger ; - nif:endIndex "5207"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "on" ; - nif:beginIndex "2739"^^xsd:nonNegativeInteger ; - nif:endIndex "2741"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "2506"^^xsd:nonNegativeInteger ; - nif:endIndex "2508"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "5884"^^xsd:nonNegativeInteger ; - nif:endIndex "5885"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "at" ; - nif:beginIndex "11142"^^xsd:nonNegativeInteger ; - nif:endIndex "11144"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "which" ; - nif:beginIndex "10398"^^xsd:nonNegativeInteger ; - nif:endIndex "10403"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "1913" ; - nif:beginIndex "4513"^^xsd:nonNegativeInteger ; - nif:endIndex "4517"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Everything" ; - nif:beginIndex "12162"^^xsd:nonNegativeInteger ; - nif:endIndex "12172"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "raises" ; - nif:beginIndex "10646"^^xsd:nonNegativeInteger ; - nif:endIndex "10652"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "pistol" ; - nif:beginIndex "11125"^^xsd:nonNegativeInteger ; - nif:endIndex "11131"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "3369"^^xsd:nonNegativeInteger ; - nif:endIndex "3370"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Jr." ; - nif:beginIndex "4550"^^xsd:nonNegativeInteger ; - nif:endIndex "4553"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Hospital" ; - nif:beginIndex "4276"^^xsd:nonNegativeInteger ; - nif:endIndex "4284"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Democratic" ; - nif:beginIndex "10986"^^xsd:nonNegativeInteger ; - nif:endIndex "10996"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "term" ; - nif:beginIndex "5229"^^xsd:nonNegativeInteger ; - nif:endIndex "5233"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "election" ; - nif:beginIndex "11920"^^xsd:nonNegativeInteger ; - nif:endIndex "11928"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jail" ; - nif:beginIndex "3713"^^xsd:nonNegativeInteger ; - nif:endIndex "3717"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Legislature" ; - nif:beginIndex "9729"^^xsd:nonNegativeInteger ; - nif:endIndex "9740"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Republicans" ; - nif:beginIndex "6340"^^xsd:nonNegativeInteger ; - nif:endIndex "6351"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "under" ; - nif:beginIndex "8388"^^xsd:nonNegativeInteger ; - nif:endIndex "8393"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "which" ; - nif:beginIndex "2322"^^xsd:nonNegativeInteger ; - nif:endIndex "2327"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf ":" ; - nif:beginIndex "6474"^^xsd:nonNegativeInteger ; - nif:endIndex "6475"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "hurdle" ; - nif:beginIndex "6329"^^xsd:nonNegativeInteger ; - nif:endIndex "6335"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Authority" ; - nif:beginIndex "8361"^^xsd:nonNegativeInteger ; - nif:endIndex "8370"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "9219"^^xsd:nonNegativeInteger ; - nif:endIndex "9222"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "11348"^^xsd:nonNegativeInteger ; - nif:endIndex "11349"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "6397"^^xsd:nonNegativeInteger ; - nif:endIndex "6401"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "6434"^^xsd:nonNegativeInteger ; - nif:endIndex "6436"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "manner" ; - nif:beginIndex "3472"^^xsd:nonNegativeInteger ; - nif:endIndex "3478"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Hartsfield" ; - nif:beginIndex "4400"^^xsd:nonNegativeInteger ; - nif:endIndex "4410"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "12242"^^xsd:nonNegativeInteger ; - nif:endIndex "12244"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Miller" ; - nif:beginIndex "10697"^^xsd:nonNegativeInteger ; - nif:endIndex "10703"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "not" ; - nif:beginIndex "4875"^^xsd:nonNegativeInteger ; - nif:endIndex "4878"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "charge" ; - nif:beginIndex "257"^^xsd:nonNegativeInteger ; - nif:endIndex "263"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Davis" ; - nif:beginIndex "11764"^^xsd:nonNegativeInteger ; - nif:endIndex "11769"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "himself" ; - nif:beginIndex "11610"^^xsd:nonNegativeInteger ; - nif:endIndex "11617"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "city" ; - nif:beginIndex "1572"^^xsd:nonNegativeInteger ; - nif:endIndex "1576"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "11375"^^xsd:nonNegativeInteger ; - nif:endIndex "11377"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The new school superintendent is Harry Davis, a veteran agriculture teacher, who defeated Felix Bush, a school principal and chairman of the Miller County Democratic Executive Committee." ; - nif:beginIndex "10831"^^xsd:nonNegativeInteger ; - nif:endIndex "11017"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "will" ; - nif:beginIndex "3484"^^xsd:nonNegativeInteger ; - nif:endIndex "3488"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "official" ; - nif:beginIndex "5527"^^xsd:nonNegativeInteger ; - nif:endIndex "5535"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "no" ; - nif:beginIndex "106"^^xsd:nonNegativeInteger ; - nif:endIndex "108"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "state" ; - nif:beginIndex "6376"^^xsd:nonNegativeInteger ; - nif:endIndex "6381"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "purpose" ; - nif:beginIndex "3665"^^xsd:nonNegativeInteger ; - nif:endIndex "3672"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Georgia Republicans are getting strong encouragement to enter a candidate in the 1962 governor's race, a top official said Wednesday." ; - nif:beginIndex "5418"^^xsd:nonNegativeInteger ; - nif:endIndex "5551"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "encouragement" ; - nif:beginIndex "5457"^^xsd:nonNegativeInteger ; - nif:endIndex "5470"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "6374"^^xsd:nonNegativeInteger ; - nif:endIndex "6375"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "election" ; - nif:beginIndex "11204"^^xsd:nonNegativeInteger ; - nif:endIndex "11212"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jury" ; - nif:beginIndex "3749"^^xsd:nonNegativeInteger ; - nif:endIndex "3753"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "(" ; - nif:beginIndex "3772"^^xsd:nonNegativeInteger ; - nif:endIndex "3773"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "A" ; - nif:beginIndex "8286"^^xsd:nonNegativeInteger ; - nif:endIndex "8287"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "September-October" ; - nif:beginIndex "395"^^xsd:nonNegativeInteger ; - nif:endIndex "412"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "paid" ; - nif:beginIndex "8467"^^xsd:nonNegativeInteger ; - nif:endIndex "8471"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "petitions" ; - nif:beginIndex "5999"^^xsd:nonNegativeInteger ; - nif:endIndex "6008"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Context ; - nif:beginIndex "0"^^xsd:nonNegativeInteger ; - nif:endIndex "12246"^^xsd:nonNegativeInteger ; - nif:isString "The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced ``no evidence'' that any irregularities took place.\nThe jury further said in term-end presentments that the City Executive Committee, which had over-all charge of the election, ``deserves the praise and thanks of the City of Atlanta'' for the manner in which the election was conducted.\nThe September-October term jury had been charged by Fulton Superior Court Judge Durwood Pye to investigate reports of possible ``irregularities'' in the hard-fought primary which was won by Mayor-nominate Ivan Allen Jr..\n``Only a relative handful of such reports was received'', the jury said, ``considering the widespread interest in the election, the number of voters and the size of this city''.\nThe jury said it did find that many of Georgia's registration and election laws ``are outmoded or inadequate and often ambiguous''.\nIt recommended that Fulton legislators act ``to have these laws studied and revised to the end of modernizing and improving them''.\nThe grand jury commented on a number of other topics, among them the Atlanta and Fulton County purchasing departments which it said ``are well operated and follow generally accepted practices which inure to the best interest of both governments''.\nMerger proposed\nHowever, the jury said it believes ``these two offices should be combined to achieve greater efficiency and reduce the cost of administration''.\nThe City Purchasing Department, the jury said, ``is lacking in experienced clerical personnel as a result of city personnel policies''.\nIt urged that the city ``take steps to remedy'' this problem.\nImplementation of Georgia's automobile title law was also recommended by the outgoing jury.\nIt urged that the next Legislature ``provide enabling funds and re-set the effective date so that an orderly implementation of the law may be effected''.\nThe grand jury took a swipe at the State Welfare Department's handling of federal funds granted for child welfare services in foster homes.\n``This is one of the major items in the Fulton County general assistance program'', the jury said, but the State Welfare Department ``has seen fit to distribute these funds through the welfare departments of all the counties in the state with the exception of Fulton County, which receives none of this money.\nThe jurors said they realize ``a proportionate distribution of these funds might disable this program in our less populous counties''.\nNevertheless, ``we feel that in the future Fulton County should receive some portion of these available funds'', the jurors said.\n``Failure to do this will continue to place a disproportionate burden'' on Fulton taxpayers.\nThe jury also commented on the Fulton ordinary's court which has been under fire for its practices in the appointment of appraisers, guardians and administrators and the awarding of fees and compensation.\nWards protected\nThe jury said it found the court ``has incorporated into its operating procedures the recommendations'' of two previous grand juries, the Atlanta Bar Association and an interim citizens committee.\n``These actions should serve to protect in fact and in effect the court's wards from undue costs and its appointed and elected servants from unmeritorious criticisms'', the jury said.\nRegarding Atlanta's new multi-million-dollar airport, the jury recommended ``that when the new management takes charge Jan. 1 the airport be operated in a manner that will eliminate political influences''.\nThe jury did not elaborate, but it added that ``there should be periodic surveillance of the pricing practices of the concessionaires for the purpose of keeping the prices reasonable''.\nAsk jail deputies\nOn other matters, the jury recommended that: (1)\nFour additional deputies be employed at the Fulton County Jail and ``a doctor, medical intern or extern be employed for night and weekend duty at the jail''.\n(2)\nFulton legislators ``work with city officials to pass enabling legislation that will permit the establishment of a fair and equitable'' pension plan for city employes.\nThe jury praised the administration and operation of the Atlanta Police Department, the Fulton Tax Commissioner's Office, the Bellwood and Alpharetta prison farms, Grady Hospital and the Fulton Health Department.\nMayor William B. Hartsfield filed suit for divorce from his wife, Pearl Williams Hartsfield, in Fulton Superior Court Friday.\nHis petition charged mental cruelty.\nThe couple was married Aug. 2, 1913.\nThey have a son, William Berry Jr., and a daughter, Mrs. J. M. Cheshire of Griffin.\nAttorneys for the mayor said that an amicable property settlement has been agreed upon.\nThe petition listed the mayor's occupation as ``attorney'' and his age as 71.\nIt listed his wife's age as 74 and place of birth as Opelika, Ala..\nThe petition said that the couple has not lived together as man and wife for more than a year.\nThe Hartsfield home is at 637 E. Pelham Rd. Aj.\nHenry L. Bowden was listed on the petition as the mayor's attorney.\nHartsfield has been mayor of Atlanta, with exception of one brief interlude, since 1937.\nHis political career goes back to his election to city council in 1923.\nThe mayor's present term of office expires Jan. 1.\nHe will be succeeded by Ivan Allen Jr., who became a candidate in the Sept. 13 primary after Mayor Hartsfield announced that he would not run for reelection.\nGeorgia Republicans are getting strong encouragement to enter a candidate in the 1962 governor's race, a top official said Wednesday.\nRobert Snodgrass, state GOP chairman, said a meeting held Tuesday night in Blue Ridge brought enthusiastic responses from the audience.\nState Party Chairman James W. Dorsey added that enthusiasm was picking up for a state rally to be held Sept. 8 in Savannah at which newly elected Texas Sen. John Tower will be the featured speaker.\nIn the Blue Ridge meeting, the audience was warned that entering a candidate for governor would force it to take petitions out into voting precincts to obtain the signatures of registered voters.\nDespite the warning, there was a unanimous vote to enter a candidate, according to Republicans who attended.\nWhen the crowd was asked whether it wanted to wait one more term to make the race, it voted no -- and there were no dissents.\nThe largest hurdle the Republicans would have to face is a state law which says that before making a first race, one of two alternative courses must be taken: 1\nFive per cent of the voters in each county must sign petitions requesting that the Republicans be allowed to place names of candidates on the general election ballot, or 2\nThe Republicans must hold a primary under the county unit system -- a system which the party opposes in its platform.\nSam Caldwell, State Highway Department public relations director, resigned Tuesday to work for Lt. Gov. Garland Byrd's campaign.\nCaldwell's resignation had been expected for some time.\nHe will be succeeded by Rob Ledford of Gainesville, who has been an assistant more than three years.\nWhen the gubernatorial campaign starts, Caldwell is expected to become a campaign coordinator for Byrd.\nThe Georgia Legislature will wind up its 1961 session Monday and head for home -- where some of the highway bond money it approved will follow shortly.\nBefore adjournment Monday afternoon, the Senate is expected to approve a study of the number of legislators allotted to rural and urban areas to determine what adjustments should be made.\nGov. Vandiver is expected to make the traditional visit to both chambers as they work toward adjournment.\nVandiver likely will mention the $100 million highway bond issue approved earlier in the session as his first priority item.\nConstruction bonds\nMeanwhile, it was learned the State Highway Department is very near being ready to issue the first $30 million worth of highway reconstruction bonds.\nThe bond issue will go to the state courts for a friendly test suit to test the validity of the act, and then the sales will begin and contracts let for repair work on some of Georgia's most heavily traveled highways.\nA Highway Department source said there also is a plan there to issue some $3 million to $4 million worth of Rural Roads Authority bonds for rural road construction work.\nA revolving fund\nThe department apparently intends to make the Rural Roads Authority a revolving fund under which new bonds would be issued every time a portion of the old ones are paid off by tax authorities.\nVandiver opened his race for governor in 1958 with a battle in the Legislature against the issuance of $50 million worth of additional rural roads bonds proposed by then Gov. Marvin Griffin.\nThe Highway Department source told The Constitution, however, that Vandiver has not been consulted yet about the plans to issue the new rural roads bonds.\nSchley County Rep. B. D. Pelham will offer a resolution Monday in the House to rescind the body's action of Friday in voting itself a $10 per day increase in expense allowances.\nPelham said Sunday night there was research being done on whether the ``quickie'' vote on the increase can be repealed outright or whether notice would have to first be given that reconsideration of the action would be sought.\nWhile emphasizing that technical details were not fully worked out, Pelham said his resolution would seek to set aside the privilege resolution which the House voted through 87-31.\nA similar resolution passed in the Senate by a vote of 29-5.\nAs of Sunday night, there was no word of a resolution being offered there to rescind the action.\nPelham pointed out that Georgia voters last November rejected a constitutional amendment to allow legislators to vote on pay raises for future Legislature sessions.\nA veteran Jackson County legislator will ask the Georgia House Monday to back federal aid to education, something it has consistently opposed in the past.\nRep. Mac Barber of Commerce is asking the House in a privilege resolution to ``endorse increased federal support for public education, provided that such funds be received and expended'' as state funds.\nBarber, who is in his 13th year as a legislator, said there ``are some members of our congressional delegation in Washington who would like to see it (the resolution) passed''.\nBut he added that none of Georgia's congressmen specifically asked him to offer the resolution.\nThe resolution, which Barber tossed into the House hopper Friday, will be formally read Monday.\nIt says that ``in the event Congress does provide this increase in federal funds'', the State Board of Education should be directed to ``give priority'' to teacher pay raises.\nColquitt\n-- After a long, hot controversy, Miller County has a new school superintendent, elected, as a policeman put it, in the ``coolest election I ever saw in this county''.\nThe new school superintendent is Harry Davis, a veteran agriculture teacher, who defeated Felix Bush, a school principal and chairman of the Miller County Democratic Executive Committee.\nDavis received 1,119 votes in Saturday's election, and Bush got 402.\nOrdinary Carey Williams, armed with a pistol, stood by at the polls to insure order.\n``This was the coolest, calmest election I ever saw'', Colquitt Policeman Tom Williams said.\n``Being at the polls was just like being at church.\nI didn't smell a drop of liquor, and we didn't have a bit of trouble''.\nThe campaign leading to the election was not so quiet, however.\nIt was marked by controversy, anonymous midnight phone calls and veiled threats of violence.\nThe former county school superintendent, George P. Callan, shot himself to death March 18, four days after he resigned his post in a dispute with the county school board.\nDuring the election campaign, both candidates, Davis and Bush, reportedly received anonymous telephone calls.\nOrdinary Williams said he, too, was subjected to anonymous calls soon after he scheduled the election.\nMany local citizens feared that there would be irregularities at the polls, and Williams got himself a permit to carry a gun and promised an orderly election.\nSheriff Felix Tabb said the ordinary apparently made good his promise.\n``Everything went real smooth'', the sheriff said.\n``There wasn't a bit of trouble''.\n" . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "4601"^^xsd:nonNegativeInteger ; - nif:endIndex "4602"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "work" ; - nif:beginIndex "6854"^^xsd:nonNegativeInteger ; - nif:endIndex "6858"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "city" ; - nif:beginIndex "782"^^xsd:nonNegativeInteger ; - nif:endIndex "786"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "go" ; - nif:beginIndex "7918"^^xsd:nonNegativeInteger ; - nif:endIndex "7920"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "8303"^^xsd:nonNegativeInteger ; - nif:endIndex "8306"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "State" ; - nif:beginIndex "6782"^^xsd:nonNegativeInteger ; - nif:endIndex "6787"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "rural" ; - nif:beginIndex "8823"^^xsd:nonNegativeInteger ; - nif:endIndex "8828"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "1504"^^xsd:nonNegativeInteger ; - nif:endIndex "1508"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "11369"^^xsd:nonNegativeInteger ; - nif:endIndex "11370"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "county" ; - nif:beginIndex "10821"^^xsd:nonNegativeInteger ; - nif:endIndex "10827"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Mayor" ; - nif:beginIndex "5353"^^xsd:nonNegativeInteger ; - nif:endIndex "5358"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "achieve" ; - nif:beginIndex "1395"^^xsd:nonNegativeInteger ; - nif:endIndex "1402"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "servants" ; - nif:beginIndex "3260"^^xsd:nonNegativeInteger ; - nif:endIndex "3268"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "10560"^^xsd:nonNegativeInteger ; - nif:endIndex "10561"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "hopper" ; - nif:beginIndex "10433"^^xsd:nonNegativeInteger ; - nif:endIndex "10439"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "according" ; - nif:beginIndex "6152"^^xsd:nonNegativeInteger ; - nif:endIndex "6161"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "6989"^^xsd:nonNegativeInteger ; - nif:endIndex "6991"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "7496"^^xsd:nonNegativeInteger ; - nif:endIndex "7497"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "force" ; - nif:beginIndex "5982"^^xsd:nonNegativeInteger ; - nif:endIndex "5987"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "very" ; - nif:beginIndex "7806"^^xsd:nonNegativeInteger ; - nif:endIndex "7810"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "night" ; - nif:beginIndex "5618"^^xsd:nonNegativeInteger ; - nif:endIndex "5623"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "was" ; - nif:beginIndex "11286"^^xsd:nonNegativeInteger ; - nif:endIndex "11289"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "best" ; - nif:beginIndex "1265"^^xsd:nonNegativeInteger ; - nif:endIndex "1269"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "4747"^^xsd:nonNegativeInteger ; - nif:endIndex "4749"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "6234"^^xsd:nonNegativeInteger ; - nif:endIndex "6236"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "10756"^^xsd:nonNegativeInteger ; - nif:endIndex "10757"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "election" ; - nif:beginIndex "86"^^xsd:nonNegativeInteger ; - nif:endIndex "94"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "2131"^^xsd:nonNegativeInteger ; - nif:endIndex "2134"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "10783"^^xsd:nonNegativeInteger ; - nif:endIndex "10785"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "2902"^^xsd:nonNegativeInteger ; - nif:endIndex "2905"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "recent" ; - nif:beginIndex "71"^^xsd:nonNegativeInteger ; - nif:endIndex "77"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "6317"^^xsd:nonNegativeInteger ; - nif:endIndex "6320"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "not" ; - nif:beginIndex "11430"^^xsd:nonNegativeInteger ; - nif:endIndex "11433"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "12205"^^xsd:nonNegativeInteger ; - nif:endIndex "12209"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "presentments" ; - nif:beginIndex "190"^^xsd:nonNegativeInteger ; - nif:endIndex "202"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "House" ; - nif:beginIndex "10427"^^xsd:nonNegativeInteger ; - nif:endIndex "10432"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "8747"^^xsd:nonNegativeInteger ; - nif:endIndex "8748"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "3957"^^xsd:nonNegativeInteger ; - nif:endIndex "3959"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "He will be succeeded by Ivan Allen Jr., who became a candidate in the Sept. 13 primary after Mayor Hartsfield announced that he would not run for reelection." ; - nif:beginIndex "5260"^^xsd:nonNegativeInteger ; - nif:endIndex "5417"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "6616"^^xsd:nonNegativeInteger ; - nif:endIndex "6619"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "not" ; - nif:beginIndex "9293"^^xsd:nonNegativeInteger ; - nif:endIndex "9296"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "12113"^^xsd:nonNegativeInteger ; - nif:endIndex "12116"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Tom" ; - nif:beginIndex "11246"^^xsd:nonNegativeInteger ; - nif:endIndex "11249"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "4632"^^xsd:nonNegativeInteger ; - nif:endIndex "4636"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "11170"^^xsd:nonNegativeInteger ; - nif:endIndex "11171"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "juries" ; - nif:beginIndex "3062"^^xsd:nonNegativeInteger ; - nif:endIndex "3068"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "was" ; - nif:beginIndex "5926"^^xsd:nonNegativeInteger ; - nif:endIndex "5929"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "9397"^^xsd:nonNegativeInteger ; - nif:endIndex "9400"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "1560"^^xsd:nonNegativeInteger ; - nif:endIndex "1561"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jury" ; - nif:beginIndex "4110"^^xsd:nonNegativeInteger ; - nif:endIndex "4114"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "new" ; - nif:beginIndex "10835"^^xsd:nonNegativeInteger ; - nif:endIndex "10838"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "6832"^^xsd:nonNegativeInteger ; - nif:endIndex "6833"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jury" ; - nif:beginIndex "160"^^xsd:nonNegativeInteger ; - nif:endIndex "164"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "give" ; - nif:beginIndex "10615"^^xsd:nonNegativeInteger ; - nif:endIndex "10619"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "party" ; - nif:beginIndex "6737"^^xsd:nonNegativeInteger ; - nif:endIndex "6742"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "do" ; - nif:beginIndex "2635"^^xsd:nonNegativeInteger ; - nif:endIndex "2637"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "9086"^^xsd:nonNegativeInteger ; - nif:endIndex "9089"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "was" ; - nif:beginIndex "376"^^xsd:nonNegativeInteger ; - nif:endIndex "379"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "school" ; - nif:beginIndex "11703"^^xsd:nonNegativeInteger ; - nif:endIndex "11709"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "these" ; - nif:beginIndex "1355"^^xsd:nonNegativeInteger ; - nif:endIndex "1360"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "which" ; - nif:beginIndex "5814"^^xsd:nonNegativeInteger ; - nif:endIndex "5819"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "scheduled" ; - nif:beginIndex "11906"^^xsd:nonNegativeInteger ; - nif:endIndex "11915"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "4837"^^xsd:nonNegativeInteger ; - nif:endIndex "4840"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "11442"^^xsd:nonNegativeInteger ; - nif:endIndex "11443"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "10169"^^xsd:nonNegativeInteger ; - nif:endIndex "10171"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "4930"^^xsd:nonNegativeInteger ; - nif:endIndex "4931"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "one" ; - nif:beginIndex "2057"^^xsd:nonNegativeInteger ; - nif:endIndex "2060"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "education" ; - nif:beginIndex "9844"^^xsd:nonNegativeInteger ; - nif:endIndex "9853"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "11544"^^xsd:nonNegativeInteger ; - nif:endIndex "11545"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "from" ; - nif:beginIndex "3213"^^xsd:nonNegativeInteger ; - nif:endIndex "3217"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "requesting" ; - nif:beginIndex "6541"^^xsd:nonNegativeInteger ; - nif:endIndex "6551"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "feared" ; - nif:beginIndex "11950"^^xsd:nonNegativeInteger ; - nif:endIndex "11956"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "both" ; - nif:beginIndex "11747"^^xsd:nonNegativeInteger ; - nif:endIndex "11751"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "new" ; - nif:beginIndex "3408"^^xsd:nonNegativeInteger ; - nif:endIndex "3411"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "4711"^^xsd:nonNegativeInteger ; - nif:endIndex "4714"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "5234"^^xsd:nonNegativeInteger ; - nif:endIndex "5236"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "3892"^^xsd:nonNegativeInteger ; - nif:endIndex "3895"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "term" ; - nif:beginIndex "413"^^xsd:nonNegativeInteger ; - nif:endIndex "417"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "8029"^^xsd:nonNegativeInteger ; - nif:endIndex "8032"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "10382"^^xsd:nonNegativeInteger ; - nif:endIndex "10385"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "operating" ; - nif:beginIndex "2997"^^xsd:nonNegativeInteger ; - nif:endIndex "3006"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "welfare" ; - nif:beginIndex "2232"^^xsd:nonNegativeInteger ; - nif:endIndex "2239"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "bond" ; - nif:beginIndex "7902"^^xsd:nonNegativeInteger ; - nif:endIndex "7906"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "8997"^^xsd:nonNegativeInteger ; - nif:endIndex "8999"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "his" ; - nif:beginIndex "4375"^^xsd:nonNegativeInteger ; - nif:endIndex "4378"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "increase" ; - nif:beginIndex "10533"^^xsd:nonNegativeInteger ; - nif:endIndex "10541"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "apparently" ; - nif:beginIndex "12126"^^xsd:nonNegativeInteger ; - nif:endIndex "12136"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "it" ; - nif:beginIndex "10772"^^xsd:nonNegativeInteger ; - nif:endIndex "10774"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "2945"^^xsd:nonNegativeInteger ; - nif:endIndex "2949"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "8437"^^xsd:nonNegativeInteger ; - nif:endIndex "8438"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "resolution" ; - nif:beginIndex "10264"^^xsd:nonNegativeInteger ; - nif:endIndex "10274"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "traveled" ; - nif:beginIndex "8097"^^xsd:nonNegativeInteger ; - nif:endIndex "8105"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "4048"^^xsd:nonNegativeInteger ; - nif:endIndex "4050"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Ordinary Carey Williams, armed with a pistol, stood by at the polls to insure order." ; - nif:beginIndex "11087"^^xsd:nonNegativeInteger ; - nif:endIndex "11171"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "6428"^^xsd:nonNegativeInteger ; - nif:endIndex "6429"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "1880"^^xsd:nonNegativeInteger ; - nif:endIndex "1883"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "event" ; - nif:beginIndex "10500"^^xsd:nonNegativeInteger ; - nif:endIndex "10505"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "establishment" ; - nif:beginIndex "4034"^^xsd:nonNegativeInteger ; - nif:endIndex "4047"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "farms" ; - nif:beginIndex "4263"^^xsd:nonNegativeInteger ; - nif:endIndex "4268"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Pelham pointed out that Georgia voters last November rejected a constitutional amendment to allow legislators to vote on pay raises for future Legislature sessions." ; - nif:beginIndex "9586"^^xsd:nonNegativeInteger ; - nif:endIndex "9750"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "1978"^^xsd:nonNegativeInteger ; - nif:endIndex "1980"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Rep. Mac Barber of Commerce is asking the House in a privilege resolution to ``endorse increased federal support for public education, provided that such funds be received and expended'' as state funds." ; - nif:beginIndex "9906"^^xsd:nonNegativeInteger ; - nif:endIndex "10108"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "offered" ; - nif:beginIndex "9549"^^xsd:nonNegativeInteger ; - nif:endIndex "9556"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "legislators" ; - nif:beginIndex "7406"^^xsd:nonNegativeInteger ; - nif:endIndex "7417"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Aj" ; - nif:beginIndex "4976"^^xsd:nonNegativeInteger ; - nif:endIndex "4978"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "session" ; - nif:beginIndex "7693"^^xsd:nonNegativeInteger ; - nif:endIndex "7700"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Senate" ; - nif:beginIndex "9463"^^xsd:nonNegativeInteger ; - nif:endIndex "9469"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "1006"^^xsd:nonNegativeInteger ; - nif:endIndex "1008"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "brief" ; - nif:beginIndex "5108"^^xsd:nonNegativeInteger ; - nif:endIndex "5113"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Pelham said Sunday night there was research being done on whether the ``quickie'' vote on the increase can be repealed outright or whether notice would have to first be given that reconsideration of the action would be sought." ; - nif:beginIndex "9020"^^xsd:nonNegativeInteger ; - nif:endIndex "9246"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "veteran" ; - nif:beginIndex "9753"^^xsd:nonNegativeInteger ; - nif:endIndex "9760"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "2504"^^xsd:nonNegativeInteger ; - nif:endIndex "2505"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "vote" ; - nif:beginIndex "9475"^^xsd:nonNegativeInteger ; - nif:endIndex "9479"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "privilege" ; - nif:beginIndex "9959"^^xsd:nonNegativeInteger ; - nif:endIndex "9968"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "voters" ; - nif:beginIndex "9618"^^xsd:nonNegativeInteger ; - nif:endIndex "9624"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "7125"^^xsd:nonNegativeInteger ; - nif:endIndex "7126"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "took" ; - nif:beginIndex "144"^^xsd:nonNegativeInteger ; - nif:endIndex "148"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "6256"^^xsd:nonNegativeInteger ; - nif:endIndex "6258"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "this" ; - nif:beginIndex "2446"^^xsd:nonNegativeInteger ; - nif:endIndex "2450"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "candidates" ; - nif:beginIndex "11752"^^xsd:nonNegativeInteger ; - nif:endIndex "11762"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Office" ; - nif:beginIndex "4220"^^xsd:nonNegativeInteger ; - nif:endIndex "4226"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "studied" ; - nif:beginIndex "986"^^xsd:nonNegativeInteger ; - nif:endIndex "993"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "at" ; - nif:beginIndex "3919"^^xsd:nonNegativeInteger ; - nif:endIndex "3921"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "11974"^^xsd:nonNegativeInteger ; - nif:endIndex "11976"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "6363"^^xsd:nonNegativeInteger ; - nif:endIndex "6365"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "tossed" ; - nif:beginIndex "10411"^^xsd:nonNegativeInteger ; - nif:endIndex "10417"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "pricing" ; - nif:beginIndex "3616"^^xsd:nonNegativeInteger ; - nif:endIndex "3623"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "interest" ; - nif:beginIndex "1270"^^xsd:nonNegativeInteger ; - nif:endIndex "1278"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "29"^^xsd:nonNegativeInteger ; - nif:endIndex "33"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "practices" ; - nif:beginIndex "2804"^^xsd:nonNegativeInteger ; - nif:endIndex "2813"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Court" ; - nif:beginIndex "459"^^xsd:nonNegativeInteger ; - nif:endIndex "464"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The department apparently intends to make the Rural Roads Authority a revolving fund under which new bonds would be issued every time a portion of the old ones are paid off by tax authorities." ; - nif:beginIndex "8303"^^xsd:nonNegativeInteger ; - nif:endIndex "8495"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Sentence ; - nif:anchorOf "Sheriff Felix Tabb said the ordinary apparently made good his promise." ; - nif:beginIndex "12089"^^xsd:nonNegativeInteger ; - nif:endIndex "12159"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "Georgia" ; - nif:beginIndex "5418"^^xsd:nonNegativeInteger ; - nif:endIndex "5425"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "11172"^^xsd:nonNegativeInteger ; - nif:endIndex "11174"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "recommendations" ; - nif:beginIndex "3022"^^xsd:nonNegativeInteger ; - nif:endIndex "3037"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "toward" ; - nif:beginIndex "7584"^^xsd:nonNegativeInteger ; - nif:endIndex "7590"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Senate" ; - nif:beginIndex "7351"^^xsd:nonNegativeInteger ; - nif:endIndex "7357"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "top" ; - nif:beginIndex "5523"^^xsd:nonNegativeInteger ; - nif:endIndex "5526"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "not" ; - nif:beginIndex "5394"^^xsd:nonNegativeInteger ; - nif:endIndex "5397"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "10695"^^xsd:nonNegativeInteger ; - nif:endIndex "10696"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "with" ; - nif:beginIndex "3964"^^xsd:nonNegativeInteger ; - nif:endIndex "3968"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "10965"^^xsd:nonNegativeInteger ; - nif:endIndex "10967"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "than" ; - nif:beginIndex "4919"^^xsd:nonNegativeInteger ; - nif:endIndex "4923"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "taxpayers" ; - nif:beginIndex "2704"^^xsd:nonNegativeInteger ; - nif:endIndex "2713"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "equitable" ; - nif:beginIndex "4062"^^xsd:nonNegativeInteger ; - nif:endIndex "4071"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Ordinary Williams said he, too, was subjected to anonymous calls soon after he scheduled the election." ; - nif:beginIndex "11827"^^xsd:nonNegativeInteger ; - nif:endIndex "11929"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "Harry" ; - nif:beginIndex "10864"^^xsd:nonNegativeInteger ; - nif:endIndex "10869"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "offices" ; - nif:beginIndex "1365"^^xsd:nonNegativeInteger ; - nif:endIndex "1372"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "issuance" ; - nif:beginIndex "8587"^^xsd:nonNegativeInteger ; - nif:endIndex "8595"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "bonds" ; - nif:beginIndex "8246"^^xsd:nonNegativeInteger ; - nif:endIndex "8251"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "On other matters, the jury recommended that: (1)" ; - nif:beginIndex "3727"^^xsd:nonNegativeInteger ; - nif:endIndex "3775"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "2657"^^xsd:nonNegativeInteger ; - nif:endIndex "2659"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "3637"^^xsd:nonNegativeInteger ; - nif:endIndex "3640"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "3404"^^xsd:nonNegativeInteger ; - nif:endIndex "3407"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "after" ; - nif:beginIndex "11647"^^xsd:nonNegativeInteger ; - nif:endIndex "11652"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "156"^^xsd:nonNegativeInteger ; - nif:endIndex "159"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "pension" ; - nif:beginIndex "4074"^^xsd:nonNegativeInteger ; - nif:endIndex "4081"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Tax" ; - nif:beginIndex "4201"^^xsd:nonNegativeInteger ; - nif:endIndex "4204"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "761"^^xsd:nonNegativeInteger ; - nif:endIndex "764"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "agreed" ; - nif:beginIndex "4678"^^xsd:nonNegativeInteger ; - nif:endIndex "4684"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "areas" ; - nif:beginIndex "7446"^^xsd:nonNegativeInteger ; - nif:endIndex "7451"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "county" ; - nif:beginIndex "11696"^^xsd:nonNegativeInteger ; - nif:endIndex "11702"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "run" ; - nif:beginIndex "5398"^^xsd:nonNegativeInteger ; - nif:endIndex "5401"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The September-October term jury had been charged by Fulton Superior Court Judge Durwood Pye to investigate reports of possible ``irregularities'' in the hard-fought primary which was won by Mayor-nominate Ivan Allen Jr.." ; - nif:beginIndex "391"^^xsd:nonNegativeInteger ; - nif:endIndex "611"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "3609"^^xsd:nonNegativeInteger ; - nif:endIndex "3611"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Nevertheless, ``we feel that in the future Fulton County should receive some portion of these available funds'', the jurors said." ; - nif:beginIndex "2492"^^xsd:nonNegativeInteger ; - nif:endIndex "2621"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "County" ; - nif:beginIndex "10704"^^xsd:nonNegativeInteger ; - nif:endIndex "10710"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "10249"^^xsd:nonNegativeInteger ; - nif:endIndex "10251"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Sam" ; - nif:beginIndex "6768"^^xsd:nonNegativeInteger ; - nif:endIndex "6771"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "candidate" ; - nif:beginIndex "5313"^^xsd:nonNegativeInteger ; - nif:endIndex "5322"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "year" ; - nif:beginIndex "4926"^^xsd:nonNegativeInteger ; - nif:endIndex "4930"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "1442"^^xsd:nonNegativeInteger ; - nif:endIndex "1444"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "endorse" ; - nif:beginIndex "9985"^^xsd:nonNegativeInteger ; - nif:endIndex "9992"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "343"^^xsd:nonNegativeInteger ; - nif:endIndex "346"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "these" ; - nif:beginIndex "2580"^^xsd:nonNegativeInteger ; - nif:endIndex "2585"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "8201"^^xsd:nonNegativeInteger ; - nif:endIndex "8203"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "mayor" ; - nif:beginIndex "4621"^^xsd:nonNegativeInteger ; - nif:endIndex "4626"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "3549"^^xsd:nonNegativeInteger ; - nif:endIndex "3550"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "will" ; - nif:beginIndex "7289"^^xsd:nonNegativeInteger ; - nif:endIndex "7293"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "have" ; - nif:beginIndex "9172"^^xsd:nonNegativeInteger ; - nif:endIndex "9176"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "audience" ; - nif:beginIndex "5678"^^xsd:nonNegativeInteger ; - nif:endIndex "5686"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "should" ; - nif:beginIndex "3149"^^xsd:nonNegativeInteger ; - nif:endIndex "3155"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "new" ; - nif:beginIndex "8819"^^xsd:nonNegativeInteger ; - nif:endIndex "8822"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "10158"^^xsd:nonNegativeInteger ; - nif:endIndex "10162"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "opposed" ; - nif:beginIndex "9885"^^xsd:nonNegativeInteger ; - nif:endIndex "9892"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "listed" ; - nif:beginIndex "4704"^^xsd:nonNegativeInteger ; - nif:endIndex "4710"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "enabling" ; - nif:beginIndex "3992"^^xsd:nonNegativeInteger ; - nif:endIndex "4000"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The campaign leading to the election was not so quiet, however." ; - nif:beginIndex "11389"^^xsd:nonNegativeInteger ; - nif:endIndex "11452"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "9216"^^xsd:nonNegativeInteger ; - nif:endIndex "9218"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "6060"^^xsd:nonNegativeInteger ; - nif:endIndex "6062"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "undue" ; - nif:beginIndex "3218"^^xsd:nonNegativeInteger ; - nif:endIndex "3223"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "1824"^^xsd:nonNegativeInteger ; - nif:endIndex "1827"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "8687"^^xsd:nonNegativeInteger ; - nif:endIndex "8690"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "old" ; - nif:beginIndex "8454"^^xsd:nonNegativeInteger ; - nif:endIndex "8457"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "agriculture" ; - nif:beginIndex "10887"^^xsd:nonNegativeInteger ; - nif:endIndex "10898"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "learned" ; - nif:beginIndex "7766"^^xsd:nonNegativeInteger ; - nif:endIndex "7773"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "outmoded" ; - nif:beginIndex "876"^^xsd:nonNegativeInteger ; - nif:endIndex "884"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "this" ; - nif:beginIndex "2345"^^xsd:nonNegativeInteger ; - nif:endIndex "2349"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "11045"^^xsd:nonNegativeInteger ; - nif:endIndex "11047"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "signatures" ; - nif:beginIndex "6049"^^xsd:nonNegativeInteger ; - nif:endIndex "6059"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Atlanta" ; - nif:beginIndex "329"^^xsd:nonNegativeInteger ; - nif:endIndex "336"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "making" ; - nif:beginIndex "6409"^^xsd:nonNegativeInteger ; - nif:endIndex "6415"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "issue" ; - nif:beginIndex "7907"^^xsd:nonNegativeInteger ; - nif:endIndex "7912"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "expected" ; - nif:beginIndex "6929"^^xsd:nonNegativeInteger ; - nif:endIndex "6937"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Hartsfield" ; - nif:beginIndex "5359"^^xsd:nonNegativeInteger ; - nif:endIndex "5369"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "new" ; - nif:beginIndex "3337"^^xsd:nonNegativeInteger ; - nif:endIndex "3340"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "State" ; - nif:beginIndex "7778"^^xsd:nonNegativeInteger ; - nif:endIndex "7783"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "I didn't smell a drop of liquor, and we didn't have a bit of trouble''." ; - nif:beginIndex "11317"^^xsd:nonNegativeInteger ; - nif:endIndex "11388"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "veiled" ; - nif:beginIndex "11518"^^xsd:nonNegativeInteger ; - nif:endIndex "11524"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "1958" ; - nif:beginIndex "8537"^^xsd:nonNegativeInteger ; - nif:endIndex "8541"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Bar" ; - nif:beginIndex "3082"^^xsd:nonNegativeInteger ; - nif:endIndex "3085"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Despite" ; - nif:beginIndex "6082"^^xsd:nonNegativeInteger ; - nif:endIndex "6089"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "succeeded" ; - nif:beginIndex "6964"^^xsd:nonNegativeInteger ; - nif:endIndex "6973"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "1903"^^xsd:nonNegativeInteger ; - nif:endIndex "1905"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "together" ; - nif:beginIndex "4885"^^xsd:nonNegativeInteger ; - nif:endIndex "4893"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jail" ; - nif:beginIndex "3926"^^xsd:nonNegativeInteger ; - nif:endIndex "3930"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "been" ; - nif:beginIndex "6924"^^xsd:nonNegativeInteger ; - nif:endIndex "6928"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "3564"^^xsd:nonNegativeInteger ; - nif:endIndex "3568"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "5799"^^xsd:nonNegativeInteger ; - nif:endIndex "5801"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "They" ; - nif:beginIndex "4519"^^xsd:nonNegativeInteger ; - nif:endIndex "4523"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "held" ; - nif:beginIndex "5605"^^xsd:nonNegativeInteger ; - nif:endIndex "5609"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "11123"^^xsd:nonNegativeInteger ; - nif:endIndex "11124"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jury" ; - nif:beginIndex "1917"^^xsd:nonNegativeInteger ; - nif:endIndex "1921"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "5135"^^xsd:nonNegativeInteger ; - nif:endIndex "5136"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "superintendent" ; - nif:beginIndex "10846"^^xsd:nonNegativeInteger ; - nif:endIndex "10860"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "commented" ; - nif:beginIndex "2729"^^xsd:nonNegativeInteger ; - nif:endIndex "2738"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "expected" ; - nif:beginIndex "7361"^^xsd:nonNegativeInteger ; - nif:endIndex "7369"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "7158"^^xsd:nonNegativeInteger ; - nif:endIndex "7161"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "William" ; - nif:beginIndex "4325"^^xsd:nonNegativeInteger ; - nif:endIndex "4332"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "11225"^^xsd:nonNegativeInteger ; - nif:endIndex "11226"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "3705"^^xsd:nonNegativeInteger ; - nif:endIndex "3707"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "2368"^^xsd:nonNegativeInteger ; - nif:endIndex "2372"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jury" ; - nif:beginIndex "2135"^^xsd:nonNegativeInteger ; - nif:endIndex "2139"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "1495"^^xsd:nonNegativeInteger ; - nif:endIndex "1498"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "County" ; - nif:beginIndex "11"^^xsd:nonNegativeInteger ; - nif:endIndex "17"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "not" ; - nif:beginIndex "3536"^^xsd:nonNegativeInteger ; - nif:endIndex "3539"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The grand jury took a swipe at the State Welfare Department's handling of federal funds granted for child welfare services in foster homes." ; - nif:beginIndex "1907"^^xsd:nonNegativeInteger ; - nif:endIndex "2046"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "appointment" ; - nif:beginIndex "2821"^^xsd:nonNegativeInteger ; - nif:endIndex "2832"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "1082"^^xsd:nonNegativeInteger ; - nif:endIndex "1083"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "was" ; - nif:beginIndex "9051"^^xsd:nonNegativeInteger ; - nif:endIndex "9054"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "them" ; - nif:beginIndex "1046"^^xsd:nonNegativeInteger ; - nif:endIndex "1050"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The bond issue will go to the state courts for a friendly test suit to test the validity of the act, and then the sales will begin and contracts let for repair work on some of Georgia's most heavily traveled highways." ; - nif:beginIndex "7898"^^xsd:nonNegativeInteger ; - nif:endIndex "8115"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "it" ; - nif:beginIndex "7759"^^xsd:nonNegativeInteger ; - nif:endIndex "7761"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "filed" ; - nif:beginIndex "4347"^^xsd:nonNegativeInteger ; - nif:endIndex "4352"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "made" ; - nif:beginIndex "7492"^^xsd:nonNegativeInteger ; - nif:endIndex "7496"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Barber, who is in his 13th year as a legislator, said there ``are some members of our congressional delegation in Washington who would like to see it (the resolution) passed''." ; - nif:beginIndex "10109"^^xsd:nonNegativeInteger ; - nif:endIndex "10285"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "9027"^^xsd:nonNegativeInteger ; - nif:endIndex "9031"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "9366"^^xsd:nonNegativeInteger ; - nif:endIndex "9369"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "10877"^^xsd:nonNegativeInteger ; - nif:endIndex "10878"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Griffin" ; - nif:beginIndex "8678"^^xsd:nonNegativeInteger ; - nif:endIndex "8685"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "highways" ; - nif:beginIndex "8106"^^xsd:nonNegativeInteger ; - nif:endIndex "8114"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "2047"^^xsd:nonNegativeInteger ; - nif:endIndex "2049"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Policeman" ; - nif:beginIndex "11236"^^xsd:nonNegativeInteger ; - nif:endIndex "11245"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "5624"^^xsd:nonNegativeInteger ; - nif:endIndex "5626"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "one" ; - nif:beginIndex "6242"^^xsd:nonNegativeInteger ; - nif:endIndex "6245"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "teacher" ; - nif:beginIndex "10899"^^xsd:nonNegativeInteger ; - nif:endIndex "10906"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "council" ; - nif:beginIndex "5192"^^xsd:nonNegativeInteger ; - nif:endIndex "5199"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "precincts" ; - nif:beginIndex "6025"^^xsd:nonNegativeInteger ; - nif:endIndex "6034"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "as" ; - nif:beginIndex "10141"^^xsd:nonNegativeInteger ; - nif:endIndex "10143"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "4767"^^xsd:nonNegativeInteger ; - nif:endIndex "4768"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "he" ; - nif:beginIndex "5385"^^xsd:nonNegativeInteger ; - nif:endIndex "5387"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "anonymous" ; - nif:beginIndex "11483"^^xsd:nonNegativeInteger ; - nif:endIndex "11492"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "saw" ; - nif:beginIndex "10809"^^xsd:nonNegativeInteger ; - nif:endIndex "10812"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "10827"^^xsd:nonNegativeInteger ; - nif:endIndex "10829"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "``This is one of the major items in the Fulton County general assistance program'', the jury said, but the State Welfare Department ``has seen fit to distribute these funds through the welfare departments of all the counties in the state with the exception of Fulton County, which receives none of this money." ; - nif:beginIndex "2047"^^xsd:nonNegativeInteger ; - nif:endIndex "2356"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "8815"^^xsd:nonNegativeInteger ; - nif:endIndex "8818"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "4123"^^xsd:nonNegativeInteger ; - nif:endIndex "4126"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "election" ; - nif:beginIndex "11728"^^xsd:nonNegativeInteger ; - nif:endIndex "11736"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Welfare" ; - nif:beginIndex "1948"^^xsd:nonNegativeInteger ; - nif:endIndex "1955"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "5492"^^xsd:nonNegativeInteger ; - nif:endIndex "5494"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "legislation" ; - nif:beginIndex "4001"^^xsd:nonNegativeInteger ; - nif:endIndex "4012"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "roads" ; - nif:beginIndex "8829"^^xsd:nonNegativeInteger ; - nif:endIndex "8834"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Mayor-nominate" ; - nif:beginIndex "581"^^xsd:nonNegativeInteger ; - nif:endIndex "595"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "plans" ; - nif:beginIndex "8800"^^xsd:nonNegativeInteger ; - nif:endIndex "8805"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "often" ; - nif:beginIndex "903"^^xsd:nonNegativeInteger ; - nif:endIndex "908"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "present" ; - nif:beginIndex "5221"^^xsd:nonNegativeInteger ; - nif:endIndex "5228"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Durwood" ; - nif:beginIndex "471"^^xsd:nonNegativeInteger ; - nif:endIndex "478"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "8163"^^xsd:nonNegativeInteger ; - nif:endIndex "8164"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "6676"^^xsd:nonNegativeInteger ; - nif:endIndex "6677"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "10050"^^xsd:nonNegativeInteger ; - nif:endIndex "10054"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "is" ; - nif:beginIndex "2054"^^xsd:nonNegativeInteger ; - nif:endIndex "2056"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "7602"^^xsd:nonNegativeInteger ; - nif:endIndex "7603"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "burden" ; - nif:beginIndex "2685"^^xsd:nonNegativeInteger ; - nif:endIndex "2691"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "5380"^^xsd:nonNegativeInteger ; - nif:endIndex "5384"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "7254"^^xsd:nonNegativeInteger ; - nif:endIndex "7257"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "ever" ; - nif:beginIndex "11215"^^xsd:nonNegativeInteger ; - nif:endIndex "11219"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "10598"^^xsd:nonNegativeInteger ; - nif:endIndex "10600"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "10188"^^xsd:nonNegativeInteger ; - nif:endIndex "10190"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "will" ; - nif:beginIndex "7182"^^xsd:nonNegativeInteger ; - nif:endIndex "7186"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "year" ; - nif:beginIndex "10136"^^xsd:nonNegativeInteger ; - nif:endIndex "10140"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The former county school superintendent, George P. Callan, shot himself to death March 18, four days after he resigned his post in a dispute with the county school board." ; - nif:beginIndex "11546"^^xsd:nonNegativeInteger ; - nif:endIndex "11716"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "election" ; - nif:beginIndex "5175"^^xsd:nonNegativeInteger ; - nif:endIndex "5183"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "4555"^^xsd:nonNegativeInteger ; - nif:endIndex "4558"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "wasn't" ; - nif:beginIndex "12219"^^xsd:nonNegativeInteger ; - nif:endIndex "12225"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "administrators" ; - nif:beginIndex "2862"^^xsd:nonNegativeInteger ; - nif:endIndex "2876"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "federal" ; - nif:beginIndex "9829"^^xsd:nonNegativeInteger ; - nif:endIndex "9836"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "funds" ; - nif:beginIndex "10102"^^xsd:nonNegativeInteger ; - nif:endIndex "10107"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "7990"^^xsd:nonNegativeInteger ; - nif:endIndex "7993"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "1" ; - nif:beginIndex "5257"^^xsd:nonNegativeInteger ; - nif:endIndex "5258"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "8583"^^xsd:nonNegativeInteger ; - nif:endIndex "8586"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "welfare" ; - nif:beginIndex "2013"^^xsd:nonNegativeInteger ; - nif:endIndex "2020"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "expires" ; - nif:beginIndex "5244"^^xsd:nonNegativeInteger ; - nif:endIndex "5251"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "disable" ; - nif:beginIndex "2438"^^xsd:nonNegativeInteger ; - nif:endIndex "2445"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "2061"^^xsd:nonNegativeInteger ; - nif:endIndex "2063"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "County" ; - nif:beginIndex "9769"^^xsd:nonNegativeInteger ; - nif:endIndex "9775"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "compensation" ; - nif:beginIndex "2906"^^xsd:nonNegativeInteger ; - nif:endIndex "2918"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "House" ; - nif:beginIndex "9948"^^xsd:nonNegativeInteger ; - nif:endIndex "9953"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "4030"^^xsd:nonNegativeInteger ; - nif:endIndex "4033"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "place" ; - nif:beginIndex "149"^^xsd:nonNegativeInteger ; - nif:endIndex "154"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "says" ; - nif:beginIndex "6392"^^xsd:nonNegativeInteger ; - nif:endIndex "6396"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Gov." ; - nif:beginIndex "8666"^^xsd:nonNegativeInteger ; - nif:endIndex "8670"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "5519"^^xsd:nonNegativeInteger ; - nif:endIndex "5520"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "1" ; - nif:beginIndex "6476"^^xsd:nonNegativeInteger ; - nif:endIndex "6477"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "7865"^^xsd:nonNegativeInteger ; - nif:endIndex "7867"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "3173"^^xsd:nonNegativeInteger ; - nif:endIndex "3175"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "race" ; - nif:beginIndex "6424"^^xsd:nonNegativeInteger ; - nif:endIndex "6428"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "4829"^^xsd:nonNegativeInteger ; - nif:endIndex "4830"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "On" ; - nif:beginIndex "3727"^^xsd:nonNegativeInteger ; - nif:endIndex "3729"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "found" ; - nif:beginIndex "2953"^^xsd:nonNegativeInteger ; - nif:endIndex "2958"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Four additional deputies be employed at the Fulton County Jail and ``a doctor, medical intern or extern be employed for night and weekend duty at the jail''." ; - nif:beginIndex "3776"^^xsd:nonNegativeInteger ; - nif:endIndex "3933"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "would" ; - nif:beginIndex "6352"^^xsd:nonNegativeInteger ; - nif:endIndex "6357"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Garland" ; - nif:beginIndex "6872"^^xsd:nonNegativeInteger ; - nif:endIndex "6879"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "result" ; - nif:beginIndex "1562"^^xsd:nonNegativeInteger ; - nif:endIndex "1568"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "resolution" ; - nif:beginIndex "9438"^^xsd:nonNegativeInteger ; - nif:endIndex "9448"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "revolving" ; - nif:beginIndex "8373"^^xsd:nonNegativeInteger ; - nif:endIndex "8382"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Bush" ; - nif:beginIndex "10927"^^xsd:nonNegativeInteger ; - nif:endIndex "10931"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "5674"^^xsd:nonNegativeInteger ; - nif:endIndex "5677"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "court" ; - nif:beginIndex "2963"^^xsd:nonNegativeInteger ; - nif:endIndex "2968"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "on" ; - nif:beginIndex "5007"^^xsd:nonNegativeInteger ; - nif:endIndex "5009"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "5731"^^xsd:nonNegativeInteger ; - nif:endIndex "5735"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "his" ; - nif:beginIndex "8512"^^xsd:nonNegativeInteger ; - nif:endIndex "8515"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf ")" ; - nif:beginIndex "3774"^^xsd:nonNegativeInteger ; - nif:endIndex "3775"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "State" ; - nif:beginIndex "5688"^^xsd:nonNegativeInteger ; - nif:endIndex "5693"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Vandiver likely will mention the $100 million highway bond issue approved earlier in the session as his first priority item." ; - nif:beginIndex "7604"^^xsd:nonNegativeInteger ; - nif:endIndex "7728"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "5686"^^xsd:nonNegativeInteger ; - nif:endIndex "5687"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "7727"^^xsd:nonNegativeInteger ; - nif:endIndex "7728"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "controversy" ; - nif:beginIndex "10684"^^xsd:nonNegativeInteger ; - nif:endIndex "10695"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "voting" ; - nif:beginIndex "8960"^^xsd:nonNegativeInteger ; - nif:endIndex "8966"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Atlanta" ; - nif:beginIndex "1123"^^xsd:nonNegativeInteger ; - nif:endIndex "1130"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "were" ; - nif:beginIndex "6299"^^xsd:nonNegativeInteger ; - nif:endIndex "6303"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "537"^^xsd:nonNegativeInteger ; - nif:endIndex "539"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "implementation" ; - nif:beginIndex "1862"^^xsd:nonNegativeInteger ; - nif:endIndex "1876"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "799"^^xsd:nonNegativeInteger ; - nif:endIndex "803"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "State" ; - nif:beginIndex "10566"^^xsd:nonNegativeInteger ; - nif:endIndex "10571"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "12031"^^xsd:nonNegativeInteger ; - nif:endIndex "12032"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Construction bonds" ; - nif:beginIndex "7729"^^xsd:nonNegativeInteger ; - nif:endIndex "7747"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "6264"^^xsd:nonNegativeInteger ; - nif:endIndex "6267"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Health" ; - nif:beginIndex "4300"^^xsd:nonNegativeInteger ; - nif:endIndex "4306"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Police" ; - nif:beginIndex "4171"^^xsd:nonNegativeInteger ; - nif:endIndex "4177"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "4617"^^xsd:nonNegativeInteger ; - nif:endIndex "4620"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Fulton" ; - nif:beginIndex "2307"^^xsd:nonNegativeInteger ; - nif:endIndex "2313"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "314"^^xsd:nonNegativeInteger ; - nif:endIndex "316"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "1962" ; - nif:beginIndex "5499"^^xsd:nonNegativeInteger ; - nif:endIndex "5503"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "attorney" ; - nif:beginIndex "4739"^^xsd:nonNegativeInteger ; - nif:endIndex "4747"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "an" ; - nif:beginIndex "3102"^^xsd:nonNegativeInteger ; - nif:endIndex "3104"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "2355"^^xsd:nonNegativeInteger ; - nif:endIndex "2356"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "violence" ; - nif:beginIndex "11536"^^xsd:nonNegativeInteger ; - nif:endIndex "11544"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "trouble" ; - nif:beginIndex "12235"^^xsd:nonNegativeInteger ; - nif:endIndex "12242"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "751"^^xsd:nonNegativeInteger ; - nif:endIndex "753"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "make" ; - nif:beginIndex "8340"^^xsd:nonNegativeInteger ; - nif:endIndex "8344"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "every" ; - nif:beginIndex "8426"^^xsd:nonNegativeInteger ; - nif:endIndex "8431"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "was" ; - nif:beginIndex "11859"^^xsd:nonNegativeInteger ; - nif:endIndex "11862"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "3634"^^xsd:nonNegativeInteger ; - nif:endIndex "3636"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "ordinary" ; - nif:beginIndex "12117"^^xsd:nonNegativeInteger ; - nif:endIndex "12125"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "court's" ; - nif:beginIndex "3199"^^xsd:nonNegativeInteger ; - nif:endIndex "3206"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "appointed" ; - nif:beginIndex "3238"^^xsd:nonNegativeInteger ; - nif:endIndex "3247"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "million" ; - nif:beginIndex "8207"^^xsd:nonNegativeInteger ; - nif:endIndex "8214"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "has" ; - nif:beginIndex "7009"^^xsd:nonNegativeInteger ; - nif:endIndex "7012"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "charge" ; - nif:beginIndex "3429"^^xsd:nonNegativeInteger ; - nif:endIndex "3435"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "4529"^^xsd:nonNegativeInteger ; - nif:endIndex "4530"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "County" ; - nif:beginIndex "2314"^^xsd:nonNegativeInteger ; - nif:endIndex "2320"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Fulton" ; - nif:beginIndex "942"^^xsd:nonNegativeInteger ; - nif:endIndex "948"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "irregularities" ; - nif:beginIndex "129"^^xsd:nonNegativeInteger ; - nif:endIndex "143"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "1622"^^xsd:nonNegativeInteger ; - nif:endIndex "1624"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "automobile" ; - nif:beginIndex "1689"^^xsd:nonNegativeInteger ; - nif:endIndex "1699"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "18" ; - nif:beginIndex "11633"^^xsd:nonNegativeInteger ; - nif:endIndex "11635"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "this" ; - nif:beginIndex "10528"^^xsd:nonNegativeInteger ; - nif:endIndex "10532"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "11413"^^xsd:nonNegativeInteger ; - nif:endIndex "11416"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "8806"^^xsd:nonNegativeInteger ; - nif:endIndex "8808"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "management" ; - nif:beginIndex "3412"^^xsd:nonNegativeInteger ; - nif:endIndex "3422"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "7987"^^xsd:nonNegativeInteger ; - nif:endIndex "7989"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Henry L. Bowden was listed on the petition as the mayor's attorney." ; - nif:beginIndex "4980"^^xsd:nonNegativeInteger ; - nif:endIndex "5047"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "13" ; - nif:beginIndex "5336"^^xsd:nonNegativeInteger ; - nif:endIndex "5338"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "funds" ; - nif:beginIndex "1989"^^xsd:nonNegativeInteger ; - nif:endIndex "1994"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "combined" ; - nif:beginIndex "1383"^^xsd:nonNegativeInteger ; - nif:endIndex "1391"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "would" ; - nif:beginIndex "5388"^^xsd:nonNegativeInteger ; - nif:endIndex "5393"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "practices" ; - nif:beginIndex "1236"^^xsd:nonNegativeInteger ; - nif:endIndex "1245"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "authorities" ; - nif:beginIndex "8483"^^xsd:nonNegativeInteger ; - nif:endIndex "8494"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Many" ; - nif:beginIndex "11930"^^xsd:nonNegativeInteger ; - nif:endIndex "11934"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "P." ; - nif:beginIndex "11594"^^xsd:nonNegativeInteger ; - nif:endIndex "11596"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "coolest" ; - nif:beginIndex "11187"^^xsd:nonNegativeInteger ; - nif:endIndex "11194"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "money" ; - nif:beginIndex "7271"^^xsd:nonNegativeInteger ; - nif:endIndex "7276"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "who" ; - nif:beginIndex "10234"^^xsd:nonNegativeInteger ; - nif:endIndex "10237"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "concessionaires" ; - nif:beginIndex "3641"^^xsd:nonNegativeInteger ; - nif:endIndex "3656"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "10066"^^xsd:nonNegativeInteger ; - nif:endIndex "10068"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "2045"^^xsd:nonNegativeInteger ; - nif:endIndex "2046"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "board" ; - nif:beginIndex "11710"^^xsd:nonNegativeInteger ; - nif:endIndex "11715"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "place" ; - nif:beginIndex "2660"^^xsd:nonNegativeInteger ; - nif:endIndex "2665"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Fulton" ; - nif:beginIndex "2746"^^xsd:nonNegativeInteger ; - nif:endIndex "2752"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "this" ; - nif:beginIndex "777"^^xsd:nonNegativeInteger ; - nif:endIndex "781"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "8114"^^xsd:nonNegativeInteger ; - nif:endIndex "8115"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "mayor's" ; - nif:beginIndex "4715"^^xsd:nonNegativeInteger ; - nif:endIndex "4722"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "2080"^^xsd:nonNegativeInteger ; - nif:endIndex "2082"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "first" ; - nif:beginIndex "9180"^^xsd:nonNegativeInteger ; - nif:endIndex "9185"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "1186"^^xsd:nonNegativeInteger ; - nif:endIndex "1188"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "revolving" ; - nif:beginIndex "8288"^^xsd:nonNegativeInteger ; - nif:endIndex "8297"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "exception" ; - nif:beginIndex "2294"^^xsd:nonNegativeInteger ; - nif:endIndex "2303"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "constitutional" ; - nif:beginIndex "9650"^^xsd:nonNegativeInteger ; - nif:endIndex "9664"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "However, the jury said it believes ``these two offices should be combined to achieve greater efficiency and reduce the cost of administration''." ; - nif:beginIndex "1318"^^xsd:nonNegativeInteger ; - nif:endIndex "1462"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "deputies" ; - nif:beginIndex "3792"^^xsd:nonNegativeInteger ; - nif:endIndex "3800"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "10144"^^xsd:nonNegativeInteger ; - nif:endIndex "10145"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Fulton" ; - nif:beginIndex "4293"^^xsd:nonNegativeInteger ; - nif:endIndex "4299"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Attorneys" ; - nif:beginIndex "4603"^^xsd:nonNegativeInteger ; - nif:endIndex "4612"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "173"^^xsd:nonNegativeInteger ; - nif:endIndex "177"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Sept." ; - nif:beginIndex "5791"^^xsd:nonNegativeInteger ; - nif:endIndex "5796"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "teacher" ; - nif:beginIndex "10634"^^xsd:nonNegativeInteger ; - nif:endIndex "10641"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Allen" ; - nif:beginIndex "601"^^xsd:nonNegativeInteger ; - nif:endIndex "606"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "3816"^^xsd:nonNegativeInteger ; - nif:endIndex "3819"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "not" ; - nif:beginIndex "8767"^^xsd:nonNegativeInteger ; - nif:endIndex "8770"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "some" ; - nif:beginIndex "8066"^^xsd:nonNegativeInteger ; - nif:endIndex "8070"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "11995"^^xsd:nonNegativeInteger ; - nif:endIndex "11998"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Being" ; - nif:beginIndex "11267"^^xsd:nonNegativeInteger ; - nif:endIndex "11272"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "4613"^^xsd:nonNegativeInteger ; - nif:endIndex "4616"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "994"^^xsd:nonNegativeInteger ; - nif:endIndex "997"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "revised" ; - nif:beginIndex "998"^^xsd:nonNegativeInteger ; - nif:endIndex "1005"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "6080"^^xsd:nonNegativeInteger ; - nif:endIndex "6081"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "During the election campaign, both candidates, Davis and Bush, reportedly received anonymous telephone calls." ; - nif:beginIndex "11717"^^xsd:nonNegativeInteger ; - nif:endIndex "11826"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "2713"^^xsd:nonNegativeInteger ; - nif:endIndex "2714"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "size" ; - nif:beginIndex "769"^^xsd:nonNegativeInteger ; - nif:endIndex "773"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "resigned" ; - nif:beginIndex "11656"^^xsd:nonNegativeInteger ; - nif:endIndex "11664"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "our" ; - nif:beginIndex "2462"^^xsd:nonNegativeInteger ; - nif:endIndex "2465"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "intends" ; - nif:beginIndex "8329"^^xsd:nonNegativeInteger ; - nif:endIndex "8336"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "It" ; - nif:beginIndex "922"^^xsd:nonNegativeInteger ; - nif:endIndex "924"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "bond" ; - nif:beginIndex "7266"^^xsd:nonNegativeInteger ; - nif:endIndex "7270"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "2" ; - nif:beginIndex "3935"^^xsd:nonNegativeInteger ; - nif:endIndex "3936"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "3684"^^xsd:nonNegativeInteger ; - nif:endIndex "3687"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "who" ; - nif:beginIndex "5300"^^xsd:nonNegativeInteger ; - nif:endIndex "5303"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "his" ; - nif:beginIndex "5171"^^xsd:nonNegativeInteger ; - nif:endIndex "5174"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "newly" ; - nif:beginIndex "5820"^^xsd:nonNegativeInteger ; - nif:endIndex "5825"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "1336"^^xsd:nonNegativeInteger ; - nif:endIndex "1340"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "9696"^^xsd:nonNegativeInteger ; - nif:endIndex "9698"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The petition listed the mayor's occupation as ``attorney'' and his age as 71." ; - nif:beginIndex "4691"^^xsd:nonNegativeInteger ; - nif:endIndex "4768"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "didn't" ; - nif:beginIndex "11319"^^xsd:nonNegativeInteger ; - nif:endIndex "11325"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "money" ; - nif:beginIndex "2350"^^xsd:nonNegativeInteger ; - nif:endIndex "2355"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "1937" ; - nif:beginIndex "5131"^^xsd:nonNegativeInteger ; - nif:endIndex "5135"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "expense" ; - nif:beginIndex "9000"^^xsd:nonNegativeInteger ; - nif:endIndex "9007"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "203"^^xsd:nonNegativeInteger ; - nif:endIndex "207"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "M." ; - nif:beginIndex "4579"^^xsd:nonNegativeInteger ; - nif:endIndex "4581"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "When the crowd was asked whether it wanted to wait one more term to make the race, it voted no -- and there were no dissents." ; - nif:beginIndex "6191"^^xsd:nonNegativeInteger ; - nif:endIndex "6316"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "10496"^^xsd:nonNegativeInteger ; - nif:endIndex "10499"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "3932"^^xsd:nonNegativeInteger ; - nif:endIndex "3933"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "who" ; - nif:beginIndex "7005"^^xsd:nonNegativeInteger ; - nif:endIndex "7008"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "opened" ; - nif:beginIndex "8505"^^xsd:nonNegativeInteger ; - nif:endIndex "8511"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "10613"^^xsd:nonNegativeInteger ; - nif:endIndex "10615"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "departments" ; - nif:beginIndex "2240"^^xsd:nonNegativeInteger ; - nif:endIndex "2251"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "an" ; - nif:beginIndex "1851"^^xsd:nonNegativeInteger ; - nif:endIndex "1853"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "5951"^^xsd:nonNegativeInteger ; - nif:endIndex "5952"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "790"^^xsd:nonNegativeInteger ; - nif:endIndex "793"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "7092"^^xsd:nonNegativeInteger ; - nif:endIndex "7093"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "also" ; - nif:beginIndex "8155"^^xsd:nonNegativeInteger ; - nif:endIndex "8159"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "3300"^^xsd:nonNegativeInteger ; - nif:endIndex "3301"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "held" ; - nif:beginIndex "5786"^^xsd:nonNegativeInteger ; - nif:endIndex "5790"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "state" ; - nif:beginIndex "2279"^^xsd:nonNegativeInteger ; - nif:endIndex "2284"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "9904"^^xsd:nonNegativeInteger ; - nif:endIndex "9905"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "``There wasn't a bit of trouble''." ; - nif:beginIndex "12211"^^xsd:nonNegativeInteger ; - nif:endIndex "12245"^^xsd:nonNegativeInteger ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "what" ; - nif:beginIndex "7465"^^xsd:nonNegativeInteger ; - nif:endIndex "7469"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "road" ; - nif:beginIndex "8262"^^xsd:nonNegativeInteger ; - nif:endIndex "8266"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "1644"^^xsd:nonNegativeInteger ; - nif:endIndex "1646"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Fulton" ; - nif:beginIndex "4194"^^xsd:nonNegativeInteger ; - nif:endIndex "4200"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Ordinary" ; - nif:beginIndex "11087"^^xsd:nonNegativeInteger ; - nif:endIndex "11095"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "10380"^^xsd:nonNegativeInteger ; - nif:endIndex "10381"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "11155"^^xsd:nonNegativeInteger ; - nif:endIndex "11157"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "2127"^^xsd:nonNegativeInteger ; - nif:endIndex "2129"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "10678"^^xsd:nonNegativeInteger ; - nif:endIndex "10679"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "by" ; - nif:beginIndex "578"^^xsd:nonNegativeInteger ; - nif:endIndex "580"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "reconsideration" ; - nif:beginIndex "9200"^^xsd:nonNegativeInteger ; - nif:endIndex "9215"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "there" ; - nif:beginIndex "6103"^^xsd:nonNegativeInteger ; - nif:endIndex "6108"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "10652"^^xsd:nonNegativeInteger ; - nif:endIndex "10653"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "asked" ; - nif:beginIndex "6210"^^xsd:nonNegativeInteger ; - nif:endIndex "6215"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "11131"^^xsd:nonNegativeInteger ; - nif:endIndex "11132"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Rob" ; - nif:beginIndex "6977"^^xsd:nonNegativeInteger ; - nif:endIndex "6980"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "4106"^^xsd:nonNegativeInteger ; - nif:endIndex "4109"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "these" ; - nif:beginIndex "2208"^^xsd:nonNegativeInteger ; - nif:endIndex "2213"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "11674"^^xsd:nonNegativeInteger ; - nif:endIndex "11676"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "sales" ; - nif:beginIndex "8012"^^xsd:nonNegativeInteger ; - nif:endIndex "8017"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "general" ; - nif:beginIndex "6620"^^xsd:nonNegativeInteger ; - nif:endIndex "6627"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "being" ; - nif:beginIndex "11300"^^xsd:nonNegativeInteger ; - nif:endIndex "11305"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "has" ; - nif:beginIndex "8763"^^xsd:nonNegativeInteger ; - nif:endIndex "8766"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Vandiver" ; - nif:beginIndex "7503"^^xsd:nonNegativeInteger ; - nif:endIndex "7511"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "produced" ; - nif:beginIndex "95"^^xsd:nonNegativeInteger ; - nif:endIndex "103"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "8596"^^xsd:nonNegativeInteger ; - nif:endIndex "8598"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "it" ; - nif:beginIndex "7277"^^xsd:nonNegativeInteger ; - nif:endIndex "7279"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "12087"^^xsd:nonNegativeInteger ; - nif:endIndex "12088"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "6162"^^xsd:nonNegativeInteger ; - nif:endIndex "6164"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "``Failure to do this will continue to place a disproportionate burden'' on Fulton taxpayers." ; - nif:beginIndex "2622"^^xsd:nonNegativeInteger ; - nif:endIndex "2714"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "accepted" ; - nif:beginIndex "1227"^^xsd:nonNegativeInteger ; - nif:endIndex "1235"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "6090"^^xsd:nonNegativeInteger ; - nif:endIndex "6093"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "issue" ; - nif:beginIndex "8809"^^xsd:nonNegativeInteger ; - nif:endIndex "8814"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "11194"^^xsd:nonNegativeInteger ; - nif:endIndex "11195"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "11778"^^xsd:nonNegativeInteger ; - nif:endIndex "11779"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "highway" ; - nif:beginIndex "7868"^^xsd:nonNegativeInteger ; - nif:endIndex "7875"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The jury also commented on the Fulton ordinary's court which has been under fire for its practices in the appointment of appraisers, guardians and administrators and the awarding of fees and compensation." ; - nif:beginIndex "2715"^^xsd:nonNegativeInteger ; - nif:endIndex "2919"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "make" ; - nif:beginIndex "7527"^^xsd:nonNegativeInteger ; - nif:endIndex "7531"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "679"^^xsd:nonNegativeInteger ; - nif:endIndex "683"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "9186"^^xsd:nonNegativeInteger ; - nif:endIndex "9188"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "8749"^^xsd:nonNegativeInteger ; - nif:endIndex "8753"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "8071"^^xsd:nonNegativeInteger ; - nif:endIndex "8073"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "6584"^^xsd:nonNegativeInteger ; - nif:endIndex "6586"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "greater" ; - nif:beginIndex "1403"^^xsd:nonNegativeInteger ; - nif:endIndex "1410"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Jr." ; - nif:beginIndex "607"^^xsd:nonNegativeInteger ; - nif:endIndex "610"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "post" ; - nif:beginIndex "11669"^^xsd:nonNegativeInteger ; - nif:endIndex "11673"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "find" ; - nif:beginIndex "811"^^xsd:nonNegativeInteger ; - nif:endIndex "815"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "eliminate" ; - nif:beginIndex "3489"^^xsd:nonNegativeInteger ; - nif:endIndex "3498"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "11618"^^xsd:nonNegativeInteger ; - nif:endIndex "11620"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "9954"^^xsd:nonNegativeInteger ; - nif:endIndex "9956"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "act" ; - nif:beginIndex "961"^^xsd:nonNegativeInteger ; - nif:endIndex "964"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "it" ; - nif:beginIndex "1341"^^xsd:nonNegativeInteger ; - nif:endIndex "1343"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "should" ; - nif:beginIndex "10591"^^xsd:nonNegativeInteger ; - nif:endIndex "10597"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "million" ; - nif:beginIndex "7642"^^xsd:nonNegativeInteger ; - nif:endIndex "7649"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "2129"^^xsd:nonNegativeInteger ; - nif:endIndex "2130"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "3181"^^xsd:nonNegativeInteger ; - nif:endIndex "3184"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "amendment" ; - nif:beginIndex "9665"^^xsd:nonNegativeInteger ; - nif:endIndex "9674"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The Georgia Legislature will wind up its 1961 session Monday and head for home -- where some of the highway bond money it approved will follow shortly." ; - nif:beginIndex "7158"^^xsd:nonNegativeInteger ; - nif:endIndex "7309"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "who" ; - nif:beginIndex "10117"^^xsd:nonNegativeInteger ; - nif:endIndex "10120"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "was" ; - nif:beginIndex "570"^^xsd:nonNegativeInteger ; - nif:endIndex "573"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Barber" ; - nif:beginIndex "10404"^^xsd:nonNegativeInteger ; - nif:endIndex "10410"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "In the Blue Ridge meeting, the audience was warned that entering a candidate for governor would force it to take petitions out into voting precincts to obtain the signatures of registered voters." ; - nif:beginIndex "5886"^^xsd:nonNegativeInteger ; - nif:endIndex "6081"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "deserves" ; - nif:beginIndex "283"^^xsd:nonNegativeInteger ; - nif:endIndex "291"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "would" ; - nif:beginIndex "11968"^^xsd:nonNegativeInteger ; - nif:endIndex "11973"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "10829"^^xsd:nonNegativeInteger ; - nif:endIndex "10830"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Mac" ; - nif:beginIndex "9911"^^xsd:nonNegativeInteger ; - nif:endIndex "9914"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "body's" ; - nif:beginIndex "8933"^^xsd:nonNegativeInteger ; - nif:endIndex "8939"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "are" ; - nif:beginIndex "8463"^^xsd:nonNegativeInteger ; - nif:endIndex "8466"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "test" ; - nif:beginIndex "7956"^^xsd:nonNegativeInteger ; - nif:endIndex "7960"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "item" ; - nif:beginIndex "7723"^^xsd:nonNegativeInteger ; - nif:endIndex "7727"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "3745"^^xsd:nonNegativeInteger ; - nif:endIndex "3748"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "1327"^^xsd:nonNegativeInteger ; - nif:endIndex "1330"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Hartsfield has been mayor of Atlanta, with exception of one brief interlude, since 1937." ; - nif:beginIndex "5048"^^xsd:nonNegativeInteger ; - nif:endIndex "5136"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "employed" ; - nif:beginIndex "3804"^^xsd:nonNegativeInteger ; - nif:endIndex "3812"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "state" ; - nif:beginIndex "5768"^^xsd:nonNegativeInteger ; - nif:endIndex "5773"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "It urged that the city ``take steps to remedy'' this problem." ; - nif:beginIndex "1599"^^xsd:nonNegativeInteger ; - nif:endIndex "1660"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "5766"^^xsd:nonNegativeInteger ; - nif:endIndex "5767"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "12108"^^xsd:nonNegativeInteger ; - nif:endIndex "12112"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "by" ; - nif:beginIndex "8658"^^xsd:nonNegativeInteger ; - nif:endIndex "8660"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "10078"^^xsd:nonNegativeInteger ; - nif:endIndex "10081"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "other" ; - nif:beginIndex "3730"^^xsd:nonNegativeInteger ; - nif:endIndex "3735"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "among" ; - nif:beginIndex "1108"^^xsd:nonNegativeInteger ; - nif:endIndex "1113"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Welfare" ; - nif:beginIndex "2160"^^xsd:nonNegativeInteger ; - nif:endIndex "2167"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "1106"^^xsd:nonNegativeInteger ; - nif:endIndex "1107"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "208"^^xsd:nonNegativeInteger ; - nif:endIndex "211"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "superintendent" ; - nif:beginIndex "11571"^^xsd:nonNegativeInteger ; - nif:endIndex "11585"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "8957"^^xsd:nonNegativeInteger ; - nif:endIndex "8959"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "should" ; - nif:beginIndex "7482"^^xsd:nonNegativeInteger ; - nif:endIndex "7488"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "assistant" ; - nif:beginIndex "7021"^^xsd:nonNegativeInteger ; - nif:endIndex "7030"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "any" ; - nif:beginIndex "125"^^xsd:nonNegativeInteger ; - nif:endIndex "128"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "A veteran Jackson County legislator will ask the Georgia House Monday to back federal aid to education, something it has consistently opposed in the past." ; - nif:beginIndex "9751"^^xsd:nonNegativeInteger ; - nif:endIndex "9905"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "courses" ; - nif:beginIndex "6453"^^xsd:nonNegativeInteger ; - nif:endIndex "6460"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "legislator" ; - nif:beginIndex "10146"^^xsd:nonNegativeInteger ; - nif:endIndex "10156"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "is" ; - nif:beginIndex "7103"^^xsd:nonNegativeInteger ; - nif:endIndex "7105"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "school" ; - nif:beginIndex "10721"^^xsd:nonNegativeInteger ; - nif:endIndex "10727"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "funds" ; - nif:beginIndex "10553"^^xsd:nonNegativeInteger ; - nif:endIndex "10558"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "9090"^^xsd:nonNegativeInteger ; - nif:endIndex "9092"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "primary" ; - nif:beginIndex "556"^^xsd:nonNegativeInteger ; - nif:endIndex "563"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "74" ; - nif:beginIndex "4797"^^xsd:nonNegativeInteger ; - nif:endIndex "4799"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "11067"^^xsd:nonNegativeInteger ; - nif:endIndex "11068"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "7689"^^xsd:nonNegativeInteger ; - nif:endIndex "7692"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "It recommended that Fulton legislators act ``to have these laws studied and revised to the end of modernizing and improving them''." ; - nif:beginIndex "922"^^xsd:nonNegativeInteger ; - nif:endIndex "1053"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "awarding" ; - nif:beginIndex "2885"^^xsd:nonNegativeInteger ; - nif:endIndex "2893"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "action" ; - nif:beginIndex "8940"^^xsd:nonNegativeInteger ; - nif:endIndex "8946"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "made" ; - nif:beginIndex "12137"^^xsd:nonNegativeInteger ; - nif:endIndex "12141"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "being" ; - nif:beginIndex "7816"^^xsd:nonNegativeInteger ; - nif:endIndex "7821"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "who" ; - nif:beginIndex "10908"^^xsd:nonNegativeInteger ; - nif:endIndex "10911"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Ordinary" ; - nif:beginIndex "11827"^^xsd:nonNegativeInteger ; - nif:endIndex "11835"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "allotted" ; - nif:beginIndex "7418"^^xsd:nonNegativeInteger ; - nif:endIndex "7426"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "aid" ; - nif:beginIndex "9837"^^xsd:nonNegativeInteger ; - nif:endIndex "9840"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "federal" ; - nif:beginIndex "10003"^^xsd:nonNegativeInteger ; - nif:endIndex "10010"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Sunday" ; - nif:beginIndex "9495"^^xsd:nonNegativeInteger ; - nif:endIndex "9501"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "upon" ; - nif:beginIndex "4685"^^xsd:nonNegativeInteger ; - nif:endIndex "4689"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "most" ; - nif:beginIndex "8084"^^xsd:nonNegativeInteger ; - nif:endIndex "8088"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "some" ; - nif:beginIndex "10175"^^xsd:nonNegativeInteger ; - nif:endIndex "10179"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "takes" ; - nif:beginIndex "3423"^^xsd:nonNegativeInteger ; - nif:endIndex "3428"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "legislators" ; - nif:beginIndex "3945"^^xsd:nonNegativeInteger ; - nif:endIndex "3956"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "or" ; - nif:beginIndex "9148"^^xsd:nonNegativeInteger ; - nif:endIndex "9150"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "9487"^^xsd:nonNegativeInteger ; - nif:endIndex "9488"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Wednesday" ; - nif:beginIndex "5541"^^xsd:nonNegativeInteger ; - nif:endIndex "5550"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "daughter" ; - nif:beginIndex "4561"^^xsd:nonNegativeInteger ; - nif:endIndex "4569"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "4358"^^xsd:nonNegativeInteger ; - nif:endIndex "4361"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The resolution, which Barber tossed into the House hopper Friday, will be formally read Monday." ; - nif:beginIndex "10382"^^xsd:nonNegativeInteger ; - nif:endIndex "10477"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "received" ; - nif:beginIndex "658"^^xsd:nonNegativeInteger ; - nif:endIndex "666"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "10124"^^xsd:nonNegativeInteger ; - nif:endIndex "10126"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "10446"^^xsd:nonNegativeInteger ; - nif:endIndex "10447"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Gov." ; - nif:beginIndex "7498"^^xsd:nonNegativeInteger ; - nif:endIndex "7502"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "fair" ; - nif:beginIndex "4053"^^xsd:nonNegativeInteger ; - nif:endIndex "4057"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "965"^^xsd:nonNegativeInteger ; - nif:endIndex "967"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "passed" ; - nif:beginIndex "9449"^^xsd:nonNegativeInteger ; - nif:endIndex "9455"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "920"^^xsd:nonNegativeInteger ; - nif:endIndex "921"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "5101"^^xsd:nonNegativeInteger ; - nif:endIndex "5103"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "investigate" ; - nif:beginIndex "486"^^xsd:nonNegativeInteger ; - nif:endIndex "497"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "home" ; - nif:beginIndex "4947"^^xsd:nonNegativeInteger ; - nif:endIndex "4951"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "election" ; - nif:beginIndex "6628"^^xsd:nonNegativeInteger ; - nif:endIndex "6636"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "emphasizing" ; - nif:beginIndex "9253"^^xsd:nonNegativeInteger ; - nif:endIndex "9264"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "612"^^xsd:nonNegativeInteger ; - nif:endIndex "614"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "9749"^^xsd:nonNegativeInteger ; - nif:endIndex "9750"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "D." ; - nif:beginIndex "8864"^^xsd:nonNegativeInteger ; - nif:endIndex "8866"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "been" ; - nif:beginIndex "2780"^^xsd:nonNegativeInteger ; - nif:endIndex "2784"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "aside" ; - nif:beginIndex "9360"^^xsd:nonNegativeInteger ; - nif:endIndex "9365"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Legislature" ; - nif:beginIndex "8563"^^xsd:nonNegativeInteger ; - nif:endIndex "8574"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Pye" ; - nif:beginIndex "479"^^xsd:nonNegativeInteger ; - nif:endIndex "482"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "county" ; - nif:beginIndex "6696"^^xsd:nonNegativeInteger ; - nif:endIndex "6702"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "highway" ; - nif:beginIndex "7258"^^xsd:nonNegativeInteger ; - nif:endIndex "7265"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "good" ; - nif:beginIndex "12142"^^xsd:nonNegativeInteger ; - nif:endIndex "12146"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "10423"^^xsd:nonNegativeInteger ; - nif:endIndex "10426"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "sought" ; - nif:beginIndex "9239"^^xsd:nonNegativeInteger ; - nif:endIndex "9245"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "B." ; - nif:beginIndex "8861"^^xsd:nonNegativeInteger ; - nif:endIndex "8863"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "699"^^xsd:nonNegativeInteger ; - nif:endIndex "702"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "action" ; - nif:beginIndex "9223"^^xsd:nonNegativeInteger ; - nif:endIndex "9229"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "responses" ; - nif:beginIndex "5659"^^xsd:nonNegativeInteger ; - nif:endIndex "5668"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "10357"^^xsd:nonNegativeInteger ; - nif:endIndex "10359"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "his" ; - nif:beginIndex "11665"^^xsd:nonNegativeInteger ; - nif:endIndex "11668"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "battle" ; - nif:beginIndex "8549"^^xsd:nonNegativeInteger ; - nif:endIndex "8555"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "whether" ; - nif:beginIndex "9151"^^xsd:nonNegativeInteger ; - nif:endIndex "9158"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "11259"^^xsd:nonNegativeInteger ; - nif:endIndex "11263"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "``Everything went real smooth'', the sheriff said." ; - nif:beginIndex "12160"^^xsd:nonNegativeInteger ; - nif:endIndex "12210"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "votes" ; - nif:beginIndex "11039"^^xsd:nonNegativeInteger ; - nif:endIndex "11044"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Fulton" ; - nif:beginIndex "3938"^^xsd:nonNegativeInteger ; - nif:endIndex "3944"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "County" ; - nif:beginIndex "1142"^^xsd:nonNegativeInteger ; - nif:endIndex "1148"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "church" ; - nif:beginIndex "11309"^^xsd:nonNegativeInteger ; - nif:endIndex "11315"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "bonds" ; - nif:beginIndex "8835"^^xsd:nonNegativeInteger ; - nif:endIndex "8840"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "or" ; - nif:beginIndex "6645"^^xsd:nonNegativeInteger ; - nif:endIndex "6647"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "political" ; - nif:beginIndex "3499"^^xsd:nonNegativeInteger ; - nif:endIndex "3508"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "9983"^^xsd:nonNegativeInteger ; - nif:endIndex "9985"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Snodgrass" ; - nif:beginIndex "5559"^^xsd:nonNegativeInteger ; - nif:endIndex "5568"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "through" ; - nif:beginIndex "9413"^^xsd:nonNegativeInteger ; - nif:endIndex "9420"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "given" ; - nif:beginIndex "9189"^^xsd:nonNegativeInteger ; - nif:endIndex "9194"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Department" ; - nif:beginIndex "8699"^^xsd:nonNegativeInteger ; - nif:endIndex "8709"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "with" ; - nif:beginIndex "8542"^^xsd:nonNegativeInteger ; - nif:endIndex "8546"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "8447"^^xsd:nonNegativeInteger ; - nif:endIndex "8449"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "7924"^^xsd:nonNegativeInteger ; - nif:endIndex "7927"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "head" ; - nif:beginIndex "7223"^^xsd:nonNegativeInteger ; - nif:endIndex "7227"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "place" ; - nif:beginIndex "6587"^^xsd:nonNegativeInteger ; - nif:endIndex "6592"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "entering" ; - nif:beginIndex "5942"^^xsd:nonNegativeInteger ; - nif:endIndex "5950"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "chairman" ; - nif:beginIndex "5580"^^xsd:nonNegativeInteger ; - nif:endIndex "5588"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "4058"^^xsd:nonNegativeInteger ; - nif:endIndex "4061"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "11677"^^xsd:nonNegativeInteger ; - nif:endIndex "11678"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "11016"^^xsd:nonNegativeInteger ; - nif:endIndex "11017"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "5323"^^xsd:nonNegativeInteger ; - nif:endIndex "5325"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "orderly" ; - nif:beginIndex "1854"^^xsd:nonNegativeInteger ; - nif:endIndex "1861"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jurors" ; - nif:beginIndex "2361"^^xsd:nonNegativeInteger ; - nif:endIndex "2367"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "which" ; - nif:beginIndex "357"^^xsd:nonNegativeInteger ; - nif:endIndex "362"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "experienced" ; - nif:beginIndex "1526"^^xsd:nonNegativeInteger ; - nif:endIndex "1537"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The Highway Department source told The Constitution, however, that Vandiver has not been consulted yet about the plans to issue the new rural roads bonds." ; - nif:beginIndex "8687"^^xsd:nonNegativeInteger ; - nif:endIndex "8841"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "11770"^^xsd:nonNegativeInteger ; - nif:endIndex "11773"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "committee" ; - nif:beginIndex "3122"^^xsd:nonNegativeInteger ; - nif:endIndex "3131"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "policeman" ; - nif:beginIndex "10758"^^xsd:nonNegativeInteger ; - nif:endIndex "10767"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "10396"^^xsd:nonNegativeInteger ; - nif:endIndex "10397"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "they" ; - nif:beginIndex "2373"^^xsd:nonNegativeInteger ; - nif:endIndex "2377"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "9980"^^xsd:nonNegativeInteger ; - nif:endIndex "9982"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "is" ; - nif:beginIndex "7358"^^xsd:nonNegativeInteger ; - nif:endIndex "7360"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Hartsfield" ; - nif:beginIndex "4936"^^xsd:nonNegativeInteger ; - nif:endIndex "4946"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "multi-million-dollar" ; - nif:beginIndex "3341"^^xsd:nonNegativeInteger ; - nif:endIndex "3361"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "11263"^^xsd:nonNegativeInteger ; - nif:endIndex "11264"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "8494"^^xsd:nonNegativeInteger ; - nif:endIndex "8495"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "governments" ; - nif:beginIndex "1287"^^xsd:nonNegativeInteger ; - nif:endIndex "1298"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Gainesville" ; - nif:beginIndex "6992"^^xsd:nonNegativeInteger ; - nif:endIndex "7003"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "8008"^^xsd:nonNegativeInteger ; - nif:endIndex "8011"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "political" ; - nif:beginIndex "5141"^^xsd:nonNegativeInteger ; - nif:endIndex "5150"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "election" ; - nif:beginIndex "856"^^xsd:nonNegativeInteger ; - nif:endIndex "864"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Sen." ; - nif:beginIndex "5840"^^xsd:nonNegativeInteger ; - nif:endIndex "5844"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "2417"^^xsd:nonNegativeInteger ; - nif:endIndex "2419"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "resolution" ; - nif:beginIndex "9969"^^xsd:nonNegativeInteger ; - nif:endIndex "9979"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "267"^^xsd:nonNegativeInteger ; - nif:endIndex "270"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "some" ; - nif:beginIndex "7246"^^xsd:nonNegativeInteger ; - nif:endIndex "7250"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "been" ; - nif:beginIndex "7013"^^xsd:nonNegativeInteger ; - nif:endIndex "7017"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "date" ; - nif:beginIndex "1838"^^xsd:nonNegativeInteger ; - nif:endIndex "1842"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Pelham" ; - nif:beginIndex "9020"^^xsd:nonNegativeInteger ; - nif:endIndex "9026"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "past" ; - nif:beginIndex "9900"^^xsd:nonNegativeInteger ; - nif:endIndex "9904"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Regarding Atlanta's new multi-million-dollar airport, the jury recommended ``that when the new management takes charge Jan. 1 the airport be operated in a manner that will eliminate political influences''." ; - nif:beginIndex "3317"^^xsd:nonNegativeInteger ; - nif:endIndex "3522"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "GOP" ; - nif:beginIndex "5576"^^xsd:nonNegativeInteger ; - nif:endIndex "5579"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "may" ; - nif:beginIndex "1888"^^xsd:nonNegativeInteger ; - nif:endIndex "1891"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "3843"^^xsd:nonNegativeInteger ; - nif:endIndex "3845"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "will" ; - nif:beginIndex "9787"^^xsd:nonNegativeInteger ; - nif:endIndex "9791"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "10491"^^xsd:nonNegativeInteger ; - nif:endIndex "10493"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Merger proposed" ; - nif:beginIndex "1302"^^xsd:nonNegativeInteger ; - nif:endIndex "1317"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "5588"^^xsd:nonNegativeInteger ; - nif:endIndex "5589"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "election" ; - nif:beginIndex "11417"^^xsd:nonNegativeInteger ; - nif:endIndex "11425"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "1595"^^xsd:nonNegativeInteger ; - nif:endIndex "1597"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "elected" ; - nif:beginIndex "10744"^^xsd:nonNegativeInteger ; - nif:endIndex "10751"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "at" ; - nif:beginIndex "4955"^^xsd:nonNegativeInteger ; - nif:endIndex "4957"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "$30" ; - nif:beginIndex "7847"^^xsd:nonNegativeInteger ; - nif:endIndex "7850"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "2521"^^xsd:nonNegativeInteger ; - nif:endIndex "2523"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "5209"^^xsd:nonNegativeInteger ; - nif:endIndex "5212"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "voted" ; - nif:beginIndex "6277"^^xsd:nonNegativeInteger ; - nif:endIndex "6282"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "9648"^^xsd:nonNegativeInteger ; - nif:endIndex "9649"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "will" ; - nif:beginIndex "8874"^^xsd:nonNegativeInteger ; - nif:endIndex "8878"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "action" ; - nif:beginIndex "9578"^^xsd:nonNegativeInteger ; - nif:endIndex "9584"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "by" ; - nif:beginIndex "6974"^^xsd:nonNegativeInteger ; - nif:endIndex "6976"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "real" ; - nif:beginIndex "12178"^^xsd:nonNegativeInteger ; - nif:endIndex "12182"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "738"^^xsd:nonNegativeInteger ; - nif:endIndex "739"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Monday" ; - nif:beginIndex "9814"^^xsd:nonNegativeInteger ; - nif:endIndex "9820"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "local" ; - nif:beginIndex "11935"^^xsd:nonNegativeInteger ; - nif:endIndex "11940"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "adjustments" ; - nif:beginIndex "7470"^^xsd:nonNegativeInteger ; - nif:endIndex "7481"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "superintendent" ; - nif:beginIndex "10728"^^xsd:nonNegativeInteger ; - nif:endIndex "10742"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "relations" ; - nif:beginIndex "6814"^^xsd:nonNegativeInteger ; - nif:endIndex "6823"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "there" ; - nif:beginIndex "8170"^^xsd:nonNegativeInteger ; - nif:endIndex "8175"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "2918"^^xsd:nonNegativeInteger ; - nif:endIndex "2919"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "5595"^^xsd:nonNegativeInteger ; - nif:endIndex "5596"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "as" ; - nif:beginIndex "5023"^^xsd:nonNegativeInteger ; - nif:endIndex "5025"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "anonymous" ; - nif:beginIndex "11800"^^xsd:nonNegativeInteger ; - nif:endIndex "11809"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "4480"^^xsd:nonNegativeInteger ; - nif:endIndex "4481"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "10486"^^xsd:nonNegativeInteger ; - nif:endIndex "10490"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Gov. Vandiver is expected to make the traditional visit to both chambers as they work toward adjournment." ; - nif:beginIndex "7498"^^xsd:nonNegativeInteger ; - nif:endIndex "7603"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "It" ; - nif:beginIndex "1599"^^xsd:nonNegativeInteger ; - nif:endIndex "1601"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "B." ; - nif:beginIndex "4333"^^xsd:nonNegativeInteger ; - nif:endIndex "4335"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "plan" ; - nif:beginIndex "4082"^^xsd:nonNegativeInteger ; - nif:endIndex "4086"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "by" ; - nif:beginIndex "1731"^^xsd:nonNegativeInteger ; - nif:endIndex "1733"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "bit" ; - nif:beginIndex "11371"^^xsd:nonNegativeInteger ; - nif:endIndex "11374"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "E." ; - nif:beginIndex "4962"^^xsd:nonNegativeInteger ; - nif:endIndex "4964"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "0"^^xsd:nonNegativeInteger ; - nif:endIndex "3"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "2969"^^xsd:nonNegativeInteger ; - nif:endIndex "2971"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "populous" ; - nif:beginIndex "2471"^^xsd:nonNegativeInteger ; - nif:endIndex "2479"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "11389"^^xsd:nonNegativeInteger ; - nif:endIndex "11392"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Fulton" ; - nif:beginIndex "2087"^^xsd:nonNegativeInteger ; - nif:endIndex "2093"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "2881"^^xsd:nonNegativeInteger ; - nif:endIndex "2884"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "1,119" ; - nif:beginIndex "11033"^^xsd:nonNegativeInteger ; - nif:endIndex "11038"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "governor" ; - nif:beginIndex "5967"^^xsd:nonNegativeInteger ; - nif:endIndex "5975"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "They have a son, William Berry Jr., and a daughter, Mrs. J. M. Cheshire of Griffin." ; - nif:beginIndex "4519"^^xsd:nonNegativeInteger ; - nif:endIndex "4602"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "specifically" ; - nif:beginIndex "10334"^^xsd:nonNegativeInteger ; - nif:endIndex "10346"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "been" ; - nif:beginIndex "8771"^^xsd:nonNegativeInteger ; - nif:endIndex "8775"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "portion" ; - nif:beginIndex "2569"^^xsd:nonNegativeInteger ; - nif:endIndex "2576"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "4850"^^xsd:nonNegativeInteger ; - nif:endIndex "4854"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "A revolving fund" ; - nif:beginIndex "8286"^^xsd:nonNegativeInteger ; - nif:endIndex "8302"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , . - - - a nif:Word ; - nif:anchorOf "polls" ; - nif:beginIndex "11280"^^xsd:nonNegativeInteger ; - nif:endIndex "11285"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "County" ; - nif:beginIndex "10979"^^xsd:nonNegativeInteger ; - nif:endIndex "10985"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "87-31" ; - nif:beginIndex "9421"^^xsd:nonNegativeInteger ; - nif:endIndex "9426"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "A" ; - nif:beginIndex "8116"^^xsd:nonNegativeInteger ; - nif:endIndex "8117"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "it" ; - nif:beginIndex "2950"^^xsd:nonNegativeInteger ; - nif:endIndex "2952"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "couple" ; - nif:beginIndex "4864"^^xsd:nonNegativeInteger ; - nif:endIndex "4870"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Atlanta" ; - nif:beginIndex "4163"^^xsd:nonNegativeInteger ; - nif:endIndex "4170"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "670"^^xsd:nonNegativeInteger ; - nif:endIndex "673"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "under" ; - nif:beginIndex "2785"^^xsd:nonNegativeInteger ; - nif:endIndex "2790"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "legislator" ; - nif:beginIndex "9776"^^xsd:nonNegativeInteger ; - nif:endIndex "9786"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "11110"^^xsd:nonNegativeInteger ; - nif:endIndex "11111"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "7392"^^xsd:nonNegativeInteger ; - nif:endIndex "7395"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "age" ; - nif:beginIndex "4790"^^xsd:nonNegativeInteger ; - nif:endIndex "4793"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "6035"^^xsd:nonNegativeInteger ; - nif:endIndex "6037"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "1392"^^xsd:nonNegativeInteger ; - nif:endIndex "1394"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "repealed" ; - nif:beginIndex "9130"^^xsd:nonNegativeInteger ; - nif:endIndex "9138"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "work" ; - nif:beginIndex "8058"^^xsd:nonNegativeInteger ; - nif:endIndex "8062"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The City Purchasing Department, the jury said, ``is lacking in experienced clerical personnel as a result of city personnel policies''." ; - nif:beginIndex "1463"^^xsd:nonNegativeInteger ; - nif:endIndex "1598"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "reduce" ; - nif:beginIndex "1426"^^xsd:nonNegativeInteger ; - nif:endIndex "1432"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "like" ; - nif:beginIndex "10244"^^xsd:nonNegativeInteger ; - nif:endIndex "10248"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jury" ; - nif:beginIndex "418"^^xsd:nonNegativeInteger ; - nif:endIndex "422"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "11265"^^xsd:nonNegativeInteger ; - nif:endIndex "11267"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "5402"^^xsd:nonNegativeInteger ; - nif:endIndex "5405"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "granted" ; - nif:beginIndex "1995"^^xsd:nonNegativeInteger ; - nif:endIndex "2002"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "9459"^^xsd:nonNegativeInteger ; - nif:endIndex "9462"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "6495"^^xsd:nonNegativeInteger ; - nif:endIndex "6498"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "approved" ; - nif:beginIndex "7669"^^xsd:nonNegativeInteger ; - nif:endIndex "7677"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "as" ; - nif:beginIndex "4762"^^xsd:nonNegativeInteger ; - nif:endIndex "4764"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "rural" ; - nif:beginIndex "8631"^^xsd:nonNegativeInteger ; - nif:endIndex "8636"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "9480"^^xsd:nonNegativeInteger ; - nif:endIndex "9482"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "683"^^xsd:nonNegativeInteger ; - nif:endIndex "684"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "506"^^xsd:nonNegativeInteger ; - nif:endIndex "508"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "6552"^^xsd:nonNegativeInteger ; - nif:endIndex "6556"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "law" ; - nif:beginIndex "1884"^^xsd:nonNegativeInteger ; - nif:endIndex "1887"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Superior" ; - nif:beginIndex "450"^^xsd:nonNegativeInteger ; - nif:endIndex "458"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "5521"^^xsd:nonNegativeInteger ; - nif:endIndex "5522"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "10542"^^xsd:nonNegativeInteger ; - nif:endIndex "10544"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "procedures" ; - nif:beginIndex "3007"^^xsd:nonNegativeInteger ; - nif:endIndex "3017"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "by" ; - nif:beginIndex "8476"^^xsd:nonNegativeInteger ; - nif:endIndex "8478"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "issue" ; - nif:beginIndex "7831"^^xsd:nonNegativeInteger ; - nif:endIndex "7836"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "A Highway Department source said there also is a plan there to issue some $3 million to $4 million worth of Rural Roads Authority bonds for rural road construction work." ; - nif:beginIndex "8116"^^xsd:nonNegativeInteger ; - nif:endIndex "8285"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "3922"^^xsd:nonNegativeInteger ; - nif:endIndex "3925"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "smell" ; - nif:beginIndex "11326"^^xsd:nonNegativeInteger ; - nif:endIndex "11331"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "10628"^^xsd:nonNegativeInteger ; - nif:endIndex "10630"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "is" ; - nif:beginIndex "9934"^^xsd:nonNegativeInteger ; - nif:endIndex "9936"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "610"^^xsd:nonNegativeInteger ; - nif:endIndex "611"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "(" ; - nif:beginIndex "3934"^^xsd:nonNegativeInteger ; - nif:endIndex "3935"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "modernizing" ; - nif:beginIndex "1020"^^xsd:nonNegativeInteger ; - nif:endIndex "1031"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "petition" ; - nif:beginIndex "4841"^^xsd:nonNegativeInteger ; - nif:endIndex "4849"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "had" ; - nif:beginIndex "244"^^xsd:nonNegativeInteger ; - nif:endIndex "247"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "While" ; - nif:beginIndex "9247"^^xsd:nonNegativeInteger ; - nif:endIndex "9252"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "9245"^^xsd:nonNegativeInteger ; - nif:endIndex "9246"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "can" ; - nif:beginIndex "9123"^^xsd:nonNegativeInteger ; - nif:endIndex "9126"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "6643"^^xsd:nonNegativeInteger ; - nif:endIndex "6644"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "7686"^^xsd:nonNegativeInteger ; - nif:endIndex "7688"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "7347"^^xsd:nonNegativeInteger ; - nif:endIndex "7350"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Five" ; - nif:beginIndex "6478"^^xsd:nonNegativeInteger ; - nif:endIndex "6482"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "election" ; - nif:beginIndex "12079"^^xsd:nonNegativeInteger ; - nif:endIndex "12087"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "5268"^^xsd:nonNegativeInteger ; - nif:endIndex "5270"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "law" ; - nif:beginIndex "1706"^^xsd:nonNegativeInteger ; - nif:endIndex "1709"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "264"^^xsd:nonNegativeInteger ; - nif:endIndex "266"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "13th" ; - nif:beginIndex "10131"^^xsd:nonNegativeInteger ; - nif:endIndex "10135"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "was" ; - nif:beginIndex "654"^^xsd:nonNegativeInteger ; - nif:endIndex "657"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "wards" ; - nif:beginIndex "3207"^^xsd:nonNegativeInteger ; - nif:endIndex "3212"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Board" ; - nif:beginIndex "10572"^^xsd:nonNegativeInteger ; - nif:endIndex "10577"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The mayor's present term of office expires Jan. 1." ; - nif:beginIndex "5209"^^xsd:nonNegativeInteger ; - nif:endIndex "5259"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "1813"^^xsd:nonNegativeInteger ; - nif:endIndex "1816"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "1017"^^xsd:nonNegativeInteger ; - nif:endIndex "1019"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "orderly" ; - nif:beginIndex "12071"^^xsd:nonNegativeInteger ; - nif:endIndex "12078"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "provide" ; - nif:beginIndex "10520"^^xsd:nonNegativeInteger ; - nif:endIndex "10527"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "term-end" ; - nif:beginIndex "181"^^xsd:nonNegativeInteger ; - nif:endIndex "189"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "election" ; - nif:beginIndex "367"^^xsd:nonNegativeInteger ; - nif:endIndex "375"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "they" ; - nif:beginIndex "7574"^^xsd:nonNegativeInteger ; - nif:endIndex "7578"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "``" ; - nif:beginIndex "3133"^^xsd:nonNegativeInteger ; - nif:endIndex "3135"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Tuesday" ; - nif:beginIndex "5610"^^xsd:nonNegativeInteger ; - nif:endIndex "5617"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "5495"^^xsd:nonNegativeInteger ; - nif:endIndex "5498"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "state" ; - nif:beginIndex "10096"^^xsd:nonNegativeInteger ; - nif:endIndex "10101"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "being" ; - nif:beginIndex "9543"^^xsd:nonNegativeInteger ; - nif:endIndex "9548"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Rural" ; - nif:beginIndex "8224"^^xsd:nonNegativeInteger ; - nif:endIndex "8229"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Four" ; - nif:beginIndex "3776"^^xsd:nonNegativeInteger ; - nif:endIndex "3780"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Mayor" ; - nif:beginIndex "4319"^^xsd:nonNegativeInteger ; - nif:endIndex "4324"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "It" ; - nif:beginIndex "1753"^^xsd:nonNegativeInteger ; - nif:endIndex "1755"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "4188"^^xsd:nonNegativeInteger ; - nif:endIndex "4189"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "many" ; - nif:beginIndex "821"^^xsd:nonNegativeInteger ; - nif:endIndex "825"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "4317"^^xsd:nonNegativeInteger ; - nif:endIndex "4318"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "5168"^^xsd:nonNegativeInteger ; - nif:endIndex "5170"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "2796"^^xsd:nonNegativeInteger ; - nif:endIndex "2799"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "6557"^^xsd:nonNegativeInteger ; - nif:endIndex "6560"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "airport" ; - nif:beginIndex "3362"^^xsd:nonNegativeInteger ; - nif:endIndex "3369"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "March" ; - nif:beginIndex "11627"^^xsd:nonNegativeInteger ; - nif:endIndex "11632"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "10751"^^xsd:nonNegativeInteger ; - nif:endIndex "10752"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "adjournment" ; - nif:beginIndex "7317"^^xsd:nonNegativeInteger ; - nif:endIndex "7328"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "it" ; - nif:beginIndex "3555"^^xsd:nonNegativeInteger ; - nif:endIndex "3557"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "29-5" ; - nif:beginIndex "9483"^^xsd:nonNegativeInteger ; - nif:endIndex "9487"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "11332"^^xsd:nonNegativeInteger ; - nif:endIndex "11333"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "1300"^^xsd:nonNegativeInteger ; - nif:endIndex "1301"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "2490"^^xsd:nonNegativeInteger ; - nif:endIndex "2491"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "offer" ; - nif:beginIndex "8879"^^xsd:nonNegativeInteger ; - nif:endIndex "8884"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "soon" ; - nif:beginIndex "11892"^^xsd:nonNegativeInteger ; - nif:endIndex "11896"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "8176"^^xsd:nonNegativeInteger ; - nif:endIndex "8178"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "7436"^^xsd:nonNegativeInteger ; - nif:endIndex "7439"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "continue" ; - nif:beginIndex "2648"^^xsd:nonNegativeInteger ; - nif:endIndex "2656"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "11385"^^xsd:nonNegativeInteger ; - nif:endIndex "11387"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "9527"^^xsd:nonNegativeInteger ; - nif:endIndex "9529"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "is" ; - nif:beginIndex "6371"^^xsd:nonNegativeInteger ; - nif:endIndex "6373"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Wards" ; - nif:beginIndex "2920"^^xsd:nonNegativeInteger ; - nif:endIndex "2925"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "was" ; - nif:beginIndex "1710"^^xsd:nonNegativeInteger ; - nif:endIndex "1713"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "have" ; - nif:beginIndex "970"^^xsd:nonNegativeInteger ; - nif:endIndex "974"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "strong" ; - nif:beginIndex "5450"^^xsd:nonNegativeInteger ; - nif:endIndex "5456"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "had" ; - nif:beginIndex "423"^^xsd:nonNegativeInteger ; - nif:endIndex "426"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "3455"^^xsd:nonNegativeInteger ; - nif:endIndex "3457"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "permit" ; - nif:beginIndex "12033"^^xsd:nonNegativeInteger ; - nif:endIndex "12039"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "relative" ; - nif:beginIndex "621"^^xsd:nonNegativeInteger ; - nif:endIndex "629"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "2064"^^xsd:nonNegativeInteger ; - nif:endIndex "2067"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "available" ; - nif:beginIndex "2586"^^xsd:nonNegativeInteger ; - nif:endIndex "2595"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Jr." ; - nif:beginIndex "5295"^^xsd:nonNegativeInteger ; - nif:endIndex "5298"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "shot" ; - nif:beginIndex "11605"^^xsd:nonNegativeInteger ; - nif:endIndex "11609"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "10578"^^xsd:nonNegativeInteger ; - nif:endIndex "10580"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "thanks" ; - nif:beginIndex "307"^^xsd:nonNegativeInteger ; - nif:endIndex "313"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "congressmen" ; - nif:beginIndex "10322"^^xsd:nonNegativeInteger ; - nif:endIndex "10333"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "quickie" ; - nif:beginIndex "9092"^^xsd:nonNegativeInteger ; - nif:endIndex "9099"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "take" ; - nif:beginIndex "5994"^^xsd:nonNegativeInteger ; - nif:endIndex "5998"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "intern" ; - nif:beginIndex "3863"^^xsd:nonNegativeInteger ; - nif:endIndex "3869"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "source" ; - nif:beginIndex "8710"^^xsd:nonNegativeInteger ; - nif:endIndex "8716"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "resolution" ; - nif:beginIndex "9380"^^xsd:nonNegativeInteger ; - nif:endIndex "9390"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "1493"^^xsd:nonNegativeInteger ; - nif:endIndex "1494"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "5471"^^xsd:nonNegativeInteger ; - nif:endIndex "5473"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "is" ; - nif:beginIndex "7512"^^xsd:nonNegativeInteger ; - nif:endIndex "7514"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "he" ; - nif:beginIndex "11903"^^xsd:nonNegativeInteger ; - nif:endIndex "11905"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Williams" ; - nif:beginIndex "12010"^^xsd:nonNegativeInteger ; - nif:endIndex "12018"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "too" ; - nif:beginIndex "11854"^^xsd:nonNegativeInteger ; - nif:endIndex "11857"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "million" ; - nif:beginIndex "7851"^^xsd:nonNegativeInteger ; - nif:endIndex "7858"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Jan." ; - nif:beginIndex "5252"^^xsd:nonNegativeInteger ; - nif:endIndex "5256"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "election" ; - nif:beginIndex "11059"^^xsd:nonNegativeInteger ; - nif:endIndex "11067"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jury" ; - nif:beginIndex "1331"^^xsd:nonNegativeInteger ; - nif:endIndex "1335"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "2858"^^xsd:nonNegativeInteger ; - nif:endIndex "2861"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "candidate" ; - nif:beginIndex "5953"^^xsd:nonNegativeInteger ; - nif:endIndex "5962"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "education" ; - nif:beginIndex "10030"^^xsd:nonNegativeInteger ; - nif:endIndex "10039"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "for" ; - nif:beginIndex "5963"^^xsd:nonNegativeInteger ; - nif:endIndex "5966"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Failure" ; - nif:beginIndex "2624"^^xsd:nonNegativeInteger ; - nif:endIndex "2631"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "from" ; - nif:beginIndex "3269"^^xsd:nonNegativeInteger ; - nif:endIndex "3273"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "5550"^^xsd:nonNegativeInteger ; - nif:endIndex "5551"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "4810"^^xsd:nonNegativeInteger ; - nif:endIndex "4812"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "irregularities" ; - nif:beginIndex "520"^^xsd:nonNegativeInteger ; - nif:endIndex "534"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "William" ; - nif:beginIndex "4536"^^xsd:nonNegativeInteger ; - nif:endIndex "4543"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "threats" ; - nif:beginIndex "11525"^^xsd:nonNegativeInteger ; - nif:endIndex "11532"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "child" ; - nif:beginIndex "2007"^^xsd:nonNegativeInteger ; - nif:endIndex "2012"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "didn't" ; - nif:beginIndex "11357"^^xsd:nonNegativeInteger ; - nif:endIndex "11363"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Davis" ; - nif:beginIndex "11018"^^xsd:nonNegativeInteger ; - nif:endIndex "11023"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "as" ; - nif:beginIndex "10093"^^xsd:nonNegativeInteger ; - nif:endIndex "10095"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "6650"^^xsd:nonNegativeInteger ; - nif:endIndex "6653"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Construction" ; - nif:beginIndex "7729"^^xsd:nonNegativeInteger ; - nif:endIndex "7741"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "interlude" ; - nif:beginIndex "5114"^^xsd:nonNegativeInteger ; - nif:endIndex "5123"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Alpharetta" ; - nif:beginIndex "4245"^^xsd:nonNegativeInteger ; - nif:endIndex "4255"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "rejected" ; - nif:beginIndex "9639"^^xsd:nonNegativeInteger ; - nif:endIndex "9647"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "was" ; - nif:beginIndex "11426"^^xsd:nonNegativeInteger ; - nif:endIndex "11429"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "as" ; - nif:beginIndex "10753"^^xsd:nonNegativeInteger ; - nif:endIndex "10755"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The grand jury commented on a number of other topics, among them the Atlanta and Fulton County purchasing departments which it said ``are well operated and follow generally accepted practices which inure to the best interest of both governments''." ; - nif:beginIndex "1054"^^xsd:nonNegativeInteger ; - nif:endIndex "1301"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "8534"^^xsd:nonNegativeInteger ; - nif:endIndex "8536"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "death" ; - nif:beginIndex "11621"^^xsd:nonNegativeInteger ; - nif:endIndex "11626"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "likely" ; - nif:beginIndex "7613"^^xsd:nonNegativeInteger ; - nif:endIndex "7619"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "5889"^^xsd:nonNegativeInteger ; - nif:endIndex "5892"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Department's" ; - nif:beginIndex "1956"^^xsd:nonNegativeInteger ; - nif:endIndex "1968"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "did" ; - nif:beginIndex "807"^^xsd:nonNegativeInteger ; - nif:endIndex "810"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "would" ; - nif:beginIndex "9166"^^xsd:nonNegativeInteger ; - nif:endIndex "9171"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "4627"^^xsd:nonNegativeInteger ; - nif:endIndex "4631"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "polls" ; - nif:beginIndex "11149"^^xsd:nonNegativeInteger ; - nif:endIndex "11154"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "increase" ; - nif:beginIndex "9114"^^xsd:nonNegativeInteger ; - nif:endIndex "9122"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "967"^^xsd:nonNegativeInteger ; - nif:endIndex "969"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "--" ; - nif:beginIndex "6286"^^xsd:nonNegativeInteger ; - nif:endIndex "6288"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "4553"^^xsd:nonNegativeInteger ; - nif:endIndex "4554"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "unmeritorious" ; - nif:beginIndex "3274"^^xsd:nonNegativeInteger ; - nif:endIndex "3287"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "When the gubernatorial campaign starts, Caldwell is expected to become a campaign coordinator for Byrd." ; - nif:beginIndex "7054"^^xsd:nonNegativeInteger ; - nif:endIndex "7157"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "4978"^^xsd:nonNegativeInteger ; - nif:endIndex "4979"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "3162"^^xsd:nonNegativeInteger ; - nif:endIndex "3164"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "is" ; - nif:beginIndex "10121"^^xsd:nonNegativeInteger ; - nif:endIndex "10123"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "2601"^^xsd:nonNegativeInteger ; - nif:endIndex "2603"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "10562"^^xsd:nonNegativeInteger ; - nif:endIndex "10565"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "9099"^^xsd:nonNegativeInteger ; - nif:endIndex "9101"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "fact" ; - nif:beginIndex "3176"^^xsd:nonNegativeInteger ; - nif:endIndex "3180"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "items" ; - nif:beginIndex "2074"^^xsd:nonNegativeInteger ; - nif:endIndex "2079"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "first" ; - nif:beginIndex "7708"^^xsd:nonNegativeInteger ; - nif:endIndex "7713"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "congressional" ; - nif:beginIndex "10195"^^xsd:nonNegativeInteger ; - nif:endIndex "10208"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "federal" ; - nif:beginIndex "10545"^^xsd:nonNegativeInteger ; - nif:endIndex "10552"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "his" ; - nif:beginIndex "7704"^^xsd:nonNegativeInteger ; - nif:endIndex "7707"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "5568"^^xsd:nonNegativeInteger ; - nif:endIndex "5569"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "2290"^^xsd:nonNegativeInteger ; - nif:endIndex "2293"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "determine" ; - nif:beginIndex "7455"^^xsd:nonNegativeInteger ; - nif:endIndex "7464"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "5200"^^xsd:nonNegativeInteger ; - nif:endIndex "5202"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "3584"^^xsd:nonNegativeInteger ; - nif:endIndex "3586"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "this" ; - nif:beginIndex "10816"^^xsd:nonNegativeInteger ; - nif:endIndex "10820"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "10715"^^xsd:nonNegativeInteger ; - nif:endIndex "10716"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "10309"^^xsd:nonNegativeInteger ; - nif:endIndex "10311"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "2894"^^xsd:nonNegativeInteger ; - nif:endIndex "2896"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Rep." ; - nif:beginIndex "9906"^^xsd:nonNegativeInteger ; - nif:endIndex "9910"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "some" ; - nif:beginIndex "6942"^^xsd:nonNegativeInteger ; - nif:endIndex "6946"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "operated" ; - nif:beginIndex "1197"^^xsd:nonNegativeInteger ; - nif:endIndex "1205"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "5911"^^xsd:nonNegativeInteger ; - nif:endIndex "5912"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "7052"^^xsd:nonNegativeInteger ; - nif:endIndex "7053"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "resigned" ; - nif:beginIndex "6834"^^xsd:nonNegativeInteger ; - nif:endIndex "6842"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "warning" ; - nif:beginIndex "6094"^^xsd:nonNegativeInteger ; - nif:endIndex "6101"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Atlanta's" ; - nif:beginIndex "3327"^^xsd:nonNegativeInteger ; - nif:endIndex "3336"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "3131"^^xsd:nonNegativeInteger ; - nif:endIndex "3132"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "added" ; - nif:beginIndex "3558"^^xsd:nonNegativeInteger ; - nif:endIndex "3563"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "``These actions should serve to protect in fact and in effect the court's wards from undue costs and its appointed and elected servants from unmeritorious criticisms'', the jury said." ; - nif:beginIndex "3133"^^xsd:nonNegativeInteger ; - nif:endIndex "3316"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "bond" ; - nif:beginIndex "7658"^^xsd:nonNegativeInteger ; - nif:endIndex "7662"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "fit" ; - nif:beginIndex "2190"^^xsd:nonNegativeInteger ; - nif:endIndex "2193"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "proposed" ; - nif:beginIndex "8649"^^xsd:nonNegativeInteger ; - nif:endIndex "8657"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "1751"^^xsd:nonNegativeInteger ; - nif:endIndex "1752"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "1050"^^xsd:nonNegativeInteger ; - nif:endIndex "1052"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "coordinator" ; - nif:beginIndex "7136"^^xsd:nonNegativeInteger ; - nif:endIndex "7147"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Association" ; - nif:beginIndex "3086"^^xsd:nonNegativeInteger ; - nif:endIndex "3097"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "system" ; - nif:beginIndex "6720"^^xsd:nonNegativeInteger ; - nif:endIndex "6726"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "act" ; - nif:beginIndex "7994"^^xsd:nonNegativeInteger ; - nif:endIndex "7997"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "4241"^^xsd:nonNegativeInteger ; - nif:endIndex "4244"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "11350"^^xsd:nonNegativeInteger ; - nif:endIndex "11353"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Ivan" ; - nif:beginIndex "5284"^^xsd:nonNegativeInteger ; - nif:endIndex "5288"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Berry" ; - nif:beginIndex "4544"^^xsd:nonNegativeInteger ; - nif:endIndex "4549"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "State" ; - nif:beginIndex "1942"^^xsd:nonNegativeInteger ; - nif:endIndex "1947"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "2877"^^xsd:nonNegativeInteger ; - nif:endIndex "2880"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Georgia's" ; - nif:beginIndex "10312"^^xsd:nonNegativeInteger ; - nif:endIndex "10321"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "on" ; - nif:beginIndex "2694"^^xsd:nonNegativeInteger ; - nif:endIndex "2696"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Before" ; - nif:beginIndex "7310"^^xsd:nonNegativeInteger ; - nif:endIndex "7316"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jury" ; - nif:beginIndex "3527"^^xsd:nonNegativeInteger ; - nif:endIndex "3531"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "5258"^^xsd:nonNegativeInteger ; - nif:endIndex "5259"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "7966"^^xsd:nonNegativeInteger ; - nif:endIndex "7968"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "6851"^^xsd:nonNegativeInteger ; - nif:endIndex "6853"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "House" ; - nif:beginIndex "9401"^^xsd:nonNegativeInteger ; - nif:endIndex "9406"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "warned" ; - nif:beginIndex "5930"^^xsd:nonNegativeInteger ; - nif:endIndex "5936"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "4071"^^xsd:nonNegativeInteger ; - nif:endIndex "4073"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "resolution" ; - nif:beginIndex "9331"^^xsd:nonNegativeInteger ; - nif:endIndex "9341"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The Hartsfield home is at 637 E. Pelham Rd. Aj." ; - nif:beginIndex "4932"^^xsd:nonNegativeInteger ; - nif:endIndex "4979"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "its" ; - nif:beginIndex "3234"^^xsd:nonNegativeInteger ; - nif:endIndex "3237"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "legislators" ; - nif:beginIndex "9684"^^xsd:nonNegativeInteger ; - nif:endIndex "9695"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "2691"^^xsd:nonNegativeInteger ; - nif:endIndex "2693"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "2666"^^xsd:nonNegativeInteger ; - nif:endIndex "2667"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "charged" ; - nif:beginIndex "4458"^^xsd:nonNegativeInteger ; - nif:endIndex "4465"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "8918"^^xsd:nonNegativeInteger ; - nif:endIndex "8920"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "8556"^^xsd:nonNegativeInteger ; - nif:endIndex "8558"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "--" ; - nif:beginIndex "7237"^^xsd:nonNegativeInteger ; - nif:endIndex "7239"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "John" ; - nif:beginIndex "5845"^^xsd:nonNegativeInteger ; - nif:endIndex "5849"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "of" ; - nif:beginIndex "774"^^xsd:nonNegativeInteger ; - nif:endIndex "776"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "9353"^^xsd:nonNegativeInteger ; - nif:endIndex "9355"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "suit" ; - nif:beginIndex "7961"^^xsd:nonNegativeInteger ; - nif:endIndex "7965"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "11845"^^xsd:nonNegativeInteger ; - nif:endIndex "11849"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "6751"^^xsd:nonNegativeInteger ; - nif:endIndex "6753"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "120"^^xsd:nonNegativeInteger ; - nif:endIndex "124"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Washington" ; - nif:beginIndex "10223"^^xsd:nonNegativeInteger ; - nif:endIndex "10233"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "might" ; - nif:beginIndex "2432"^^xsd:nonNegativeInteger ; - nif:endIndex "2437"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "work" ; - nif:beginIndex "3959"^^xsd:nonNegativeInteger ; - nif:endIndex "3963"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "The" ; - nif:beginIndex "7898"^^xsd:nonNegativeInteger ; - nif:endIndex "7901"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "investigation" ; - nif:beginIndex "44"^^xsd:nonNegativeInteger ; - nif:endIndex "57"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "7381"^^xsd:nonNegativeInteger ; - nif:endIndex "7382"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "influences" ; - nif:beginIndex "3509"^^xsd:nonNegativeInteger ; - nif:endIndex "3519"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Department" ; - nif:beginIndex "6796"^^xsd:nonNegativeInteger ; - nif:endIndex "6806"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The Republicans must hold a primary under the county unit system -- a system which the party opposes in its platform." ; - nif:beginIndex "6650"^^xsd:nonNegativeInteger ; - nif:endIndex "6767"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "899"^^xsd:nonNegativeInteger ; - nif:endIndex "902"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "8" ; - nif:beginIndex "5797"^^xsd:nonNegativeInteger ; - nif:endIndex "5798"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "It was marked by controversy, anonymous midnight phone calls and veiled threats of violence." ; - nif:beginIndex "11453"^^xsd:nonNegativeInteger ; - nif:endIndex "11545"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "roads" ; - nif:beginIndex "8637"^^xsd:nonNegativeInteger ; - nif:endIndex "8642"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "bonds" ; - nif:beginIndex "8404"^^xsd:nonNegativeInteger ; - nif:endIndex "8409"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "doctor" ; - nif:beginIndex "3847"^^xsd:nonNegativeInteger ; - nif:endIndex "3853"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "at" ; - nif:beginIndex "11273"^^xsd:nonNegativeInteger ; - nif:endIndex "11275"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "out" ; - nif:beginIndex "9310"^^xsd:nonNegativeInteger ; - nif:endIndex "9313"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "7945"^^xsd:nonNegativeInteger ; - nif:endIndex "7946"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Before adjournment Monday afternoon, the Senate is expected to approve a study of the number of legislators allotted to rural and urban areas to determine what adjustments should be made." ; - nif:beginIndex "7310"^^xsd:nonNegativeInteger ; - nif:endIndex "7497"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "which" ; - nif:beginIndex "1246"^^xsd:nonNegativeInteger ; - nif:endIndex "1251"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "policies" ; - nif:beginIndex "1587"^^xsd:nonNegativeInteger ; - nif:endIndex "1595"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "and" ; - nif:beginIndex "3902"^^xsd:nonNegativeInteger ; - nif:endIndex "3905"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "also" ; - nif:beginIndex "1714"^^xsd:nonNegativeInteger ; - nif:endIndex "1718"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "delegation" ; - nif:beginIndex "10209"^^xsd:nonNegativeInteger ; - nif:endIndex "10219"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "it" ; - nif:beginIndex "804"^^xsd:nonNegativeInteger ; - nif:endIndex "806"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "feel" ; - nif:beginIndex "2511"^^xsd:nonNegativeInteger ; - nif:endIndex "2515"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "these" ; - nif:beginIndex "975"^^xsd:nonNegativeInteger ; - nif:endIndex "980"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Williams" ; - nif:beginIndex "11102"^^xsd:nonNegativeInteger ; - nif:endIndex "11110"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "12193"^^xsd:nonNegativeInteger ; - nif:endIndex "12196"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "pay" ; - nif:beginIndex "10642"^^xsd:nonNegativeInteger ; - nif:endIndex "10645"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "rescind" ; - nif:beginIndex "8921"^^xsd:nonNegativeInteger ; - nif:endIndex "8928"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "1905"^^xsd:nonNegativeInteger ; - nif:endIndex "1906"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "9675"^^xsd:nonNegativeInteger ; - nif:endIndex "9677"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Friday" ; - nif:beginIndex "4437"^^xsd:nonNegativeInteger ; - nif:endIndex "4443"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Henry" ; - nif:beginIndex "4980"^^xsd:nonNegativeInteger ; - nif:endIndex "4985"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "through" ; - nif:beginIndex "2220"^^xsd:nonNegativeInteger ; - nif:endIndex "2227"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "12244"^^xsd:nonNegativeInteger ; - nif:endIndex "12245"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "4268"^^xsd:nonNegativeInteger ; - nif:endIndex "4269"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "has" ; - nif:beginIndex "9868"^^xsd:nonNegativeInteger ; - nif:endIndex "9871"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "privilege" ; - nif:beginIndex "9370"^^xsd:nonNegativeInteger ; - nif:endIndex "9379"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Superior" ; - nif:beginIndex "4422"^^xsd:nonNegativeInteger ; - nif:endIndex "4430"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "which" ; - nif:beginIndex "6386"^^xsd:nonNegativeInteger ; - nif:endIndex "6391"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "be" ; - nif:beginIndex "5861"^^xsd:nonNegativeInteger ; - nif:endIndex "5863"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "3707"^^xsd:nonNegativeInteger ; - nif:endIndex "3708"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "operation" ; - nif:beginIndex "4146"^^xsd:nonNegativeInteger ; - nif:endIndex "4155"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "." ; - nif:beginIndex "9018"^^xsd:nonNegativeInteger ; - nif:endIndex "9019"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "619"^^xsd:nonNegativeInteger ; - nif:endIndex "620"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "first" ; - nif:beginIndex "6418"^^xsd:nonNegativeInteger ; - nif:endIndex "6423"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "a" ; - nif:beginIndex "6416"^^xsd:nonNegativeInteger ; - nif:endIndex "6417"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Purchasing" ; - nif:beginIndex "1472"^^xsd:nonNegativeInteger ; - nif:endIndex "1482"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "402" ; - nif:beginIndex "11082"^^xsd:nonNegativeInteger ; - nif:endIndex "11085"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "there" ; - nif:beginIndex "10163"^^xsd:nonNegativeInteger ; - nif:endIndex "10168"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "funds" ; - nif:beginIndex "1807"^^xsd:nonNegativeInteger ; - nif:endIndex "1812"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "court" ; - nif:beginIndex "2764"^^xsd:nonNegativeInteger ; - nif:endIndex "2769"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "state" ; - nif:beginIndex "5570"^^xsd:nonNegativeInteger ; - nif:endIndex "5575"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "on" ; - nif:beginIndex "9704"^^xsd:nonNegativeInteger ; - nif:endIndex "9706"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "gun" ; - nif:beginIndex "12051"^^xsd:nonNegativeInteger ; - nif:endIndex "12054"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "program" ; - nif:beginIndex "2120"^^xsd:nonNegativeInteger ; - nif:endIndex "2127"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "have" ; - nif:beginIndex "6358"^^xsd:nonNegativeInteger ; - nif:endIndex "6362"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "vote" ; - nif:beginIndex "6125"^^xsd:nonNegativeInteger ; - nif:endIndex "6129"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "mention" ; - nif:beginIndex "7625"^^xsd:nonNegativeInteger ; - nif:endIndex "7632"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "has" ; - nif:beginIndex "2776"^^xsd:nonNegativeInteger ; - nif:endIndex "2779"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "3394"^^xsd:nonNegativeInteger ; - nif:endIndex "3398"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "9110"^^xsd:nonNegativeInteger ; - nif:endIndex "9113"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "jury" ; - nif:beginIndex "1747"^^xsd:nonNegativeInteger ; - nif:endIndex "1751"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "added" ; - nif:beginIndex "5725"^^xsd:nonNegativeInteger ; - nif:endIndex "5730"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "validity" ; - nif:beginIndex "7978"^^xsd:nonNegativeInteger ; - nif:endIndex "7986"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "said" ; - nif:beginIndex "8144"^^xsd:nonNegativeInteger ; - nif:endIndex "8148"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "The petition said that the couple has not lived together as man and wife for more than a year." ; - nif:beginIndex "4837"^^xsd:nonNegativeInteger ; - nif:endIndex "4931"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "county" ; - nif:beginIndex "11557"^^xsd:nonNegativeInteger ; - nif:endIndex "11563"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "was" ; - nif:beginIndex "11456"^^xsd:nonNegativeInteger ; - nif:endIndex "11459"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Georgia's" ; - nif:beginIndex "8074"^^xsd:nonNegativeInteger ; - nif:endIndex "8083"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Sentence ; - nif:anchorOf "Fulton legislators ``work with city officials to pass enabling legislation that will permit the establishment of a fair and equitable'' pension plan for city employes." ; - nif:beginIndex "3938"^^xsd:nonNegativeInteger ; - nif:endIndex "4105"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext ; - nif:word , , , , , , , , , , , , , , , , , , , , , , , , , , , . - - - a nif:Word ; - nif:anchorOf "jury" ; - nif:beginIndex "2940"^^xsd:nonNegativeInteger ; - nif:endIndex "2944"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "which" ; - nif:beginIndex "9391"^^xsd:nonNegativeInteger ; - nif:endIndex "9396"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "A" ; - nif:beginIndex "9751"^^xsd:nonNegativeInteger ; - nif:endIndex "9752"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "the" ; - nif:beginIndex "4289"^^xsd:nonNegativeInteger ; - nif:endIndex "4292"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Tabb" ; - nif:beginIndex "12103"^^xsd:nonNegativeInteger ; - nif:endIndex "12107"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "4511"^^xsd:nonNegativeInteger ; - nif:endIndex "4512"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "proposed" ; - nif:beginIndex "1309"^^xsd:nonNegativeInteger ; - nif:endIndex "1317"^^xsd:nonNegativeInteger ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "to" ; - nif:beginIndex "7828"^^xsd:nonNegativeInteger ; - nif:endIndex "7830"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "passed" ; - nif:beginIndex "10276"^^xsd:nonNegativeInteger ; - nif:endIndex "10282"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "," ; - nif:beginIndex "7003"^^xsd:nonNegativeInteger ; - nif:endIndex "7004"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "none" ; - nif:beginIndex "2337"^^xsd:nonNegativeInteger ; - nif:endIndex "2341"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Committee" ; - nif:beginIndex "227"^^xsd:nonNegativeInteger ; - nif:endIndex "236"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "this" ; - nif:beginIndex "1647"^^xsd:nonNegativeInteger ; - nif:endIndex "1651"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "city" ; - nif:beginIndex "4091"^^xsd:nonNegativeInteger ; - nif:endIndex "4095"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "in" ; - nif:beginIndex "1523"^^xsd:nonNegativeInteger ; - nif:endIndex "1525"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "''" ; - nif:beginIndex "117"^^xsd:nonNegativeInteger ; - nif:endIndex "119"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "counties" ; - nif:beginIndex "2480"^^xsd:nonNegativeInteger ; - nif:endIndex "2488"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "smooth" ; - nif:beginIndex "12183"^^xsd:nonNegativeInteger ; - nif:endIndex "12189"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "11957"^^xsd:nonNegativeInteger ; - nif:endIndex "11961"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "that" ; - nif:beginIndex "4013"^^xsd:nonNegativeInteger ; - nif:endIndex "4017"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:previousWord ; - nif:referenceContext ; - nif:sentence . - - - a nif:Word ; - nif:anchorOf "Merger" ; - nif:beginIndex "1302"^^xsd:nonNegativeInteger ; - nif:endIndex "1308"^^xsd:nonNegativeInteger ; - nif:nextWord ; - nif:referenceContext ; - nif:sentence . - -rdf:Bag a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdf:Bag , rdfs:Container . - -rdf:Seq a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdf:Seq , rdfs:Container . - -rdfs:Datatype a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdfs:Datatype , rdfs:Class , rdfs:Resource . - -rdf:Alt a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdf:Alt , rdfs:Container . - -rdfs:Container a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdfs:Container . - -rdfs:ContainerMembershipProperty - a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdfs:ContainerMembershipProperty , rdf:Property , rdfs:Resource . - -rdfs:isDefinedBy a rdf:Property , rdfs:Resource ; - rdfs:subPropertyOf rdfs:isDefinedBy , rdfs:seeAlso . - -rdfs:seeAlso a rdf:Property , rdfs:Resource ; - rdfs:subPropertyOf rdfs:seeAlso . - -nif:Word a rdfs:Class , rdfs:Resource . - -nif:Sentence a rdfs:Class , rdfs:Resource . - -nif:Context a rdfs:Class , rdfs:Resource . diff --git a/dkpro-core-io-nif-asl/src/test/resources/nif/freme/README.txt b/dkpro-core-io-nif-asl/src/test/resources/nif/freme/README.txt new file mode 100644 index 0000000000..97668dee4e --- /dev/null +++ b/dkpro-core-io-nif-asl/src/test/resources/nif/freme/README.txt @@ -0,0 +1,2 @@ +Resource: https://github.com/freme-project/e-Entity/issues/56 + diff --git a/dkpro-core-io-nif-asl/src/test/resources/nif/freme/freme-cooked-ref.ttl b/dkpro-core-io-nif-asl/src/test/resources/nif/freme/freme-cooked-ref.ttl new file mode 100644 index 0000000000..4970263834 --- /dev/null +++ b/dkpro-core-io-nif-asl/src/test/resources/nif/freme/freme-cooked-ref.ttl @@ -0,0 +1,21 @@ +@prefix rdf: . +@prefix owl: . +@prefix xsd: . +@prefix itsrdf: . +@prefix nif: . +@prefix rdfs: . + + + a nif:OffsetBasedString , nif:Context ; + nif:beginIndex "0"^^xsd:nonNegativeInteger ; + nif:endIndex "140"^^xsd:nonNegativeInteger ; + nif:isString "This meant the Aos Sí (pronounced ees shee), the 'spirits' or 'fairies', could more easily come into our world and were particularly active." . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Aos Sí" ; + nif:beginIndex "15"^^xsd:nonNegativeInteger ; + nif:endIndex "21"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taClassRef owl:Thing ; + itsrdf:taIdentRef . diff --git a/dkpro-core-io-nif-asl/src/test/resources/nif/freme/freme-cooked.ttl b/dkpro-core-io-nif-asl/src/test/resources/nif/freme/freme-cooked.ttl new file mode 100644 index 0000000000..7f48874632 --- /dev/null +++ b/dkpro-core-io-nif-asl/src/test/resources/nif/freme/freme-cooked.ttl @@ -0,0 +1,32 @@ +@prefix dbc: . +@prefix dbpedia-fr: . +@prefix owl: . +@prefix dbpedia-es: . +@prefix xsd: . +@prefix itsrdf: . +@prefix nif: . +@prefix dbpedia: . +@prefix rdfs: . +@prefix dbpedia-de: . +@prefix dbpedia-ru: . +@prefix freme-onto: . +@prefix rdf: . +@prefix dbpedia-nl: . +@prefix dcterms: . +@prefix dbpedia-it: . + + + a nif:RFC5147String , nif:Context , nif:String ; + nif:beginIndex "0"^^xsd:int ; + nif:endIndex "140"^^xsd:int ; + nif:isString "This meant the Aos Sí (pronounced ees shee), the 'spirits' or 'fairies', could more easily come into our world and were particularly active." . + + + a nif:RFC5147String , nif:String , nif:Word , nif:Phrase ; + nif:anchorOf "Aos Sí" ; + nif:beginIndex "15"^^xsd:int ; + nif:endIndex "21"^^xsd:int ; + nif:referenceContext ; + itsrdf:taClassRef owl:Thing ; + itsrdf:taConfidence "0.4859081423223223"^^xsd:double ; + itsrdf:taIdentRef . diff --git a/dkpro-core-io-nif-asl/src/test/resources/nif/freme/freme.ttl b/dkpro-core-io-nif-asl/src/test/resources/nif/freme/freme.ttl new file mode 100644 index 0000000000..1efc958f12 --- /dev/null +++ b/dkpro-core-io-nif-asl/src/test/resources/nif/freme/freme.ttl @@ -0,0 +1,30 @@ +@prefix dbpedia-fr: . +@prefix dbc: . +@prefix dbpedia-es: . +@prefix xsd: . +@prefix itsrdf: . +@prefix dbpedia: . +@prefix rdfs: . +@prefix nif: . +@prefix dbpedia-de: . +@prefix dbpedia-ru: . +@prefix freme-onto: . +@prefix dbpedia-nl: . +@prefix dcterms: . +@prefix dbpedia-it: . + + + a nif:String , nif:Context , nif:RFC5147String ; + nif:beginIndex "0"^^xsd:int ; + nif:endIndex "140"^^xsd:int ; + nif:isString "This meant the Aos Sí (pronounced ees shee), the 'spirits' or 'fairies', could more easily come into our world and were particularly active."^^xsd:string . + + + a nif:RFC5147String , nif:String , nif:Word , nif:Phrase ; + nif:anchorOf "Aos Sí"^^xsd:string ; + nif:beginIndex "15"^^xsd:int ; + nif:endIndex "21"^^xsd:int ; + nif:referenceContext ; + itsrdf:taClassRef ; + itsrdf:taConfidence "0.4859081423223223"^^xsd:double ; + itsrdf:taIdentRef . diff --git a/dkpro-core-io-nif-asl/src/test/resources/nif/kore50/kore50-cooked-ref.ttl b/dkpro-core-io-nif-asl/src/test/resources/nif/kore50/kore50-cooked-ref.ttl new file mode 100644 index 0000000000..f3e84c8c75 --- /dev/null +++ b/dkpro-core-io-nif-asl/src/test/resources/nif/kore50/kore50-cooked-ref.ttl @@ -0,0 +1,1612 @@ +@prefix rdf: . +@prefix owl: . +@prefix xsd: . +@prefix itsrdf: . +@prefix nif: . +@prefix rdfs: . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "American Recordings" ; + nif:beginIndex "1223"^^xsd:nonNegativeInteger ; + nif:endIndex "1242"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Beck" ; + nif:beginIndex "826"^^xsd:nonNegativeInteger ; + nif:endIndex "830"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Euro" ; + nif:beginIndex "3290"^^xsd:nonNegativeInteger ; + nif:endIndex "3294"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Apple" ; + nif:beginIndex "2323"^^xsd:nonNegativeInteger ; + nif:endIndex "2328"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Isle of Wight festival" ; + nif:beginIndex "1487"^^xsd:nonNegativeInteger ; + nif:endIndex "1509"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Mitchell" ; + nif:beginIndex "1603"^^xsd:nonNegativeInteger ; + nif:endIndex "1611"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Santana" ; + nif:beginIndex "1083"^^xsd:nonNegativeInteger ; + nif:endIndex "1090"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Paris" ; + nif:beginIndex "450"^^xsd:nonNegativeInteger ; + nif:endIndex "455"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Jones" ; + nif:beginIndex "1327"^^xsd:nonNegativeInteger ; + nif:endIndex "1332"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Context ; + nif:beginIndex "0"^^xsd:nonNegativeInteger ; + nif:endIndex "3779"^^xsd:nonNegativeInteger ; + nif:isString "David and Victoria named their children Brooklyn, Romeo, Cruz, and Harper Seven.\nDavid and Victoria added spice to their marriage.\nTiger was lost in the woods when he got divorced from Elin.\nTiger lost the US Open.\nMadonna played Eva and was seen with Carlos.\nIn this musical, Madonna played the role of the First Lady.\nAngelina, her father Jon, and her partner Brad never played together in the same movie.\nHeidi and her husband Seal live in Vegas.\nParis and Kim are both wealthy It Girls who had sex tapes on the Internet.\nJustin, Stefani, and Kate are among the most popular people on both MTV and Twitter.\nDylan performed Hurricane about the black fighter Carter, from his album Desire.\nDesire contains a duet with Harris in the song Joey.\nThree of the greatest guitarists started their career in a single band : Clapton, Beck, and Page.\nAllen founded the EMP in Seattle, which featured exhibitions about Hendrix and Dylan, but also about various science fiction movies.\nDespite featuring some of the most promininent musicians of their decade --- like Sinatra, Dylan, Joel, and Santana --- Columbia was aquired by Sony in the 1980s.\nAfter unsuccessful years, aging country star Cash made a grandiose comeback with his American Recordings, recorded at his home with the help of Rubin.\nThe group formed by Homme, Grohl, and Jones was supposed to be named Caligula, but the name was already taken.\nJobs and Baez dated in the late 1970s, and she performed at his Stanford memorial.\nThe Isle of Wight festival in 1970 was the biggest at its time, surpassing Woodstock with acts like Davis, Chicago, and Mitchell.\nEric preferred to play Blues instead of Rock, so he joined Mayall 's band.\nAfter the death of Steve, the former CEO of Apple, his commencement speech at Stanford was watched thousands of times.\nIn 1980, Steve dropped out of Stanford to join Microsoft, the company behind the Windows operating system.\nCairo was the code name for a project at Microsoft from 1991 to 1996. Its charter was to build technologies for a next generation operating system that would fulfill the vision of Bill.\nSteve, Bill, Sergey, and Larry have drawn a great deal of admiration these days for their pioneering successes that changed the world we live in.\nKarl and Theo made their extreme fortunes selling low-price groceries.\nWhile Apple is an electronics company, Mango is a clothing one and Orange is a communication one.\nSam, the co-founder of Equity International, was given the nickname of \\\"the grave dancer\\\" because of his ability to buy businesses that others thought were dead.\nPixar produced Cars, and John directed it.\nMars, Galaxy, and Bounty are all chocolate.\nBosch and Sharp are both home appliances producing companies.\nCity won 3:2 against the Spurs.\nThe Gunners now play their home matches at the Emirates.\nAtletico has beaten its archrival Real.\nMüller scored a hattrick against England.\nThomas and Mario are strikers playing in Munich.\nHaug congratulated Red Bull.\nVöller will never forget the match against Oranje in San Siro.\nLandgraf and Meijer played at the Tivoli.\nYabo plays for Aachen.\nHertha won against Dortmund.\nNixon resigned after Watergate despite his success in the Ping-Pong Diplomacy with China.\nThe Sun and The Times reported that Greece will have to leave the Euro soon.\nThe Enola Gay bombed Hiroshima at the end of Second World War.\nThe RAF was a terrorist group led by Baader and Meinhof that killed Schleyer.\nOnassis married Kennedy on October 20, 1968.\nArmstrong was the first man on the Moon.\nErich was born in Neunkirchen.\nMacedonia is a province of Greece.\nObama welcomed Merkel upon her arrival at JFK.\nKennedy was also an active politician, yet he is most known for his writings, some of which he published under the name of Mark Littleton." . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Chicago" ; + nif:beginIndex "1590"^^xsd:nonNegativeInteger ; + nif:endIndex "1597"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Müller scored a hattrick against England." ; + nif:beginIndex "2857"^^xsd:nonNegativeInteger ; + nif:endIndex "2898"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "England" ; + nif:beginIndex "2890"^^xsd:nonNegativeInteger ; + nif:endIndex "2897"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Equity International" ; + nif:beginIndex "2438"^^xsd:nonNegativeInteger ; + nif:endIndex "2458"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "David" ; + nif:beginIndex "0"^^xsd:nonNegativeInteger ; + nif:endIndex "5"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Yabo" ; + nif:beginIndex "3082"^^xsd:nonNegativeInteger ; + nif:endIndex "3086"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Grohl" ; + nif:beginIndex "1316"^^xsd:nonNegativeInteger ; + nif:endIndex "1321"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Bosch and Sharp are both home appliances producing companies." ; + nif:beginIndex "2666"^^xsd:nonNegativeInteger ; + nif:endIndex "2727"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Völler" ; + nif:beginIndex "2977"^^xsd:nonNegativeInteger ; + nif:endIndex "2983"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Atletico has beaten its archrival Real." ; + nif:beginIndex "2817"^^xsd:nonNegativeInteger ; + nif:endIndex "2856"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Ping-Pong Diplomacy" ; + nif:beginIndex "3192"^^xsd:nonNegativeInteger ; + nif:endIndex "3211"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Haug" ; + nif:beginIndex "2948"^^xsd:nonNegativeInteger ; + nif:endIndex "2952"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Three of the greatest guitarists started their career in a single band : Clapton, Beck, and Page." ; + nif:beginIndex "744"^^xsd:nonNegativeInteger ; + nif:endIndex "841"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Steve" ; + nif:beginIndex "2100"^^xsd:nonNegativeInteger ; + nif:endIndex "2105"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Joey" ; + nif:beginIndex "738"^^xsd:nonNegativeInteger ; + nif:endIndex "742"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Greece" ; + nif:beginIndex "3260"^^xsd:nonNegativeInteger ; + nif:endIndex "3266"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Gunners" ; + nif:beginIndex "2764"^^xsd:nonNegativeInteger ; + nif:endIndex "2771"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Real" ; + nif:beginIndex "2851"^^xsd:nonNegativeInteger ; + nif:endIndex "2855"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "San Siro" ; + nif:beginIndex "3030"^^xsd:nonNegativeInteger ; + nif:endIndex "3038"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Allen founded the EMP in Seattle, which featured exhibitions about Hendrix and Dylan, but also about various science fiction movies." ; + nif:beginIndex "842"^^xsd:nonNegativeInteger ; + nif:endIndex "974"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Erich" ; + nif:beginIndex "3528"^^xsd:nonNegativeInteger ; + nif:endIndex "3533"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Eric preferred to play Blues instead of Rock, so he joined Mayall 's band." ; + nif:beginIndex "1613"^^xsd:nonNegativeInteger ; + nif:endIndex "1687"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "The Isle of Wight festival in 1970 was the biggest at its time, surpassing Woodstock with acts like Davis, Chicago, and Mitchell." ; + nif:beginIndex "1483"^^xsd:nonNegativeInteger ; + nif:endIndex "1612"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Tiger" ; + nif:beginIndex "131"^^xsd:nonNegativeInteger ; + nif:endIndex "136"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Jobs" ; + nif:beginIndex "1400"^^xsd:nonNegativeInteger ; + nif:endIndex "1404"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Larry" ; + nif:beginIndex "2125"^^xsd:nonNegativeInteger ; + nif:endIndex "2130"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Schleyer" ; + nif:beginIndex "3432"^^xsd:nonNegativeInteger ; + nif:endIndex "3440"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Desire contains a duet with Harris in the song Joey." ; + nif:beginIndex "691"^^xsd:nonNegativeInteger ; + nif:endIndex "743"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Hiroshima" ; + nif:beginIndex "3322"^^xsd:nonNegativeInteger ; + nif:endIndex "3331"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Columbia" ; + nif:beginIndex "1095"^^xsd:nonNegativeInteger ; + nif:endIndex "1103"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "The Gunners now play their home matches at the Emirates." ; + nif:beginIndex "2760"^^xsd:nonNegativeInteger ; + nif:endIndex "2816"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Madonna" ; + nif:beginIndex "277"^^xsd:nonNegativeInteger ; + nif:endIndex "284"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Madonna" ; + nif:beginIndex "215"^^xsd:nonNegativeInteger ; + nif:endIndex "222"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Homme" ; + nif:beginIndex "1309"^^xsd:nonNegativeInteger ; + nif:endIndex "1314"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Pixar" ; + nif:beginIndex "2579"^^xsd:nonNegativeInteger ; + nif:endIndex "2584"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Carter" ; + nif:beginIndex "660"^^xsd:nonNegativeInteger ; + nif:endIndex "666"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Steve" ; + nif:beginIndex "1816"^^xsd:nonNegativeInteger ; + nif:endIndex "1821"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Macedonia is a province of Greece." ; + nif:beginIndex "3559"^^xsd:nonNegativeInteger ; + nif:endIndex "3593"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Haug congratulated Red Bull." ; + nif:beginIndex "2948"^^xsd:nonNegativeInteger ; + nif:endIndex "2976"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Landgraf and Meijer played at the Tivoli." ; + nif:beginIndex "3040"^^xsd:nonNegativeInteger ; + nif:endIndex "3081"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Neunkirchen" ; + nif:beginIndex "3546"^^xsd:nonNegativeInteger ; + nif:endIndex "3557"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "China" ; + nif:beginIndex "3217"^^xsd:nonNegativeInteger ; + nif:endIndex "3222"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Paris and Kim are both wealthy It Girls who had sex tapes on the Internet." ; + nif:beginIndex "450"^^xsd:nonNegativeInteger ; + nif:endIndex "524"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Pixar produced Cars, and John directed it." ; + nif:beginIndex "2579"^^xsd:nonNegativeInteger ; + nif:endIndex "2621"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Stanford" ; + nif:beginIndex "1766"^^xsd:nonNegativeInteger ; + nif:endIndex "1774"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Page" ; + nif:beginIndex "836"^^xsd:nonNegativeInteger ; + nif:endIndex "840"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Bounty" ; + nif:beginIndex "2640"^^xsd:nonNegativeInteger ; + nif:endIndex "2646"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Harris" ; + nif:beginIndex "719"^^xsd:nonNegativeInteger ; + nif:endIndex "725"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Aachen" ; + nif:beginIndex "3097"^^xsd:nonNegativeInteger ; + nif:endIndex "3103"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Madonna played Eva and was seen with Carlos." ; + nif:beginIndex "215"^^xsd:nonNegativeInteger ; + nif:endIndex "259"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Despite featuring some of the most promininent musicians of their decade --- like Sinatra, Dylan, Joel, and Santana --- Columbia was aquired by Sony in the 1980s." ; + nif:beginIndex "975"^^xsd:nonNegativeInteger ; + nif:endIndex "1137"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Dylan" ; + nif:beginIndex "610"^^xsd:nonNegativeInteger ; + nif:endIndex "615"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Atletico" ; + nif:beginIndex "2817"^^xsd:nonNegativeInteger ; + nif:endIndex "2825"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Nixon resigned after Watergate despite his success in the Ping-Pong Diplomacy with China." ; + nif:beginIndex "3134"^^xsd:nonNegativeInteger ; + nif:endIndex "3223"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Red Bull" ; + nif:beginIndex "2967"^^xsd:nonNegativeInteger ; + nif:endIndex "2975"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "In this musical, Madonna played the role of the First Lady." ; + nif:beginIndex "260"^^xsd:nonNegativeInteger ; + nif:endIndex "319"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Dylan" ; + nif:beginIndex "1066"^^xsd:nonNegativeInteger ; + nif:endIndex "1071"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Stanford" ; + nif:beginIndex "1837"^^xsd:nonNegativeInteger ; + nif:endIndex "1845"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Vegas" ; + nif:beginIndex "443"^^xsd:nonNegativeInteger ; + nif:endIndex "448"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Mango" ; + nif:beginIndex "2356"^^xsd:nonNegativeInteger ; + nif:endIndex "2361"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Tiger lost the US Open." ; + nif:beginIndex "191"^^xsd:nonNegativeInteger ; + nif:endIndex "214"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Joel" ; + nif:beginIndex "1073"^^xsd:nonNegativeInteger ; + nif:endIndex "1077"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Onassis" ; + nif:beginIndex "3442"^^xsd:nonNegativeInteger ; + nif:endIndex "3449"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Microsoft" ; + nif:beginIndex "1955"^^xsd:nonNegativeInteger ; + nif:endIndex "1964"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Hertha won against Dortmund." ; + nif:beginIndex "3105"^^xsd:nonNegativeInteger ; + nif:endIndex "3133"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "City" ; + nif:beginIndex "2728"^^xsd:nonNegativeInteger ; + nif:endIndex "2732"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Sharp" ; + nif:beginIndex "2676"^^xsd:nonNegativeInteger ; + nif:endIndex "2681"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Steve, Bill, Sergey, and Larry have drawn a great deal of admiration these days for their pioneering successes that changed the world we live in." ; + nif:beginIndex "2100"^^xsd:nonNegativeInteger ; + nif:endIndex "2245"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Clapton" ; + nif:beginIndex "817"^^xsd:nonNegativeInteger ; + nif:endIndex "824"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Hurricane" ; + nif:beginIndex "626"^^xsd:nonNegativeInteger ; + nif:endIndex "635"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Bill" ; + nif:beginIndex "2094"^^xsd:nonNegativeInteger ; + nif:endIndex "2098"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Nixon" ; + nif:beginIndex "3134"^^xsd:nonNegativeInteger ; + nif:endIndex "3139"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Dylan" ; + nif:beginIndex "921"^^xsd:nonNegativeInteger ; + nif:endIndex "926"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "The Enola Gay bombed Hiroshima at the end of Second World War." ; + nif:beginIndex "3301"^^xsd:nonNegativeInteger ; + nif:endIndex "3363"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Kennedy was also an active politician, yet he is most known for his writings, some of which he published under the name of Mark Littleton." ; + nif:beginIndex "3641"^^xsd:nonNegativeInteger ; + nif:endIndex "3779"^^xsd:nonNegativeInteger ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "The Sun" ; + nif:beginIndex "3224"^^xsd:nonNegativeInteger ; + nif:endIndex "3231"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "MTV" ; + nif:beginIndex "593"^^xsd:nonNegativeInteger ; + nif:endIndex "596"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Obama" ; + nif:beginIndex "3594"^^xsd:nonNegativeInteger ; + nif:endIndex "3599"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Onassis married Kennedy on October 20, 1968." ; + nif:beginIndex "3442"^^xsd:nonNegativeInteger ; + nif:endIndex "3486"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "John" ; + nif:beginIndex "2604"^^xsd:nonNegativeInteger ; + nif:endIndex "2608"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "US Open" ; + nif:beginIndex "206"^^xsd:nonNegativeInteger ; + nif:endIndex "213"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Angelina" ; + nif:beginIndex "320"^^xsd:nonNegativeInteger ; + nif:endIndex "328"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "The Times" ; + nif:beginIndex "3236"^^xsd:nonNegativeInteger ; + nif:endIndex "3245"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Armstrong was the first man on the Moon." ; + nif:beginIndex "3487"^^xsd:nonNegativeInteger ; + nif:endIndex "3527"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Angelina, her father Jon, and her partner Brad never played together in the same movie." ; + nif:beginIndex "320"^^xsd:nonNegativeInteger ; + nif:endIndex "407"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "The RAF was a terrorist group led by Baader and Meinhof that killed Schleyer." ; + nif:beginIndex "3364"^^xsd:nonNegativeInteger ; + nif:endIndex "3441"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Desire" ; + nif:beginIndex "683"^^xsd:nonNegativeInteger ; + nif:endIndex "689"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "JFK" ; + nif:beginIndex "3636"^^xsd:nonNegativeInteger ; + nif:endIndex "3639"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Second World War" ; + nif:beginIndex "3346"^^xsd:nonNegativeInteger ; + nif:endIndex "3362"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Eric" ; + nif:beginIndex "1613"^^xsd:nonNegativeInteger ; + nif:endIndex "1617"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Carlos" ; + nif:beginIndex "252"^^xsd:nonNegativeInteger ; + nif:endIndex "258"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Stanford" ; + nif:beginIndex "1464"^^xsd:nonNegativeInteger ; + nif:endIndex "1472"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Heidi and her husband Seal live in Vegas." ; + nif:beginIndex "408"^^xsd:nonNegativeInteger ; + nif:endIndex "449"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Rubin" ; + nif:beginIndex "1282"^^xsd:nonNegativeInteger ; + nif:endIndex "1287"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "David and Victoria added spice to their marriage." ; + nif:beginIndex "81"^^xsd:nonNegativeInteger ; + nif:endIndex "130"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Elin" ; + nif:beginIndex "185"^^xsd:nonNegativeInteger ; + nif:endIndex "189"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Orange" ; + nif:beginIndex "2384"^^xsd:nonNegativeInteger ; + nif:endIndex "2390"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Seal" ; + nif:beginIndex "430"^^xsd:nonNegativeInteger ; + nif:endIndex "434"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Greece" ; + nif:beginIndex "3586"^^xsd:nonNegativeInteger ; + nif:endIndex "3592"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Windows" ; + nif:beginIndex "1888"^^xsd:nonNegativeInteger ; + nif:endIndex "1895"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Tiger" ; + nif:beginIndex "191"^^xsd:nonNegativeInteger ; + nif:endIndex "196"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Mars, Galaxy, and Bounty are all chocolate." ; + nif:beginIndex "2622"^^xsd:nonNegativeInteger ; + nif:endIndex "2665"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "David" ; + nif:beginIndex "81"^^xsd:nonNegativeInteger ; + nif:endIndex "86"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Stefani" ; + nif:beginIndex "533"^^xsd:nonNegativeInteger ; + nif:endIndex "540"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Tivoli" ; + nif:beginIndex "3074"^^xsd:nonNegativeInteger ; + nif:endIndex "3080"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Kim" ; + nif:beginIndex "460"^^xsd:nonNegativeInteger ; + nif:endIndex "463"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Thomas and Mario are strikers playing in Munich." ; + nif:beginIndex "2899"^^xsd:nonNegativeInteger ; + nif:endIndex "2947"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Karl" ; + nif:beginIndex "2246"^^xsd:nonNegativeInteger ; + nif:endIndex "2250"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Thomas" ; + nif:beginIndex "2899"^^xsd:nonNegativeInteger ; + nif:endIndex "2905"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "After the death of Steve, the former CEO of Apple, his commencement speech at Stanford was watched thousands of times." ; + nif:beginIndex "1688"^^xsd:nonNegativeInteger ; + nif:endIndex "1806"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Justin, Stefani, and Kate are among the most popular people on both MTV and Twitter." ; + nif:beginIndex "525"^^xsd:nonNegativeInteger ; + nif:endIndex "609"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Kennedy" ; + nif:beginIndex "3641"^^xsd:nonNegativeInteger ; + nif:endIndex "3648"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "EMP" ; + nif:beginIndex "860"^^xsd:nonNegativeInteger ; + nif:endIndex "863"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Meijer" ; + nif:beginIndex "3053"^^xsd:nonNegativeInteger ; + nif:endIndex "3059"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Eva" ; + nif:beginIndex "230"^^xsd:nonNegativeInteger ; + nif:endIndex "233"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "City won 3:2 against the Spurs." ; + nif:beginIndex "2728"^^xsd:nonNegativeInteger ; + nif:endIndex "2759"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "The Sun and The Times reported that Greece will have to leave the Euro soon." ; + nif:beginIndex "3224"^^xsd:nonNegativeInteger ; + nif:endIndex "3300"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Karl and Theo made their extreme fortunes selling low-price groceries." ; + nif:beginIndex "2246"^^xsd:nonNegativeInteger ; + nif:endIndex "2316"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Victoria" ; + nif:beginIndex "91"^^xsd:nonNegativeInteger ; + nif:endIndex "99"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Blues" ; + nif:beginIndex "1636"^^xsd:nonNegativeInteger ; + nif:endIndex "1641"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Microsoft" ; + nif:beginIndex "1854"^^xsd:nonNegativeInteger ; + nif:endIndex "1863"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Erich was born in Neunkirchen." ; + nif:beginIndex "3528"^^xsd:nonNegativeInteger ; + nif:endIndex "3558"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Sam, the co-founder of Equity International, was given the nickname of \\\"the grave dancer\\\" because of his ability to buy businesses that others thought were dead." ; + nif:beginIndex "2415"^^xsd:nonNegativeInteger ; + nif:endIndex "2578"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Baez" ; + nif:beginIndex "1409"^^xsd:nonNegativeInteger ; + nif:endIndex "1413"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Mayall" ; + nif:beginIndex "1672"^^xsd:nonNegativeInteger ; + nif:endIndex "1678"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Merkel" ; + nif:beginIndex "3609"^^xsd:nonNegativeInteger ; + nif:endIndex "3615"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Rock" ; + nif:beginIndex "1653"^^xsd:nonNegativeInteger ; + nif:endIndex "1657"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "The group formed by Homme, Grohl, and Jones was supposed to be named Caligula, but the name was already taken." ; + nif:beginIndex "1289"^^xsd:nonNegativeInteger ; + nif:endIndex "1399"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Landgraf" ; + nif:beginIndex "3040"^^xsd:nonNegativeInteger ; + nif:endIndex "3048"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Mars" ; + nif:beginIndex "2622"^^xsd:nonNegativeInteger ; + nif:endIndex "2626"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Cairo was the code name for a project at Microsoft from 1991 to 1996. Its charter was to build technologies for a next generation operating system that would fulfill the vision of Bill." ; + nif:beginIndex "1914"^^xsd:nonNegativeInteger ; + nif:endIndex "2099"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Armstrong" ; + nif:beginIndex "3487"^^xsd:nonNegativeInteger ; + nif:endIndex "3496"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Meinhof" ; + nif:beginIndex "3412"^^xsd:nonNegativeInteger ; + nif:endIndex "3419"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Theo" ; + nif:beginIndex "2255"^^xsd:nonNegativeInteger ; + nif:endIndex "2259"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Steve" ; + nif:beginIndex "1707"^^xsd:nonNegativeInteger ; + nif:endIndex "1712"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Tiger was lost in the woods when he got divorced from Elin." ; + nif:beginIndex "131"^^xsd:nonNegativeInteger ; + nif:endIndex "190"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Macedonia" ; + nif:beginIndex "3559"^^xsd:nonNegativeInteger ; + nif:endIndex "3568"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Kennedy" ; + nif:beginIndex "3458"^^xsd:nonNegativeInteger ; + nif:endIndex "3465"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Woodstock" ; + nif:beginIndex "1558"^^xsd:nonNegativeInteger ; + nif:endIndex "1567"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Desire" ; + nif:beginIndex "691"^^xsd:nonNegativeInteger ; + nif:endIndex "697"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Sony" ; + nif:beginIndex "1119"^^xsd:nonNegativeInteger ; + nif:endIndex "1123"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Cash" ; + nif:beginIndex "1183"^^xsd:nonNegativeInteger ; + nif:endIndex "1187"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Müller" ; + nif:beginIndex "2857"^^xsd:nonNegativeInteger ; + nif:endIndex "2863"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Sergey" ; + nif:beginIndex "2113"^^xsd:nonNegativeInteger ; + nif:endIndex "2119"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "David and Victoria named their children Brooklyn, Romeo, Cruz, and Harper Seven." ; + nif:beginIndex "0"^^xsd:nonNegativeInteger ; + nif:endIndex "80"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Twitter" ; + nif:beginIndex "601"^^xsd:nonNegativeInteger ; + nif:endIndex "608"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Mario" ; + nif:beginIndex "2910"^^xsd:nonNegativeInteger ; + nif:endIndex "2915"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Jon" ; + nif:beginIndex "341"^^xsd:nonNegativeInteger ; + nif:endIndex "344"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Emirates" ; + nif:beginIndex "2807"^^xsd:nonNegativeInteger ; + nif:endIndex "2815"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Munich" ; + nif:beginIndex "2940"^^xsd:nonNegativeInteger ; + nif:endIndex "2946"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Cairo" ; + nif:beginIndex "1914"^^xsd:nonNegativeInteger ; + nif:endIndex "1919"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Enola Gay" ; + nif:beginIndex "3305"^^xsd:nonNegativeInteger ; + nif:endIndex "3314"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Brad" ; + nif:beginIndex "362"^^xsd:nonNegativeInteger ; + nif:endIndex "366"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Sam" ; + nif:beginIndex "2415"^^xsd:nonNegativeInteger ; + nif:endIndex "2418"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Heidi" ; + nif:beginIndex "408"^^xsd:nonNegativeInteger ; + nif:endIndex "413"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Völler will never forget the match against Oranje in San Siro." ; + nif:beginIndex "2977"^^xsd:nonNegativeInteger ; + nif:endIndex "3039"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Bosch" ; + nif:beginIndex "2666"^^xsd:nonNegativeInteger ; + nif:endIndex "2671"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Oranje" ; + nif:beginIndex "3020"^^xsd:nonNegativeInteger ; + nif:endIndex "3026"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Kate" ; + nif:beginIndex "546"^^xsd:nonNegativeInteger ; + nif:endIndex "550"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Galaxy" ; + nif:beginIndex "2628"^^xsd:nonNegativeInteger ; + nif:endIndex "2634"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Allen" ; + nif:beginIndex "842"^^xsd:nonNegativeInteger ; + nif:endIndex "847"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Jobs and Baez dated in the late 1970s, and she performed at his Stanford memorial." ; + nif:beginIndex "1400"^^xsd:nonNegativeInteger ; + nif:endIndex "1482"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Dylan performed Hurricane about the black fighter Carter, from his album Desire." ; + nif:beginIndex "610"^^xsd:nonNegativeInteger ; + nif:endIndex "690"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Watergate" ; + nif:beginIndex "3155"^^xsd:nonNegativeInteger ; + nif:endIndex "3164"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Davis" ; + nif:beginIndex "1583"^^xsd:nonNegativeInteger ; + nif:endIndex "1588"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Yabo plays for Aachen." ; + nif:beginIndex "3082"^^xsd:nonNegativeInteger ; + nif:endIndex "3104"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Victoria" ; + nif:beginIndex "10"^^xsd:nonNegativeInteger ; + nif:endIndex "18"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Hendrix" ; + nif:beginIndex "909"^^xsd:nonNegativeInteger ; + nif:endIndex "916"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "RAF" ; + nif:beginIndex "3368"^^xsd:nonNegativeInteger ; + nif:endIndex "3371"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Seattle" ; + nif:beginIndex "867"^^xsd:nonNegativeInteger ; + nif:endIndex "874"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Baader" ; + nif:beginIndex "3401"^^xsd:nonNegativeInteger ; + nif:endIndex "3407"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "After unsuccessful years, aging country star Cash made a grandiose comeback with his American Recordings, recorded at his home with the help of Rubin." ; + nif:beginIndex "1138"^^xsd:nonNegativeInteger ; + nif:endIndex "1288"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Apple" ; + nif:beginIndex "1732"^^xsd:nonNegativeInteger ; + nif:endIndex "1737"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Moon" ; + nif:beginIndex "3522"^^xsd:nonNegativeInteger ; + nif:endIndex "3526"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Justin" ; + nif:beginIndex "525"^^xsd:nonNegativeInteger ; + nif:endIndex "531"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "In 1980, Steve dropped out of Stanford to join Microsoft, the company behind the Windows operating system." ; + nif:beginIndex "1807"^^xsd:nonNegativeInteger ; + nif:endIndex "1913"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Sinatra" ; + nif:beginIndex "1057"^^xsd:nonNegativeInteger ; + nif:endIndex "1064"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "First Lady" ; + nif:beginIndex "308"^^xsd:nonNegativeInteger ; + nif:endIndex "318"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Spurs" ; + nif:beginIndex "2753"^^xsd:nonNegativeInteger ; + nif:endIndex "2758"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Cars" ; + nif:beginIndex "2594"^^xsd:nonNegativeInteger ; + nif:endIndex "2598"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Dortmund" ; + nif:beginIndex "3124"^^xsd:nonNegativeInteger ; + nif:endIndex "3132"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Bill" ; + nif:beginIndex "2107"^^xsd:nonNegativeInteger ; + nif:endIndex "2111"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Hertha" ; + nif:beginIndex "3105"^^xsd:nonNegativeInteger ; + nif:endIndex "3111"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "Obama welcomed Merkel upon her arrival at JFK." ; + nif:beginIndex "3594"^^xsd:nonNegativeInteger ; + nif:endIndex "3640"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . + + + a nif:OffsetBasedString , nif:Sentence ; + nif:anchorOf "While Apple is an electronics company, Mango is a clothing one and Orange is a communication one." ; + nif:beginIndex "2317"^^xsd:nonNegativeInteger ; + nif:endIndex "2414"^^xsd:nonNegativeInteger ; + nif:nextSentence ; + nif:previousSentence ; + nif:referenceContext . diff --git a/dkpro-core-io-nif-asl/src/test/resources/nif/kore50/ref.ttl b/dkpro-core-io-nif-asl/src/test/resources/nif/kore50/ref.ttl deleted file mode 100644 index af542f24f1..0000000000 --- a/dkpro-core-io-nif-asl/src/test/resources/nif/kore50/ref.ttl +++ /dev/null @@ -1,557 +0,0 @@ -@prefix rdf: . -@prefix owl: . -@prefix xsd: . -@prefix itsrdf: . -@prefix nif: . -@prefix rdfs: . - -rdf:rest a rdf:Property , rdfs:Resource ; - rdfs:domain rdf:List ; - rdfs:range rdf:List ; - rdfs:subPropertyOf rdf:rest . - -rdf:List a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdfs:Resource , rdf:List . - -rdf:predicate a rdf:Property , rdfs:Resource ; - rdfs:domain rdf:Statement ; - rdfs:subPropertyOf rdf:predicate . - -rdf:Property a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdfs:Resource , rdf:Property . - -rdfs:label a rdf:Property , rdfs:Resource ; - rdfs:range rdfs:Literal . - -rdf:Statement a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdfs:Resource , rdf:Statement . - -rdfs:Class a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdfs:Resource , rdfs:Class . - -rdf:type a rdf:Property , rdfs:Resource ; - rdfs:range rdfs:Class . - -rdfs:Resource a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdfs:Resource . - -rdf:subject a rdf:Property , rdfs:Resource ; - rdfs:domain rdf:Statement ; - rdfs:subPropertyOf rdf:subject . - -rdf:XMLLiteral a rdfs:Datatype , rdfs:Resource , rdfs:Class . - -rdfs:comment a rdf:Property , rdfs:Resource ; - rdfs:range rdfs:Literal . - -rdfs:range a rdf:Property , rdfs:Resource ; - rdfs:domain rdf:Property ; - rdfs:range rdfs:Class . - -rdfs:subPropertyOf a rdf:Property , rdfs:Resource ; - rdfs:domain rdf:Property ; - rdfs:range rdf:Property . - -rdf:object a rdf:Property , rdfs:Resource ; - rdfs:domain rdf:Statement ; - rdfs:subPropertyOf rdf:object . - -rdf:nil a rdf:List , rdfs:Resource . - -rdfs:domain a rdf:Property , rdfs:Resource ; - rdfs:domain rdf:Property ; - rdfs:range rdfs:Class . - -rdfs:Literal a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdfs:Resource , rdfs:Literal . - -rdf:first a rdf:Property , rdfs:Resource ; - rdfs:domain rdf:List ; - rdfs:subPropertyOf rdf:first . - -rdfs:subClassOf a rdf:Property , rdfs:Resource ; - rdfs:domain rdfs:Class ; - rdfs:range rdfs:Class . - - - a nif:Sentence ; - nif:anchorOf "Desire contains a duet with Harris in the song Joey." ; - nif:beginIndex "691"^^xsd:nonNegativeInteger ; - nif:endIndex "743"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Cairo was the code name for a project at Microsoft from 1991 to 1996. Its charter was to build technologies for a next generation operating system that would fulfill the vision of Bill." ; - nif:beginIndex "1914"^^xsd:nonNegativeInteger ; - nif:endIndex "2099"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Steve, Bill, Sergey, and Larry have drawn a great deal of admiration these days for their pioneering successes that changed the world we live in." ; - nif:beginIndex "2100"^^xsd:nonNegativeInteger ; - nif:endIndex "2245"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Context ; - nif:beginIndex "0"^^xsd:nonNegativeInteger ; - nif:endIndex "3779"^^xsd:nonNegativeInteger ; - nif:isString "David and Victoria named their children Brooklyn, Romeo, Cruz, and Harper Seven.\nDavid and Victoria added spice to their marriage.\nTiger was lost in the woods when he got divorced from Elin.\nTiger lost the US Open.\nMadonna played Eva and was seen with Carlos.\nIn this musical, Madonna played the role of the First Lady.\nAngelina, her father Jon, and her partner Brad never played together in the same movie.\nHeidi and her husband Seal live in Vegas.\nParis and Kim are both wealthy It Girls who had sex tapes on the Internet.\nJustin, Stefani, and Kate are among the most popular people on both MTV and Twitter.\nDylan performed Hurricane about the black fighter Carter, from his album Desire.\nDesire contains a duet with Harris in the song Joey.\nThree of the greatest guitarists started their career in a single band : Clapton, Beck, and Page.\nAllen founded the EMP in Seattle, which featured exhibitions about Hendrix and Dylan, but also about various science fiction movies.\nDespite featuring some of the most promininent musicians of their decade --- like Sinatra, Dylan, Joel, and Santana --- Columbia was aquired by Sony in the 1980s.\nAfter unsuccessful years, aging country star Cash made a grandiose comeback with his American Recordings, recorded at his home with the help of Rubin.\nThe group formed by Homme, Grohl, and Jones was supposed to be named Caligula, but the name was already taken.\nJobs and Baez dated in the late 1970s, and she performed at his Stanford memorial.\nThe Isle of Wight festival in 1970 was the biggest at its time, surpassing Woodstock with acts like Davis, Chicago, and Mitchell.\nEric preferred to play Blues instead of Rock, so he joined Mayall 's band.\nAfter the death of Steve, the former CEO of Apple, his commencement speech at Stanford was watched thousands of times.\nIn 1980, Steve dropped out of Stanford to join Microsoft, the company behind the Windows operating system.\nCairo was the code name for a project at Microsoft from 1991 to 1996. Its charter was to build technologies for a next generation operating system that would fulfill the vision of Bill.\nSteve, Bill, Sergey, and Larry have drawn a great deal of admiration these days for their pioneering successes that changed the world we live in.\nKarl and Theo made their extreme fortunes selling low-price groceries.\nWhile Apple is an electronics company, Mango is a clothing one and Orange is a communication one.\nSam, the co-founder of Equity International, was given the nickname of \\\"the grave dancer\\\" because of his ability to buy businesses that others thought were dead.\nPixar produced Cars, and John directed it.\nMars, Galaxy, and Bounty are all chocolate.\nBosch and Sharp are both home appliances producing companies.\nCity won 3:2 against the Spurs.\nThe Gunners now play their home matches at the Emirates.\nAtletico has beaten its archrival Real.\nMüller scored a hattrick against England.\nThomas and Mario are strikers playing in Munich.\nHaug congratulated Red Bull.\nVöller will never forget the match against Oranje in San Siro.\nLandgraf and Meijer played at the Tivoli.\nYabo plays for Aachen.\nHertha won against Dortmund.\nNixon resigned after Watergate despite his success in the Ping-Pong Diplomacy with China.\nThe Sun and The Times reported that Greece will have to leave the Euro soon.\nThe Enola Gay bombed Hiroshima at the end of Second World War.\nThe RAF was a terrorist group led by Baader and Meinhof that killed Schleyer.\nOnassis married Kennedy on October 20, 1968.\nArmstrong was the first man on the Moon.\nErich was born in Neunkirchen.\nMacedonia is a province of Greece.\nObama welcomed Merkel upon her arrival at JFK.\nKennedy was also an active politician, yet he is most known for his writings, some of which he published under the name of Mark Littleton." . - - - a nif:Sentence ; - nif:anchorOf "Nixon resigned after Watergate despite his success in the Ping-Pong Diplomacy with China." ; - nif:beginIndex "3134"^^xsd:nonNegativeInteger ; - nif:endIndex "3223"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Eric preferred to play Blues instead of Rock, so he joined Mayall 's band." ; - nif:beginIndex "1613"^^xsd:nonNegativeInteger ; - nif:endIndex "1687"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "The Isle of Wight festival in 1970 was the biggest at its time, surpassing Woodstock with acts like Davis, Chicago, and Mitchell." ; - nif:beginIndex "1483"^^xsd:nonNegativeInteger ; - nif:endIndex "1612"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Kennedy was also an active politician, yet he is most known for his writings, some of which he published under the name of Mark Littleton." ; - nif:beginIndex "3641"^^xsd:nonNegativeInteger ; - nif:endIndex "3779"^^xsd:nonNegativeInteger ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "David and Victoria named their children Brooklyn, Romeo, Cruz, and Harper Seven." ; - nif:beginIndex "0"^^xsd:nonNegativeInteger ; - nif:endIndex "80"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Thomas and Mario are strikers playing in Munich." ; - nif:beginIndex "2899"^^xsd:nonNegativeInteger ; - nif:endIndex "2947"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "The group formed by Homme, Grohl, and Jones was supposed to be named Caligula, but the name was already taken." ; - nif:beginIndex "1289"^^xsd:nonNegativeInteger ; - nif:endIndex "1399"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "The RAF was a terrorist group led by Baader and Meinhof that killed Schleyer." ; - nif:beginIndex "3364"^^xsd:nonNegativeInteger ; - nif:endIndex "3441"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Mars, Galaxy, and Bounty are all chocolate." ; - nif:beginIndex "2622"^^xsd:nonNegativeInteger ; - nif:endIndex "2665"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Karl and Theo made their extreme fortunes selling low-price groceries." ; - nif:beginIndex "2246"^^xsd:nonNegativeInteger ; - nif:endIndex "2316"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Armstrong was the first man on the Moon." ; - nif:beginIndex "3487"^^xsd:nonNegativeInteger ; - nif:endIndex "3527"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Haug congratulated Red Bull." ; - nif:beginIndex "2948"^^xsd:nonNegativeInteger ; - nif:endIndex "2976"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Tiger was lost in the woods when he got divorced from Elin." ; - nif:beginIndex "131"^^xsd:nonNegativeInteger ; - nif:endIndex "190"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "In 1980, Steve dropped out of Stanford to join Microsoft, the company behind the Windows operating system." ; - nif:beginIndex "1807"^^xsd:nonNegativeInteger ; - nif:endIndex "1913"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Bosch and Sharp are both home appliances producing companies." ; - nif:beginIndex "2666"^^xsd:nonNegativeInteger ; - nif:endIndex "2727"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "The Sun and The Times reported that Greece will have to leave the Euro soon." ; - nif:beginIndex "3224"^^xsd:nonNegativeInteger ; - nif:endIndex "3300"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Three of the greatest guitarists started their career in a single band : Clapton, Beck, and Page." ; - nif:beginIndex "744"^^xsd:nonNegativeInteger ; - nif:endIndex "841"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Yabo plays for Aachen." ; - nif:beginIndex "3082"^^xsd:nonNegativeInteger ; - nif:endIndex "3104"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "The Enola Gay bombed Hiroshima at the end of Second World War." ; - nif:beginIndex "3301"^^xsd:nonNegativeInteger ; - nif:endIndex "3363"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "After the death of Steve, the former CEO of Apple, his commencement speech at Stanford was watched thousands of times." ; - nif:beginIndex "1688"^^xsd:nonNegativeInteger ; - nif:endIndex "1806"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Sam, the co-founder of Equity International, was given the nickname of \\\"the grave dancer\\\" because of his ability to buy businesses that others thought were dead." ; - nif:beginIndex "2415"^^xsd:nonNegativeInteger ; - nif:endIndex "2578"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Despite featuring some of the most promininent musicians of their decade --- like Sinatra, Dylan, Joel, and Santana --- Columbia was aquired by Sony in the 1980s." ; - nif:beginIndex "975"^^xsd:nonNegativeInteger ; - nif:endIndex "1137"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Heidi and her husband Seal live in Vegas." ; - nif:beginIndex "408"^^xsd:nonNegativeInteger ; - nif:endIndex "449"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Dylan performed Hurricane about the black fighter Carter, from his album Desire." ; - nif:beginIndex "610"^^xsd:nonNegativeInteger ; - nif:endIndex "690"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Landgraf and Meijer played at the Tivoli." ; - nif:beginIndex "3040"^^xsd:nonNegativeInteger ; - nif:endIndex "3081"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "City won 3:2 against the Spurs." ; - nif:beginIndex "2728"^^xsd:nonNegativeInteger ; - nif:endIndex "2759"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Tiger lost the US Open." ; - nif:beginIndex "191"^^xsd:nonNegativeInteger ; - nif:endIndex "214"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Pixar produced Cars, and John directed it." ; - nif:beginIndex "2579"^^xsd:nonNegativeInteger ; - nif:endIndex "2621"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "The Gunners now play their home matches at the Emirates." ; - nif:beginIndex "2760"^^xsd:nonNegativeInteger ; - nif:endIndex "2816"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Allen founded the EMP in Seattle, which featured exhibitions about Hendrix and Dylan, but also about various science fiction movies." ; - nif:beginIndex "842"^^xsd:nonNegativeInteger ; - nif:endIndex "974"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Angelina, her father Jon, and her partner Brad never played together in the same movie." ; - nif:beginIndex "320"^^xsd:nonNegativeInteger ; - nif:endIndex "407"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Paris and Kim are both wealthy It Girls who had sex tapes on the Internet." ; - nif:beginIndex "450"^^xsd:nonNegativeInteger ; - nif:endIndex "524"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Madonna played Eva and was seen with Carlos." ; - nif:beginIndex "215"^^xsd:nonNegativeInteger ; - nif:endIndex "259"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Atletico has beaten its archrival Real." ; - nif:beginIndex "2817"^^xsd:nonNegativeInteger ; - nif:endIndex "2856"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Obama welcomed Merkel upon her arrival at JFK." ; - nif:beginIndex "3594"^^xsd:nonNegativeInteger ; - nif:endIndex "3640"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "David and Victoria added spice to their marriage." ; - nif:beginIndex "81"^^xsd:nonNegativeInteger ; - nif:endIndex "130"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Erich was born in Neunkirchen." ; - nif:beginIndex "3528"^^xsd:nonNegativeInteger ; - nif:endIndex "3558"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Hertha won against Dortmund." ; - nif:beginIndex "3105"^^xsd:nonNegativeInteger ; - nif:endIndex "3133"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Justin, Stefani, and Kate are among the most popular people on both MTV and Twitter." ; - nif:beginIndex "525"^^xsd:nonNegativeInteger ; - nif:endIndex "609"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Völler will never forget the match against Oranje in San Siro." ; - nif:beginIndex "2977"^^xsd:nonNegativeInteger ; - nif:endIndex "3039"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Onassis married Kennedy on October 20, 1968." ; - nif:beginIndex "3442"^^xsd:nonNegativeInteger ; - nif:endIndex "3486"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "In this musical, Madonna played the role of the First Lady." ; - nif:beginIndex "260"^^xsd:nonNegativeInteger ; - nif:endIndex "319"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Macedonia is a province of Greece." ; - nif:beginIndex "3559"^^xsd:nonNegativeInteger ; - nif:endIndex "3593"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Jobs and Baez dated in the late 1970s, and she performed at his Stanford memorial." ; - nif:beginIndex "1400"^^xsd:nonNegativeInteger ; - nif:endIndex "1482"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "After unsuccessful years, aging country star Cash made a grandiose comeback with his American Recordings, recorded at his home with the help of Rubin." ; - nif:beginIndex "1138"^^xsd:nonNegativeInteger ; - nif:endIndex "1288"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "While Apple is an electronics company, Mango is a clothing one and Orange is a communication one." ; - nif:beginIndex "2317"^^xsd:nonNegativeInteger ; - nif:endIndex "2414"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - - - a nif:Sentence ; - nif:anchorOf "Müller scored a hattrick against England." ; - nif:beginIndex "2857"^^xsd:nonNegativeInteger ; - nif:endIndex "2898"^^xsd:nonNegativeInteger ; - nif:nextSentence ; - nif:previousSentence ; - nif:referenceContext . - -rdf:Bag a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdf:Bag , rdfs:Container . - -rdf:Seq a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdf:Seq , rdfs:Container . - -rdfs:Datatype a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdfs:Datatype , rdfs:Class , rdfs:Resource . - -rdf:Alt a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdf:Alt , rdfs:Container . - -rdfs:Container a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdfs:Container . - -rdfs:ContainerMembershipProperty - a rdfs:Class , rdfs:Resource ; - rdfs:subClassOf rdfs:ContainerMembershipProperty , rdf:Property , rdfs:Resource . - -rdfs:isDefinedBy a rdf:Property , rdfs:Resource ; - rdfs:subPropertyOf rdfs:isDefinedBy , rdfs:seeAlso . - -rdfs:seeAlso a rdf:Property , rdfs:Resource ; - rdfs:subPropertyOf rdfs:seeAlso . - -nif:Sentence a rdfs:Class , rdfs:Resource . - -nif:Context a rdfs:Class , rdfs:Resource . diff --git a/dkpro-core-io-nif-asl/src/test/resources/nif/pynif/pynif-example-ref.ttl b/dkpro-core-io-nif-asl/src/test/resources/nif/pynif/pynif-example-ref.ttl new file mode 100644 index 0000000000..f93c0f3b76 --- /dev/null +++ b/dkpro-core-io-nif-asl/src/test/resources/nif/pynif/pynif-example-ref.ttl @@ -0,0 +1,29 @@ +@prefix rdf: . +@prefix owl: . +@prefix xsd: . +@prefix itsrdf: . +@prefix nif: . +@prefix rdfs: . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Diego Maradona" ; + nif:beginIndex "0"^^xsd:nonNegativeInteger ; + nif:endIndex "14"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taClassRef ; + itsrdf:taIdentRef . + + + a nif:OffsetBasedString , nif:EntityOccurrence ; + nif:anchorOf "Argentina" ; + nif:beginIndex "23"^^xsd:nonNegativeInteger ; + nif:endIndex "32"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + itsrdf:taClassRef . + + + a nif:OffsetBasedString , nif:Context ; + nif:beginIndex "0"^^xsd:nonNegativeInteger ; + nif:endIndex "33"^^xsd:nonNegativeInteger ; + nif:isString "Diego Maradona is from Argentina." . diff --git a/dkpro-core-io-nif-asl/src/test/resources/nif/pynif/pynif-example.ttl b/dkpro-core-io-nif-asl/src/test/resources/nif/pynif/pynif-example.ttl new file mode 100644 index 0000000000..df03e27bbb --- /dev/null +++ b/dkpro-core-io-nif-asl/src/test/resources/nif/pynif/pynif-example.ttl @@ -0,0 +1,42 @@ +@prefix xsd: . +@prefix nif: . +@prefix itsrdf: . +@prefix dcterms: + + a nif:ContextCollection ; + nif:hasContext ; + dcterms:conformsTo . + + a nif:Context, + nif:OffsetBasedString ; + nif:beginIndex "0"^^xsd:nonNegativeInteger ; + nif:endIndex "33"^^xsd:nonNegativeInteger ; + nif:isString "Diego Maradona is from Argentina." . + + a nif:OffsetBasedString, + nif:Phrase ; + nif:anchorOf "Diego Maradona" ; + nif:beginIndex "0"^^xsd:nonNegativeInteger ; + nif:endIndex "14"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + nif:taMsClassRef ; + itsrdf:taAnnotatorsRef ; + itsrdf:taClassRef , + , + ; + itsrdf:taConfidence 9.869993e-01 ; + itsrdf:taIdentRef . + + a nif:OffsetBasedString, + nif:Phrase ; + nif:anchorOf "Argentina" ; + nif:beginIndex "23"^^xsd:nonNegativeInteger ; + nif:endIndex "32"^^xsd:nonNegativeInteger ; + nif:referenceContext ; + nif:taMsClassRef ; + itsrdf:taAnnotatorsRef ; + itsrdf:taClassRef , + , + ; + itsrdf:taConfidence 9.804964e-01 . + \ No newline at end of file diff --git a/dkpro-core-io-nyt-asl/.activate-run-jcasgen b/dkpro-core-io-nitf-asl/.activate-run-jcasgen similarity index 100% rename from dkpro-core-io-nyt-asl/.activate-run-jcasgen rename to dkpro-core-io-nitf-asl/.activate-run-jcasgen diff --git a/dkpro-core-lancaster-asl/LICENSE.txt b/dkpro-core-io-nitf-asl/LICENSE.txt similarity index 100% rename from dkpro-core-lancaster-asl/LICENSE.txt rename to dkpro-core-io-nitf-asl/LICENSE.txt diff --git a/dkpro-core-io-nitf-asl/pom.xml b/dkpro-core-io-nitf-asl/pom.xml new file mode 100644 index 0000000000..08af2c4317 --- /dev/null +++ b/dkpro-core-io-nitf-asl/pom.xml @@ -0,0 +1,76 @@ + + + 4.0.0 + + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT + ../dkpro-core-asl + + dkpro-core-io-nitf-asl + jar + DKPro Core ASL - IO - New York Times Corpus + https://dkpro.github.io/dkpro-core/ + + + org.apache.uima + uimaj-core + + + org.apache.uima + uimafit-core + + + org.dkpro.core + dkpro-core-api-io-asl + + + edu.jhu.hlt + annotated-nyt + 1.1.4 + + + eu.openminted.share.annotations + omtd-share-annotations-api + + + junit + junit + test + + + + + + false + src/main/resources + + desc/type/**/* + + + + true + src/main/resources + + desc/type/**/* + + + + + diff --git a/dkpro-core-io-nyt-asl/src/filter/filter.properties b/dkpro-core-io-nitf-asl/src/filter/filter.properties similarity index 100% rename from dkpro-core-io-nyt-asl/src/filter/filter.properties rename to dkpro-core-io-nitf-asl/src/filter/filter.properties diff --git a/dkpro-core-io-nitf-asl/src/main/java/org/dkpro/core/io/nitf/NitfReader.java b/dkpro-core-io-nitf-asl/src/main/java/org/dkpro/core/io/nitf/NitfReader.java new file mode 100644 index 0000000000..d754ba288a --- /dev/null +++ b/dkpro-core-io-nitf-asl/src/main/java/org/dkpro/core/io/nitf/NitfReader.java @@ -0,0 +1,152 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.nitf; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.List; + +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.StringArray; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.io.nift.metadata.ArticleMetaData; + +import com.nytlabs.corpus.NYTCorpusDocument; +import com.nytlabs.corpus.NYTCorpusDocumentParser; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Reader for the News Industry Text Format (NITF). Was developed primarily to work with the + * New York Times Annotated Corpus. + * + * @see NITF + * @see The New York Times Annotated Corpus + */ +@ResourceMetaData(name = "NITF Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.APPLICATION_X_NITF_XML}) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "org.dkpro.core.io.nift.metadata.ArticleMetaData" }) +public class NitfReader + extends JCasResourceCollectionReader_ImplBase +{ + /** + * A number of documents which will be skipped at the beginning. + */ + public static final String PARAM_OFFSET = "offset"; + @ConfigurationParameter(name = PARAM_OFFSET, mandatory = false) + private int offset = 0; + + /** + * Counting variable to keep track of the already skipped documents. + */ + private int skipped = 0; + + private NYTCorpusDocumentParser parser = new NYTCorpusDocumentParser(); + + @Override + public void getNext(JCas aJCas) throws IOException, CollectionException + { + while (isBelowOffset()) { + nextFile(); + skipped++; + } + + Resource xmlFile = nextFile(); + initCas(aJCas, xmlFile); + + try (InputStream is = xmlFile.getInputStream()) { + NYTCorpusDocument nytDocument = parser + .parseNYTCorpusDocumentFromFile(xmlFile.getInputStream(), false); + + setDocumenText(aJCas, nytDocument.getBody()); + + ArticleMetaData articleMetaData = createNYTArticleMetaData(aJCas, nytDocument); + articleMetaData.addToIndexes(); + } + } + + private void setDocumenText(JCas aJCas, String documentBody) + { + if (documentBody != null) { + aJCas.setDocumentText(documentBody); + } + else { + aJCas.setDocumentText(""); + } + } + + private boolean isBelowOffset() + { + return skipped < offset && getResourceIterator().hasNext(); + } + + private static StringArray toStringArray(List stringList, JCas aJCas) + { + if (!stringList.isEmpty()) { + String[] strings = stringList.toArray(new String[0]); + int length = strings.length; + StringArray stringArray = new StringArray(aJCas, length); + stringArray.copyFromArray(strings, 0, 0, length); + return stringArray; + } + else { + return new StringArray(aJCas, 0); + } + } + + private ArticleMetaData createNYTArticleMetaData(JCas aJCas, NYTCorpusDocument doc) + { + ArticleMetaData articleMetaData = new ArticleMetaData(aJCas); + articleMetaData.setGuid(doc.getGuid()); + + URL alternateUrl = doc.getAlternateURL(); + if (alternateUrl != null) { + articleMetaData.setAlternateUrl(alternateUrl.toString()); + } + + URL url = doc.getUrl(); + if (url != null) { + articleMetaData.setAlternateUrl(url.toString()); + } + + articleMetaData.setAuthor(doc.getNormalizedByline()); + articleMetaData.setColumnName(doc.getColumnName()); + articleMetaData.setDescriptors(toStringArray(doc.getDescriptors(), aJCas)); + articleMetaData.setHeadline(doc.getHeadline()); + articleMetaData.setOnlineDescriptors(toStringArray(doc.getOnlineDescriptors(), aJCas)); + articleMetaData.setOnlineHeadline(doc.getOnlineHeadline()); + articleMetaData.setOnlineSection(doc.getOnlineSection()); + articleMetaData.setPublicationDate(doc.getPublicationDate().toString()); + articleMetaData.setSection(doc.getSection()); + articleMetaData + .setTaxonomicClassifiers(toStringArray(doc.getTaxonomicClassifiers(), aJCas)); + articleMetaData.setTypesOfMaterial(toStringArray(doc.getTypesOfMaterial(), aJCas)); + return articleMetaData; + } +} diff --git a/dkpro-core-io-nitf-asl/src/main/resources/META-INF/org.apache.uima.fit/types.txt b/dkpro-core-io-nitf-asl/src/main/resources/META-INF/org.apache.uima.fit/types.txt new file mode 100644 index 0000000000..e3e15d22a9 --- /dev/null +++ b/dkpro-core-io-nitf-asl/src/main/resources/META-INF/org.apache.uima.fit/types.txt @@ -0,0 +1 @@ +classpath*:desc/type/ArticleMetaData.xml diff --git a/dkpro-core-io-nitf-asl/src/main/resources/desc/type/ArticleMetaData.xml b/dkpro-core-io-nitf-asl/src/main/resources/desc/type/ArticleMetaData.xml new file mode 100644 index 0000000000..3b9237545a --- /dev/null +++ b/dkpro-core-io-nitf-asl/src/main/resources/desc/type/ArticleMetaData.xml @@ -0,0 +1,211 @@ + + + NYTArticleMetaData + + ${version} + Ubiquitous Knowledge Processing (UKP) Lab, Technische Universität Darmstadt + + + org.dkpro.core.io.nift.metadata.ArticleMetaData + A document annotation that describes the metadata of a + newspaper article. + uima.cas.AnnotationBase + + + guid + The GUID field specifies a (4-byte) integer that is + guaranteed + to be unique for every document + in the corpus. + uima.cas.Integer + + + alternateUrl + This field specifies the location on nytimes.com of + the article. When present, this URL is preferred to the URL field + on articles published on or after April 02, + 2006, as the linked + page will have richer content. + uima.cas.String + + + url + This field specifies the location on nytimes.com of + the article. The 'Alternative Url' + field is preferred to this field + on articles published on or after + April 02, 2006, as the + linked page + will have richer content. + uima.cas.String + + + publicationDate + This field specifies the date of the article's + publication. This field is specified in the + format + YYYYMMDD'T'HHMMSS where: + 1. YYYY is the four-digit year. + 2. MM is + the two-digit month [01-12]. + 3. DD is the two-digit day [01-31]. + 4. + T is a constant value. + 5. HH is the two-digit hour [00-23]. + 6. MM is + the two-digit minute-past-the hour [00-59] + 7. SS is the two-digit + seconds-past-the-minute [00-59]. + Please note that values for HH,MM, + and SS are not defined for this + corpus, that is to day + HH,MM, and SS + are always defined to be '00'. + uima.cas.String + + + typesOfMaterial + This field specifies a normalized list of terms + describing the general editorial category of the article. + These + tags are algorithmically assigned and + manually verified by + nytimes.com production staff. + Examples Include: + * REVIEW + * OBITUARY + * ANALYSIS + uima.cas.StringArray + + + headline + This field specifies the headline of the article as it + appeared in the + print edition of the New York + Times. + uima.cas.String + + + onlineHeadline + This field specifies the headline displayed with the + article on + nytimes.com. Often + this differs from the headline used in + print. + uima.cas.String + + + columnName + If the article is part of a regular column, this field + specifies the + name of that column. + Sample Column Names: + 1. World News + Briefs + 2. WEDDINGS + 3. The Accessories Channel + uima.cas.String + + + author + This field is based on the normalized byline in the + original corpus data: "The Normalized Byline field is the byline + normalized to the form (last name, first + name)". + uima.cas.String + + + descriptors + The 'descriptors' field specifies a list of + descriptive terms drawn + from a normalized controlled + vocabulary + corresponding to subjects mentioned in the article. These tags + are + hand-assigned by + a team of library scientists working in the New + York Times Indexing + service. + Examples Include: + * ECONOMIC CONDITIONS + AND TRENDS + * AIRPLANES + * VIOLINS + uima.cas.StringArray + + + onlineDescriptors + This field specifies a list of descriptors from a + normalized + controlled + vocabulary that + correspond to topics mentioned + in the article. These + tags are + algorithmically + assigned and manually + verified by + nytimes.com production staff. + Examples Include: + * Marriages + * Parks and Other Recreation Areas + * Cooking and Cookbooks + uima.cas.StringArray + + + generalOnlineDescriptors + The 'general online descriptors' field specifies a + list of descriptors that are at a higher level of + generality than + the other tags associated with the article. These tags are + algorithmically + assigned and manually verified by nytimes.com + production staff. + Examples Include: + * Surfing + * Venice Biennale + * Ranches + uima.cas.String + + + onlineSection + This field specifies the section(s) on nytimes.com in + which the + article is placed. If + the article is placed in multiple + sections, this field will be + specified as a ';' delineated + list. + uima.cas.String + + + section + This field specifies the section of the paper in which + the article + appears. This is not + the name of the section, but rather + a letter or number that indicates + the section. + uima.cas.String + + + taxonomicClassifiers + This field specifies a list of taxonomic classifiers + that place this + article into a + hierarchy of articles. The individual + terms of each taxonomic classifier + are separated with the '/' character. + These tags are algorithmically assigned and manually + verified + by nytimes.com production staff. + Examples Include: + * Top/Features/Travel/Guides/Destinations/North America/United States/Arizona + * Top/News/U.S./Rockies + * Top/Opinion + uima.cas.StringArray + + + + + diff --git a/dkpro-core-io-nitf-asl/src/test/java/org/dkpro/core/io/nitf/NitfReaderTest.java b/dkpro-core-io-nitf-asl/src/test/java/org/dkpro/core/io/nitf/NitfReaderTest.java new file mode 100644 index 0000000000..774f8a1380 --- /dev/null +++ b/dkpro-core-io-nitf-asl/src/test/java/org/dkpro/core/io/nitf/NitfReaderTest.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.nitf; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.component.CasDumpWriter; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.CollectionReaderFactory; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.io.nitf.NitfReader; +import org.junit.Test; + +public class NitfReaderTest +{ + @Test + public void test() throws Exception + { + final String DATA_PATH = "src/test/resources/data/"; + + CollectionReader articleReader = CollectionReaderFactory.createReader( + NitfReader.class, + NitfReader.PARAM_SOURCE_LOCATION, DATA_PATH, + NitfReader.PARAM_PATTERNS, "[+]/**/*.xml", + NitfReader.PARAM_LANGUAGE, "en", + NitfReader.PARAM_OFFSET, 0); + + AnalysisEngine extractor = AnalysisEngineFactory.createEngine(CasDumpWriter.class, + CasDumpWriter.PARAM_OUTPUT_FILE, "-"); + + SimplePipeline.runPipeline(articleReader, extractor); + } +} diff --git a/dkpro-core-io-nyt-asl/src/test/resources/data/1987/01/01/0000000.xml b/dkpro-core-io-nitf-asl/src/test/resources/data/1987/01/01/0000000.xml similarity index 100% rename from dkpro-core-io-nyt-asl/src/test/resources/data/1987/01/01/0000000.xml rename to dkpro-core-io-nitf-asl/src/test/resources/data/1987/01/01/0000000.xml diff --git a/dkpro-core-io-nyt-asl/src/test/resources/data/1987/02/01/0000001.xml b/dkpro-core-io-nitf-asl/src/test/resources/data/1987/02/01/0000001.xml similarity index 100% rename from dkpro-core-io-nyt-asl/src/test/resources/data/1987/02/01/0000001.xml rename to dkpro-core-io-nitf-asl/src/test/resources/data/1987/02/01/0000001.xml diff --git a/dkpro-core-io-nyt-asl/src/test/resources/data/1987/02/01/0000002.xml b/dkpro-core-io-nitf-asl/src/test/resources/data/1987/02/01/0000002.xml similarity index 100% rename from dkpro-core-io-nyt-asl/src/test/resources/data/1987/02/01/0000002.xml rename to dkpro-core-io-nitf-asl/src/test/resources/data/1987/02/01/0000002.xml diff --git a/dkpro-core-io-nyt-asl/src/test/resources/data/1988/01/01/0000003.xml b/dkpro-core-io-nitf-asl/src/test/resources/data/1988/01/01/0000003.xml similarity index 100% rename from dkpro-core-io-nyt-asl/src/test/resources/data/1988/01/01/0000003.xml rename to dkpro-core-io-nitf-asl/src/test/resources/data/1988/01/01/0000003.xml diff --git a/dkpro-core-io-nitf-asl/suppressions.xml b/dkpro-core-io-nitf-asl/suppressions.xml new file mode 100644 index 0000000000..05381817ea --- /dev/null +++ b/dkpro-core-io-nitf-asl/suppressions.xml @@ -0,0 +1,9 @@ + + + + + + + diff --git a/dkpro-core-io-nyt-asl/pom.xml b/dkpro-core-io-nyt-asl/pom.xml deleted file mode 100644 index 951a9dc0f0..0000000000 --- a/dkpro-core-io-nyt-asl/pom.xml +++ /dev/null @@ -1,71 +0,0 @@ - - - 4.0.0 - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT - ../dkpro-core-asl - - dkpro-core-io-nyt-asl - jar - DKPro Core ASL - IO - New York Times Corpus - - - org.apache.uima - uimaj-core - - - org.apache.uima - uimafit-core - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl - - - edu.jhu.hlt - annotated-nyt - 1.1.4 - - - junit - junit - test - - - - - - false - src/main/resources - - desc/type/**/* - - - - true - src/main/resources - - desc/type/**/* - - - - - diff --git a/dkpro-core-io-nyt-asl/src/main/java/org/dkpro/core/io/nyt/NYTCollectionReader.java b/dkpro-core-io-nyt-asl/src/main/java/org/dkpro/core/io/nyt/NYTCollectionReader.java deleted file mode 100644 index 21651f179d..0000000000 --- a/dkpro-core-io-nyt-asl/src/main/java/org/dkpro/core/io/nyt/NYTCollectionReader.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Licensed to the Technische Universität Darmstadt under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt - * licenses this file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.io.nyt; - -import java.io.IOException; -import java.net.URL; -import java.util.List; - -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.cas.StringArray; -import org.dkpro.core.io.nyt.metadata.NYTArticleMetaData; - -import com.nytlabs.corpus.NYTCorpusDocument; -import com.nytlabs.corpus.NYTCorpusDocumentParser; - -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; - -public class NYTCollectionReader - extends JCasResourceCollectionReader_ImplBase -{ - - /** - * A number of documents which will be skipped at the beginning. - */ - public static final String PARAM_OFFSET = "offset"; - @ConfigurationParameter(name = PARAM_OFFSET, mandatory = false) - private int offset = 0; - - /** - * Counting variable to keep track of the already skipped documents. - */ - private int skipped = 0; - - private NYTCorpusDocumentParser nytParser = new NYTCorpusDocumentParser(); - - private void setDocumenText(JCas aJCas, String documentBody) - { - if (documentBody != null) { - aJCas.setDocumentText(documentBody); - } - else { - aJCas.setDocumentText(""); - } - } - - @Override - public void getNext(JCas aJCas) throws IOException, CollectionException - { - - while (isBelowOffset()) { - nextFile(); - skipped++; - } - - Resource xmlFile = nextFile(); - initCas(aJCas, xmlFile); - NYTCorpusDocument nytDocument = nytParser - .parseNYTCorpusDocumentFromFile(xmlFile.getInputStream(), false); - setDocumenText(aJCas, nytDocument.getBody()); - NYTArticleMetaData articleMetaData = createNYTArticleMetaData(aJCas, nytDocument); - articleMetaData.addToIndexes(); - } - - private boolean isBelowOffset() - { - return skipped < offset && getResourceIterator().hasNext(); - } - - private static StringArray toStringArray(List stringList, JCas aJCas) - { - if (!stringList.isEmpty()) { - String[] strings = stringList.toArray(new String[0]); - int length = strings.length; - StringArray stringArray = new StringArray(aJCas, length); - stringArray.copyFromArray(strings, 0, 0, length); - return stringArray; - } - else { - return new StringArray(aJCas, 0); - } - } - - private NYTArticleMetaData createNYTArticleMetaData(JCas aJCas, NYTCorpusDocument doc) - { - NYTArticleMetaData articleMetaData = new NYTArticleMetaData(aJCas); - articleMetaData.setGuid(doc.getGuid()); - - URL alternateUrl = doc.getAlternateURL(); - if (alternateUrl != null) { - articleMetaData.setAlternateUrl(alternateUrl.toString()); - } - - URL url = doc.getUrl(); - if (url != null) { - articleMetaData.setAlternateUrl(url.toString()); - } - - articleMetaData.setAuthor(doc.getNormalizedByline()); - articleMetaData.setColumnName(doc.getColumnName()); - articleMetaData.setDescriptors(toStringArray(doc.getDescriptors(), aJCas)); - articleMetaData.setHeadline(doc.getHeadline()); - articleMetaData.setOnlineDescriptors(toStringArray(doc.getOnlineDescriptors(), aJCas)); - articleMetaData.setOnlineHeadline(doc.getOnlineHeadline()); - articleMetaData.setOnlineSection(doc.getOnlineSection()); - articleMetaData.setPublicationDate(doc.getPublicationDate().toString()); - articleMetaData.setSection(doc.getSection()); - articleMetaData - .setTaxonomicClassifiers(toStringArray(doc.getTaxonomicClassifiers(), aJCas)); - articleMetaData.setTypesOfMaterial(toStringArray(doc.getTypesOfMaterial(), aJCas)); - return articleMetaData; - } -} diff --git a/dkpro-core-io-nyt-asl/src/main/resources/META-INF/org.apache.uima.fit/types.txt b/dkpro-core-io-nyt-asl/src/main/resources/META-INF/org.apache.uima.fit/types.txt deleted file mode 100644 index 14c897252c..0000000000 --- a/dkpro-core-io-nyt-asl/src/main/resources/META-INF/org.apache.uima.fit/types.txt +++ /dev/null @@ -1 +0,0 @@ -classpath*:desc/type/NYTArticleMetaData.xml diff --git a/dkpro-core-io-nyt-asl/src/main/resources/desc/type/NYTArticleMetaData.xml b/dkpro-core-io-nyt-asl/src/main/resources/desc/type/NYTArticleMetaData.xml deleted file mode 100644 index 701fdd12f9..0000000000 --- a/dkpro-core-io-nyt-asl/src/main/resources/desc/type/NYTArticleMetaData.xml +++ /dev/null @@ -1,211 +0,0 @@ - - - NYTArticleMetaData - - ${version} - Ubiquitous Knowledge Processing (UKP) Lab, Technische Universität Darmstadt - - - org.dkpro.core.io.nyt.metadata.NYTArticleMetaData - A document annotation that describes the metadata of a - newspaper article. - uima.cas.AnnotationBase - - - Guid - The GUID field specifies a (4-byte) integer that is - guaranteed - to be unique for every document - in the corpus. - uima.cas.Integer - - - alternateUrl - This field specifies the location on nytimes.com of - the article. When present, this URL is preferred to the URL field - on articles published on or after April 02, - 2006, as the linked - page will have richer content. - uima.cas.String - - - url - This field specifies the location on nytimes.com of - the article. The 'Alternative Url' - field is preferred to this field - on articles published on or after - April 02, 2006, as the - linked page - will have richer content. - uima.cas.String - - - publicationDate - This field specifies the date of the article's - publication. This field is specified in the - format - YYYYMMDD'T'HHMMSS where: - 1. YYYY is the four-digit year. - 2. MM is - the two-digit month [01-12]. - 3. DD is the two-digit day [01-31]. - 4. - T is a constant value. - 5. HH is the two-digit hour [00-23]. - 6. MM is - the two-digit minute-past-the hour [00-59] - 7. SS is the two-digit - seconds-past-the-minute [00-59]. - Please note that values for HH,MM, - and SS are not defined for this - corpus, that is to day - HH,MM, and SS - are always defined to be '00'. - uima.cas.String - - - typesOfMaterial - This field specifies a normalized list of terms - describing the general editorial category of the article. - These - tags are algorithmically assigned and - manually verified by - nytimes.com production staff. - Examples Include: - * REVIEW - * OBITUARY - * ANALYSIS - uima.cas.StringArray - - - headline - This field specifies the headline of the article as it - appeared in the - print edition of the New York - Times. - uima.cas.String - - - onlineHeadline - This field specifies the headline displayed with the - article on - nytimes.com. Often - this differs from the headline used in - print. - uima.cas.String - - - columnName - If the article is part of a regular column, this field - specifies the - name of that column. - Sample Column Names: - 1. World News - Briefs - 2. WEDDINGS - 3. The Accessories Channel - uima.cas.String - - - author - This field is based on the normalized byline in the - original corpus data: "The Normalized Byline field is the byline - normalized to the form (last name, first - name)". - uima.cas.String - - - descriptors - The 'descriptors' field specifies a list of - descriptive terms drawn - from a normalized controlled - vocabulary - corresponding to subjects mentioned in the article. These tags - are - hand-assigned by - a team of library scientists working in the New - York Times Indexing - service. - Examples Include: - * ECONOMIC CONDITIONS - AND TRENDS - * AIRPLANES - * VIOLINS - uima.cas.StringArray - - - onlineDescriptors - This field specifies a list of descriptors from a - normalized - controlled - vocabulary that - correspond to topics mentioned - in the article. These - tags are - algorithmically - assigned and manually - verified by - nytimes.com production staff. - Examples Include: - * Marriages - * Parks and Other Recreation Areas - * Cooking and Cookbooks - uima.cas.StringArray - - - generalOnlineDescriptors - The 'general online descriptors' field specifies a - list of descriptors that are at a higher level of - generality than - the other tags associated with the article. These tags are - algorithmically - assigned and manually verified by nytimes.com - production staff. - Examples Include: - * Surfing - * Venice Biennale - * Ranches - uima.cas.String - - - onlineSection - This field specifies the section(s) on nytimes.com in - which the - article is placed. If - the article is placed in multiple - sections, this field will be - specified as a ';' delineated - list. - uima.cas.String - - - section - This field specifies the section of the paper in which - the article - appears. This is not - the name of the section, but rather - a letter or number that indicates - the section. - uima.cas.String - - - taxonomicClassifiers - This field specifies a list of taxonomic classifiers - that place this - article into a - hierarchy of articles. The individual - terms of each taxonomic classifier - are separated with the '/' character. - These tags are algorithmically assigned and manually - verified - by nytimes.com production staff. - Examples Include: - * Top/Features/Travel/Guides/Destinations/North America/United States/Arizona - * Top/News/U.S./Rockies - * Top/Opinion - uima.cas.StringArray - - - - - diff --git a/dkpro-core-io-nyt-asl/src/test/java/org/dkpro/core/io/nyt/NYTCollectionReaderTest.java b/dkpro-core-io-nyt-asl/src/test/java/org/dkpro/core/io/nyt/NYTCollectionReaderTest.java deleted file mode 100644 index a00a662629..0000000000 --- a/dkpro-core-io-nyt-asl/src/test/java/org/dkpro/core/io/nyt/NYTCollectionReaderTest.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Technische Universität Darmstadt under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt - * licenses this file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.io.nyt; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.collection.CollectionReader; -import org.apache.uima.fit.component.CasDumpWriter; -import org.apache.uima.fit.factory.AnalysisEngineFactory; -import org.apache.uima.fit.factory.CollectionReaderFactory; -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.dkpro.core.io.nyt.NYTCollectionReader; -import org.junit.Test; - -public class NYTCollectionReaderTest -{ - @Test - public void test() throws Exception - { - final String DATA_PATH = "src/test/resources/data/"; - - CollectionReader articleReader = CollectionReaderFactory.createReader( - NYTCollectionReader.class, - NYTCollectionReader.PARAM_SOURCE_LOCATION, DATA_PATH, - NYTCollectionReader.PARAM_PATTERNS, "[+]/**/*.xml", - NYTCollectionReader.PARAM_LANGUAGE, "en", - NYTCollectionReader.PARAM_OFFSET, 0); - - AnalysisEngine extractor = AnalysisEngineFactory.createEngine(CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "-"); - - SimplePipeline.runPipeline(articleReader, extractor); - } -} \ No newline at end of file diff --git a/dkpro-core-io-pdf-asl/NOTICE.txt b/dkpro-core-io-pdf-asl/NOTICE.txt index f426c10d2a..ef97028289 100644 --- a/dkpro-core-io-pdf-asl/NOTICE.txt +++ b/dkpro-core-io-pdf-asl/NOTICE.txt @@ -3,6 +3,11 @@ AnnoLab project by Richard Eckart de Castilho. They were originally licensed und the LGPL and have been relicenced under the Apache License 2.0 by the original author. +LegacyPDFStreamEngine.java: + +This class was copied from PDFbox 2.0.9 because it could not be extended as it was package-private +in PDFBox. + PDFLayoutEventStripper.java: diff --git a/dkpro-core-io-pdf-asl/pom.xml b/dkpro-core-io-pdf-asl/pom.xml index 425ae4e1a9..4ae48d717a 100644 --- a/dkpro-core-io-pdf-asl/pom.xml +++ b/dkpro-core-io-pdf-asl/pom.xml @@ -1,74 +1,87 @@ - 4.0.0 - - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT - ../dkpro-core-asl - - de.tudarmstadt.ukp.dkpro.core.io.pdf-asl - jar - DKPro Core ASL - IO - PDF - - - org.apache.uima - uimafit-core - - - org.apache.uima - uimaj-core - - - commons-io - commons-io - - - commons-logging - commons-logging-api - - - org.apache.pdfbox - pdfbox - 1.7.0 - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl - - - junit - junit - test - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl - test - - + 4.0.0 + + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT + ../dkpro-core-asl + + dkpro-core-io-pdf-asl + jar + DKPro Core ASL - IO - PDF (pdfbox ${pdfbox.version}) + https://dkpro.github.io/dkpro-core/ + + 2.0.24 + + + + org.apache.uima + uimafit-core + + + org.apache.uima + uimaj-core + + + commons-io + commons-io + + + commons-logging + commons-logging-api + + + org.apache.pdfbox + pdfbox + ${pdfbox.version} + + + org.apache.pdfbox + fontbox + ${pdfbox.version} + + + org.dkpro.core + dkpro-core-api-io-asl + + + org.dkpro.core + dkpro-core-api-segmentation-asl + + + org.dkpro.core + dkpro-core-api-resources-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api + + + junit + junit + test + + + org.dkpro.core + dkpro-core-testing-asl + test + + \ No newline at end of file diff --git a/dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/Buckets.java b/dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/Buckets.java deleted file mode 100644 index 2726362c32..0000000000 --- a/dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/Buckets.java +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright 2010 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.pdf; - -import java.util.LinkedList; - -/** - * Cluster values into buckets. A new bucket is opened if a new value is added that differs from the - * average value of each of the existing buckets by more than a certain threshold. - * - */ -public class Buckets -{ - private final LinkedList buckets = new LinkedList(); - private final double tolerance; - private boolean modified = true; - private Bucket cachedBest = null; - - Buckets(final double aTolerance) - { - tolerance = aTolerance; - } - - public void put(final double aValue) - { - modified = true; - - // Empty so far - if (buckets.size() == 0) { - newBucket(aValue); - return; - } - - Bucket best = buckets.getFirst(); - double best_diff = Math.abs(best.getValue() - aValue); - for (final Bucket b : buckets) { - final double cur_diff = Math.abs(b.getValue() - aValue); - - // Bail out on exact match - if (cur_diff == 0.0) { - b.add(aValue); - return; - } - - // Found better match? - if (cur_diff < best_diff) { - best = b; - best_diff = cur_diff; - } - } - - // Add to existing bucket if within tolerance, otherwise create new one - if (best_diff < tolerance) { - best.add(aValue); - } - else { - newBucket(aValue); - } - } - - private void newBucket(final double aValue) - { - buckets.add(new Bucket(aValue)); - } - - public Bucket getBest() - { - if (modified == false) { - return cachedBest; - } - - Bucket best = buckets.getFirst(); - for (final Bucket b : buckets) { - if (best.size() < b.size()) { - best = b; - } - } - cachedBest = best; - - return best; - } - - @Override - public String toString() - { - return buckets.toString(); - } -} - -class Bucket -{ - private final LinkedList values = new LinkedList(); - private double cached_avg = 0.0; - private boolean modified = true; - - double getValue() - { - if (!modified) { - return cached_avg; - } - - modified = false; - - double avg = 0.0; - for (final Double v : values) { - avg += v.doubleValue(); - } - cached_avg = avg / values.size(); - return cached_avg; - } - - int size() - { - return values.size(); - } - - Bucket(final double aValue) - { - values.add(aValue); - } - - void add(final double aValue) - { - modified = true; - values.add(aValue); - } - - @Override - public String toString() - { - return "[" + getValue() + " : " + values.size() + "]"; - } -} diff --git a/dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/SubstitutionTrieParser.java b/dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/SubstitutionTrieParser.java deleted file mode 100644 index 3bb078b92e..0000000000 --- a/dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/SubstitutionTrieParser.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright 2009, Richard Eckart de Castilho - * Copyright 2012, Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.pdf; - -import java.io.IOException; -import java.io.InputStream; - -import org.xml.sax.Attributes; -import org.xml.sax.InputSource; -import org.xml.sax.SAXException; -import org.xml.sax.XMLReader; -import org.xml.sax.helpers.DefaultHandler; -import org.xml.sax.helpers.XMLReaderFactory; - -public -class SubstitutionTrieParser -extends DefaultHandler -{ - private final Trie _trie; - - private - SubstitutionTrieParser( - final Trie trie) - { - _trie = trie; - } - - @Override - public - void startElement( - final String uri, - final String localName, - final String qName, - final Attributes attributes) - throws SAXException - { - if (localName.equals("substitution")) { - _trie.put( - attributes.getValue("orig"), - attributes.getValue("subst")); - } - } - - public static - Trie parse( - final InputStream is) - throws IOException - { - final Trie trie = new Trie(); - parse(is, trie); - return trie; - } - - public static - void parse( - final InputStream is, - final Trie trie) - throws IOException - { - try { - final XMLReader xr = XMLReaderFactory.createXMLReader(); - final SubstitutionTrieParser sp = new SubstitutionTrieParser(trie); - xr.setContentHandler(sp); - xr.parse(new InputSource(is)); - } - catch (final SAXException e) { - throw new IOException(e); - } - } -}; diff --git a/dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/package-info.java b/dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/package-info.java deleted file mode 100644 index bc4a864206..0000000000 --- a/dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Support for PDF files (read-only). - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.io.pdf; diff --git a/dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/PdfReader.java b/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/PdfReader.java similarity index 89% rename from dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/PdfReader.java rename to dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/PdfReader.java index 73abcfe4d0..2ea295569a 100644 --- a/dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/PdfReader.java +++ b/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/PdfReader.java @@ -1,158 +1,165 @@ -/* - * Copyright 2010 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.pdf; - -import static org.apache.commons.io.IOUtils.closeQuietly; - -import java.io.IOException; -import java.io.InputStream; -import java.net.URL; - -import org.apache.uima.UimaContext; -import org.apache.uima.cas.CAS; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; - -/** - * Collection reader for PDF files. Uses simple heuristics to detect headings and paragraphs. - */ -@ResourceMetaData(name="PDFBox PDF Reader") -@MimeTypeCapability({MimeTypes.APPLICATION_PDF}) -@TypeCapability( - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph"}) -public class PdfReader - extends ResourceCollectionReaderBase -{ - public static final String BUILT_IN = ""; - private static final String NOT_RESTRICTED = "-1"; - - /** - * The location of the substitution table use to post-process the text extracted form the PDF, - * e.g. to convert ligatures to separate characters. - */ - public static final String PARAM_SUBSTITUTION_TABLE_LOCATION = "substitutionTableLocation"; - @ConfigurationParameter(name = PARAM_SUBSTITUTION_TABLE_LOCATION, mandatory = false, defaultValue = BUILT_IN) - private String substitutionTableLocation; - - /** - * The type used to annotate headings. - */ - public static final String PARAM_HEADING_TYPE = "headingType"; - @ConfigurationParameter(name = PARAM_HEADING_TYPE, mandatory = false, defaultValue = BUILT_IN) - private String headingType; - - /** - * The type used to annotate paragraphs. - */ - public static final String PARAM_PARAGRAPH_TYPE = "paragraphType"; - @ConfigurationParameter(name = PARAM_PARAGRAPH_TYPE, mandatory = false, defaultValue = BUILT_IN) - private String paragraphType; - - /** - * The first page to be extracted from the PDF. - */ - public static final String PARAM_START_PAGE = "startPage"; - @ConfigurationParameter(name = PARAM_START_PAGE, mandatory = false, defaultValue = NOT_RESTRICTED) - private int startPage; - - /** - * The last page to be extracted from the PDF. - */ - public static final String PARAM_END_PAGE = "endPage"; - @ConfigurationParameter(name = PARAM_END_PAGE, mandatory = false, defaultValue = NOT_RESTRICTED) - private int endPage; - - private Trie substitutionTable; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - if (BUILT_IN.equals(headingType)) { - headingType = Heading.class.getName(); - } - - if (BUILT_IN.equals(paragraphType)) { - paragraphType = Paragraph.class.getName(); - } - - if (substitutionTableLocation != null) { - if (BUILT_IN.equals(substitutionTableLocation)) { - substitutionTableLocation = "classpath:/de/tudarmstadt/ukp/dkpro/core/io/pdf/substitutionTable.xml"; - } - - InputStream is = null; - try { - URL url = ResourceUtils.resolveLocation(substitutionTableLocation, this, aContext); - is = url.openStream(); - substitutionTable = SubstitutionTrieParser.parse(is); - } - catch (IOException e) { - throw new ResourceInitializationException(e); - } - finally { - closeQuietly(is); - } - } - else { - substitutionTable = null; - } - } - - @Override - public void getNext(CAS aCAS) - throws IOException, CollectionException - { - Resource resource = nextFile(); - initCas(aCAS, resource, null); - - InputStream is = null; - try { - is = resource.getInputStream(); - final Pdf2CasConverter converter = new Pdf2CasConverter(); - converter.setSubstitutionTable(substitutionTable); - converter.setHeadingType(headingType); - converter.setParagraphType(paragraphType); - if (startPage != Integer.parseInt(NOT_RESTRICTED)) { - converter.setStartPage(startPage); - } - if (endPage != Integer.parseInt(NOT_RESTRICTED)) { - converter.setEndPage(endPage); - } - converter.writeText(aCAS, is); - } - finally { - closeQuietly(is); - } - } -} +/* + * Copyright 2010 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.pdf; + +import static org.apache.commons.io.IOUtils.closeQuietly; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; + +import org.apache.uima.UimaContext; +import org.apache.uima.cas.CAS; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.ResourceCollectionReaderBase; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.io.pdf.internal.Pdf2CasConverter; +import org.dkpro.core.io.pdf.internal.SubstitutionTrieParser; +import org.dkpro.core.io.pdf.internal.Trie; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Collection reader for PDF files. Uses simple heuristics to detect headings and paragraphs. + */ +@ResourceMetaData(name = "PDFBox PDF Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.APPLICATION_PDF}) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph"}) +public class PdfReader + extends ResourceCollectionReaderBase +{ + public static final String BUILT_IN = ""; + private static final String NOT_RESTRICTED = "-1"; + + /** + * The location of the substitution table use to post-process the text extracted form the PDF, + * e.g. to convert ligatures to separate characters. + */ + public static final String PARAM_SUBSTITUTION_TABLE_LOCATION = "substitutionTableLocation"; + @ConfigurationParameter(name = PARAM_SUBSTITUTION_TABLE_LOCATION, mandatory = false, + defaultValue = BUILT_IN) + private String substitutionTableLocation; + + /** + * The type used to annotate headings. + */ + public static final String PARAM_HEADING_TYPE = "headingType"; + @ConfigurationParameter(name = PARAM_HEADING_TYPE, mandatory = false, defaultValue = BUILT_IN) + private String headingType; + + /** + * The type used to annotate paragraphs. + */ + public static final String PARAM_PARAGRAPH_TYPE = "paragraphType"; + @ConfigurationParameter(name = PARAM_PARAGRAPH_TYPE, mandatory = false, defaultValue = BUILT_IN) + private String paragraphType; + + /** + * The first page to be extracted from the PDF. + */ + public static final String PARAM_START_PAGE = "startPage"; + @ConfigurationParameter(name = PARAM_START_PAGE, mandatory = false, + defaultValue = NOT_RESTRICTED) + private int startPage; + + /** + * The last page to be extracted from the PDF. + */ + public static final String PARAM_END_PAGE = "endPage"; + @ConfigurationParameter(name = PARAM_END_PAGE, mandatory = false, defaultValue = NOT_RESTRICTED) + private int endPage; + + private Trie substitutionTable; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + if (BUILT_IN.equals(headingType)) { + headingType = Heading.class.getName(); + } + + if (BUILT_IN.equals(paragraphType)) { + paragraphType = Paragraph.class.getName(); + } + + if (substitutionTableLocation != null) { + if (BUILT_IN.equals(substitutionTableLocation)) { + substitutionTableLocation = "classpath:/de/tudarmstadt/ukp/dkpro/core/io/pdf/substitutionTable.xml"; + } + + InputStream is = null; + try { + URL url = ResourceUtils.resolveLocation(substitutionTableLocation, this, aContext); + is = url.openStream(); + substitutionTable = SubstitutionTrieParser.parse(is); + } + catch (IOException e) { + throw new ResourceInitializationException(e); + } + finally { + closeQuietly(is); + } + } + else { + substitutionTable = null; + } + } + + @Override + public void getNext(CAS aCAS) + throws IOException, CollectionException + { + Resource resource = nextFile(); + initCas(aCAS, resource, null); + + InputStream is = null; + try { + is = resource.getInputStream(); + final Pdf2CasConverter converter = new Pdf2CasConverter(); + converter.setSubstitutionTable(substitutionTable); + converter.setHeadingType(headingType); + converter.setParagraphType(paragraphType); + if (startPage != Integer.parseInt(NOT_RESTRICTED)) { + converter.setStartPage(startPage); + } + if (endPage != Integer.parseInt(NOT_RESTRICTED)) { + converter.setEndPage(endPage); + } + converter.writeText(aCAS, is); + } + finally { + closeQuietly(is); + } + } +} diff --git a/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/internal/Buckets.java b/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/internal/Buckets.java new file mode 100644 index 0000000000..28563cba16 --- /dev/null +++ b/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/internal/Buckets.java @@ -0,0 +1,148 @@ +/* + * Copyright 2010 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.pdf.internal; + +import java.util.LinkedList; + +/** + * Cluster values into buckets. A new bucket is opened if a new value is added that differs from the + * average value of each of the existing buckets by more than a certain threshold. + * + */ +public class Buckets +{ + private final LinkedList buckets = new LinkedList(); + private final double tolerance; + private boolean modified = true; + private Bucket cachedBest = null; + + Buckets(final double aTolerance) + { + tolerance = aTolerance; + } + + public void put(final double aValue) + { + modified = true; + + // Empty so far + if (buckets.size() == 0) { + newBucket(aValue); + return; + } + + Bucket best = buckets.getFirst(); + double best_diff = Math.abs(best.getValue() - aValue); + for (final Bucket b : buckets) { + final double cur_diff = Math.abs(b.getValue() - aValue); + + // Bail out on exact match + if (cur_diff == 0.0) { + b.add(aValue); + return; + } + + // Found better match? + if (cur_diff < best_diff) { + best = b; + best_diff = cur_diff; + } + } + + // Add to existing bucket if within tolerance, otherwise create new one + if (best_diff < tolerance) { + best.add(aValue); + } + else { + newBucket(aValue); + } + } + + private void newBucket(final double aValue) + { + buckets.add(new Bucket(aValue)); + } + + public Bucket getBest() + { + if (modified == false) { + return cachedBest; + } + + Bucket best = buckets.getFirst(); + for (final Bucket b : buckets) { + if (best.size() < b.size()) { + best = b; + } + } + cachedBest = best; + + return best; + } + + @Override + public String toString() + { + return buckets.toString(); + } + + static class Bucket + { + private final LinkedList values = new LinkedList(); + private double cached_avg = 0.0; + private boolean modified = true; + + double getValue() + { + if (!modified) { + return cached_avg; + } + + modified = false; + + double avg = 0.0; + for (final Double v : values) { + avg += v.doubleValue(); + } + cached_avg = avg / values.size(); + return cached_avg; + } + + int size() + { + return values.size(); + } + + Bucket(final double aValue) + { + values.add(aValue); + } + + void add(final double aValue) + { + modified = true; + values.add(aValue); + } + + @Override + public String toString() + { + return "[" + getValue() + " : " + values.size() + "]"; + } + } +} diff --git a/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/internal/LegacyPDFStreamEngine.java b/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/internal/LegacyPDFStreamEngine.java new file mode 100644 index 0000000000..a39b71ef28 --- /dev/null +++ b/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/internal/LegacyPDFStreamEngine.java @@ -0,0 +1,340 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.pdf.internal; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.fontbox.ttf.TrueTypeFont; +import org.apache.fontbox.util.BoundingBox; +import org.apache.pdfbox.contentstream.PDFStreamEngine; +import org.apache.pdfbox.contentstream.operator.DrawObject; +import org.apache.pdfbox.contentstream.operator.state.Concatenate; +import org.apache.pdfbox.contentstream.operator.state.Restore; +import org.apache.pdfbox.contentstream.operator.state.Save; +import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters; +import org.apache.pdfbox.contentstream.operator.state.SetMatrix; +import org.apache.pdfbox.contentstream.operator.text.BeginText; +import org.apache.pdfbox.contentstream.operator.text.EndText; +import org.apache.pdfbox.contentstream.operator.text.MoveText; +import org.apache.pdfbox.contentstream.operator.text.MoveTextSetLeading; +import org.apache.pdfbox.contentstream.operator.text.NextLine; +import org.apache.pdfbox.contentstream.operator.text.SetCharSpacing; +import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize; +import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling; +import org.apache.pdfbox.contentstream.operator.text.SetTextLeading; +import org.apache.pdfbox.contentstream.operator.text.SetTextRenderingMode; +import org.apache.pdfbox.contentstream.operator.text.SetTextRise; +import org.apache.pdfbox.contentstream.operator.text.SetWordSpacing; +import org.apache.pdfbox.contentstream.operator.text.ShowText; +import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted; +import org.apache.pdfbox.contentstream.operator.text.ShowTextLine; +import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.font.PDCIDFont; +import org.apache.pdfbox.pdmodel.font.PDCIDFontType2; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDFontDescriptor; +import org.apache.pdfbox.pdmodel.font.PDSimpleFont; +import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont; +import org.apache.pdfbox.pdmodel.font.PDType0Font; +import org.apache.pdfbox.pdmodel.font.PDType3Font; +import org.apache.pdfbox.pdmodel.font.encoding.GlyphList; +import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState; +import org.apache.pdfbox.text.TextPosition; +import org.apache.pdfbox.util.Matrix; +import org.apache.pdfbox.util.Vector; + +/** + * LEGACY text calculations which are known to be incorrect but are depended on by PDFTextStripper. + * + * This class exists only so that we don't break the code of users who have their own subclasses + * of PDFTextStripper. It replaces the good implementation of showGlyph in PDFStreamEngine, with + * a bad implementation which is backwards compatible. + * + * DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper. + * THIS CODE IS DELIBERATELY INCORRECT, USE PDFStreamEngine INSTEAD. + */ +class LegacyPDFStreamEngine extends PDFStreamEngine +{ + private static final Log LOG = LogFactory.getLog(LegacyPDFStreamEngine.class); + + private int pageRotation; + private PDRectangle pageSize; + private Matrix translateMatrix; + private final GlyphList glyphList; + + /** + * Constructor. + */ + LegacyPDFStreamEngine() throws IOException + { + addOperator(new BeginText()); + addOperator(new Concatenate()); + addOperator(new DrawObject()); // special text version + addOperator(new EndText()); + addOperator(new SetGraphicsStateParameters()); + addOperator(new Save()); + addOperator(new Restore()); + addOperator(new NextLine()); + addOperator(new SetCharSpacing()); + addOperator(new MoveText()); + addOperator(new MoveTextSetLeading()); + addOperator(new SetFontAndSize()); + addOperator(new ShowText()); + addOperator(new ShowTextAdjusted()); + addOperator(new SetTextLeading()); + addOperator(new SetMatrix()); + addOperator(new SetTextRenderingMode()); + addOperator(new SetTextRise()); + addOperator(new SetWordSpacing()); + addOperator(new SetTextHorizontalScaling()); + addOperator(new ShowTextLine()); + addOperator(new ShowTextLineAndSpace()); + + // load additional glyph list for Unicode mapping + String path = "org/apache/pdfbox/resources/glyphlist/additional.txt"; + InputStream input = GlyphList.class.getClassLoader().getResourceAsStream(path); + glyphList = new GlyphList(GlyphList.getAdobeGlyphList(), input); + } + + /** + * This will initialise and process the contents of the stream. + * + * @param page the page to process + * @throws java.io.IOException if there is an error accessing the stream. + */ + @Override + public void processPage(PDPage page) throws IOException + { + this.pageRotation = page.getRotation(); + this.pageSize = page.getCropBox(); + + if (pageSize.getLowerLeftX() == 0 && pageSize.getLowerLeftY() == 0) + { + translateMatrix = null; + } + else + { + // translation matrix for cropbox + translateMatrix = Matrix.getTranslateInstance(-pageSize.getLowerLeftX(), + -pageSize.getLowerLeftY()); + } + super.processPage(page); + } + + /** + * This method was originally written by Ben Litchfield for PDFStreamEngine. + */ + @Override + protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, String unicode, + Vector displacement) throws IOException + { + // + // legacy calculations which were previously in PDFStreamEngine + // + // DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper. + // THIS CODE IS DELIBERATELY INCORRECT + // + + PDGraphicsState state = getGraphicsState(); + Matrix ctm = state.getCurrentTransformationMatrix(); + float fontSize = state.getTextState().getFontSize(); + float horizontalScaling = state.getTextState().getHorizontalScaling() / 100f; + Matrix textMatrix = getTextMatrix(); + + BoundingBox bbox = font.getBoundingBox(); + if (bbox.getLowerLeftY() < Short.MIN_VALUE) + { + // PDFBOX-2158 and PDFBOX-3130 + // files by Salmat eSolutions / ClibPDF Library + bbox.setLowerLeftY(- (bbox.getLowerLeftY() + 65536)); + } + // 1/2 the bbox is used as the height todo: why? + float glyphHeight = bbox.getHeight() / 2; + + // sometimes the bbox has very high values, but CapHeight is OK + PDFontDescriptor fontDescriptor = font.getFontDescriptor(); + if (fontDescriptor != null) + { + float capHeight = fontDescriptor.getCapHeight(); + if (capHeight != 0 && (capHeight < glyphHeight || glyphHeight == 0)) + { + glyphHeight = capHeight; + } + } + + // transformPoint from glyph space -> text space + float height; + if (font instanceof PDType3Font) + { + height = font.getFontMatrix().transformPoint(0, glyphHeight).y; + } + else + { + height = glyphHeight / 1000; + } + + float displacementX = displacement.getX(); + // the sorting algorithm is based on the width of the character. As the displacement + // for vertical characters doesn't provide any suitable value for it, we have to + // calculate our own + if (font.isVertical()) + { + displacementX = font.getWidth(code) / 1000; + // there may be an additional scaling factor for true type fonts + TrueTypeFont ttf = null; + if (font instanceof PDTrueTypeFont) + { + ttf = ((PDTrueTypeFont)font).getTrueTypeFont(); + } + else if (font instanceof PDType0Font) + { + PDCIDFont cidFont = ((PDType0Font)font).getDescendantFont(); + if (cidFont instanceof PDCIDFontType2) + { + ttf = ((PDCIDFontType2)cidFont).getTrueTypeFont(); + } + } + if (ttf != null && ttf.getUnitsPerEm() != 1000) + { + displacementX *= 1000f / ttf.getUnitsPerEm(); + } + } + + // + // legacy calculations which were previously in PDFStreamEngine + // + // DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper. + // THIS CODE IS DELIBERATELY INCORRECT + // + + // (modified) combined displacement, this is calculated *without* taking the character + // spacing and word spacing into account, due to legacy code in TextStripper + float tx = displacementX * fontSize * horizontalScaling; + float ty = displacement.getY() * fontSize; + + // (modified) combined displacement matrix + Matrix td = Matrix.getTranslateInstance(tx, ty); + + // (modified) text rendering matrix + // text space -> device space + Matrix nextTextRenderingMatrix = td.multiply(textMatrix).multiply(ctm); + float nextX = nextTextRenderingMatrix.getTranslateX(); + float nextY = nextTextRenderingMatrix.getTranslateY(); + + // (modified) width and height calculations + float dxDisplay = nextX - textRenderingMatrix.getTranslateX(); + float dyDisplay = height * textRenderingMatrix.getScalingFactorY(); + + // + // start of the original method + // + + // Note on variable names. There are three different units being used in this code. + // Character sizes are given in glyph units, text locations are initially given in text + // units, and we want to save the data in display units. The variable names should end with + // Text or Disp to represent if the values are in text or disp units (no glyph units are + // saved). + + float glyphSpaceToTextSpaceFactor = 1 / 1000f; + if (font instanceof PDType3Font) + { + glyphSpaceToTextSpaceFactor = font.getFontMatrix().getScaleX(); + } + + float spaceWidthText = 0; + try + { + // to avoid crash as described in PDFBOX-614, see what the space displacement should be + spaceWidthText = font.getSpaceWidth() * glyphSpaceToTextSpaceFactor; + } + catch (Throwable exception) + { + LOG.warn(exception, exception); + } + + if (spaceWidthText == 0) + { + spaceWidthText = font.getAverageFontWidth() * glyphSpaceToTextSpaceFactor; + // the average space width appears to be higher than necessary so make it smaller + spaceWidthText *= .80f; + } + if (spaceWidthText == 0) + { + spaceWidthText = 1.0f; // if could not find font, use a generic value + } + + // the space width has to be transformed into display units + float spaceWidthDisplay = spaceWidthText * textRenderingMatrix.getScalingFactorX(); + + // use our additional glyph list for Unicode mapping + unicode = font.toUnicode(code, glyphList); + + // when there is no Unicode mapping available, Acrobat simply coerces the character code + // into Unicode, so we do the same. Subclasses of PDFStreamEngine don't necessarily want + // this, which is why we leave it until this point in PDFTextStreamEngine. + if (unicode == null) + { + if (font instanceof PDSimpleFont) + { + char c = (char) code; + unicode = new String(new char[] { c }); + } + else + { + // Acrobat doesn't seem to coerce composite font's character codes, instead it + // skips them. See the "allah2.pdf" TestTextStripper file. + return; + } + } + + // adjust for cropbox if needed + Matrix translatedTextRenderingMatrix; + if (translateMatrix == null) + { + translatedTextRenderingMatrix = textRenderingMatrix; + } + else + { + translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, + textRenderingMatrix); + nextX -= pageSize.getLowerLeftX(); + nextY -= pageSize.getLowerLeftY(); + } + + processTextPosition(new TextPosition(pageRotation, pageSize.getWidth(), + pageSize.getHeight(), translatedTextRenderingMatrix, nextX, nextY, + Math.abs(dyDisplay), dxDisplay, + Math.abs(spaceWidthDisplay), unicode, new int[] { code } , font, fontSize, + (int)(fontSize * textMatrix.getScalingFactorX()))); + } + + /** + * A method provided as an event interface to allow a subclass to perform some specific + * functionality when text needs to be processed. + * + * @param text The text to be processed. + */ + protected void processTextPosition(TextPosition text) + { + // subclasses can override to provide specific functionality + } +} diff --git a/dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/Pdf2CasConverter.java b/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/internal/Pdf2CasConverter.java similarity index 93% rename from dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/Pdf2CasConverter.java rename to dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/internal/Pdf2CasConverter.java index 07413eaf1c..3e29c18464 100644 --- a/dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/Pdf2CasConverter.java +++ b/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/internal/Pdf2CasConverter.java @@ -1,339 +1,345 @@ -/* - * Copyright 2010 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.pdf; - -import java.io.IOException; -import java.io.InputStream; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.util.TextPosition; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.text.AnnotationFS; - -/** - * Converts a PDF to a CAS. Uses a substitution table. - * - */ -public class Pdf2CasConverter - extends PdfLayoutEventStripper -{ - private final Log log = LogFactory.getLog(getClass()); - - private Trie substitutionTable; - private CAS cas; - private StringBuilder text; - private Style regionStyle; - private StringBuilder regionText; - private String paragraphType; - private String headingType; - - public Pdf2CasConverter() - throws IOException - { - super(); - } - - public void writeText(final CAS aCas, final InputStream aIs) - throws IOException - { - final PDDocument doc = PDDocument.load(aIs); - - try { - if (doc.isEncrypted()) { - throw new IOException("Encrypted documents currently not supported"); - } - - cas = aCas; - text = new StringBuilder(); - - writeText(doc); - } - finally { - doc.close(); - } - } - - @Override - protected void startDocument(final PDDocument aPdf) - throws IOException - { - if (log.isTraceEnabled()) { - log.trace(""); - } - } - - @Override - protected void endDocument(final PDDocument aPdf) - throws IOException - { - cas.setDocumentText(text.toString()); - - if (log.isTraceEnabled()) { - log.trace(""); - } - } - - @Override - protected void processLineSeparator() - throws IOException - { - if (log.isTraceEnabled()) { - log.trace("
"); - } - - if (regionText == null) { - throw new IllegalStateException("No region started"); - } - - regionText.append("\n"); - } - - @Override - protected void processWordSeparator() - throws IOException - { - if (log.isTraceEnabled()) { - log.trace("< >"); - } - - if (regionText == null) { - throw new IllegalStateException("No region started"); - } - - regionText.append(" "); - } - - @Override - protected void startPage(final int aFirstPage, final int aLastPage, final int aCurrentPage, - final PDPage page) - throws IOException - { - if (log.isTraceEnabled()) { - log.trace(""); - } - - if (log.isDebugEnabled()) { - log.debug("Decoding page " + aCurrentPage + " of " + (aLastPage - aFirstPage + 1)); - } - } - - @Override - protected void endPage(final int aStartPage, final int aEndPage, final int aCurrentPage, - final PDPage page) - throws IOException - { - if (log.isTraceEnabled()) { - log.trace(""); - } - } - - @Override - protected void startRegion(final Style aStyle) - throws IOException - { - if (log.isTraceEnabled()) { - log.trace("<" + aStyle + ">"); - } - - regionStyle = aStyle; - regionText = new StringBuilder(); - } - - @Override - protected void endRegion(final Style aStyle) - throws IOException - { - if (log.isTraceEnabled()) { - log.trace(""); - } - - if (regionText == null) { - throw new IllegalStateException("No region started"); - } - - if (regionStyle != aStyle) { - throw new IllegalStateException("Current region has style " + regionStyle - + ", but closing region has style " + aStyle); - } - - // Append text - int begin = text.length(); - sanitize(regionText); - text.append(regionText.toString()); - int end = text.length(); - text.append('\n'); - - // Add annotation - switch (aStyle) { - case HEADING: - if (headingType != null) { - Type t = cas.getTypeSystem().getType(headingType); - AnnotationFS a = cas.createAnnotation(t, begin, end); - cas.addFsToIndexes(a); - } - break; - case PARAGRAPH: - if (paragraphType != null) { - Type t = cas.getTypeSystem().getType(paragraphType); - AnnotationFS a = cas.createAnnotation(t, begin, end); - cas.addFsToIndexes(a); - } - break; - default: - throw new IllegalStateException("Unknown region style: " + aStyle); - } - - regionStyle = null; - regionText = null; - } - - @Override - protected void writeCharacters(final TextPosition aText) - throws IOException - { - if (log.isTraceEnabled()) { - log.trace("[" + aText.getCharacter() + "]"); - } - - if (regionText == null) { - throw new IllegalStateException("No region started"); - } - - regionText.append(aText.getCharacter()); - } - - private static boolean isValidXMLChar(final int aCodePoint) - { - return (aCodePoint == 0x0009) || (aCodePoint == 0x000A) || (aCodePoint == 0x000D) - || ((0x0020 <= aCodePoint) && (aCodePoint <= 0xD7FF)) - || ((0xE000 <= aCodePoint) && (aCodePoint <= 0xFFFD)); - } - - private StringBuilder sanitize(final StringBuilder aContent) - { - int i = 0; - int lastBreak = 0; - while (i < aContent.length()) { - // Check valid unicode char - if (!isValidXMLChar(aContent.codePointAt(i))) { - aContent.setCharAt(i, ' '); - i++; - continue; - } - - // Set up how many characters we want to skip - int seek = i + 1; - - // Do we maybe have an entity? - if (aContent.charAt(i) == '&') { - // REC 2006-10-21 Some PDFs seem to have entities and others - // don't - // so we may encounter &'s that do not introduce an entity and - // just ignore them. - final int end = aContent.indexOf(";", i); - if (end != -1) { - final String cand = aContent.substring(i, end + 1); - String r = null; - try { - if (cand.startsWith("&#x")) { - final int cp = Integer.parseInt(cand.substring(2, cand.length() - 1), - 16); - r = isValidXMLChar(cp) ? String.valueOf(Character.toChars(cp)) : " "; - } - else if (cand.startsWith("&#")) { - final int cp = Integer.parseInt(cand.substring(2, cand.length() - 1)); - r = isValidXMLChar(cp) ? String.valueOf(Character.toChars(cp)) : " "; - } - else { - // RE 2006-10-22 The chance that there is a & and a - // ; - // together in a string is quite big. Let's be - // tolerant. - } - } - catch (final NumberFormatException e) { - log.warn("Invalid numeric entity in fragment [" + cand + "] - Dropping it."); - } - - // Expand the entity and set proper skip (if found) - if (r != null) { - aContent.replace(i, i + cand.length(), r); - seek = i + r.length(); - } - } - } - - // Match against the Trie after numeric entity expansion is over - if (substitutionTable != null) { - final Trie.Node match = substitutionTable.getNode(aContent, i); - if (match != null) { - aContent.replace(i, i + match.level, match.value); - seek = i + match.value.length(); - } - } - - // Check line breaks - while (i < seek) { - if (aContent.charAt(i) == '\n') { - lastBreak = i; - } - else if (Character.isWhitespace(aContent.codePointAt(i)) && (i > (lastBreak + 79))) { - lastBreak = i; - aContent.replace(i, i + 1, "\n"); - } - i++; - } - } - - return aContent; - } - - public void setSubstitutionTable(Trie aSubstitutionTable) - { - substitutionTable = aSubstitutionTable; - } - - public Trie getSubstitutionTable() - { - return substitutionTable; - } - - public String getParagraphType() - { - return paragraphType; - } - - public void setParagraphType(String aParagraphType) - { - paragraphType = aParagraphType; - } - - public String getHeadingType() - { - return headingType; - } - - public void setHeadingType(String aHeadingType) - { - headingType = aHeadingType; - } -} +/* + * Copyright 2010 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.pdf.internal; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.text.TextPosition; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.TrimUtils; + +/** + * Converts a PDF to a CAS. Uses a substitution table. + */ +public class Pdf2CasConverter + extends PdfLayoutEventStripper +{ + private final Log log = LogFactory.getLog(getClass()); + + private Trie substitutionTable; + private CAS cas; + private StringBuilder text; + private Style regionStyle; + private StringBuilder regionText; + private String paragraphType; + private String headingType; + + public Pdf2CasConverter() + throws IOException + { + super(); + } + + public void writeText(final CAS aCas, final InputStream aIs) + throws IOException + { + final PDDocument doc = PDDocument.load(aIs); + + try { + if (doc.isEncrypted()) { + throw new IOException("Encrypted documents currently not supported"); + } + + cas = aCas; + text = new StringBuilder(); + + writeText(doc); + } + finally { + doc.close(); + } + } + + @Override + protected void startDocument(final PDDocument aPdf) + throws IOException + { + if (log.isTraceEnabled()) { + log.trace(""); + } + } + + @Override + protected void endDocument(final PDDocument aPdf) + throws IOException + { + cas.setDocumentText(text.toString()); + + if (log.isTraceEnabled()) { + log.trace(""); + } + } + + @Override + protected void processLineSeparator() + throws IOException + { + if (log.isTraceEnabled()) { + log.trace("
"); + } + + if (regionText == null) { + throw new IllegalStateException("No region started"); + } + + regionText.append("\n"); + } + + @Override + protected void processWordSeparator() + throws IOException + { + if (log.isTraceEnabled()) { + log.trace("< >"); + } + + if (regionText == null) { + throw new IllegalStateException("No region started"); + } + + regionText.append(" "); + } + + @Override + protected void startPage(final int aFirstPage, final int aLastPage, final int aCurrentPage, + final PDPage page) + throws IOException + { + if (log.isTraceEnabled()) { + log.trace(""); + } + + if (log.isDebugEnabled()) { + log.debug("Decoding page " + aCurrentPage + " of " + (aLastPage - aFirstPage + 1)); + } + } + + @Override + protected void endPage(final int aStartPage, final int aEndPage, final int aCurrentPage, + final PDPage page) + throws IOException + { + if (log.isTraceEnabled()) { + log.trace(""); + } + } + + @Override + protected void startRegion(final Style aStyle) + throws IOException + { + if (log.isTraceEnabled()) { + log.trace("<" + aStyle + ">"); + } + + regionStyle = aStyle; + regionText = new StringBuilder(); + } + + @Override + protected void endRegion(final Style aStyle) + throws IOException + { + if (log.isTraceEnabled()) { + log.trace(""); + } + + if (regionText == null) { + throw new IllegalStateException("No region started"); + } + + if (regionStyle != aStyle) { + throw new IllegalStateException("Current region has style " + regionStyle + + ", but closing region has style " + aStyle); + } + + // Append text + int begin = text.length(); + sanitize(regionText); + text.append(regionText.toString()); + int end = text.length(); + text.append('\n'); + + // Trim leading/trailing whitespace + int[] offsets = {begin, end}; + TrimUtils.trim(text, offsets); + + // Add annotation + switch (aStyle) { + case HEADING: + if (headingType != null) { + Type t = cas.getTypeSystem().getType(headingType); + AnnotationFS a = cas.createAnnotation(t, offsets[0], offsets[1]); + cas.addFsToIndexes(a); + } + break; + case PARAGRAPH: + if (paragraphType != null) { + Type t = cas.getTypeSystem().getType(paragraphType); + AnnotationFS a = cas.createAnnotation(t, offsets[0], offsets[1]); + cas.addFsToIndexes(a); + } + break; + default: + throw new IllegalStateException("Unknown region style: " + aStyle); + } + + regionStyle = null; + regionText = null; + } + + @Override + protected void writeCharacters(final TextPosition aText) + throws IOException + { + if (log.isTraceEnabled()) { + log.trace("[" + aText.getUnicode() + "]"); + } + + if (regionText == null) { + throw new IllegalStateException("No region started"); + } + + regionText.append(aText.getUnicode()); + } + + private static boolean isValidXMLChar(final int aCodePoint) + { + return (aCodePoint == 0x0009) || (aCodePoint == 0x000A) || (aCodePoint == 0x000D) + || ((0x0020 <= aCodePoint) && (aCodePoint <= 0xD7FF)) + || ((0xE000 <= aCodePoint) && (aCodePoint <= 0xFFFD)); + } + + private StringBuilder sanitize(final StringBuilder aContent) + { + int i = 0; + int lastBreak = 0; + while (i < aContent.length()) { + // Check valid unicode char + if (!isValidXMLChar(aContent.codePointAt(i))) { + aContent.setCharAt(i, ' '); + i++; + continue; + } + + // Set up how many characters we want to skip + int seek = i + 1; + + // Do we maybe have an entity? + if (aContent.charAt(i) == '&') { + // REC 2006-10-21 Some PDFs seem to have entities and others + // don't + // so we may encounter &'s that do not introduce an entity and + // just ignore them. + final int end = aContent.indexOf(";", i); + if (end != -1) { + final String cand = aContent.substring(i, end + 1); + String r = null; + try { + if (cand.startsWith("&#x")) { + final int cp = Integer.parseInt(cand.substring(2, cand.length() - 1), + 16); + r = isValidXMLChar(cp) ? String.valueOf(Character.toChars(cp)) : " "; + } + else if (cand.startsWith("&#")) { + final int cp = Integer.parseInt(cand.substring(2, cand.length() - 1)); + r = isValidXMLChar(cp) ? String.valueOf(Character.toChars(cp)) : " "; + } + else { + // RE 2006-10-22 The chance that there is a & and a + // ; + // together in a string is quite big. Let's be + // tolerant. + } + } + catch (final NumberFormatException e) { + log.warn("Invalid numeric entity in fragment [" + cand + "] - Dropping it."); + } + + // Expand the entity and set proper skip (if found) + if (r != null) { + aContent.replace(i, i + cand.length(), r); + seek = i + r.length(); + } + } + } + + // Match against the Trie after numeric entity expansion is over + if (substitutionTable != null) { + final Trie.Node match = substitutionTable.getNode(aContent, i); + if (match != null) { + aContent.replace(i, i + match.level, match.value); + seek = i + match.value.length(); + } + } + + // Check line breaks + while (i < seek) { + if (aContent.charAt(i) == '\n') { + lastBreak = i; + } + else if (Character.isWhitespace(aContent.codePointAt(i)) + && (i > (lastBreak + 79))) { + lastBreak = i; + aContent.replace(i, i + 1, "\n"); + } + i++; + } + } + + return aContent; + } + + public void setSubstitutionTable(Trie aSubstitutionTable) + { + substitutionTable = aSubstitutionTable; + } + + public Trie getSubstitutionTable() + { + return substitutionTable; + } + + public String getParagraphType() + { + return paragraphType; + } + + public void setParagraphType(String aParagraphType) + { + paragraphType = aParagraphType; + } + + public String getHeadingType() + { + return headingType; + } + + public void setHeadingType(String aHeadingType) + { + headingType = aHeadingType; + } +} diff --git a/dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/PdfLayoutEventStripper.java b/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/internal/PdfLayoutEventStripper.java similarity index 86% rename from dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/PdfLayoutEventStripper.java rename to dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/internal/PdfLayoutEventStripper.java index 798ca2b612..089eb1084f 100644 --- a/dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/PdfLayoutEventStripper.java +++ b/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/internal/PdfLayoutEventStripper.java @@ -1,1197 +1,1150 @@ -/* - * Copyright 2010 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * This code is based on the PDFTextStripper written by Ben Litchfield from - * the PDFbox 0.7.x project and licensed under the BSD license. In accordance - * with the terms of this license, the following copyright statement is retained: - * - * Copyright (c) 2003-2007, www.pdfbox.org - * All rights reserved. - * - * Furthermore the modified code is re-licensed under the Apache License, - * Version 2.0 as stated above. - */ -package de.tudarmstadt.ukp.dkpro.core.io.pdf; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.Vector; - -import org.apache.pdfbox.cos.COSStream; -import org.apache.pdfbox.exceptions.CryptographyException; -import org.apache.pdfbox.exceptions.InvalidPasswordException; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.common.PDRectangle; -import org.apache.pdfbox.pdmodel.common.PDStream; -import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead; -import org.apache.pdfbox.util.PDFStreamEngine; -import org.apache.pdfbox.util.ResourceLoader; -import org.apache.pdfbox.util.TextPosition; - -/** - * This class will take a PDF document and strip out all of the text and ignore the formatting and - * such. Please note; it is up to clients of this class to verify that a specific user has the - * correct permissions to extract text from the PDF document. - *

- * This class is based on the pdfbox 1.7.0 PDFTextStripper class and was substantially modified and - * enhanced for basic paragraph and heading detection. Unfortunately it was not possible to add - * these enhancements through sub-classing, thus the code was copied and adapted. - */ -public abstract class PdfLayoutEventStripper - extends PDFStreamEngine -{ - public static enum Values - { - LEFT, RIGHT, TOP, BOTTOM, LINESPACING, LINEHEIGHT - } - - public static enum Style - { - PAGE, PARAGRAPH, HEADING - } - - private PDDocument document; - - private int currentPageNo = 0; - private int startPage = 1; - private int maxPage = 0; - private int endPage = Integer.MAX_VALUE; - private boolean suppressDuplicateOverlappingText = true; - private boolean shouldSeparateByBeads = true; - - private List pageArticles = null; - /** - * The charactersByArticle is used to extract text by article divisions. For example a PDF that - * has two columns like a newspaper, we want to extract the first column and then the second - * column. In this example the PDF would have 2 beads(or articles), one for each column. The - * size of the charactersByArticle would be 5, because not all text on the screen will fall into - * one of the articles. The five divisions are shown below - * - * Text before first article first article text text between first article and second article - * second article text text after second article - * - * Most PDFs won't have any beads, so charactersByArticle will contain a single entry. - */ - protected Vector> charactersByArticle = new Vector>(); - - private final Map> characterListMapping = new HashMap>(); - - /** - * Instantiate a new PDFTextStripper object. This object will load properties from - * Resources/PDFTextStripper.properties. - * - * @throws IOException - * If there is an error loading the properties. - */ - public PdfLayoutEventStripper() - throws IOException - { - super(ResourceLoader.loadProperties( - "org/apache/pdfbox/resources/PDFTextStripper.properties", true)); - } - - /** - * Instantiate a new PDFTextStripper object. Loading all of the operator mappings from the - * properties object that is passed in. - * - * @param props - * The properties containing the mapping of operators to PDFOperator classes. - * - * @throws IOException - * If there is an error reading the properties. - */ - public PdfLayoutEventStripper(final Properties props) - throws IOException - { - super(props); - } - - /** - * This will take a PDDocument and write the text of that document to the print writer. - * - * @param doc - * The document to get the data from. - * - * @throws IOException - * If the doc is in an invalid state. - */ - public void writeText(final PDDocument doc) - throws IOException - { - resetEngine(); - - currentPageNo = 0; - document = doc; - startDocument(document); - - if (document.isEncrypted()) { - // We are expecting non-encrypted documents here, but it is common - // for users to pass in a document that is encrypted with an empty - // password (such a document appears to not be encrypted by - // someone viewing the document, thus the confusion). We will - // attempt to decrypt with the empty password to handle this case. - // - try { - document.decrypt(""); - } - catch (CryptographyException e) { - throw new IOException("Error decrypting document, details: ", e); - } - catch (InvalidPasswordException e) { - throw new IOException("Error: document is encrypted", e); - } - } - - processPages(document.getDocumentCatalog().getAllPages()); - endDocument(document); - } - - /** - * This will process all of the pages and the text that is in them. - * - * @param pages - * The pages object in the document. - * - * @throws IOException - * If there is an error parsing the text. - */ - protected void processPages(List pages) - throws IOException - { - maxPage = pages.size(); - - for (final PDPage page : pages) { - currentPageNo++; - final PDStream contentStream = page.getContents(); - if (contentStream != null) { - final COSStream contents = contentStream.getStream(); - processPage(page, contents); - } - } - } - - /** - * This will process the contents of a page. - * - * @param page - * The page to process. - * @param content - * The contents of the page. - * - * @throws IOException - * If there is an error processing the page. - */ - protected void processPage(final PDPage page, final COSStream content) - throws IOException - { - if ((currentPageNo >= startPage) && (currentPageNo <= endPage)) { - startPage(startPage, Math.min(maxPage, endPage), currentPageNo, page); - pageArticles = page.getThreadBeads(); - int numberOfArticleSections = 1 + pageArticles.size() * 2; - if (!shouldSeparateByBeads) { - numberOfArticleSections = 1; - } - final int originalSize = charactersByArticle.size(); - charactersByArticle.setSize(numberOfArticleSections); - for (int i = 0; i < numberOfArticleSections; i++) { - if (numberOfArticleSections < originalSize) { - charactersByArticle.get(i).clear(); - } - else { - charactersByArticle.set(i, new ArrayList()); - } - } - - characterListMapping.clear(); - - // processStream will call showCharacter were we will simply - // collect all the TextPositions for the page - processStream(page, page.findResources(), content); - - // Now we do the real processing - for (int i = 0; i < charactersByArticle.size(); i++) { - processArticle(charactersByArticle.get(i)); - } - - endPage(startPage, endPage, currentPageNo, page); - } - } - - /** - * This method tries do detect headings and paragraphs and line boundaries. - * - * @param textList - * the text. - * @throws IOException - * if there is an error writing to the stream. - */ - protected void processArticle(final List textList) - throws IOException - { - // Nothing to do in this article? - if (textList.size() == 0) { - return; - } - - // System.out.println("XScale: "+textList.get(0).getXScale()); - // System.out.println("YScale: "+textList.get(0).getYScale()); - - final int prediction_depth = 10; - Prediction pred = null; - final Block block = new Block(textList, 0); - Line currentLine = null; - - boolean newRegion = false; - Style currentStyle = null; - Style prevStyle = null; - int cur = 0; - while (cur < textList.size()) { - // Initialize the line (if not already done) - if (currentLine == null) { - currentLine = new Line(textList, cur); - - // Get the style for the line (base on style for current - // element) - prevStyle = currentStyle; - currentStyle = getStyle(textList.get(cur)); - - // Test for a style change - if ((newRegion) || (prevStyle != currentStyle)) { - if (newRegion) { - newRegion = false; - } - // On a style change issue the proper events - if (prevStyle != null) { - endRegion(prevStyle); - } - startRegion(currentStyle); - pred = predictGeneralStructure(textList, cur, prediction_depth); - } - } - - // Check if we left the line - if (!currentLine.withinLine(textList.get(cur)) - && !currentLine.isSuperscript(textList.get(cur)) - && !currentLine.isSubscript(textList.get(cur))) { - // We left the line - currentLine = null; - - // Check if we left the region - final boolean columnSwitch = isColumnSwitch(textList.get(cur), block); - final boolean leftIndented = isLeftIndented(textList.get(cur), pred); - final boolean leftOutdented = isLeftOutdented(textList.get(cur), pred); - // boolean fontSwitch = (fontSize[cur] != fontSize[cur-1]); - final boolean vAdjacent = isVerticallyAdjacent(textList.get(cur).getY(), textList - .get(cur - 1).getY(), block.linespacing); - - if (!columnSwitch && !leftIndented && !leftOutdented && - /* !fontSwitch && */vAdjacent) { - // Same region. Issue a line separator and restart - processLineSeparator(); - } - else { - // New region - newRegion = true; - block.reset(cur); - - if ((pred == null) || !vAdjacent) { - pred = predictGeneralStructure(textList, cur, prediction_depth); - } - else if (vAdjacent) { - // If the block is directly adjacent, we may be better - // of - // with the old prediction... let's see if we can get a - // comparatively good new one. - final Prediction new_pred = predictGeneralStructure(textList, cur, - prediction_depth); - final boolean badPred = isSignifiantlyWorse(new_pred.quality, pred.quality, - 0.4); - if (!badPred) { - pred = new_pred; - } - } - } - - continue; // Start again to create a new currentLine - } - - // Ok, we are in the same line still. - - // Let's check if the block is adjacent or needs a space - // if (!isRightAdjacent(textList, cur, cur-1, cur-2)) { - if ((cur > 0) && !isNextChar(textList.get(cur), textList.get(cur - 1))) { - processWordSeparator(); - } - - // Grow the current block to calculate better spacings. - block.grow(cur); - - // Write of the characters and advance. - writeCharacters(textList.get(cur)); - cur++; - } - - // Close region - if (currentStyle != null) { - endRegion(currentStyle); - } - } - - /** - * This will show add a character to the list of characters to be printed to the text file. - * - * @param text - * The description of the character to display. - */ - @Override - protected void processTextPosition(final TextPosition text) - { - boolean showCharacter = true; - if (suppressDuplicateOverlappingText) { - showCharacter = false; - final String textCharacter = text.getCharacter(); - final float textX = text.getX(); - final float textY = text.getY(); - List sameTextCharacters = characterListMapping.get(textCharacter); - if (sameTextCharacters == null) { - sameTextCharacters = new ArrayList(); - characterListMapping.put(textCharacter, sameTextCharacters); - } - - // RDD - Here we compute the value that represents the end of the - // rendered - // text. This value is used to determine whether subsequent text - // rendered - // on the same line overwrites the current text. - // - // We subtract any positive padding to handle cases where extreme - // amounts - // of padding are applied, then backed off (not sure why this is - // done, but there - // are cases where the padding is on the order of 10x the character - // width, and - // the TJ just backs up to compensate after each character). Also, - // we subtract - // an amount to allow for kerning (a percentage of the width of the - // last - // character). - // - boolean suppressCharacter = false; - final float tolerance = (text.getWidth() / textCharacter.length()) / 3.0f; - for (int i = 0; i < sameTextCharacters.size() && textCharacter != null; i++) { - final TextPosition character = sameTextCharacters.get(i); - final String charCharacter = character.getCharacter(); - final float charX = character.getX(); - final float charY = character.getY(); - // only want to suppress - - if (charCharacter != null && - // charCharacter.equals( textCharacter ) && - within(charX, textX, tolerance) && within(charY, textY, tolerance)) { - suppressCharacter = true; - } - } - if (!suppressCharacter && (text.getCharacter() != null) - && (text.getCharacter().length() > 0)) { - sameTextCharacters.add(text); - showCharacter = true; - } - } - - if (showCharacter) { - // if we are showing the character then we need to determine which - // article it belongs to. - int foundArticleDivisionIndex = -1; - int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1; - int notFoundButFirstLeftArticleDivisionIndex = -1; - int notFoundButFirstAboveArticleDivisionIndex = -1; - final float x = text.getX(); - final float y = text.getY(); - if (shouldSeparateByBeads) { - for (int i = 0; i < pageArticles.size() && foundArticleDivisionIndex == -1; i++) { - final PDThreadBead bead = pageArticles.get(i); - if (bead != null) { - final PDRectangle rect = bead.getRectangle(); - if (rect.contains(x, y)) { - foundArticleDivisionIndex = i * 2 + 1; - } - else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY()) - && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) { - notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2; - } - else if (x < rect.getLowerLeftX() - && notFoundButFirstLeftArticleDivisionIndex == -1) { - notFoundButFirstLeftArticleDivisionIndex = i * 2; - } - else if (y < rect.getUpperRightY() - && notFoundButFirstAboveArticleDivisionIndex == -1) { - notFoundButFirstAboveArticleDivisionIndex = i * 2; - } - } - else { - foundArticleDivisionIndex = 0; - } - } - } - else { - foundArticleDivisionIndex = 0; - } - int articleDivisionIndex = -1; - if (foundArticleDivisionIndex != -1) { - articleDivisionIndex = foundArticleDivisionIndex; - } - else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) { - articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex; - } - else if (notFoundButFirstLeftArticleDivisionIndex != -1) { - articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex; - } - else if (notFoundButFirstAboveArticleDivisionIndex != -1) { - articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex; - } - else { - articleDivisionIndex = charactersByArticle.size() - 1; - } - final List textList = charactersByArticle.get(articleDivisionIndex); - textList.add(text); - } - } - - /** - * This will determine of two floating point numbers are within a specified variance. - * - * @param first - * The first number to compare to. - * @param second - * The second number to compare to. - * @param variance - * The allowed variance. - * @return if the number is within the specified variance. - */ - private static boolean within(final float first, final float second, final float variance) - { - return second > first - variance && second < first + variance; - } - - private static float getWordSpacing(final TextPosition position) - { - if (position == null) { - return 0; - } - - float wordSpacing = 0; - - if (wordSpacing == 0) { - // try to get width of a space character - wordSpacing = position.getWidthOfSpace(); - // if still zero fall back to getting the width of the current - // character - if (wordSpacing == 0) { - wordSpacing = position.getWidth(); - } - } - - return wordSpacing; - } - - private static boolean validPosition(final List textList, final int pos) - { - return (pos >= 0) && (pos < textList.size()); - } - - /** - * Detects whether text in two positions is on the same line. This method is a bit fuzzy so we - * also get potential superscripts and subscripts. - * - * @param cur current position. - * @param prev previous position. - * @return if both are in the same line. - */ - private static boolean isSameLine(final TextPosition cur, final TextPosition prev) - { - if (cur.getY() == prev.getY()) { - return true; - } - else { - final float prevCenter = prev.getY() + prev.getHeight() / 2.0f; - final float prevHeight = prev.getHeight(); - final float curCenter = cur.getY() + cur.getHeight() / 2.0f; - - final boolean result = Math.abs(curCenter - prevCenter) < (prevHeight * 0.25f); - - // if (!result) { - // _log.debug("sameLine ["+result+"]"+ - // "[px:"+f_y1[prev]+"-"+f_y2[prev]+":"+contents[prev]+"]"+ - // "[cx:"+f_y1[cur]+"-"+f_y2[cur]+":"+contents[cur]+"]"); - // } - - return result; - } - } - - /** - * Tests if two objects are vertically adjacent or if they are so far away from each other that - * they have to be considered different blocks. - * - * @param cur_top - * current top. - * @param prev_top - * previous top. - * @param spacing - * spacing. - * @return if the two objects are verticalla adjacent. - */ - private static boolean isVerticallyAdjacent(final float cur_top, final float prev_top, - final float spacing) - { - /* set vertical error margin */ - final float verterr = (float) (spacing * 1.27); - - final boolean aboveThreshold = (cur_top < (prev_top + verterr)); - final boolean belowprev = (cur_top > prev_top); - - return aboveThreshold && belowprev; - } - - private static boolean isLeftIndented(final TextPosition cur, final Prediction pred) - { - return cur.getX() > (pred.left + (pred.linespacing * 0.2)); - } - - private static boolean isLeftOutdented(final TextPosition cur, final Prediction pred) - { - return cur.getX() < (pred.left - (pred.linespacing * 0.2)); - } - - /** - * Check if the current fragment is in a new column. - * - * @param cur - * current text position. - * @param block - * current block. - * @return if the fragment is in a new column. - */ - private static boolean isColumnSwitch(final TextPosition cur, final Block block) - { - return (cur.getY() < block.top); // && (f_x1[cur] > block.right); - } - - private static boolean isSignifiantlyWorse(final double qnew, final double qold, - final double limit) - { - final double deviation = Math.abs(((qnew - qold) / (qnew + qold))); - final boolean result = (deviation > limit) && (qnew < qold); - // if (_log.isTraceEnabled()) { - // _log.trace("Deviation: "+deviation+ " - "+(result?"BAD":"OK")); - // } - return result; - } - - /** - * Determine whether we need to insert a word separator between the two positions or not. - * - * Adapted from PDFBox PDFTextStripper.flushText() - * - * @param cur - * current position. - * @param prev - * previous position. - * @return if the two positions are immediately adjacent. - */ - private static boolean isNextChar(final TextPosition cur, final TextPosition prev) - { - float lastWordSpacing = getWordSpacing(prev); - final float wordSpacing = getWordSpacing(cur); - float startOfNextWordX; - final float endOfLastTextX = prev.getX() + prev.getWidth(); - - // RDD - We add a conservative approximation for space determination. - // basically if there is a blank area between two characters that is - // equal to some percentage of the word spacing then that will be the - // start of the next word - if (lastWordSpacing <= 0) { - startOfNextWordX = endOfLastTextX + (wordSpacing * 0.50f); - } - else { - startOfNextWordX = endOfLastTextX + (((wordSpacing + lastWordSpacing) / 2f) * 0.50f); - } - - lastWordSpacing = wordSpacing; - - // if (startOfNextWordX > cur.getX()) { - // System.out.print("{O:"+(startOfNextWordX - cur.getX())+"}"); - // } - - if (startOfNextWordX != -1 && startOfNextWordX < cur.getX() && prev != null && - // only bother adding a space if the last character was not a - // space - prev.getCharacter() != null && !prev.getCharacter().endsWith(" ")) { - return false; - } - else { - return true; - } - } - - private List collectLines(final List textList, final int blk_start, - final int depth) - { - final ArrayList lines = new ArrayList(depth); - Line l = new Line(textList, blk_start); - lines.add(l); - for (int i = 1; i < depth && l.hasNextLine(); i++) { - l = l.getNextLine(); - - // Bail out if we have a potential column switch - if (l.top < lines.get(lines.size() - 1).bottom) { - break; - } - lines.add(l); - } - return lines; - } - - /** - * Return a block with the probable linespacing, lineheight and left and right borders. - * - * @param textList - * text. - * @param blk_start - * block start. - * @param depth - * depth. - * @return structure prediction. - */ - private Prediction predictGeneralStructure(final List textList, - final int blk_start, final int depth) - { - // Try to fetch the next lines up to depth - final List lines = collectLines(textList, blk_start, depth); - - // Calculate the line block parameters - LineBlock lb = new LineBlock(lines); - - // Iterate once more over the lines because we may have a big spacing - // indicating a new block. - - final List lines2 = new ArrayList(depth); - final Line l = lines.get(0); - lines2.add(l); - for (int i = 1; i < lines.size(); i++) { - // Bail out if we have too much distance - if (!isVerticallyAdjacent(lines.get(i).top, lines.get(i - 1).top, lb.linespacing)) { - break; - } - lines2.add(lines.get(i)); - } - - // Get the bounds in buckets - final Buckets left_buckets = new Buckets(lb.linespacing * 0.1); - final Buckets right_buckets = new Buckets(lb.linespacing * 0.1); - for (final Line ln : lines2) { - left_buckets.put(ln.left); - right_buckets.put(ln.right); - } - - // if (_log.isTraceEnabled()) { - // _log.trace("Left: size:"+left_buckets.getBest().size()+" - lines:"+lines2.size()+" - depth:"+depth); - // } - - lb = new LineBlock(lines2); - - // Return values - final Prediction result = new Prediction(); - result.linespacing = lb.linespacing; - result.lineheight = lb.avglineheight; - result.left = (float) left_buckets.getBest().getValue(); - result.right = (float) right_buckets.getBest().getValue(); - result.quality = (float) left_buckets.getBest().size() / (float) depth; - - return result; - } - - protected Style getStyle(final TextPosition pos) - { - if ((pos.getFontSize() * pos.getYScale()) > 14) { - return Style.HEADING; - } - else { - return Style.PARAGRAPH; - } - } - - /** - * This method is available for subclasses of this class. It will be called before processing of - * the document start. - * - * @param pdf - * The PDF document that is being processed. - * @throws IOException - * If an IO error occurs. - */ - protected abstract void startDocument(PDDocument pdf) - throws IOException; - - /** - * This method is available for subclasses of this class. It will be called after processing of - * the document finishes. - * - * @param pdf - * The PDF document that is being processed. - * @throws IOException - * If an IO error occurs. - */ - protected abstract void endDocument(PDDocument pdf) - throws IOException; - - /** - * Start a new region. - * - * @param style - * the style. - * @throws IOException - * If there is any error writing to the stream. - */ - protected abstract void startRegion(Style style) - throws IOException; - - /** - * End a region. - * - * @param style - * the style. - * @throws IOException - * If there is any error writing to the stream. - */ - protected abstract void endRegion(Style style) - throws IOException; - - /** - * Start a new page. - * - * @param firstPage - * first page. - * @param lastPage - * last page. - * @param currentPage - * current page. - * @param page - * The page we are about to process. - * - * @throws IOException - * If there is any error writing to the stream. - */ - protected abstract void startPage(int firstPage, int lastPage, int currentPage, PDPage page) - throws IOException; - - /** - * End a page. - * - * @param firstPage - * first page. - * @param lastPage - * last page. - * @param currentPage - * current page. - * @param page - * The page we are about to process. - * - * @throws IOException - * If there is any error writing to the stream. - */ - protected abstract void endPage(int firstPage, int lastPage, int currentPage, PDPage page) - throws IOException; - - protected abstract void processLineSeparator() - throws IOException; - - protected abstract void processWordSeparator() - throws IOException; - - /** - * Write the string to the output stream. - * - * @param text - * The text to write to the stream. - * @throws IOException - * If there is an error when writing the text. - */ - protected abstract void writeCharacters(TextPosition text) - throws IOException; - - /** - * This is the page that the text extraction will start on. The pages start at page 1. For - * example in a 5 page PDF document, if the start page is 1 then all pages will be extracted. If - * the start page is 4 then pages 4 and 5 will be extracted. The default value is 1. - * - * @return Value of property startPage. - */ - public int getStartPage() - { - return startPage; - } - - /** - * This will set the first page to be extracted by this class. - * - * @param startPageValue - * New value of property startPage. - */ - public void setStartPage(final int startPageValue) - { - startPage = startPageValue; - } - - /** - * This will get the last page that will be extracted. This is inclusive, for example if a 5 - * page PDF an endPage value of 5 would extract the entire document, an end page of 2 would - * extract pages 1 and 2. This defaults to Integer.MAX_VALUE such that all pages of the pdf will - * be extracted. - * - * @return Value of property endPage. - */ - public int getEndPage() - { - return endPage; - } - - /** - * This will set the last page to be extracted by this class. - * - * @param endPageValue - * New value of property endPage. - */ - public void setEndPage(final int endPageValue) - { - endPage = endPageValue; - } - - /** - * @return Returns the suppressDuplicateOverlappingText. - */ - public boolean shouldSuppressDuplicateOverlappingText() - { - return suppressDuplicateOverlappingText; - } - - /** - * Get the current page number that is being processed. - * - * @return A 1 based number representing the current page. - */ - protected int getCurrentPageNo() - { - return currentPageNo; - } - - /** - * Character strings are grouped by articles. It is quite common that there will only be a - * single article. This returns a List that contains List objects, the inner lists will contain - * TextPosition objects. - * - * @return A double List of TextPositions for all text strings on the page. - */ - protected List> getCharactersByArticle() - { - return charactersByArticle; - } - - /** - * By default the text stripper will attempt to remove text that overlapps each other. Word - * paints the same character several times in order to make it look bold. By setting this to - * false all text will be extracted, which means that certain sections will be duplicated, but - * better performance will be noticed. - * - * @param suppressDuplicateOverlappingTextValue - * The suppressDuplicateOverlappingText to set. - */ - public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingTextValue) - { - this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue; - } - - /** - * This will tell if the text stripper should separate by beads. - * - * @return If the text will be grouped by beads. - */ - public boolean shouldSeparateByBeads() - { - return shouldSeparateByBeads; - } - - /** - * Set if the text stripper should group the text output by a list of beads. The default value - * is true! - * - * @param aShouldSeparateByBeads - * The new grouping of beads. - */ - public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) - { - this.shouldSeparateByBeads = aShouldSeparateByBeads; - } - - static class LineBlock - { - final List lines; - final float linespacing; - final float avglineheight; - - LineBlock(final List ls) - { - lines = ls; - linespacing = calcLinespacing(); - avglineheight = calcAvgLineheight(); - } - - float calcLinespacing() - { - if (lines.size() == 1) { - return Math.abs(lines.get(0).top - lines.get(0).bottom); - } - - float avgls = 0.0f; - for (int i = 0; i < (lines.size() - 1); i++) { - avgls += Math.abs(lines.get(i).top - lines.get(i + 1).top); - } - return avgls / (lines.size() - 1); - } - - private float calcAvgLineheight() - { - float avglh = 0.0f; - for (final Line l : lines) { - avglh += l.lineheight; - } - return avglh / lines.size(); - } - } - - static class Prediction - { - float lineheight; - float linespacing; - float left; - float right; - float quality; - } - - static class Line - extends BasicBlock - { - final int start; - final int end; - final float lineheight; - - Line(final List tl, final int pos) - { - super(tl); - start = pos; - end = findEnd(); - lineheight = growAndCalcLineheight(); - } - - private float growAndCalcLineheight() - { - float h = textList.get(start).getHeight(); - reset(start); - for (int i = start + 1; i < end; i++) { - h = Math.max(h, textList.get(i).getHeight()); - grow(i); - } - return h; - } - - private int findEnd() - { - int cur = start; - while (validPosition(textList, cur) - && isSameLine(textList.get(cur), textList.get(start))) { - cur++; - } - return cur; - } - - boolean hasNextLine() - { - return validPosition(textList, end); - } - - Line getNextLine() - { - if (hasNextLine()) { - return new Line(textList, end); - } - else { - return null; - } - } - - /** - * Return true if the text position is within the line height boundaries. Left and right - * boundaries are not checked. - * - * @param pos - * text position. - * @return if the position is within the line. - */ - boolean withinLine(final TextPosition pos) - { - final boolean underTop = top <= pos.getY(); - final boolean overBottom = (pos.getY() + pos.getHeight()) <= bottom; - return underTop && overBottom; - } - - boolean isSuperscript(final TextPosition pos) - { - final boolean underTop = (top - lineheight * 0.6f) <= pos.getY(); - final boolean overBottom = (pos.getY() + pos.getHeight()) <= bottom; - return underTop && overBottom; - } - - boolean isSubscript(final TextPosition pos) - { - final boolean underTop = (top <= pos.getY()); - final boolean overBottom = (pos.getY() + pos.getHeight() + lineheight * 0.6f) <= bottom; - return underTop && overBottom; - } - - @Override - public String toString() - { - return "[t:" + top + " b:" + bottom + "|" + content + "]"; - } - } - - static class BasicBlock - { - float left; - float top; - float right; - float bottom; - int lines; - int last_pos; - final List textList; - - // This is for debugging purposes only. - final StringBuilder content = new StringBuilder(); - - public BasicBlock(final List tl) - { - textList = tl; - } - - float getValue(final Values v) - { - switch (v) { - case BOTTOM: - return bottom; - case TOP: - return top; - case RIGHT: - return right; - case LEFT: - return left; - default: - throw new IllegalArgumentException("Unsupported value"); - } - } - - void normalize() - { - if (top < bottom) { - final float b = top; - top = bottom; - bottom = b; - } - - if (left > right) { - final float l = left; - left = right; - right = l; - } - } - - void reset(final int pos) - { - final TextPosition p = textList.get(pos); - - last_pos = pos; - lines = 0; - left = p.getX(); - right = p.getX() + p.getWidth(); - top = p.getY(); - bottom = p.getY() + p.getHeight(); - - content.setLength(0); - content.append(p.getCharacter()); - } - - void grow(final int pos) - { - final TextPosition p = textList.get(pos); - - if (!isSameLine(p, textList.get(last_pos))) { - lines++; - } - - last_pos = pos; - left = Math.min(p.getX(), left); - right = Math.max(p.getX() + p.getWidth(), right); - top = Math.min(p.getY(), top); - bottom = Math.max(p.getY() + p.getHeight(), bottom); - - content.append(" "); - content.append(p.getCharacter()); - } - } - - class Block - extends BasicBlock - { - float linespacing; - float lineheight; - - Block(final List textList, final int pos) - { - super(textList); - reset(pos); - } - - @Override - void reset(final int pos) - { - super.reset(pos); - linespacing = new LineBlock(collectLines(textList, pos, 3)).linespacing; - lineheight = Math.abs(bottom - top); - } - - @Override - void grow(final int pos) - { - super.grow(pos); - lineheight = Math.max(lineheight, textList.get(pos).getHeight()); - } - } -} +/* + * Copyright 2010 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This code is based on the PDFTextStripper written by Ben Litchfield from + * the PDFbox 0.7.x project and licensed under the BSD license. In accordance + * with the terms of this license, the following copyright statement is retained: + * + * Copyright (c) 2003-2007, www.pdfbox.org + * All rights reserved. + * + * Furthermore the modified code is re-licensed under the Apache License, + * Version 2.0 as stated above. + */ +package org.dkpro.core.io.pdf.internal; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Vector; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageTree; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead; +import org.apache.pdfbox.text.TextPosition; + +/** + * This class will take a PDF document and strip out all of the text and ignore the formatting and + * such. Please note; it is up to clients of this class to verify that a specific user has the + * correct permissions to extract text from the PDF document. + *

+ * This class is based on the pdfbox 1.7.0 PDFTextStripper class and was substantially modified and + * enhanced for basic paragraph and heading detection. Unfortunately it was not possible to add + * these enhancements through sub-classing, thus the code was copied and adapted. + */ +public abstract class PdfLayoutEventStripper + extends LegacyPDFStreamEngine +{ + public static enum Values + { + LEFT, RIGHT, TOP, BOTTOM, LINESPACING, LINEHEIGHT + } + + public static enum Style + { + PAGE, PARAGRAPH, HEADING + } + + private PDDocument document; + + private int currentPageNo = 0; + private int startPage = 1; + private int maxPage = 0; + private int endPage = Integer.MAX_VALUE; + private boolean suppressDuplicateOverlappingText = true; + private boolean shouldSeparateByBeads = true; + + private List pageArticles = null; + /** + * The charactersByArticle is used to extract text by article divisions. For example a PDF that + * has two columns like a newspaper, we want to extract the first column and then the second + * column. In this example the PDF would have 2 beads(or articles), one for each column. The + * size of the charactersByArticle would be 5, because not all text on the screen will fall into + * one of the articles. The five divisions are shown below + * + * Text before first article first article text text between first article and second article + * second article text text after second article + * + * Most PDFs won't have any beads, so charactersByArticle will contain a single entry. + */ + protected Vector> charactersByArticle = new Vector<>(); + + private final Map> characterListMapping = new HashMap<>(); + + public PdfLayoutEventStripper() throws IOException + { + super(); + } + + /** + * This will take a PDDocument and write the text of that document to the print writer. + * + * @param doc + * The document to get the data from. + * + * @throws IOException + * If the doc is in an invalid state. + */ + public void writeText(final PDDocument doc) throws IOException + { + resetEngine(); + + currentPageNo = 0; + document = doc; + startDocument(document); + + processPages(document.getPages()); + endDocument(document); + } + + private void resetEngine() + { + currentPageNo = 0; + document = null; + if (charactersByArticle != null) + { + charactersByArticle.clear(); + } + if (characterListMapping != null) + { + characterListMapping.clear(); + } + } + + /** + * This will process all of the pages and the text that is in them. + * + * @param pages + * The pages object in the document. + * + * @throws IOException + * If there is an error parsing the text. + */ + protected void processPages(PDPageTree pages) throws IOException + { + maxPage = pages.getCount(); + + for (PDPage page : pages) + { + currentPageNo++; + if (page.hasContents()) + { + processPage(page); + } + } + } + + /** + * This will process the contents of a page. + * + * @param page + * The page to process. + * @throws IOException + * If there is an error processing the page. + */ + @Override + public void processPage(final PDPage page) throws IOException + { + if ((currentPageNo >= startPage) && (currentPageNo <= endPage)) { + startPage(startPage, Math.min(maxPage, endPage), currentPageNo, page); + pageArticles = page.getThreadBeads(); + int numberOfArticleSections = 1 + pageArticles.size() * 2; + if (!shouldSeparateByBeads) { + numberOfArticleSections = 1; + } + final int originalSize = charactersByArticle.size(); + charactersByArticle.setSize(numberOfArticleSections); + for (int i = 0; i < numberOfArticleSections; i++) { + if (numberOfArticleSections < originalSize) { + charactersByArticle.get(i).clear(); + } + else { + charactersByArticle.set(i, new ArrayList()); + } + } + + characterListMapping.clear(); + + super.processPage(page); + + // Now we do the real processing + for (int i = 0; i < charactersByArticle.size(); i++) { + processArticle(charactersByArticle.get(i)); + } + + endPage(startPage, endPage, currentPageNo, page); + } + } + + /** + * This method tries do detect headings and paragraphs and line boundaries. + * + * @param textList + * the text. + * @throws IOException + * if there is an error writing to the stream. + */ + protected void processArticle(final List textList) throws IOException + { + // Nothing to do in this article? + if (textList.size() == 0) { + return; + } + + // System.out.println("XScale: "+textList.get(0).getXScale()); + // System.out.println("YScale: "+textList.get(0).getYScale()); + + final int prediction_depth = 10; + Prediction pred = null; + final Block block = new Block(textList, 0); + Line currentLine = null; + + boolean newRegion = false; + Style currentStyle = null; + Style prevStyle = null; + int cur = 0; + while (cur < textList.size()) { + // Initialize the line (if not already done) + if (currentLine == null) { + currentLine = new Line(textList, cur); + + // Get the style for the line (base on style for current + // element) + prevStyle = currentStyle; + currentStyle = getStyle(textList.get(cur)); + + // Test for a style change + if ((newRegion) || (prevStyle != currentStyle)) { + if (newRegion) { + newRegion = false; + } + // On a style change issue the proper events + if (prevStyle != null) { + endRegion(prevStyle); + } + startRegion(currentStyle); + pred = predictGeneralStructure(textList, cur, prediction_depth); + } + } + + // Check if we left the line + if (!currentLine.withinLine(textList.get(cur)) + && !currentLine.isSuperscript(textList.get(cur)) + && !currentLine.isSubscript(textList.get(cur))) { + // We left the line + currentLine = null; + + // Check if we left the region + final boolean columnSwitch = isColumnSwitch(textList.get(cur), block); + final boolean leftIndented = isLeftIndented(textList.get(cur), pred); + final boolean leftOutdented = isLeftOutdented(textList.get(cur), pred); + // boolean fontSwitch = (fontSize[cur] != fontSize[cur-1]); + final boolean vAdjacent = isVerticallyAdjacent(textList.get(cur).getY(), + textList.get(cur - 1).getY(), block.linespacing); + + if (!columnSwitch && !leftIndented && !leftOutdented && + /* !fontSwitch && */vAdjacent) { + // Same region. Issue a line separator and restart + processLineSeparator(); + } + else { + // New region + newRegion = true; + block.reset(cur); + + if ((pred == null) || !vAdjacent) { + pred = predictGeneralStructure(textList, cur, prediction_depth); + } + else if (vAdjacent) { + // If the block is directly adjacent, we may be better + // of + // with the old prediction... let's see if we can get a + // comparatively good new one. + final Prediction new_pred = predictGeneralStructure(textList, cur, + prediction_depth); + final boolean badPred = isSignifiantlyWorse(new_pred.quality, pred.quality, + 0.4); + if (!badPred) { + pred = new_pred; + } + } + } + + continue; // Start again to create a new currentLine + } + + // Ok, we are in the same line still. + + // Let's check if the block is adjacent or needs a space + // if (!isRightAdjacent(textList, cur, cur-1, cur-2)) { + if ((cur > 0) && !isNextChar(textList.get(cur), textList.get(cur - 1))) { + processWordSeparator(); + } + + // Grow the current block to calculate better spacings. + block.grow(cur); + + // Write of the characters and advance. + writeCharacters(textList.get(cur)); + cur++; + } + + // Close region + if (currentStyle != null) { + endRegion(currentStyle); + } + } + + /** + * This will show add a character to the list of characters to be printed to the text file. + * + * @param text + * The description of the character to display. + */ + @Override + protected void processTextPosition(final TextPosition text) + { + boolean showCharacter = true; + if (suppressDuplicateOverlappingText) { + showCharacter = false; + final String textCharacter = text.getUnicode(); + final float textX = text.getX(); + final float textY = text.getY(); + List sameTextCharacters = characterListMapping.get(textCharacter); + if (sameTextCharacters == null) { + sameTextCharacters = new ArrayList(); + characterListMapping.put(textCharacter, sameTextCharacters); + } + + // RDD - Here we compute the value that represents the end of the + // rendered + // text. This value is used to determine whether subsequent text + // rendered + // on the same line overwrites the current text. + // + // We subtract any positive padding to handle cases where extreme + // amounts + // of padding are applied, then backed off (not sure why this is + // done, but there + // are cases where the padding is on the order of 10x the character + // width, and + // the TJ just backs up to compensate after each character). Also, + // we subtract + // an amount to allow for kerning (a percentage of the width of the + // last + // character). + // + boolean suppressCharacter = false; + final float tolerance = (text.getWidth() / textCharacter.length()) / 3.0f; + for (int i = 0; i < sameTextCharacters.size() && textCharacter != null; i++) { + final TextPosition character = sameTextCharacters.get(i); + final String charCharacter = character.getUnicode(); + final float charX = character.getX(); + final float charY = character.getY(); + // only want to suppress + + if (charCharacter != null && + // charCharacter.equals( textCharacter ) && + within(charX, textX, tolerance) && within(charY, textY, tolerance)) { + suppressCharacter = true; + } + } + if (!suppressCharacter && (text.getUnicode() != null) + && (text.getUnicode().length() > 0)) { + sameTextCharacters.add(text); + showCharacter = true; + } + } + + if (showCharacter) { + // if we are showing the character then we need to determine which + // article it belongs to. + int foundArticleDivisionIndex = -1; + int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1; + int notFoundButFirstLeftArticleDivisionIndex = -1; + int notFoundButFirstAboveArticleDivisionIndex = -1; + final float x = text.getX(); + final float y = text.getY(); + if (shouldSeparateByBeads) { + for (int i = 0; i < pageArticles.size() && foundArticleDivisionIndex == -1; i++) { + final PDThreadBead bead = pageArticles.get(i); + if (bead != null) { + final PDRectangle rect = bead.getRectangle(); + if (rect.contains(x, y)) { + foundArticleDivisionIndex = i * 2 + 1; + } + else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY()) + && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) { + notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2; + } + else if (x < rect.getLowerLeftX() + && notFoundButFirstLeftArticleDivisionIndex == -1) { + notFoundButFirstLeftArticleDivisionIndex = i * 2; + } + else if (y < rect.getUpperRightY() + && notFoundButFirstAboveArticleDivisionIndex == -1) { + notFoundButFirstAboveArticleDivisionIndex = i * 2; + } + } + else { + foundArticleDivisionIndex = 0; + } + } + } + else { + foundArticleDivisionIndex = 0; + } + int articleDivisionIndex = -1; + if (foundArticleDivisionIndex != -1) { + articleDivisionIndex = foundArticleDivisionIndex; + } + else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) { + articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex; + } + else if (notFoundButFirstLeftArticleDivisionIndex != -1) { + articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex; + } + else if (notFoundButFirstAboveArticleDivisionIndex != -1) { + articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex; + } + else { + articleDivisionIndex = charactersByArticle.size() - 1; + } + final List textList = charactersByArticle.get(articleDivisionIndex); + textList.add(text); + } + } + + /** + * This will determine of two floating point numbers are within a specified variance. + * + * @param first + * The first number to compare to. + * @param second + * The second number to compare to. + * @param variance + * The allowed variance. + * @return if the number is within the specified variance. + */ + private static boolean within(final float first, final float second, final float variance) + { + return second > first - variance && second < first + variance; + } + + private static float getWordSpacing(final TextPosition position) + { + if (position == null) { + return 0; + } + + float wordSpacing = 0; + + if (wordSpacing == 0) { + // try to get width of a space character + wordSpacing = position.getWidthOfSpace(); + // if still zero fall back to getting the width of the current + // character + if (wordSpacing == 0) { + wordSpacing = position.getWidth(); + } + } + + return wordSpacing; + } + + private static boolean validPosition(final List textList, final int pos) + { + return (pos >= 0) && (pos < textList.size()); + } + + /** + * Detects whether text in two positions is on the same line. This method is a bit fuzzy so we + * also get potential superscripts and subscripts. + * + * @param cur + * current position. + * @param prev + * previous position. + * @return if both are in the same line. + */ + private static boolean isSameLine(final TextPosition cur, final TextPosition prev) + { + if (cur.getY() == prev.getY()) { + return true; + } + else { + final float prevCenter = prev.getY() + prev.getHeight() / 2.0f; + final float prevHeight = prev.getHeight(); + final float curCenter = cur.getY() + cur.getHeight() / 2.0f; + + final boolean result = Math.abs(curCenter - prevCenter) < (prevHeight * 0.25f); + + // if (!result) { + // _log.debug("sameLine ["+result+"]"+ + // "[px:"+f_y1[prev]+"-"+f_y2[prev]+":"+contents[prev]+"]"+ + // "[cx:"+f_y1[cur]+"-"+f_y2[cur]+":"+contents[cur]+"]"); + // } + + return result; + } + } + + /** + * Tests if two objects are vertically adjacent or if they are so far away from each other that + * they have to be considered different blocks. + * + * @param cur_top + * current top. + * @param prev_top + * previous top. + * @param spacing + * spacing. + * @return if the two objects are verticalla adjacent. + */ + private static boolean isVerticallyAdjacent(final float cur_top, final float prev_top, + final float spacing) + { + /* set vertical error margin */ + final float verterr = (float) (spacing * 1.27); + + final boolean aboveThreshold = (cur_top < (prev_top + verterr)); + final boolean belowprev = (cur_top > prev_top); + + return aboveThreshold && belowprev; + } + + private static boolean isLeftIndented(final TextPosition cur, final Prediction pred) + { + return cur.getX() > (pred.left + (pred.linespacing * 0.2)); + } + + private static boolean isLeftOutdented(final TextPosition cur, final Prediction pred) + { + return cur.getX() < (pred.left - (pred.linespacing * 0.2)); + } + + /** + * Check if the current fragment is in a new column. + * + * @param cur + * current text position. + * @param block + * current block. + * @return if the fragment is in a new column. + */ + private static boolean isColumnSwitch(final TextPosition cur, final Block block) + { + return (cur.getY() < block.top); // && (f_x1[cur] > block.right); + } + + private static boolean isSignifiantlyWorse(final double qnew, final double qold, + final double limit) + { + final double deviation = Math.abs(((qnew - qold) / (qnew + qold))); + final boolean result = (deviation > limit) && (qnew < qold); + // if (_log.isTraceEnabled()) { + // _log.trace("Deviation: "+deviation+ " - "+(result?"BAD":"OK")); + // } + return result; + } + + /** + * Determine whether we need to insert a word separator between the two positions or not. + * + * Adapted from PDFBox PDFTextStripper.flushText() + * + * @param cur + * current position. + * @param prev + * previous position. + * @return if the two positions are immediately adjacent. + */ + private static boolean isNextChar(final TextPosition cur, final TextPosition prev) + { + float lastWordSpacing = getWordSpacing(prev); + final float wordSpacing = getWordSpacing(cur); + float startOfNextWordX; + final float endOfLastTextX = prev.getX() + prev.getWidth(); + + // RDD - We add a conservative approximation for space determination. + // basically if there is a blank area between two characters that is + // equal to some percentage of the word spacing then that will be the + // start of the next word + if (lastWordSpacing <= 0) { + startOfNextWordX = endOfLastTextX + (wordSpacing * 0.50f); + } + else { + startOfNextWordX = endOfLastTextX + (((wordSpacing + lastWordSpacing) / 2f) * 0.50f); + } + + lastWordSpacing = wordSpacing; + + // if (startOfNextWordX > cur.getX()) { + // System.out.print("{O:"+(startOfNextWordX - cur.getX())+"}"); + // } + + if (startOfNextWordX != -1 && startOfNextWordX < cur.getX() && prev != null && + // only bother adding a space if the last character was not a + // space + prev.getUnicode() != null && !prev.getUnicode().endsWith(" ")) { + return false; + } + else { + return true; + } + } + + private List collectLines(final List textList, final int blk_start, + final int depth) + { + final ArrayList lines = new ArrayList(depth); + Line l = new Line(textList, blk_start); + lines.add(l); + for (int i = 1; i < depth && l.hasNextLine(); i++) { + l = l.getNextLine(); + + // Bail out if we have a potential column switch + if (l.top < lines.get(lines.size() - 1).bottom) { + break; + } + lines.add(l); + } + return lines; + } + + /** + * Return a block with the probable linespacing, lineheight and left and right borders. + * + * @param textList + * text. + * @param blk_start + * block start. + * @param depth + * depth. + * @return structure prediction. + */ + private Prediction predictGeneralStructure(final List textList, + final int blk_start, final int depth) + { + // Try to fetch the next lines up to depth + final List lines = collectLines(textList, blk_start, depth); + + // Calculate the line block parameters + LineBlock lb = new LineBlock(lines); + + // Iterate once more over the lines because we may have a big spacing + // indicating a new block. + + final List lines2 = new ArrayList(depth); + final Line l = lines.get(0); + lines2.add(l); + for (int i = 1; i < lines.size(); i++) { + // Bail out if we have too much distance + if (!isVerticallyAdjacent(lines.get(i).top, lines.get(i - 1).top, lb.linespacing)) { + break; + } + lines2.add(lines.get(i)); + } + + // Get the bounds in buckets + final Buckets left_buckets = new Buckets(lb.linespacing * 0.1); + final Buckets right_buckets = new Buckets(lb.linespacing * 0.1); + for (final Line ln : lines2) { + left_buckets.put(ln.left); + right_buckets.put(ln.right); + } + + // if (_log.isTraceEnabled()) { + // _log.trace("Left: size:"+left_buckets.getBest().size()+" - lines:"+lines2.size()+" - + // depth:"+depth); + // } + + lb = new LineBlock(lines2); + + // Return values + final Prediction result = new Prediction(); + result.linespacing = lb.linespacing; + result.lineheight = lb.avglineheight; + result.left = (float) left_buckets.getBest().getValue(); + result.right = (float) right_buckets.getBest().getValue(); + result.quality = (float) left_buckets.getBest().size() / (float) depth; + + return result; + } + + protected Style getStyle(final TextPosition pos) + { + if ((pos.getFontSize() * pos.getYScale()) > 14) { + return Style.HEADING; + } + else { + return Style.PARAGRAPH; + } + } + + /** + * This method is available for subclasses of this class. It will be called before processing of + * the document start. + * + * @param pdf + * The PDF document that is being processed. + * @throws IOException + * If an IO error occurs. + */ + protected abstract void startDocument(PDDocument pdf) throws IOException; + + /** + * This method is available for subclasses of this class. It will be called after processing of + * the document finishes. + * + * @param pdf + * The PDF document that is being processed. + * @throws IOException + * If an IO error occurs. + */ + protected abstract void endDocument(PDDocument pdf) throws IOException; + + /** + * Start a new region. + * + * @param style + * the style. + * @throws IOException + * If there is any error writing to the stream. + */ + protected abstract void startRegion(Style style) throws IOException; + + /** + * End a region. + * + * @param style + * the style. + * @throws IOException + * If there is any error writing to the stream. + */ + protected abstract void endRegion(Style style) throws IOException; + + /** + * Start a new page. + * + * @param firstPage + * first page. + * @param lastPage + * last page. + * @param currentPage + * current page. + * @param page + * The page we are about to process. + * + * @throws IOException + * If there is any error writing to the stream. + */ + protected abstract void startPage(int firstPage, int lastPage, int currentPage, PDPage page) + throws IOException; + + /** + * End a page. + * + * @param firstPage + * first page. + * @param lastPage + * last page. + * @param currentPage + * current page. + * @param page + * The page we are about to process. + * + * @throws IOException + * If there is any error writing to the stream. + */ + protected abstract void endPage(int firstPage, int lastPage, int currentPage, PDPage page) + throws IOException; + + protected abstract void processLineSeparator() throws IOException; + + protected abstract void processWordSeparator() throws IOException; + + /** + * Write the string to the output stream. + * + * @param text + * The text to write to the stream. + * @throws IOException + * If there is an error when writing the text. + */ + protected abstract void writeCharacters(TextPosition text) throws IOException; + + /** + * This is the page that the text extraction will start on. The pages start at page 1. For + * example in a 5 page PDF document, if the start page is 1 then all pages will be extracted. If + * the start page is 4 then pages 4 and 5 will be extracted. The default value is 1. + * + * @return Value of property startPage. + */ + public int getStartPage() + { + return startPage; + } + + /** + * This will set the first page to be extracted by this class. + * + * @param startPageValue + * New value of property startPage. + */ + public void setStartPage(final int startPageValue) + { + startPage = startPageValue; + } + + /** + * This will get the last page that will be extracted. This is inclusive, for example if a 5 + * page PDF an endPage value of 5 would extract the entire document, an end page of 2 would + * extract pages 1 and 2. This defaults to Integer.MAX_VALUE such that all pages of the pdf will + * be extracted. + * + * @return Value of property endPage. + */ + public int getEndPage() + { + return endPage; + } + + /** + * This will set the last page to be extracted by this class. + * + * @param endPageValue + * New value of property endPage. + */ + public void setEndPage(final int endPageValue) + { + endPage = endPageValue; + } + + /** + * @return Returns the suppressDuplicateOverlappingText. + */ + public boolean shouldSuppressDuplicateOverlappingText() + { + return suppressDuplicateOverlappingText; + } + + /** + * Get the current page number that is being processed. + * + * @return A 1 based number representing the current page. + */ + protected int getCurrentPageNo() + { + return currentPageNo; + } + + /** + * Character strings are grouped by articles. It is quite common that there will only be a + * single article. This returns a List that contains List objects, the inner lists will contain + * TextPosition objects. + * + * @return A double List of TextPositions for all text strings on the page. + */ + protected List> getCharactersByArticle() + { + return charactersByArticle; + } + + /** + * By default the text stripper will attempt to remove text that overlapps each other. Word + * paints the same character several times in order to make it look bold. By setting this to + * false all text will be extracted, which means that certain sections will be duplicated, but + * better performance will be noticed. + * + * @param suppressDuplicateOverlappingTextValue + * The suppressDuplicateOverlappingText to set. + */ + public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingTextValue) + { + this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue; + } + + /** + * This will tell if the text stripper should separate by beads. + * + * @return If the text will be grouped by beads. + */ + public boolean shouldSeparateByBeads() + { + return shouldSeparateByBeads; + } + + /** + * Set if the text stripper should group the text output by a list of beads. The default value + * is true! + * + * @param aShouldSeparateByBeads + * The new grouping of beads. + */ + public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) + { + this.shouldSeparateByBeads = aShouldSeparateByBeads; + } + + static class LineBlock + { + final List lines; + final float linespacing; + final float avglineheight; + + LineBlock(final List ls) + { + lines = ls; + linespacing = calcLinespacing(); + avglineheight = calcAvgLineheight(); + } + + float calcLinespacing() + { + if (lines.size() == 1) { + return Math.abs(lines.get(0).top - lines.get(0).bottom); + } + + float avgls = 0.0f; + for (int i = 0; i < (lines.size() - 1); i++) { + avgls += Math.abs(lines.get(i).top - lines.get(i + 1).top); + } + return avgls / (lines.size() - 1); + } + + private float calcAvgLineheight() + { + float avglh = 0.0f; + for (final Line l : lines) { + avglh += l.lineheight; + } + return avglh / lines.size(); + } + } + + static class Prediction + { + float lineheight; + float linespacing; + float left; + float right; + float quality; + } + + static class Line + extends BasicBlock + { + final int start; + final int end; + final float lineheight; + + Line(final List tl, final int pos) + { + super(tl); + start = pos; + end = findEnd(); + lineheight = growAndCalcLineheight(); + } + + private float growAndCalcLineheight() + { + float h = textList.get(start).getHeight(); + reset(start); + for (int i = start + 1; i < end; i++) { + h = Math.max(h, textList.get(i).getHeight()); + grow(i); + } + return h; + } + + private int findEnd() + { + int cur = start; + while (validPosition(textList, cur) + && isSameLine(textList.get(cur), textList.get(start))) { + cur++; + } + return cur; + } + + boolean hasNextLine() + { + return validPosition(textList, end); + } + + Line getNextLine() + { + if (hasNextLine()) { + return new Line(textList, end); + } + else { + return null; + } + } + + /** + * Return true if the text position is within the line height boundaries. Left and right + * boundaries are not checked. + * + * @param pos + * text position. + * @return if the position is within the line. + */ + boolean withinLine(final TextPosition pos) + { + final boolean underTop = top <= pos.getY(); + final boolean overBottom = (pos.getY() + pos.getHeight()) <= bottom; + return underTop && overBottom; + } + + boolean isSuperscript(final TextPosition pos) + { + final boolean underTop = (top - lineheight * 0.6f) <= pos.getY(); + final boolean overBottom = (pos.getY() + pos.getHeight()) <= bottom; + return underTop && overBottom; + } + + boolean isSubscript(final TextPosition pos) + { + final boolean underTop = (top <= pos.getY()); + final boolean overBottom = (pos.getY() + pos.getHeight() + lineheight * 0.6f) <= bottom; + return underTop && overBottom; + } + + @Override + public String toString() + { + return "[t:" + top + " b:" + bottom + "|" + content + "]"; + } + } + + static class BasicBlock + { + float left; + float top; + float right; + float bottom; + int lines; + int last_pos; + final List textList; + + // This is for debugging purposes only. + final StringBuilder content = new StringBuilder(); + + public BasicBlock(final List tl) + { + textList = tl; + } + + float getValue(final Values v) + { + switch (v) { + case BOTTOM: + return bottom; + case TOP: + return top; + case RIGHT: + return right; + case LEFT: + return left; + default: + throw new IllegalArgumentException("Unsupported value"); + } + } + + void normalize() + { + if (top < bottom) { + final float b = top; + top = bottom; + bottom = b; + } + + if (left > right) { + final float l = left; + left = right; + right = l; + } + } + + void reset(final int pos) + { + final TextPosition p = textList.get(pos); + + last_pos = pos; + lines = 0; + left = p.getX(); + right = p.getX() + p.getWidth(); + top = p.getY(); + bottom = p.getY() + p.getHeight(); + + content.setLength(0); + content.append(p.getUnicode()); + } + + void grow(final int pos) + { + final TextPosition p = textList.get(pos); + + if (!isSameLine(p, textList.get(last_pos))) { + lines++; + } + + last_pos = pos; + left = Math.min(p.getX(), left); + right = Math.max(p.getX() + p.getWidth(), right); + top = Math.min(p.getY(), top); + bottom = Math.max(p.getY() + p.getHeight(), bottom); + + content.append(" "); + content.append(p.getUnicode()); + } + } + + class Block + extends BasicBlock + { + float linespacing; + float lineheight; + + Block(final List textList, final int pos) + { + super(textList); + reset(pos); + } + + @Override + void reset(final int pos) + { + super.reset(pos); + linespacing = new LineBlock(collectLines(textList, pos, 3)).linespacing; + lineheight = Math.abs(bottom - top); + } + + @Override + void grow(final int pos) + { + super.grow(pos); + lineheight = Math.max(lineheight, textList.get(pos).getHeight()); + } + } +} diff --git a/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/internal/SubstitutionTrieParser.java b/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/internal/SubstitutionTrieParser.java new file mode 100644 index 0000000000..e9cd7649a2 --- /dev/null +++ b/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/internal/SubstitutionTrieParser.java @@ -0,0 +1,71 @@ +/* + * Copyright 2009, Richard Eckart de Castilho + * Copyright 2012, Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.pdf.internal; + +import java.io.IOException; +import java.io.InputStream; + +import org.xml.sax.Attributes; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; +import org.xml.sax.helpers.DefaultHandler; +import org.xml.sax.helpers.XMLReaderFactory; + +public class SubstitutionTrieParser + extends DefaultHandler +{ + private final Trie _trie; + + private SubstitutionTrieParser(final Trie trie) + { + _trie = trie; + } + + @Override + public void startElement(final String uri, final String localName, final String qName, + final Attributes attributes) + throws SAXException + { + if (localName.equals("substitution")) { + _trie.put( + attributes.getValue("orig"), + attributes.getValue("subst")); + } + } + + public static Trie parse(final InputStream is) throws IOException + { + final Trie trie = new Trie(); + parse(is, trie); + return trie; + } + + public static void parse(final InputStream is, final Trie trie) throws IOException + { + try { + final XMLReader xr = XMLReaderFactory.createXMLReader(); + final SubstitutionTrieParser sp = new SubstitutionTrieParser(trie); + xr.setContentHandler(sp); + xr.parse(new InputSource(is)); + } + catch (final SAXException e) { + throw new IOException(e); + } + } +}; diff --git a/dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/Trie.java b/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/internal/Trie.java similarity index 99% rename from dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/Trie.java rename to dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/internal/Trie.java index 8fd72c24b0..ef032781fb 100644 --- a/dkpro-core-io-pdf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/Trie.java +++ b/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/internal/Trie.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.pdf; +package org.dkpro.core.io.pdf.internal; import java.util.ArrayList; import java.util.Collection; @@ -364,7 +364,8 @@ public Iterator keyIterator() * @param vals * the found key values. */ - private void keys(final Character c, final Node n, final StringBuilder b, final Set vals) + private void keys(final Character c, final Node n, final StringBuilder b, + final Set vals) { b.append(c); diff --git a/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/package-info.java b/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/package-info.java new file mode 100644 index 0000000000..c97e40cb3f --- /dev/null +++ b/dkpro-core-io-pdf-asl/src/main/java/org/dkpro/core/io/pdf/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Support for PDF files (read-only). + * + * @since 1.1.0 + */ +package org.dkpro.core.io.pdf; diff --git a/dkpro-core-io-pdf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/PdfReaderTest.java b/dkpro-core-io-pdf-asl/src/test/java/org/dkpro/core/io/pdf/PdfReaderTest.java similarity index 88% rename from dkpro-core-io-pdf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/PdfReaderTest.java rename to dkpro-core-io-pdf-asl/src/test/java/org/dkpro/core/io/pdf/PdfReaderTest.java index 883794b89e..8c842d2e87 100644 --- a/dkpro-core-io-pdf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/pdf/PdfReaderTest.java +++ b/dkpro-core-io-pdf-asl/src/test/java/org/dkpro/core/io/pdf/PdfReaderTest.java @@ -1,65 +1,66 @@ -/* - * Copyright 2010 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.pdf; - -import static org.apache.commons.io.FileUtils.readFileToString; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; -import static org.junit.Assert.assertEquals; - -import java.io.File; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.collection.CollectionReader; -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.junit.Rule; -import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.EOLUtils; -import de.tudarmstadt.ukp.dkpro.core.testing.dumper.CasDumpWriter; - -public class PdfReaderTest -{ - @Test - public void test() - throws Exception - { - File outputFile = new File(testContext.getTestOutputFolder(), "dump-output.txt"); - - CollectionReader reader = createReader(PdfReader.class, - PdfReader.PARAM_SOURCE_LOCATION, "src/test/resources/data", - PdfReader.PARAM_PATTERNS, "[+]**/*.pdf"); - - AnalysisEngine writer = createEngine(CasDumpWriter.class, - CasDumpWriter.PARAM_TARGET_LOCATION, outputFile); - - SimplePipeline.runPipeline(reader, writer); - - String reference = readFileToString(new File("src/test/resources/reference/test.dump"), - "UTF-8").trim(); - String actual = readFileToString(outputFile, "UTF-8").trim(); - - actual = EOLUtils.normalizeLineEndings(actual); - reference = EOLUtils.normalizeLineEndings(reference); - - assertEquals(reference, actual); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} +/* + * Copyright 2010 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.pdf; + +import static org.apache.commons.io.FileUtils.readFileToString; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; +import static org.junit.Assert.assertEquals; + +import java.io.File; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.io.pdf.PdfReader; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.EOLUtils; +import org.dkpro.core.testing.dumper.CasDumpWriter; +import org.junit.Rule; +import org.junit.Test; + +public class PdfReaderTest +{ + @Test + public void test() + throws Exception + { + File outputFile = new File(testContext.getTestOutputFolder(), "dump-output.txt"); + + CollectionReader reader = createReader(PdfReader.class, + PdfReader.PARAM_SOURCE_LOCATION, "src/test/resources/data", + PdfReader.PARAM_PATTERNS, "[+]**/*.pdf"); + + AnalysisEngine writer = createEngine(CasDumpWriter.class, + CasDumpWriter.PARAM_TARGET_LOCATION, outputFile); + + SimplePipeline.runPipeline(reader, writer); + + String reference = readFileToString(new File("src/test/resources/reference/test.dump"), + "UTF-8").trim(); + String actual = readFileToString(outputFile, "UTF-8").trim(); + + actual = EOLUtils.normalizeLineEndings(actual); + reference = EOLUtils.normalizeLineEndings(reference); + + assertEquals(reference, actual); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-pdf-asl/src/test/resources/log4j.properties b/dkpro-core-io-pdf-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-io-pdf-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-pdf-asl/src/test/resources/log4j2.xml b/dkpro-core-io-pdf-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-pdf-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-pdf-asl/src/test/resources/reference/test.dump b/dkpro-core-io-pdf-asl/src/test/resources/reference/test.dump index a22f3efcfb..02891e07c2 100644 --- a/dkpro-core-io-pdf-asl/src/test/resources/reference/test.dump +++ b/dkpro-core-io-pdf-asl/src/test/resources/reference/test.dump @@ -20,18 +20,16 @@ Dies ist ein Test. Word with Umlauts (äöü) may require the substitution table. [This is a test. -Dies ist ein Test. -] +Dies ist ein Test.] Paragraph sofa: _InitialView begin: 0 - end: 35 -[Überschrift -] + end: 34 +[Überschrift] Heading sofa: _InitialView begin: 36 - end: 48 + end: 47 [Word with Umlauts (äöü) may require the substitution table.] Paragraph sofa: _InitialView diff --git a/dkpro-core-io-penntree-asl/pom.xml b/dkpro-core-io-penntree-asl/pom.xml index 95fa1ba204..e078d647a7 100644 --- a/dkpro-core-io-penntree-asl/pom.xml +++ b/dkpro-core-io-penntree-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.penntree-asl + dkpro-core-io-penntree-asl jar DKPro Core ASL - IO - Penn Treebank Format + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -44,28 +45,32 @@ commons-io - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.syntax-asl + org.dkpro.core + dkpro-core-api-syntax-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -73,17 +78,17 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-opennlp-asl + 2.3.0-SNAPSHOT pom import diff --git a/dkpro-core-io-penntree-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreeNode.java b/dkpro-core-io-penntree-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreeNode.java deleted file mode 100644 index 69ca303750..0000000000 --- a/dkpro-core-io-penntree-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreeNode.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.penntree; - -import java.util.ArrayList; -import java.util.List; - -/** - */ -public class PennTreeNode -{ - private PennTreeNode parent; - private String label; - private List children = new ArrayList(); - - public PennTreeNode getParent() - { - return parent; - } - - public void setParent(PennTreeNode aParent) - { - parent = aParent; - } - - public String getLabel() - { - return label; - } - - public void setLabel(String aLabel) - { - label = aLabel; - } - - public List getChildren() - { - return children; - } - - public void setChildren(List aChildren) - { - children = aChildren; - } - - public void addChild(PennTreeNode aNode) - { - aNode.setParent(this); - children.add(aNode); - } - - public boolean isPreTerminal() - { - return children.size() == 1 && children.get(0).isTerminal(); - } - - public boolean isTerminal() - { - return children.isEmpty(); - } - - @Override - public String toString() - { - return PennTreeUtils.toPennTree(this); - } -} diff --git a/dkpro-core-io-penntree-asl/src/main/java/org/dkpro/core/io/penntree/PennTreeNode.java b/dkpro-core-io-penntree-asl/src/main/java/org/dkpro/core/io/penntree/PennTreeNode.java new file mode 100644 index 0000000000..e7cf93cb78 --- /dev/null +++ b/dkpro-core-io-penntree-asl/src/main/java/org/dkpro/core/io/penntree/PennTreeNode.java @@ -0,0 +1,82 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.penntree; + +import java.util.ArrayList; +import java.util.List; + +/** + */ +public class PennTreeNode +{ + private PennTreeNode parent; + private String label; + private List children = new ArrayList(); + + public PennTreeNode getParent() + { + return parent; + } + + public void setParent(PennTreeNode aParent) + { + parent = aParent; + } + + public String getLabel() + { + return label; + } + + public void setLabel(String aLabel) + { + label = aLabel; + } + + public List getChildren() + { + return children; + } + + public void setChildren(List aChildren) + { + children = aChildren; + } + + public void addChild(PennTreeNode aNode) + { + aNode.setParent(this); + children.add(aNode); + } + + public boolean isPreTerminal() + { + return children.size() == 1 && children.get(0).isTerminal(); + } + + public boolean isTerminal() + { + return children.isEmpty(); + } + + @Override + public String toString() + { + return PennTreeUtils.toPennTree(this); + } +} diff --git a/dkpro-core-io-penntree-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreeToJCasConverter.java b/dkpro-core-io-penntree-asl/src/main/java/org/dkpro/core/io/penntree/PennTreeToJCasConverter.java similarity index 94% rename from dkpro-core-io-penntree-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreeToJCasConverter.java rename to dkpro-core-io-penntree-asl/src/main/java/org/dkpro/core/io/penntree/PennTreeToJCasConverter.java index f55cca55e0..ac9a734417 100644 --- a/dkpro-core-io-penntree-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreeToJCasConverter.java +++ b/dkpro-core-io-penntree-asl/src/main/java/org/dkpro/core/io/penntree/PennTreeToJCasConverter.java @@ -15,13 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.penntree; +package org.dkpro.core.io.penntree; -import static de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils.trim; -import static de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils.unescapeToken; import static org.apache.commons.lang3.StringUtils.isBlank; import static org.apache.uima.fit.util.FSCollectionFactory.createFSArray; import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.dkpro.core.io.penntree.PennTreeUtils.trim; +import static org.dkpro.core.io.penntree.PennTreeUtils.unescapeToken; import java.util.ArrayList; import java.util.HashMap; @@ -32,10 +32,10 @@ import org.apache.uima.cas.Type; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; @@ -48,7 +48,7 @@ public class PennTreeToJCasConverter private boolean writeTracesToText; private boolean createPosTags; - private boolean internTags; + private boolean internTags = true; private String rootLabel = ROOT; private MappingProvider posMappingProvider; @@ -260,7 +260,7 @@ private Constituent convertPennTree(JCas aJCas, PennTreeNode aNode, } constituent.setBegin(children.get(0).getBegin()); - constituent.setEnd(children.get(children.size()-1).getEnd()); + constituent.setEnd(children.get(children.size() - 1).getEnd()); constituent.setChildren(createFSArray(aJCas, children)); constituent.setParent(parent); constituent.addToIndexes(); @@ -285,8 +285,9 @@ private POS createPOS(JCas aJCas, PennTreeNode aPreterminal, int aBegin, int aEn else { posAnno = new POS(aJCas, aBegin, aEnd); } - posAnno.setPosValue(internTags ? aPreterminal.getLabel().intern() : aPreterminal - .getLabel()); + posAnno.setPosValue( + internTags && aPreterminal.getLabel() != null ? aPreterminal.getLabel().intern() + : aPreterminal.getLabel()); POSUtils.assignCoarseValue(posAnno); posAnno.addToIndexes(); return posAnno; @@ -302,8 +303,8 @@ private Constituent createConstituent(JCas aJCas, String aLabel) Constituent constituentAnno; if (constituentMappingProvider != null) { Type constituentTag = constituentMappingProvider.getTagType(label[0]); - // We just set a dummy value for the offsets here. These need to be fixed when we know the - // children and before addToIndexes() is called. + // We just set a dummy value for the offsets here. These need to be fixed when we + // know the children and before addToIndexes() is called. constituentAnno = (Constituent) aJCas.getCas().createAnnotation(constituentTag, 0, 0); } else { diff --git a/dkpro-core-io-penntree-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreeUtils.java b/dkpro-core-io-penntree-asl/src/main/java/org/dkpro/core/io/penntree/PennTreeUtils.java similarity index 96% rename from dkpro-core-io-penntree-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreeUtils.java rename to dkpro-core-io-penntree-asl/src/main/java/org/dkpro/core/io/penntree/PennTreeUtils.java index 8881fa3975..499f916886 100644 --- a/dkpro-core-io-penntree-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreeUtils.java +++ b/dkpro-core-io-penntree-asl/src/main/java/org/dkpro/core/io/penntree/PennTreeUtils.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.penntree; +package org.dkpro.core.io.penntree; import static java.util.Collections.singletonList; import static org.apache.uima.fit.util.FSCollectionFactory.create; @@ -31,6 +31,8 @@ import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.mutable.MutableInt; import org.apache.uima.cas.FeatureStructure; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.jcas.tcas.Annotation; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; @@ -62,7 +64,7 @@ public static PennTreeNode convertPennTree(Constituent aConstituent) } List children = new ArrayList(); - for (FeatureStructure c : create(aConstituent.getChildren())) { + for (FeatureStructure c : create((FSArray) aConstituent.getChildren())) { if (c instanceof Constituent) { children.add(convertPennTree((Constituent) c)); } @@ -229,7 +231,7 @@ private static void toPennTree(StringBuilder aSb, PennTreeNode aNode, int aLevel } else if (indentationEnabled && prevChild != null && !prevChild.isPreTerminal()) { aSb.append('\n'); - aSb.append(StringUtils.repeat(" ", (aLevel+1) * 2)); + aSb.append(StringUtils.repeat(" ", (aLevel + 1) * 2)); } else { aSb.append(' '); @@ -253,11 +255,11 @@ else if (indentationEnabled && prevChild != null && !prevChild.isPreTerminal()) public static void trim(CharSequence aText, int[] aSpan) { int begin = aSpan[0]; - int end = aSpan[1]-1; + int end = aSpan[1] - 1; CharSequence data = aText; while ( - (begin < (data.length()-1)) + (begin < (data.length() - 1)) && trimChar(data.charAt(begin)) ) { begin ++; diff --git a/dkpro-core-io-penntree-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreebankChunkedReader.java b/dkpro-core-io-penntree-asl/src/main/java/org/dkpro/core/io/penntree/PennTreebankChunkedReader.java similarity index 90% rename from dkpro-core-io-penntree-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreebankChunkedReader.java rename to dkpro-core-io-penntree-asl/src/main/java/org/dkpro/core/io/penntree/PennTreebankChunkedReader.java index 9150d77494..89f69ef2f9 100644 --- a/dkpro-core-io-penntree-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreebankChunkedReader.java +++ b/dkpro-core-io-penntree-asl/src/main/java/org/dkpro/core/io/penntree/PennTreebankChunkedReader.java @@ -15,7 +15,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.penntree; +package org.dkpro.core.io.penntree; + +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; import java.io.BufferedReader; import java.io.IOException; @@ -35,22 +37,23 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Penn Treebank chunked format reader. */ -@ResourceMetaData(name="Penn Treebank Chunked Format Reader") +@ResourceMetaData(name = "Penn Treebank Chunked Format Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.TEXT_X_PTB_CHUNKED}) @TypeCapability( outputs = { @@ -62,10 +65,19 @@ public class PennTreebankChunkedReader extends JCasResourceCollectionReader_ImplBase { + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Location of the mapping file for part-of-speech tags to UIMA types. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; @@ -123,8 +135,8 @@ public void initialize(UimaContext aContext) { super.initialize(aContext); - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - posTagset, getLanguage()); + posMappingProvider = createPosMappingProvider(this, posMappingLocation, posTagset, + getLanguage()); } @Override @@ -203,7 +215,8 @@ else if (token_tag.length < 2) { // in ambiguous cases a token might have two or more part of // speech tags. We take the first one named and ignore the other // ones - tag = selectFirstTagIfTokenIsAmbiguousInContextAndSeveralAcceptableOnesExist(tag); + tag = selectFirstTagIfTokenIsAmbiguousInContextAndSeveralAcceptableOnesExist( + tag); // A corpus might contain two pos tags for a word if it is // misspelled in the source material. 'The students dormitory' @@ -299,7 +312,8 @@ private boolean wordsAreConnectedByForwardSlash(String aTwt) return aTwt.contains("\\/"); } - private String annotateSenenceTokenPosTypes(JCas aJCas, List aTokens, List aTags) + private String annotateSenenceTokenPosTypes(JCas aJCas, List aTokens, + List aTags) { StringBuilder textString = new StringBuilder(); int sentStart = 0; diff --git a/dkpro-core-io-penntree-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreebankCombinedReader.java b/dkpro-core-io-penntree-asl/src/main/java/org/dkpro/core/io/penntree/PennTreebankCombinedReader.java similarity index 83% rename from dkpro-core-io-penntree-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreebankCombinedReader.java rename to dkpro-core-io-penntree-asl/src/main/java/org/dkpro/core/io/penntree/PennTreebankCombinedReader.java index 3b1e6eef1e..73127c8fb3 100644 --- a/dkpro-core-io-penntree-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreebankCombinedReader.java +++ b/dkpro-core-io-penntree-asl/src/main/java/org/dkpro/core/io/penntree/PennTreebankCombinedReader.java @@ -15,7 +15,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.penntree; +package org.dkpro.core.io.penntree; + +import static org.dkpro.core.api.resources.MappingProviderFactory.createConstituentMappingProvider; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; import java.io.IOException; import java.io.InputStream; @@ -32,19 +35,20 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Penn Treebank combined format reader. */ -@ResourceMetaData(name="Penn Treebank Combined Format Reader") +@ResourceMetaData(name = "Penn Treebank Combined Format Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.TEXT_X_PTB_COMBINED}) @TypeCapability( outputs = { @@ -60,7 +64,8 @@ public class PennTreebankCombinedReader * Name of configuration parameter that contains the character encoding used by the input files. */ public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String sourceEncoding; /** @@ -72,19 +77,9 @@ public class PennTreebankCombinedReader @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) protected String posTagset; - /** - * Load the part-of-speech tag to UIMA type mapping from this location instead of locating - * the mapping automatically. - */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - protected String posMappingLocation; - /** * Sets whether to create or not to create POS tags. The creation of * constituent tags must be turned on for this to work. - * - *

Default: {@code true}

*/ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") @@ -95,32 +90,47 @@ public class PennTreebankCombinedReader * tag set defined as part of the model meta data. This can be useful if a custom model is * specified which does not have such meta data, or it can be used in readers. */ - public static final String PARAM_CONSTITUENT_TAG_SET = ComponentParameters.PARAM_CONSTITUENT_TAG_SET; + public static final String PARAM_CONSTITUENT_TAG_SET = + ComponentParameters.PARAM_CONSTITUENT_TAG_SET; @ConfigurationParameter(name = PARAM_CONSTITUENT_TAG_SET, mandatory = false) protected String constituentTagset; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Load the part-of-speech tag to UIMA type mapping from this location instead of locating + * the mapping automatically. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + protected String posMappingLocation; + /** * Load the constituent tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ - public static final String PARAM_CONSTITUENT_MAPPING_LOCATION = ComponentParameters.PARAM_CONSTITUENT_MAPPING_LOCATION; + public static final String PARAM_CONSTITUENT_MAPPING_LOCATION = + ComponentParameters.PARAM_CONSTITUENT_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_CONSTITUENT_MAPPING_LOCATION, mandatory = false) protected String constituentMappingLocation; /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - * - *

Default: {@code true}

+ * Whether to remove traces from the parse tree. */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - public static final String PARAM_REMOVE_TRACES = "removeTraces"; @ConfigurationParameter(name = PARAM_REMOVE_TRACES, mandatory = false, defaultValue = "true") private boolean removeTraces; + /** + * Whether to render traces into the document text. + */ public static final String PARAM_WRITE_TRACES_TO_TEXT = "writeTracesToText"; @ConfigurationParameter(name = PARAM_WRITE_TRACES_TO_TEXT, mandatory = false, defaultValue = "false") private boolean writeTracesToText; @@ -140,14 +150,13 @@ public void initialize(UimaContext aContext) { super.initialize(aContext); - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - posTagset, getLanguage()); - - constituentMappingProvider = MappingProviderFactory.createConstituentMappingProvider( + posMappingProvider = createPosMappingProvider(this, posMappingLocation, posTagset, + getLanguage()); + + constituentMappingProvider = createConstituentMappingProvider(this, constituentMappingLocation, constituentTagset, getLanguage()); converter = new PennTreeToJCasConverter(posMappingProvider, constituentMappingProvider); - converter.setInternTags(internTags); converter.setWriteTracesToText(writeTracesToText); converter.setCreatePosTags(createPosTags); } diff --git a/dkpro-core-io-penntree-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreebankCombinedWriter.java b/dkpro-core-io-penntree-asl/src/main/java/org/dkpro/core/io/penntree/PennTreebankCombinedWriter.java similarity index 77% rename from dkpro-core-io-penntree-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreebankCombinedWriter.java rename to dkpro-core-io-penntree-asl/src/main/java/org/dkpro/core/io/penntree/PennTreebankCombinedWriter.java index 75f50e820b..787676e868 100644 --- a/dkpro-core-io-penntree-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreebankCombinedWriter.java +++ b/dkpro-core-io-penntree-asl/src/main/java/org/dkpro/core/io/penntree/PennTreebankCombinedWriter.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.penntree; +package org.dkpro.core.io.penntree; import static org.apache.uima.fit.util.JCasUtil.select; @@ -29,16 +29,18 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Penn Treebank combined format writer. */ -@ResourceMetaData(name="Penn Treebank Combined Format Writer") +@ResourceMetaData(name = "Penn Treebank Combined Format Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.TEXT_X_PTB_COMBINED}) @TypeCapability( inputs = { @@ -54,21 +56,31 @@ public class PennTreebankCombinedWriter * Specify the suffix of output files. Default value .mrg. If the suffix is not * needed, provide an empty string as value. */ - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".mrg") private String filenameSuffix; /** * Character encoding of the output data. */ - public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; - @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + public static final String PARAM_TARGET_ENCODING = + ComponentParameters.PARAM_TARGET_ENCODING; + @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String targetEncoding; + /** + * Whether to force the root label to be empty. + */ public static final String PARAM_EMPTY_ROOT_LABEL = "emptyRootLabel"; @ConfigurationParameter(name = PARAM_EMPTY_ROOT_LABEL, mandatory = true, defaultValue = "false") private boolean emptyRootLabel; + /** + * Whether to remove the root node. This is only possible if the root node has only a single + * child (i.e. a sentence node). + */ public static final String PARAM_NO_ROOT_LABEL = "noRootLabel"; @ConfigurationParameter(name = PARAM_NO_ROOT_LABEL, mandatory = true, defaultValue = "false") private boolean noRootLabel = false; @@ -77,7 +89,8 @@ public class PennTreebankCombinedWriter public void process(JCas aJCas) throws AnalysisEngineProcessException { - try (Writer docOS = new OutputStreamWriter(getOutputStream(aJCas, filenameSuffix), targetEncoding)) { + try (Writer docOS = new OutputStreamWriter(getOutputStream(aJCas, filenameSuffix), + targetEncoding)) { for (ROOT root : select(aJCas, ROOT.class)) { PennTreeNode tree = PennTreeUtils.convertPennTree(root); @@ -87,7 +100,8 @@ public void process(JCas aJCas) if (noRootLabel) { if (tree.getChildren().size() > 1) { - throw new IllegalStateException("Cannot remove ROOT not that has more than one child: " + tree); + throw new IllegalStateException( + "Cannot remove ROOT not that has more than one child: " + tree); } if (tree.getChildren().isEmpty()) { continue; diff --git a/dkpro-core-io-penntree-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreeUtilsTest.java b/dkpro-core-io-penntree-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreeUtilsTest.java deleted file mode 100644 index 7bfa81cf80..0000000000 --- a/dkpro-core-io-penntree-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreeUtilsTest.java +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.penntree; - -import static de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils.convertPennTree; -import static de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils.parsePennTree; -import static de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils.selectDfs; -import static de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils.toPennTree; -import static de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils.toPrettyPennTree; -import static de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils.toText; -import static org.apache.uima.fit.util.JCasUtil.selectSingle; -import static org.junit.Assert.assertEquals; - -import org.apache.uima.fit.factory.JCasFactory; -import org.apache.uima.jcas.JCas; -import org.junit.Ignore; -import org.junit.Rule; -import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - -/** - */ -public class PennTreeUtilsTest -{ - @Test - public void testParseSerialize() - { - doTest("(S (NP a) (VP b) (PUNC .))"); - doTest("(ROOT (S (NP (PRP It)) (VP (VBZ is) (PP (IN for) (NP (NP (DT this) (NN reason)) " + - "(SBAR (IN that) (S (NP (NN deconstruction)) (VP (VP (VBZ remains) (NP (NP " + - "(DT a) (JJ fundamental) (NN threat)) (PP (TO to) (NP (NNP Marxism))))) (, ,) " + - "(CC and) (VP (PP (IN by) (NP (NP (NN implication)) (PP (TO to) (NP (JJ other) " + - "(ADJP (JJ culturalist) (CC and) (JJ contextualizing)) " + - "(NNS approaches)))))))))))) (. .)))"); - } - - @Test - public void testPrettySerialize() - { - String tree = "(ROOT\n" + - " (S\n" + - " (S\n" + - " (NP\n" + - " (NP (DT The) (JJS strongest) (NN rain))\n" + - " (VP\n" + - " (ADVP (RB ever))\n" + - " (VBN recorded)\n" + - " (PP (IN in)\n" + - " (NP (NNP India)))))\n" + - " (VP\n" + - " (VP (VBD shut)\n" + - " (PRT (RP down))\n" + - " (NP\n" + - " (NP (DT the) (JJ financial) (NN hub))\n" + - " (PP (IN of)\n" + - " (NP (NNP Mumbai)))))\n" + - " (, ,)\n" + - " (VP (VBD snapped)\n" + - " (NP (NN communication) (NNS lines)))\n" + - " (, ,)\n" + - " (VP (VBD closed)\n" + - " (NP (NNS airports)))\n" + - " (CC and)\n" + - " (VP (VBD forced)\n" + - " (NP\n" + - " (NP (NNS thousands))\n" + - " (PP (IN of)\n" + - " (NP (NNS people))))\n" + - " (S\n" + - " (VP (TO to)\n" + - " (VP\n" + - " (VP (VB sleep)\n" + - " (PP (IN in)\n" + - " (NP (PRP$ their) (NNS offices))))\n" + - " (CC or)\n" + - " (VP (VB walk)\n" + - " (NP (NN home))\n" + - " (PP (IN during)\n" + - " (NP (DT the) (NN night))))))))))\n" + - " (, ,)\n" + - " (NP (NNS officials))\n" + - " (VP (VBD said)\n" + - " (NP-TMP (NN today)))\n" + - " (. .)))"; - - PennTreeNode n = parsePennTree(tree); - String actual = toPrettyPennTree(n); - - assertEquals(tree, actual); - } - - private static void doTest(String aBracket) - { - String expected = aBracket; - - PennTreeNode n = parsePennTree(expected); - - String actual = n.toString(); - - assertEquals(expected, actual); - } - - @Test - @Ignore("No asserts yet!") - public void testSelectDfs() - { - PennTreeNode n = parsePennTree( - "(ROOT (S (NP (PRP It)) (VP (VBZ is) (PP (IN for) (NP (NP (DT this) (NN reason)) " + - "(SBAR (IN that) (S (NP (NN deconstruction)) (VP (VP (VBZ remains) (NP (NP " + - "(DT a) (JJ fundamental) (NN threat)) (PP (TO to) (NP (NNP Marxism))))) (, ,) " + - "(CC and) (VP (PP (IN by) (NP (NP (NN implication)) (PP (TO to) (NP (JJ other) " + - "(ADJP (JJ culturalist) (CC and) (JJ contextualizing)) " + - "(NNS approaches)))))))))))) (. .)))"); - System.out.println(selectDfs(n, 1)); - System.out.println(selectDfs(n, 2)); - System.out.println(selectDfs(n, 3)); - System.out.println(selectDfs(n, 4)); - System.out.println(selectDfs(n, 5)); - System.out.println(selectDfs(n, 6)); - System.out.println(selectDfs(n, 7)); - System.out.println(selectDfs(n, 8)); - System.out.println(selectDfs(n, 9)); - System.out.println(selectDfs(n, 10)); - System.out.println(selectDfs(n, 11)); - System.out.println(selectDfs(n, 12)); - } - - @Test - public void testFromUimaConversion() - throws Exception - { - String documentEnglish = - "It is for this reason that deconstruction remains a ( fundamental ) threat to " + - "Marxism , and by implication to other culturalist and contextualizing " + - "approaches ."; - - String pennTree = "(ROOT (S (S (NP (PRP It)) (VP (VBZ is) (PP (IN for) (NP (DT this) " - + "(NN reason))) (SBAR (IN that) (S (NP (NN deconstruction)) (VP (VBZ remains) " - + "(NP (NP (DT a) (PRN (-LRB- -LRB-) (NN fundamental) (-RRB- -RRB-)) (NN threat)) " - + "(PP (TO to) (NP (NNP Marxism))))))))) (, ,) (CC and) (S (PP (IN by) (NP " - + "(NN implication))) (PP (TO to) (NP (NP (JJ other) (NN culturalist)) (CC and) " - + "(NP (VBG contextualizing) (NNS approaches))))) (. .)))"; - - PennTreeToJCasConverter converter = new PennTreeToJCasConverter(null, null); - converter.setInternTags(true); - converter.setWriteTracesToText(false); - converter.setCreatePosTags(true); - converter.setRootLabel("ROOT"); - - JCas jcas = JCasFactory.createJCas(); - - StringBuilder text = new StringBuilder(); - converter.convertPennTree(jcas, text, PennTreeUtils.parsePennTree(pennTree)); - jcas.setDocumentText(text.toString()); - - ROOT root = selectSingle(jcas, ROOT.class); - PennTreeNode r = convertPennTree(root); - - assertEquals(documentEnglish.trim(), toText(r).trim()); - AssertAnnotations.assertPennTree(pennTree, toPennTree(r)); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-io-penntree-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreeToJCasConverterTest.java b/dkpro-core-io-penntree-asl/src/test/java/org/dkpro/core/io/penntree/PennTreeToJCasConverterTest.java similarity index 84% rename from dkpro-core-io-penntree-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreeToJCasConverterTest.java rename to dkpro-core-io-penntree-asl/src/test/java/org/dkpro/core/io/penntree/PennTreeToJCasConverterTest.java index 3156ac9e16..641202866a 100644 --- a/dkpro-core-io-penntree-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreeToJCasConverterTest.java +++ b/dkpro-core-io-penntree-asl/src/test/java/org/dkpro/core/io/penntree/PennTreeToJCasConverterTest.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.penntree; +package org.dkpro.core.io.penntree; import java.util.Collection; @@ -24,16 +24,19 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.MappingProviderFactory; +import org.dkpro.core.io.penntree.PennTreeNode; +import org.dkpro.core.io.penntree.PennTreeToJCasConverter; +import org.dkpro.core.io.penntree.PennTreeUtils; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.validation.extras.AllTokensHavePos; import org.junit.Assert; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.extras.AllTokensHavePos; public class PennTreeToJCasConverterTest { @@ -42,9 +45,9 @@ public void whenConvertingFromStringThenTheParentOfConstituensAreSet() throws UIMAException { MappingProvider posMappingProvider = MappingProviderFactory.createPosMappingProvider(null, - null, (String) null); + null, null, (String) null); MappingProvider constituentMappingProvider = MappingProviderFactory - .createConstituentMappingProvider(null, null, (String) null); + .createConstituentMappingProvider(null, null, null, (String) null); PennTreeToJCasConverter converter = new PennTreeToJCasConverter(posMappingProvider, constituentMappingProvider); diff --git a/dkpro-core-io-penntree-asl/src/test/java/org/dkpro/core/io/penntree/PennTreeUtilsTest.java b/dkpro-core-io-penntree-asl/src/test/java/org/dkpro/core/io/penntree/PennTreeUtilsTest.java new file mode 100644 index 0000000000..b7d885e0d3 --- /dev/null +++ b/dkpro-core-io-penntree-asl/src/test/java/org/dkpro/core/io/penntree/PennTreeUtilsTest.java @@ -0,0 +1,186 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.penntree; + +import static org.apache.uima.fit.util.JCasUtil.selectSingle; +import static org.dkpro.core.io.penntree.PennTreeUtils.convertPennTree; +import static org.dkpro.core.io.penntree.PennTreeUtils.parsePennTree; +import static org.dkpro.core.io.penntree.PennTreeUtils.selectDfs; +import static org.dkpro.core.io.penntree.PennTreeUtils.toPennTree; +import static org.dkpro.core.io.penntree.PennTreeUtils.toPrettyPennTree; +import static org.dkpro.core.io.penntree.PennTreeUtils.toText; +import static org.junit.Assert.assertEquals; + +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.penntree.PennTreeNode; +import org.dkpro.core.io.penntree.PennTreeToJCasConverter; +import org.dkpro.core.io.penntree.PennTreeUtils; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Ignore; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; + +/** + */ +public class PennTreeUtilsTest +{ + @Test + public void testParseSerialize() + { + doTest("(S (NP a) (VP b) (PUNC .))"); + doTest("(ROOT (S (NP (PRP It)) (VP (VBZ is) (PP (IN for) (NP (NP (DT this) (NN reason)) " + + "(SBAR (IN that) (S (NP (NN deconstruction)) (VP (VP (VBZ remains) (NP (NP " + + "(DT a) (JJ fundamental) (NN threat)) (PP (TO to) (NP (NNP Marxism))))) (, ,) " + + "(CC and) (VP (PP (IN by) (NP (NP (NN implication)) (PP (TO to) (NP (JJ other) " + + "(ADJP (JJ culturalist) (CC and) (JJ contextualizing)) " + + "(NNS approaches)))))))))))) (. .)))"); + } + + @Test + public void testPrettySerialize() + { + String tree = "(ROOT\n" + + " (S\n" + + " (S\n" + + " (NP\n" + + " (NP (DT The) (JJS strongest) (NN rain))\n" + + " (VP\n" + + " (ADVP (RB ever))\n" + + " (VBN recorded)\n" + + " (PP (IN in)\n" + + " (NP (NNP India)))))\n" + + " (VP\n" + + " (VP (VBD shut)\n" + + " (PRT (RP down))\n" + + " (NP\n" + + " (NP (DT the) (JJ financial) (NN hub))\n" + + " (PP (IN of)\n" + + " (NP (NNP Mumbai)))))\n" + + " (, ,)\n" + + " (VP (VBD snapped)\n" + + " (NP (NN communication) (NNS lines)))\n" + + " (, ,)\n" + + " (VP (VBD closed)\n" + + " (NP (NNS airports)))\n" + + " (CC and)\n" + + " (VP (VBD forced)\n" + + " (NP\n" + + " (NP (NNS thousands))\n" + + " (PP (IN of)\n" + + " (NP (NNS people))))\n" + + " (S\n" + + " (VP (TO to)\n" + + " (VP\n" + + " (VP (VB sleep)\n" + + " (PP (IN in)\n" + + " (NP (PRP$ their) (NNS offices))))\n" + + " (CC or)\n" + + " (VP (VB walk)\n" + + " (NP (NN home))\n" + + " (PP (IN during)\n" + + " (NP (DT the) (NN night))))))))))\n" + + " (, ,)\n" + + " (NP (NNS officials))\n" + + " (VP (VBD said)\n" + + " (NP-TMP (NN today)))\n" + + " (. .)))"; + + PennTreeNode n = parsePennTree(tree); + String actual = toPrettyPennTree(n); + + assertEquals(tree, actual); + } + + private static void doTest(String aBracket) + { + String expected = aBracket; + + PennTreeNode n = parsePennTree(expected); + + String actual = n.toString(); + + assertEquals(expected, actual); + } + + @Test + @Ignore("No asserts yet!") + public void testSelectDfs() + { + PennTreeNode n = parsePennTree( + "(ROOT (S (NP (PRP It)) (VP (VBZ is) (PP (IN for) (NP (NP (DT this) (NN reason)) " + + "(SBAR (IN that) (S (NP (NN deconstruction)) (VP (VP (VBZ remains) (NP (NP " + + "(DT a) (JJ fundamental) (NN threat)) (PP (TO to) (NP (NNP Marxism))))) (, ,) " + + "(CC and) (VP (PP (IN by) (NP (NP (NN implication)) (PP (TO to) (NP (JJ other) " + + "(ADJP (JJ culturalist) (CC and) (JJ contextualizing)) " + + "(NNS approaches)))))))))))) (. .)))"); + System.out.println(selectDfs(n, 1)); + System.out.println(selectDfs(n, 2)); + System.out.println(selectDfs(n, 3)); + System.out.println(selectDfs(n, 4)); + System.out.println(selectDfs(n, 5)); + System.out.println(selectDfs(n, 6)); + System.out.println(selectDfs(n, 7)); + System.out.println(selectDfs(n, 8)); + System.out.println(selectDfs(n, 9)); + System.out.println(selectDfs(n, 10)); + System.out.println(selectDfs(n, 11)); + System.out.println(selectDfs(n, 12)); + } + + @Test + public void testFromUimaConversion() + throws Exception + { + String documentEnglish = + "It is for this reason that deconstruction remains a ( fundamental ) threat to " + + "Marxism , and by implication to other culturalist and contextualizing " + + "approaches ."; + + String pennTree = "(ROOT (S (S (NP (PRP It)) (VP (VBZ is) (PP (IN for) (NP (DT this) " + + "(NN reason))) (SBAR (IN that) (S (NP (NN deconstruction)) (VP (VBZ remains) " + + "(NP (NP (DT a) (PRN (-LRB- -LRB-) (NN fundamental) (-RRB- -RRB-)) (NN threat)) " + + "(PP (TO to) (NP (NNP Marxism))))))))) (, ,) (CC and) (S (PP (IN by) (NP " + + "(NN implication))) (PP (TO to) (NP (NP (JJ other) (NN culturalist)) (CC and) " + + "(NP (VBG contextualizing) (NNS approaches))))) (. .)))"; + + PennTreeToJCasConverter converter = new PennTreeToJCasConverter(null, null); + converter.setInternTags(true); + converter.setWriteTracesToText(false); + converter.setCreatePosTags(true); + converter.setRootLabel("ROOT"); + + JCas jcas = JCasFactory.createJCas(); + + StringBuilder text = new StringBuilder(); + converter.convertPennTree(jcas, text, PennTreeUtils.parsePennTree(pennTree)); + jcas.setDocumentText(text.toString()); + + ROOT root = selectSingle(jcas, ROOT.class); + PennTreeNode r = convertPennTree(root); + + assertEquals(documentEnglish.trim(), toText(r).trim()); + AssertAnnotations.assertPennTree(pennTree, toPennTree(r)); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-penntree-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreebankChunkedReaderTest.java b/dkpro-core-io-penntree-asl/src/test/java/org/dkpro/core/io/penntree/PennTreebankChunkedReaderTest.java similarity index 97% rename from dkpro-core-io-penntree-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreebankChunkedReaderTest.java rename to dkpro-core-io-penntree-asl/src/test/java/org/dkpro/core/io/penntree/PennTreebankChunkedReaderTest.java index 75f8d9d993..171725868e 100644 --- a/dkpro-core-io-penntree-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreebankChunkedReaderTest.java +++ b/dkpro-core-io-penntree-asl/src/test/java/org/dkpro/core/io/penntree/PennTreebankChunkedReaderTest.java @@ -15,22 +15,23 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.penntree; +package org.dkpro.core.io.penntree; +import static org.apache.uima.fit.util.JCasUtil.select; import static org.junit.Assert.assertEquals; -import static org.apache.uima.fit.util.JCasUtil.*; import org.apache.uima.collection.CollectionReader; import org.apache.uima.fit.factory.CollectionReaderFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.penntree.PennTreebankChunkedReader; +import org.dkpro.core.testing.AssertAnnotations; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; public class PennTreebankChunkedReaderTest { diff --git a/dkpro-core-io-penntree-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreebankCombinedReaderTest.java b/dkpro-core-io-penntree-asl/src/test/java/org/dkpro/core/io/penntree/PennTreebankCombinedReaderTest.java similarity index 92% rename from dkpro-core-io-penntree-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreebankCombinedReaderTest.java rename to dkpro-core-io-penntree-asl/src/test/java/org/dkpro/core/io/penntree/PennTreebankCombinedReaderTest.java index 6b64bfbd64..2c09f3e9bc 100644 --- a/dkpro-core-io-penntree-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreebankCombinedReaderTest.java +++ b/dkpro-core-io-penntree-asl/src/test/java/org/dkpro/core/io/penntree/PennTreebankCombinedReaderTest.java @@ -15,20 +15,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.penntree; +package org.dkpro.core.io.penntree; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertConstituents; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertPOS; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertSentence; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertToken; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.dkpro.core.testing.AssertAnnotations.assertConstituents; +import static org.dkpro.core.testing.AssertAnnotations.assertPOS; +import static org.dkpro.core.testing.AssertAnnotations.assertSentence; +import static org.dkpro.core.testing.AssertAnnotations.assertToken; import static org.junit.Assert.assertEquals; import org.apache.uima.collection.CollectionReader; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.penntree.PennTreebankCombinedReader; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -58,7 +59,8 @@ public void test() String[] sentences = { "Al Qaida Endorses George W. Bush for President", "Al-Qaeda tries to incite more violence in Iraq" }; - String[] tokens1 = { "Al", "Qaida", "Endorses", "George", "W.", "Bush", "for", "President" }; + String[] tokens1 = { "Al", "Qaida", "Endorses", "George", "W.", "Bush", "for", + "President" }; String[] constituentMapped1 = { "Constituent 0,46", "Constituent 0,8", "Constituent 18,32", "Constituent 33,46", "Constituent 37,46", "Constituent 9,46", "ROOT 0,46" }; @@ -66,7 +68,8 @@ public void test() String[] constituentOriginal1 = { "NP 0,8", "NP 18,32", "NP 37,46", "PP 33,46", "ROOT 0,46", "S 0,46", "VP 9,46" }; - String[] tokens2 = { "Al-Qaeda", "tries", "to", "incite", "more", "violence", "in", "Iraq" }; + String[] tokens2 = { "Al-Qaeda", "tries", "to", "incite", "more", "violence", "in", + "Iraq" }; String[] constituentMapped2 = { "Constituent 47,55", "Constituent 47,93", "Constituent 56,93", "Constituent 62,93", "Constituent 65,93", "Constituent 72,85", @@ -105,7 +108,8 @@ public void testWithDirectSpeech() String[] tokens = { "``", "And", "what", "do", "you", "know", "?", "''" }; - String[] posMapped = { "POS_PUNCT", "POS_CONJ", "POS_PRON", "POS_VERB", "POS_PRON", "POS_VERB", "POS_PUNCT", "POS_PUNCT" }; + String[] posMapped = { "POS_PUNCT", "POS_CONJ", "POS_PRON", "POS_VERB", "POS_PRON", + "POS_VERB", "POS_PUNCT", "POS_PUNCT" }; String[] posOriginal = { "``", "CC", "WP", "VBP", "PRP", "VB", ".", "''" }; diff --git a/dkpro-core-io-penntree-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreebankCombinedReaderWriterTest.java b/dkpro-core-io-penntree-asl/src/test/java/org/dkpro/core/io/penntree/PennTreebankCombinedReaderWriterTest.java similarity index 95% rename from dkpro-core-io-penntree-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreebankCombinedReaderWriterTest.java rename to dkpro-core-io-penntree-asl/src/test/java/org/dkpro/core/io/penntree/PennTreebankCombinedReaderWriterTest.java index 0f7319d33b..da25af1569 100644 --- a/dkpro-core-io-penntree-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreebankCombinedReaderWriterTest.java +++ b/dkpro-core-io-penntree-asl/src/test/java/org/dkpro/core/io/penntree/PennTreebankCombinedReaderWriterTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.penntree; +package org.dkpro.core.io.penntree; import static java.util.Arrays.asList; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; @@ -33,14 +33,15 @@ import org.apache.commons.io.FilenameUtils; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReaderDescription; +import org.dkpro.core.io.penntree.PennTreebankCombinedReader; +import org.dkpro.core.io.penntree.PennTreebankCombinedWriter; +import org.dkpro.core.testing.EOLUtils; import org.junit.Before; import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TestName; -import de.tudarmstadt.ukp.dkpro.core.testing.EOLUtils; - public class PennTreebankCombinedReaderWriterTest { @Test @@ -107,8 +108,8 @@ public void testOneWay(String aExpectedFile, String aFile, Object... aExtraParam extraReaderParams.add(input); extraReaderParams.addAll(asList(aExtraParams)); - CollectionReaderDescription reader = createReaderDescription(PennTreebankCombinedReader.class, - extraReaderParams.toArray()); + CollectionReaderDescription reader = createReaderDescription( + PennTreebankCombinedReader.class, extraReaderParams.toArray()); List extraWriterParams = new ArrayList<>(); extraWriterParams.add(PennTreebankCombinedWriter.PARAM_TARGET_LOCATION); diff --git a/dkpro-core-io-penntree-asl/src/test/resources/log4j.properties b/dkpro-core-io-penntree-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-io-penntree-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-penntree-asl/src/test/resources/log4j2.xml b/dkpro-core-io-penntree-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-penntree-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-lbj-asl/LICENSE.txt b/dkpro-core-io-perseus-asl/LICENSE.txt similarity index 100% rename from dkpro-core-lbj-asl/LICENSE.txt rename to dkpro-core-io-perseus-asl/LICENSE.txt diff --git a/dkpro-core-io-perseus-asl/pom.xml b/dkpro-core-io-perseus-asl/pom.xml new file mode 100644 index 0000000000..bfd3b70363 --- /dev/null +++ b/dkpro-core-io-perseus-asl/pom.xml @@ -0,0 +1,103 @@ + + + 4.0.0 + + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT + ../dkpro-core-asl + + dkpro-core-io-perseus-asl + jar + DKPro Core ASL - IO - Perseus Treebank + https://dkpro.github.io/dkpro-core/ + + + org.apache.uima + uimaj-core + + + org.apache.uima + uimafit-core + + + org.dkpro.core + dkpro-core-api-io-asl + + + org.dkpro.core + dkpro-core-api-segmentation-asl + + + org.dkpro.core + dkpro-core-api-lexmorph-asl + + + org.dkpro.core + dkpro-core-api-resources-asl + + + org.dkpro.core + dkpro-core-api-parameter-asl + + + org.dkpro.core + dkpro-core-api-syntax-asl + + + javax.xml.bind + jaxb-api + + + com.sun.xml.bind + jaxb-core + + + com.sun.xml.bind + jaxb-impl + + + javax.activation + javax.activation-api + + + eu.openminted.share.annotations + omtd-share-annotations-api + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + true + + + com.sun.xml.bind:jaxb-core + com.sun.xml.bind:jaxb-impl + javax.activation:javax.activation-api + + + + + + diff --git a/dkpro-core-io-perseus-asl/src/main/java/org/dkpro/core/io/perseus/PerseusReader.java b/dkpro-core-io-perseus-asl/src/main/java/org/dkpro/core/io/perseus/PerseusReader.java new file mode 100644 index 0000000000..c3f1bbbc95 --- /dev/null +++ b/dkpro-core-io-perseus-asl/src/main/java/org/dkpro/core/io/perseus/PerseusReader.java @@ -0,0 +1,268 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.perseus; + +import static org.dkpro.core.api.resources.CompressionUtils.getInputStream; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; + +import java.io.IOException; +import java.io.InputStream; +import java.util.LinkedHashMap; +import java.util.Map; + +import javax.xml.bind.JAXBContext; +import javax.xml.bind.JAXBException; +import javax.xml.bind.Unmarshaller; +import javax.xml.stream.XMLEventReader; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.events.StartElement; +import javax.xml.stream.events.XMLEvent; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.Type; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.factory.JCasBuilder; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.io.perseus.internal.model.PerseusSentence; +import org.dkpro.core.io.perseus.internal.model.PerseusWord; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Reader for the Perseus Treebank XML format. + */ +@ResourceMetaData(name = "Perseus Treebank XML Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.APPLICATION_X_PERSEUS_XML}) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) +public class PerseusReader + extends JCasResourceCollectionReader_ImplBase +{ + /** + * Read fine-grained part-of-speech information. + */ + public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; + @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") + private boolean readPos; + + /** + * Location of the mapping file for part-of-speech tags to UIMA types. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + protected String mappingPosLocation; + + /** + * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the + * tag set defined as part of the model meta data. This can be useful if a custom model is + * specified which does not have such meta data, or it can be used in readers. + */ + public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; + @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) + protected String posTagset; + + /** + * Read lemma information. + */ + public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; + @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") + private boolean readLemma; + + /** + * Read syntactic dependency information. + */ + public static final String PARAM_READ_DEPENDENCY = ComponentParameters.PARAM_READ_DEPENDENCY; + @ConfigurationParameter(name = PARAM_READ_DEPENDENCY, mandatory = true, defaultValue = "true") + private boolean readDependency; + + private MappingProvider posMappingProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + posMappingProvider = createPosMappingProvider(this, mappingPosLocation, posTagset, + getLanguage()); + } + + @Override + public void getNext(JCas aJCas) + throws IOException, CollectionException + { + Resource res = nextFile(); + initCas(aJCas, res); + + try { + posMappingProvider.configure(aJCas.getCas()); + } + catch (AnalysisEngineProcessException e) { + throw new IOException(e); + } + + try (InputStream is = getInputStream(res.getLocation(), res.getInputStream())) { + XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); + XMLEventReader xmlEventReader = xmlInputFactory.createXMLEventReader(is); + + JAXBContext context = JAXBContext.newInstance(PerseusSentence.class); + Unmarshaller unmarshaller = context.createUnmarshaller(); + + JCasBuilder jb = new JCasBuilder(aJCas); + + XMLEvent e = null; + while ((e = xmlEventReader.peek()) != null) { + if (isStartElement(e, "sentence")) { + PerseusSentence sentence = unmarshaller + .unmarshal(xmlEventReader, PerseusSentence.class).getValue(); + readSentence(jb, sentence); + } + else { + xmlEventReader.next(); + } + + } + + jb.close(); + } + catch (XMLStreamException ex1) { + throw new IOException(ex1); + } + catch (JAXBException ex2) { + throw new IOException(ex2); + } + } + + protected void readSentence(JCasBuilder aBuilder, PerseusSentence aSentence) + { + int sentenceBegin = aBuilder.getPosition(); + int sentenceEnd = aBuilder.getPosition(); + Map perseusWords = new LinkedHashMap<>(); + Map tokens = new LinkedHashMap<>(); + + for (PerseusWord w : aSentence.words) { + Token token = aBuilder.add(w.form, Token.class); + token.setId(w.id); + tokens.put(Integer.valueOf(w.id), token); + perseusWords.put(Integer.valueOf(w.id), w); + + if (readLemma && w.lemma != null) { + Lemma lemma = new Lemma(aBuilder.getJCas(), token.getBegin(), token.getEnd()); + lemma.setValue(w.lemma); + lemma.addToIndexes(); + token.setLemma(lemma); + } + + if (readPos) { + Type posType = posMappingProvider.getTagType(w.postag); + POS posAnno = (POS) aBuilder.getJCas().getCas().createAnnotation(posType, + token.getBegin(), token.getEnd()); + if (w.postag != null) { + posAnno.setPosValue(w.postag.intern()); + } + POSUtils.assignCoarseValue(posAnno); + posAnno.addToIndexes(); + token.setPos(posAnno); + } + + token.addToIndexes(); + + // Remember position before adding space + sentenceEnd = aBuilder.getPosition(); + + aBuilder.add(" "); + + } + aBuilder.add("\n"); + + // Dependencies + if (readDependency) { + for (PerseusWord word : perseusWords.values()) { + int depId = Integer.valueOf(word.id); + int govId = word.head; + + // Model the root as a loop onto itself + Dependency rel; + if (govId == 0) { + rel = new Dependency(aBuilder.getJCas()); + rel.setGovernor(tokens.get(depId)); + rel.setDependent(tokens.get(depId)); + rel.setDependencyType(word.relation); + rel.setBegin(rel.getDependent().getBegin()); + rel.setEnd(rel.getDependent().getEnd()); + rel.setFlavor(DependencyFlavor.BASIC); + rel.addToIndexes(); + } + else { + rel = new Dependency(aBuilder.getJCas()); + rel.setGovernor(tokens.get(govId)); + rel.setDependent(tokens.get(depId)); + rel.setDependencyType(word.relation); + rel.setBegin(rel.getDependent().getBegin()); + rel.setEnd(rel.getDependent().getEnd()); + rel.setFlavor(DependencyFlavor.BASIC); + rel.addToIndexes(); + } + + if (rel.getDependent() == null) { + throw new IllegalStateException( + "Referred dependent with ID [" + depId + "] not found"); + } + if (rel.getGovernor() == null) { + throw new IllegalStateException( + "Referred governor with ID [" + govId + "] not found"); + } + } + } + Sentence sentence = new Sentence(aBuilder.getJCas(), sentenceBegin, sentenceEnd); + sentence.setId(String.valueOf(aSentence.id)); + sentence.addToIndexes(); + } + + public static boolean isStartElement(XMLEvent aEvent, String aElement) + { + return aEvent.isStartElement() + && ((StartElement) aEvent).getName().getLocalPart().equals(aElement); + } +} diff --git a/dkpro-core-io-perseus-asl/src/main/java/org/dkpro/core/io/perseus/internal/model/PerseusSentence.java b/dkpro-core-io-perseus-asl/src/main/java/org/dkpro/core/io/perseus/internal/model/PerseusSentence.java new file mode 100644 index 0000000000..7299f50bf1 --- /dev/null +++ b/dkpro-core-io-perseus-asl/src/main/java/org/dkpro/core/io/perseus/internal/model/PerseusSentence.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.perseus.internal.model; + +import java.util.List; + +import javax.xml.bind.annotation.XmlAttribute; +import javax.xml.bind.annotation.XmlElement; +import javax.xml.bind.annotation.XmlID; + +public class PerseusSentence +{ + @XmlID + @XmlAttribute(name = "id") + public String id; + + @XmlAttribute(name = "subdoc") + public String subdoc; + + @XmlAttribute(name = "document_id") + public String documentId; + + @XmlElement(name = "word") + public List words; +} diff --git a/dkpro-core-io-perseus-asl/src/main/java/org/dkpro/core/io/perseus/internal/model/PerseusWord.java b/dkpro-core-io-perseus-asl/src/main/java/org/dkpro/core/io/perseus/internal/model/PerseusWord.java new file mode 100644 index 0000000000..eb94054820 --- /dev/null +++ b/dkpro-core-io-perseus-asl/src/main/java/org/dkpro/core/io/perseus/internal/model/PerseusWord.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.perseus.internal.model; + +import javax.xml.bind.annotation.XmlAttribute; +import javax.xml.bind.annotation.XmlID; + +public class PerseusWord +{ + @XmlID + @XmlAttribute + public String id; + + @XmlAttribute + public String form; + + @XmlAttribute + public String lemma; + + @XmlAttribute + public String postag; + + @XmlAttribute + public String relation; + + @XmlAttribute + public String cite; + + @XmlAttribute + public int head; + + @XmlAttribute(name = "insertion_id") + public String insertionId; + + @XmlAttribute + public String artificial; +} diff --git a/dkpro-core-stopwordremover-asl/LICENSE.txt b/dkpro-core-io-pubannotation-asl/LICENSE.txt similarity index 100% rename from dkpro-core-stopwordremover-asl/LICENSE.txt rename to dkpro-core-io-pubannotation-asl/LICENSE.txt diff --git a/dkpro-core-io-pubannotation-asl/pom.xml b/dkpro-core-io-pubannotation-asl/pom.xml new file mode 100644 index 0000000000..9d3fb55a77 --- /dev/null +++ b/dkpro-core-io-pubannotation-asl/pom.xml @@ -0,0 +1,83 @@ + + + 4.0.0 + + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT + ../dkpro-core-asl + + dkpro-core-io-pubannotation-asl + jar + DKPro Core ASL - IO - PubAnnotation + https://dkpro.github.io/dkpro-core/ + + + com.fasterxml.jackson.core + jackson-databind + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-annotations + + + org.apache.uima + uimaj-core + + + org.apache.uima + uimafit-core + + + org.apache.commons + commons-lang3 + + + org.dkpro.core + dkpro-core-api-parameter-asl + + + org.dkpro.core + dkpro-core-api-metadata-asl + + + org.dkpro.core + dkpro-core-api-io-asl + + + junit + junit + test + + + org.dkpro.core + dkpro-core-testing-asl + test + + + org.dkpro.core + dkpro-core-api-ner-asl + test + + + \ No newline at end of file diff --git a/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/PubAnnotationReader.java b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/PubAnnotationReader.java new file mode 100644 index 0000000000..443f7cb026 --- /dev/null +++ b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/PubAnnotationReader.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.pubannotation; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.uima.UimaContext; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.io.pubannotation.internal.PubAnnotation2DKPro; +import org.dkpro.core.io.pubannotation.internal.model.PADocument; + +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.databind.ObjectMapper; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; + +/** + * Reader for the PubAnnotation format. + * + * Since the PubAnnotation format only associates spans/relations with simple values and since + * annotations are not typed, it is necessary to define target types and features via + * {@link #PARAM_SPAN_TYPE} and {@link #PARAM_SPAN_LABEL_FEATURE}. In PubAnnotation, every + * annotation has an ID. If the target type has a suitable feature to retain the ID, it can be + * configured via {@link #PARAM_SPAN_ID_FEATURE}. + * + * The {@code sourcedb} and {@code sourceid} from the PubAnnotation document are imported as + * {@link DocumentMetaData#setCollectionId(String) collectionId} and + * {@link DocumentMetaData#setDocumentId(String) documentId} respectively. If present, also the + * {@code target} is imported as {@link DocumentMetaData#setDocumentUri(String) documentUri}. The + * {@link DocumentMetaData#setDocumentBaseUri(String) documentBaseUri} is cleared in this case. + * + * Currently supports only span annotations, i.e. no relations or modifications. Discontinuous + * segments are also not supported. + * + * @see PubAnnotation format + */ +@ResourceMetaData(name = "PubAnnotation Reader") +@MimeTypeCapability({MimeTypes.APPLICATION_X_PUB_ANNOTATION_JSON}) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" }) +public class PubAnnotationReader + extends JCasResourceCollectionReader_ImplBase +{ + /** + * The span annotation type to which the PubAnnotation spans are mapped. + */ + public static final String PARAM_SPAN_TYPE = "spanType"; + @ConfigurationParameter(name = PARAM_SPAN_TYPE, mandatory = true) + private String spanType; + + /** + * The feature on the span annotation type which receives the ID. + */ + public static final String PARAM_SPAN_ID_FEATURE = "spanIdFeature"; + @ConfigurationParameter(name = PARAM_SPAN_ID_FEATURE, mandatory = false) + private String spanIdFeature; + + /** + * The feature on the span annotation type which receives the label. + */ + public static final String PARAM_SPAN_LABEL_FEATURE = "spanLabelFeature"; + @ConfigurationParameter(name = PARAM_SPAN_LABEL_FEATURE, mandatory = false) + private String spanLabelFeature; + + /** + * The feature on the span annotation type which receives the label. + */ + public static final String PARAM_RESOLVE_NAMESPACES = "resolveNamespaces"; + @ConfigurationParameter(name = PARAM_RESOLVE_NAMESPACES, mandatory = true, defaultValue = "false") + private boolean resolveNamespaces; + + private ObjectMapper mapper; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + mapper = new ObjectMapper(); + // Hack because LXF dumper presently creates invalid JSON + mapper.configure(JsonParser.Feature.ALLOW_SINGLE_QUOTES, true); + } + + @Override + public void getNext(JCas aCAS) + throws IOException, CollectionException + { + Resource res = nextFile(); + initCas(aCAS, res); + + PubAnnotation2DKPro converter = new PubAnnotation2DKPro(); + converter.setSpanMapping(spanType, spanIdFeature, spanLabelFeature); + converter.setResolveNamespaces(resolveNamespaces); + + try (InputStream is = new BufferedInputStream(res.getInputStream())) { + PADocument doc = mapper.readValue(is, PADocument.class); + converter.convert(doc, aCAS); + } + } +} diff --git a/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/PubAnnotationWriter.java b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/PubAnnotationWriter.java new file mode 100644 index 0000000000..64d49d5a16 --- /dev/null +++ b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/PubAnnotationWriter.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */package org.dkpro.core.io.pubannotation; + +import java.io.OutputStream; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.io.pubannotation.internal.DKPro2PubAnnotation; +import org.dkpro.core.io.pubannotation.internal.model.PADocument; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; + +/** + * Writer for the PubAnnotation format. + * + * Since the PubAnnotation format only associates spans/relations with simple values and since + * annotations are not typed, it is necessary to define target types and features via + * {@link #PARAM_SPAN_TYPE} and {@link #PARAM_SPAN_LABEL_FEATURE}. In PubAnnotation, every + * annotation has an ID. If the annotation type has an ID feature, it can be configured via + * {@link #PARAM_SPAN_ID_FEATURE}. If this parameter is not set, the IDs are generated + * automatically. + * + * The {@code sourcedb} and {@code sourceid} from the PubAnnotation document are exported from + * {@link DocumentMetaData#setCollectionId(String) collectionId} and + * {@link DocumentMetaData#setDocumentId(String) documentId} respectively. The {@code target} is + * exported from {@link DocumentMetaData#setDocumentUri(String) documentUri}. + * + * Currently supports only span annotations, i.e. no relations or modifications. Discontinuous + * segments are also not supported. + * + * @see PubAnnotation format + */ +@ResourceMetaData(name = "PubAnnotation Writer") +@MimeTypeCapability({MimeTypes.APPLICATION_X_PUB_ANNOTATION_JSON}) +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" }) +public class PubAnnotationWriter + extends JCasFileWriter_ImplBase +{ + /** + * The span annotation type to which the PubAnnotation spans are mapped. + */ + public static final String PARAM_SPAN_TYPE = "spanType"; + @ConfigurationParameter(name = PARAM_SPAN_TYPE, mandatory = true) + private String spanType; + + /** + * The feature on the span annotation type which receives the ID. + */ + public static final String PARAM_SPAN_ID_FEATURE = "spanIdFeature"; + @ConfigurationParameter(name = PARAM_SPAN_ID_FEATURE, mandatory = false) + private String spanIdFeature; + + /** + * The feature on the span annotation type which receives the label. + */ + public static final String PARAM_SPAN_LABEL_FEATURE = "spanLabelFeature"; + @ConfigurationParameter(name = PARAM_SPAN_LABEL_FEATURE, mandatory = false) + private String spanLabelFeature; + + /** + * Specify the suffix of output files. Default value .json. If the suffix is not + * needed, provide an empty string as value. + */ + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; + @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".json") + private String filenameSuffix; + + private ObjectMapper mapper; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + mapper = new ObjectMapper(); + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + PADocument doc = new PADocument(); + + DKPro2PubAnnotation converter = new DKPro2PubAnnotation(); + converter.setSpanMapping(spanType, spanIdFeature, spanLabelFeature); + converter.convert(aJCas, doc); + + try (OutputStream docOS = getOutputStream(aJCas, filenameSuffix)) { + mapper.writerWithDefaultPrettyPrinter().writeValue(docOS, doc); + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + } +} diff --git a/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/DKPro2PubAnnotation.java b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/DKPro2PubAnnotation.java new file mode 100644 index 0000000000..9ec553b212 --- /dev/null +++ b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/DKPro2PubAnnotation.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.pubannotation.internal; + +import static org.apache.uima.fit.util.CasUtil.select; + +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.impl.CASImpl; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.fit.util.CasUtil; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.pubannotation.internal.model.PADenotation; +import org.dkpro.core.io.pubannotation.internal.model.PADocument; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; + +public class DKPro2PubAnnotation +{ + private String spanTypeName; + private String spanIdFeatureName; + private String spanLabelFeatureName; + + public void convert(JCas aJCas, PADocument aDoc) + { + aDoc.setText(aJCas.getDocumentText()); + + // Map metadata + DocumentMetaData dmd = DocumentMetaData.get(aJCas); + aDoc.setTarget(dmd.getDocumentUri()); + aDoc.setSourceDb(dmd.getCollectionId()); + aDoc.setSourceId(dmd.getDocumentId()); + + // Map span annotations + CAS cas = aJCas.getCas(); + Type spanType = CasUtil.getAnnotationType(cas, spanTypeName); + Feature spanIdFeature = spanType.getFeatureByBaseName(spanIdFeatureName); + Feature spanLabelFeature = spanType.getFeatureByBaseName(spanLabelFeatureName); + + for (AnnotationFS spanAnnotation : select(cas, spanType)) { + PADenotation denotation = new PADenotation(spanAnnotation.getBegin(), + spanAnnotation.getEnd()); + + // Set the ID from the ID feature if one was specified, otherwise set it from the + // annotation address. + if (spanIdFeature != null) { + denotation.setId(spanAnnotation.getFeatureValueAsString(spanIdFeature)); + } + else { + denotation.setId(Integer.toString(((CASImpl) cas).ll_getFSRef(spanAnnotation))); + } + + if (spanLabelFeature != null) { + denotation.setObj(spanAnnotation.getFeatureValueAsString(spanLabelFeature)); + } + + aDoc.addDenotation(denotation); + } + } + + public void setSpanMapping(String aSpanType, String aSpanIdFeature, String aSpanLabelFeature) + { + spanTypeName = aSpanType; + spanIdFeatureName = aSpanIdFeature; + spanLabelFeatureName = aSpanLabelFeature; + } +} diff --git a/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/PubAnnotation2DKPro.java b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/PubAnnotation2DKPro.java new file mode 100644 index 0000000000..fad3c8c976 --- /dev/null +++ b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/PubAnnotation2DKPro.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.pubannotation.internal; + +import static org.apache.commons.lang3.StringUtils.isNotBlank; + +import java.util.Optional; + +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.fit.util.CasUtil; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.pubannotation.internal.model.PADenotation; +import org.dkpro.core.io.pubannotation.internal.model.PADocument; +import org.dkpro.core.io.pubannotation.internal.model.PANamespace; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; + +public class PubAnnotation2DKPro +{ + private String spanTypeName; + private String spanIdFeatureName; + private String spanLabelFeatureName; + private boolean resolveNamespaces; + + public void convert(PADocument aDoc, JCas aCAS) + { + aCAS.setDocumentText(aDoc.getText()); + + // If source DB and/or source ID are set, put them into the collection/document IDs + DocumentMetaData dmd = DocumentMetaData.get(aCAS); + if (isNotBlank(aDoc.getSourceDb())) { + dmd.setCollectionId(aDoc.getSourceDb()); + } + if (isNotBlank(aDoc.getSourceId())) { + dmd.setDocumentId(aDoc.getSourceId()); + } + + // If the target is set in PubAnnotation, treat it as the document URI and clear the + // documentBaseUri since we do not know the base URI for PubAnnotation files. + // REC: not sure if this is a great idea... + if (isNotBlank(aDoc.getTarget())) { + dmd.setDocumentBaseUri(null); + dmd.setDocumentUri(aDoc.getTarget()); + } + + // Map span annotations + CAS cas = aCAS.getCas(); + Type spanType = CasUtil.getAnnotationType(cas, spanTypeName); + Feature spanIdFeature = spanType.getFeatureByBaseName(spanIdFeatureName); + Feature spanLabelFeature = spanType.getFeatureByBaseName(spanLabelFeatureName); + Optional baseNS = aDoc.getNamespace(PANamespace.PREFIX_BASE); + for (PADenotation span : aDoc.getDenotations()) { + AnnotationFS spanAnnotation = cas.createAnnotation(spanType, span.getSpan().getBegin(), + span.getSpan().getEnd()); + + // If an ID feature was set, then we set its value + if (spanIdFeature != null) { + spanAnnotation.setFeatureValueFromString(spanIdFeature, span.getId()); + } + + // If a label feature was set, then we set its value + if (spanLabelFeature != null) { + String value = span.getObj(); + if (resolveNamespaces && baseNS.isPresent()) { + value = baseNS.get().getUri() + value; + } + spanAnnotation.setFeatureValueFromString(spanLabelFeature, value); + } + + cas.addFsToIndexes(spanAnnotation); + } + } + + public void setResolveNamespaces(boolean aResolveNamespaces) + { + resolveNamespaces = aResolveNamespaces; + } + + public boolean getResolveNamespaces() + { + return resolveNamespaces; + } + + public void setSpanMapping(String aSpanType, String aSpanIdFeature, String aSpanLabelFeature) + { + spanTypeName = aSpanType; + spanIdFeatureName = aSpanIdFeature; + spanLabelFeatureName = aSpanLabelFeature; + } +} diff --git a/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/model/PADenotation.java b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/model/PADenotation.java new file mode 100644 index 0000000000..94388ba50c --- /dev/null +++ b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/model/PADenotation.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.pubannotation.internal.model; + +// "denotations": [ +// {"id": "T1", "span": {"begin": 0, "end": 5}, "obj": "Protein"}, +// {"id": "T2", "span": {"begin": 42, "end": 47}, "obj": "Protein"}, +// {"id": "E1", "span": {"begin": 6, "end": 16}, "obj": "Expression"}, +// {"id": "E2", "span": {"begin": 31, "end": 38}, "obj": "Regulation"} +// ] +public class PADenotation +{ + private String id; + private PAOffsets span; + private String obj; + + public PADenotation() + { + // Default constructor + } + + public PADenotation(int aBegin, int aEnd) + { + span = new PAOffsets(aBegin, aEnd); + } + + public String getId() + { + return id; + } + + public void setId(String aId) + { + id = aId; + } + + public PAOffsets getSpan() + { + return span; + } + + public void setSpan(PAOffsets aSpan) + { + span = aSpan; + } + + public String getObj() + { + return obj; + } + + public void setObj(String aObj) + { + obj = aObj; + } +} diff --git a/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/model/PADocument.java b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/model/PADocument.java new file mode 100644 index 0000000000..b01b8968d2 --- /dev/null +++ b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/model/PADocument.java @@ -0,0 +1,170 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.pubannotation.internal.model; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonInclude.Include; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonPropertyOrder; + +/** + * @see PubAnnotation documentation + */ +@JsonPropertyOrder({ "target", "sourcedb", "sourceid", "text", "project", "denotations", + "relations", "modifications", "namespaces" }) +public class PADocument +{ + @JsonInclude(Include.NON_NULL) + private String target; + + @JsonInclude(Include.NON_NULL) + private String project; + + @JsonProperty("sourcedb") + @JsonInclude(Include.NON_NULL) + private String sourceDb; + + @JsonProperty("sourceid") + @JsonInclude(Include.NON_NULL) + private String sourceId; + + private String text; + + @JsonInclude(Include.NON_EMPTY) + private List denotations = new ArrayList<>(); + + @JsonInclude(Include.NON_EMPTY) + private List relations = new ArrayList<>(); + + @JsonInclude(Include.NON_EMPTY) + private List namespaces = new ArrayList<>(); + + @JsonInclude(Include.NON_EMPTY) + private List modifications = new ArrayList<>(); + + public PADocument() + { + // Default constructor + } + + public String getTarget() + { + return target; + } + + public void setTarget(String aTarget) + { + target = aTarget; + } + + public String getProject() + { + return project; + } + + public void setProject(String aProject) + { + project = aProject; + } + + public String getSourceDb() + { + return sourceDb; + } + + public void setSourceDb(String aSourcedb) + { + sourceDb = aSourcedb; + } + + public String getSourceId() + { + return sourceId; + } + + public void setSourceId(String aSourceid) + { + sourceId = aSourceid; + } + + public String getText() + { + return text; + } + + public void setText(String aText) + { + text = aText; + } + + public boolean addDenotation(PADenotation aE) + { + return denotations.add(aE); + } + + public List getDenotations() + { + return denotations; + } + + public void setDenotations(List aDenotations) + { + denotations = aDenotations; + } + + public List getRelations() + { + return relations; + } + + public void setRelations(List aRelations) + { + relations = aRelations; + } + + public List getNamespaces() + { + return namespaces; + } + + public void setNamespaces(List aNamespaces) + { + namespaces = aNamespaces; + } + + public List getModifications() + { + return modifications; + } + + public void setModifications(List aModifications) + { + modifications = aModifications; + } + + public Optional getNamespace(String aPrefix) + { + return namespaces.stream() + .filter(it -> it.getPrefix().equals(aPrefix)) + .findFirst(); + } +} diff --git a/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/model/PAModification.java b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/model/PAModification.java new file mode 100644 index 0000000000..5ed40ec8ca --- /dev/null +++ b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/model/PAModification.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.pubannotation.internal.model; + +// "modifications": [ +// {"id": "M1", "pred": "Speculation", "obj": "E2"} +// ] +public class PAModification +{ + public static final String PRED_SPECULATION = "Speculation"; + public static final String PRED_NEGATION = "Negation"; + + private String id; + private String pred; + private String obj; + + public PAModification() + { + // Default constructor + } + + public String getId() + { + return id; + } + + public void setId(String aId) + { + id = aId; + } + + public String getPred() + { + return pred; + } + + public void setPred(String aPred) + { + pred = aPred; + } + + public String getObj() + { + return obj; + } + + public void setObj(String aObj) + { + obj = aObj; + } +} diff --git a/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/model/PANamespace.java b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/model/PANamespace.java new file mode 100644 index 0000000000..67f22c6f32 --- /dev/null +++ b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/model/PANamespace.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.pubannotation.internal.model; + +// "namespaces":[ +// {"prefix": "_base", "uri": "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info\u0026id="} +// ] +public class PANamespace +{ + public static final String PREFIX_BASE = "_base"; + + private String prefix; + private String uri; + + public PANamespace() + { + // Default constructor + } + + public String getPrefix() + { + return prefix; + } + + public void setPrefix(String aPrefix) + { + prefix = aPrefix; + } + + public String getUri() + { + return uri; + } + + public void setUri(String aUri) + { + uri = aUri; + } +} diff --git a/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/model/PAOffsets.java b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/model/PAOffsets.java new file mode 100644 index 0000000000..09ed2ff223 --- /dev/null +++ b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/model/PAOffsets.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.pubannotation.internal.model; + +// {"begin": 0, "end": 5} +public class PAOffsets +{ + private int begin; + private int end; + + public PAOffsets() + { + // Default constructor + } + + public PAOffsets(int aBegin, int aEnd) + { + begin = aBegin; + end = aEnd; + } + + public int getBegin() + { + return begin; + } + + public void setBegin(int aBegin) + { + begin = aBegin; + } + + public int getEnd() + { + return end; + } + + public void setEnd(int aEnd) + { + end = aEnd; + } + + @Override + public String toString() + { + StringBuilder builder = new StringBuilder(); + builder.append("["); + builder.append(begin); + builder.append("-"); + builder.append(end); + builder.append("]"); + return builder.toString(); + } +} diff --git a/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/model/PARelation.java b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/model/PARelation.java new file mode 100644 index 0000000000..e8eec09cf5 --- /dev/null +++ b/dkpro-core-io-pubannotation-asl/src/main/java/org/dkpro/core/io/pubannotation/internal/model/PARelation.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.pubannotation.internal.model; + +// "relations": [ +// {"id": "R1", "subj": "T1", "pred": "themeOf", "obj": "E1"}, +// {"id": "R2", "subj": "E1", "pred": "themeOf", "obj": "E2"}, +// {"id": "R3", "subj": "T2", "pred": "causeOf", "obj": "E2"} +// ] +public class PARelation +{ + private String id; + private String subj; + private String pred; + private String obj; + + public PARelation() + { + // Default constructor + } + + public String getId() + { + return id; + } + public void setId(String aId) + { + id = aId; + } + public String getSubj() + { + return subj; + } + public void setSubj(String aSubj) + { + subj = aSubj; + } + public String getPred() + { + return pred; + } + public void setPred(String aPred) + { + pred = aPred; + } + public String getObj() + { + return obj; + } + public void setObj(String aObj) + { + obj = aObj; + } +} diff --git a/dkpro-core-io-pubannotation-asl/src/test/java/org/dkpro/core/io/pubannotation/PubAnnotationReaderWriterTest.java b/dkpro-core-io-pubannotation-asl/src/test/java/org/dkpro/core/io/pubannotation/PubAnnotationReaderWriterTest.java new file mode 100644 index 0000000000..fa074f9a2f --- /dev/null +++ b/dkpro-core-io-pubannotation-asl/src/test/java/org/dkpro/core/io/pubannotation/PubAnnotationReaderWriterTest.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.pubannotation; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; + +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; + +public class PubAnnotationReaderWriterTest +{ + @Test + public void roundTrip() + throws Exception + { + testOneWay( + createReaderDescription(PubAnnotationReader.class, + PubAnnotationReader.PARAM_SPAN_TYPE, NamedEntity.class, + PubAnnotationReader.PARAM_SPAN_LABEL_FEATURE, "value", + PubAnnotationReader.PARAM_SPAN_ID_FEATURE, "identifier"), + createEngineDescription(PubAnnotationWriter.class, + PubAnnotationWriter.PARAM_SPAN_TYPE, NamedEntity.class, + PubAnnotationWriter.PARAM_SPAN_LABEL_FEATURE, "value", + PubAnnotationWriter.PARAM_SPAN_ID_FEATURE, "identifier"), + "pubannotation/SPECIES800/19667393-ref.json", + "pubannotation/SPECIES800/19667393.json"); + } + + @Test + public void roundTripResolveNamespaces() + throws Exception + { + testOneWay( + createReaderDescription(PubAnnotationReader.class, + PubAnnotationReader.PARAM_SPAN_TYPE, NamedEntity.class, + PubAnnotationReader.PARAM_SPAN_LABEL_FEATURE, "value", + PubAnnotationReader.PARAM_SPAN_ID_FEATURE, "identifier", + PubAnnotationReader.PARAM_RESOLVE_NAMESPACES, true), + createEngineDescription(PubAnnotationWriter.class, + PubAnnotationWriter.PARAM_SPAN_TYPE, NamedEntity.class, + PubAnnotationWriter.PARAM_SPAN_LABEL_FEATURE, "value", + PubAnnotationWriter.PARAM_SPAN_ID_FEATURE, "identifier"), + "pubannotation/SPECIES800/19667393-ref-ns.json", + "pubannotation/SPECIES800/19667393.json"); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-pubannotation-asl/src/test/resources/pubannotation/SPECIES800/19667393-ref-ns.json b/dkpro-core-io-pubannotation-asl/src/test/resources/pubannotation/SPECIES800/19667393-ref-ns.json new file mode 100644 index 0000000000..34ab3fe2c5 --- /dev/null +++ b/dkpro-core-io-pubannotation-asl/src/test/resources/pubannotation/SPECIES800/19667393-ref-ns.json @@ -0,0 +1,68 @@ +{ + "sourceid" : "19667393", + "text" : "Methanoregula formicica sp. nov., a methane-producing archaeon isolated from methanogenic sludge.\nA novel methane-producing archaeon, strain SMSP(T), was isolated from an anaerobic, propionate-degrading enrichment culture that was originally obtained from granular sludge in a mesophilic upflow anaerobic sludge blanket (UASB) reactor used to treat a beer brewery effluent. Cells were non-motile, blunt-ended, straight rods, 1.0-2.6 μm long by 0.5 μm wide; cells were sometimes up to 7 μm long. Asymmetrical cell division was observed in rod-shaped cells. Coccoid cells (0.5-1.0 μm in diameter) were also observed in mid- to late-exponential phase cultures. Growth was observed between 10 and 40 °C (optimum, 30-33 °C) and pH 7.0 and 7.6 (optimum, pH 7.4). The G+C content of the genomic DNA was 56.2 mol%. The strain utilized formate and hydrogen for growth and methane production. Based on comparative sequence analyses of the 16S rRNA and mcrA (encoding the alpha subunit of methyl-coenzyme M reductase, a key enzyme in the methane-producing pathway) genes, strain SMSP(T) was affiliated with group E1/E2 within the order Methanomicrobiales. The closest relative based on both 16S rRNA and mcrA gene sequences was Methanoregula boonei 6A8(T) (96.3 % 16S rRNA gene sequence similarity, 85.4 % deduced McrA amino acid sequence similarity). The percentage of 16S rRNA gene sequence similarity indicates that strain SMSP(T) and Methanoregula boonei 6A8(T) represent different species within the same genus. This is supported by our findings of shared phenotypic properties, including cell morphology and growth temperature range, and phenotypic differences in substrate usage and pH range. Based on these genetic and phenotypic properties, we propose that strain SMSP(T) represents a novel species of the genus Methanoregula, for which we propose the name Methanoregula formicica sp. nov., with the type strain SMSP(T) (=NBRC 105244(T) =DSM 22288(T)).", + "denotations" : [ { + "id" : "T1", + "span" : { + "begin" : 0, + "end" : 23 + }, + "obj" : "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=882104" + }, { + "id" : "T2", + "span" : { + "begin" : 141, + "end" : 148 + }, + "obj" : "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=882104" + }, { + "id" : "T3", + "span" : { + "begin" : 1068, + "end" : 1075 + }, + "obj" : "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=882104" + }, { + "id" : "T4", + "span" : { + "begin" : 1217, + "end" : 1241 + }, + "obj" : "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=358766" + }, { + "id" : "T5", + "span" : { + "begin" : 1415, + "end" : 1422 + }, + "obj" : "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=882104" + }, { + "id" : "T6", + "span" : { + "begin" : 1427, + "end" : 1451 + }, + "obj" : "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=358766" + }, { + "id" : "T7", + "span" : { + "begin" : 1762, + "end" : 1769 + }, + "obj" : "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=882104" + }, { + "id" : "T8", + "span" : { + "begin" : 1855, + "end" : 1878 + }, + "obj" : "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=882104" + }, { + "id" : "T9", + "span" : { + "begin" : 1910, + "end" : 1917 + }, + "obj" : "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=882104" + } ] +} \ No newline at end of file diff --git a/dkpro-core-io-pubannotation-asl/src/test/resources/pubannotation/SPECIES800/19667393-ref.json b/dkpro-core-io-pubannotation-asl/src/test/resources/pubannotation/SPECIES800/19667393-ref.json new file mode 100644 index 0000000000..eb6af61def --- /dev/null +++ b/dkpro-core-io-pubannotation-asl/src/test/resources/pubannotation/SPECIES800/19667393-ref.json @@ -0,0 +1,68 @@ +{ + "sourceid" : "19667393", + "text" : "Methanoregula formicica sp. nov., a methane-producing archaeon isolated from methanogenic sludge.\nA novel methane-producing archaeon, strain SMSP(T), was isolated from an anaerobic, propionate-degrading enrichment culture that was originally obtained from granular sludge in a mesophilic upflow anaerobic sludge blanket (UASB) reactor used to treat a beer brewery effluent. Cells were non-motile, blunt-ended, straight rods, 1.0-2.6 μm long by 0.5 μm wide; cells were sometimes up to 7 μm long. Asymmetrical cell division was observed in rod-shaped cells. Coccoid cells (0.5-1.0 μm in diameter) were also observed in mid- to late-exponential phase cultures. Growth was observed between 10 and 40 °C (optimum, 30-33 °C) and pH 7.0 and 7.6 (optimum, pH 7.4). The G+C content of the genomic DNA was 56.2 mol%. The strain utilized formate and hydrogen for growth and methane production. Based on comparative sequence analyses of the 16S rRNA and mcrA (encoding the alpha subunit of methyl-coenzyme M reductase, a key enzyme in the methane-producing pathway) genes, strain SMSP(T) was affiliated with group E1/E2 within the order Methanomicrobiales. The closest relative based on both 16S rRNA and mcrA gene sequences was Methanoregula boonei 6A8(T) (96.3 % 16S rRNA gene sequence similarity, 85.4 % deduced McrA amino acid sequence similarity). The percentage of 16S rRNA gene sequence similarity indicates that strain SMSP(T) and Methanoregula boonei 6A8(T) represent different species within the same genus. This is supported by our findings of shared phenotypic properties, including cell morphology and growth temperature range, and phenotypic differences in substrate usage and pH range. Based on these genetic and phenotypic properties, we propose that strain SMSP(T) represents a novel species of the genus Methanoregula, for which we propose the name Methanoregula formicica sp. nov., with the type strain SMSP(T) (=NBRC 105244(T) =DSM 22288(T)).", + "denotations" : [ { + "id" : "T1", + "span" : { + "begin" : 0, + "end" : 23 + }, + "obj" : "882104" + }, { + "id" : "T2", + "span" : { + "begin" : 141, + "end" : 148 + }, + "obj" : "882104" + }, { + "id" : "T3", + "span" : { + "begin" : 1068, + "end" : 1075 + }, + "obj" : "882104" + }, { + "id" : "T4", + "span" : { + "begin" : 1217, + "end" : 1241 + }, + "obj" : "358766" + }, { + "id" : "T5", + "span" : { + "begin" : 1415, + "end" : 1422 + }, + "obj" : "882104" + }, { + "id" : "T6", + "span" : { + "begin" : 1427, + "end" : 1451 + }, + "obj" : "358766" + }, { + "id" : "T7", + "span" : { + "begin" : 1762, + "end" : 1769 + }, + "obj" : "882104" + }, { + "id" : "T8", + "span" : { + "begin" : 1855, + "end" : 1878 + }, + "obj" : "882104" + }, { + "id" : "T9", + "span" : { + "begin" : 1910, + "end" : 1917 + }, + "obj" : "882104" + } ] +} \ No newline at end of file diff --git a/dkpro-core-io-pubannotation-asl/src/test/resources/pubannotation/SPECIES800/19667393.json b/dkpro-core-io-pubannotation-asl/src/test/resources/pubannotation/SPECIES800/19667393.json new file mode 100644 index 0000000000..e564c54fb6 --- /dev/null +++ b/dkpro-core-io-pubannotation-asl/src/test/resources/pubannotation/SPECIES800/19667393.json @@ -0,0 +1 @@ +{"target":"http://pubannotation.org/docs/sourcedb/PubMed/sourceid/19667393","sourcedb":"PubMed","sourceid":"19667393","text":"Methanoregula formicica sp. nov., a methane-producing archaeon isolated from methanogenic sludge.\nA novel methane-producing archaeon, strain SMSP(T), was isolated from an anaerobic, propionate-degrading enrichment culture that was originally obtained from granular sludge in a mesophilic upflow anaerobic sludge blanket (UASB) reactor used to treat a beer brewery effluent. Cells were non-motile, blunt-ended, straight rods, 1.0-2.6 \u03bcm long by 0.5 \u03bcm wide; cells were sometimes up to 7 \u03bcm long. Asymmetrical cell division was observed in rod-shaped cells. Coccoid cells (0.5-1.0 \u03bcm in diameter) were also observed in mid- to late-exponential phase cultures. Growth was observed between 10 and 40 \u00b0C (optimum, 30-33 \u00b0C) and pH 7.0 and 7.6 (optimum, pH 7.4). The G+C content of the genomic DNA was 56.2 mol%. The strain utilized formate and hydrogen for growth and methane production. Based on comparative sequence analyses of the 16S rRNA and mcrA (encoding the alpha subunit of methyl-coenzyme M reductase, a key enzyme in the methane-producing pathway) genes, strain SMSP(T) was affiliated with group E1/E2 within the order Methanomicrobiales. The closest relative based on both 16S rRNA and mcrA gene sequences was Methanoregula boonei 6A8(T) (96.3\u200a% 16S rRNA gene sequence similarity, 85.4\u200a% deduced McrA amino acid sequence similarity). The percentage of 16S rRNA gene sequence similarity indicates that strain SMSP(T) and Methanoregula boonei 6A8(T) represent different species within the same genus. This is supported by our findings of shared phenotypic properties, including cell morphology and growth temperature range, and phenotypic differences in substrate usage and pH range. Based on these genetic and phenotypic properties, we propose that strain SMSP(T) represents a novel species of the genus Methanoregula, for which we propose the name Methanoregula formicica sp. nov., with the type strain SMSP(T) (=NBRC 105244(T) =DSM 22288(T)).","project":"SPECIES800","denotations":[{"id":"T1","span":{"begin":0,"end":23},"obj":"882104"},{"id":"T2","span":{"begin":141,"end":148},"obj":"882104"},{"id":"T3","span":{"begin":1068,"end":1075},"obj":"882104"},{"id":"T4","span":{"begin":1217,"end":1241},"obj":"358766"},{"id":"T5","span":{"begin":1415,"end":1422},"obj":"882104"},{"id":"T6","span":{"begin":1427,"end":1451},"obj":"358766"},{"id":"T7","span":{"begin":1762,"end":1769},"obj":"882104"},{"id":"T8","span":{"begin":1855,"end":1878},"obj":"882104"},{"id":"T9","span":{"begin":1910,"end":1917},"obj":"882104"}],"namespaces":[{"prefix":"_base","uri":"http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info\u0026id="}]} \ No newline at end of file diff --git a/dkpro-core-io-pubannotation-asl/src/test/resources/pubannotation/SPECIES800/README.txt b/dkpro-core-io-pubannotation-asl/src/test/resources/pubannotation/SPECIES800/README.txt new file mode 100644 index 0000000000..bcbedf5a7a --- /dev/null +++ b/dkpro-core-io-pubannotation-asl/src/test/resources/pubannotation/SPECIES800/README.txt @@ -0,0 +1,7 @@ +Source: http://pubannotation.org/projects/SPECIES800 +Authors: Evangelos Pafilis, Sune P. Frankild, Lucia Fanini, Sarah Faulwetter, Christina Pavloudi, + Aikaterini Vasileiadou, Christos Arvanitidis, Lars Juhl Jensen +License: http://creativecommons.org/licenses/by/4.0/ + +The file was renamed from "PubMed-19667393.json" to "19667393.json" to facilitate testing since +DKPro Core writers do not include the collection ID in the filename. diff --git a/dkpro-core-io-rdf-asl/pom.xml b/dkpro-core-io-rdf-asl/pom.xml index 190576bf8f..c3d114e10f 100644 --- a/dkpro-core-io-rdf-asl/pom.xml +++ b/dkpro-core-io-rdf-asl/pom.xml @@ -15,19 +15,20 @@ See the License for the specific language governing permissions and limitations under the License. --> - 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - org.dkpro.core dkpro-core-io-rdf-asl jar DKPro Core ASL - IO - RDF + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -40,12 +41,12 @@ org.apache.jena jena-core - 3.5.0 + ${jena.version} org.apache.jena jena-arq - 3.5.0 + ${jena.version} org.apache.commons @@ -56,20 +57,24 @@ commons-lang3 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -77,13 +82,13 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl compile - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.conll-asl + org.dkpro.core + dkpro-core-io-conll-asl test diff --git a/dkpro-core-io-rdf-asl/src/main/java/org/dkpro/core/io/rdf/RdfReader.java b/dkpro-core-io-rdf-asl/src/main/java/org/dkpro/core/io/rdf/RdfReader.java index 7d1219c438..1c607fdcdd 100644 --- a/dkpro-core-io-rdf-asl/src/main/java/org/dkpro/core/io/rdf/RdfReader.java +++ b/dkpro-core-io-rdf-asl/src/main/java/org/dkpro/core/io/rdf/RdfReader.java @@ -17,6 +17,8 @@ */ package org.dkpro.core.io.rdf; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; + import java.io.IOException; import java.io.InputStream; @@ -36,20 +38,21 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.api.resources.MappingProvider; import org.dkpro.core.io.rdf.internal.Rdf2Uima; import org.dkpro.core.io.rdf.internal.RdfCas; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Reads a CAS serialized as RDF. */ -@ResourceMetaData(name="UIMA CAS RDF Reader") +@ResourceMetaData(name = "UIMA CAS RDF Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.APPLICATION_X_UIMA_RDF}) public class RdfReader extends JCasResourceCollectionReader_ImplBase @@ -63,11 +66,20 @@ public class RdfReader @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) private String posTagset; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) private String posMappingLocation; @@ -84,8 +96,8 @@ public void initialize(UimaContext aContext) { super.initialize(aContext); - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - posTagset, getLanguage()); + posMappingProvider = createPosMappingProvider(this, posMappingLocation, posTagset, + getLanguage()); // Seek first article try { @@ -100,10 +112,10 @@ public void initialize(UimaContext aContext) public void getNext(JCas aJCas) throws IOException, CollectionException { - try{ + try { posMappingProvider.configure(aJCas.getCas()); } - catch(AnalysisEngineProcessException e){ + catch (AnalysisEngineProcessException e) { throw new IOException(e); } @@ -119,7 +131,7 @@ public void getNext(JCas aJCas) // inFileCount++; step(); - } + } private void closeAll() { diff --git a/dkpro-core-io-rdf-asl/src/main/java/org/dkpro/core/io/rdf/RdfWriter.java b/dkpro-core-io-rdf-asl/src/main/java/org/dkpro/core/io/rdf/RdfWriter.java index 34ad75a12a..1d6bfdcd86 100644 --- a/dkpro-core-io-rdf-asl/src/main/java/org/dkpro/core/io/rdf/RdfWriter.java +++ b/dkpro-core-io-rdf-asl/src/main/java/org/dkpro/core/io/rdf/RdfWriter.java @@ -29,16 +29,18 @@ import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; import org.dkpro.core.io.rdf.internal.Uima2Rdf; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Writes the CAS out as RDF. */ -@ResourceMetaData(name="UIMA CAS RDF Writer") +@ResourceMetaData(name = "UIMA CAS RDF Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.APPLICATION_X_UIMA_RDF}) public class RdfWriter extends JCasFileWriter_ImplBase @@ -49,7 +51,8 @@ public class RdfWriter * * @see RDFLanguages */ - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".ttl") private String filenameSuffix; diff --git a/dkpro-core-io-rdf-asl/src/main/java/org/dkpro/core/io/rdf/internal/Uima2Rdf.java b/dkpro-core-io-rdf-asl/src/main/java/org/dkpro/core/io/rdf/internal/Uima2Rdf.java index 38c0516e7a..e1d0e30f01 100644 --- a/dkpro-core-io-rdf-asl/src/main/java/org/dkpro/core/io/rdf/internal/Uima2Rdf.java +++ b/dkpro-core-io-rdf-asl/src/main/java/org/dkpro/core/io/rdf/internal/Uima2Rdf.java @@ -34,9 +34,9 @@ import org.apache.uima.cas.Type; import org.apache.uima.cas.TypeSystem; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.validation.CasAnalysisUtils; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.CasAnalysisUtils; public class Uima2Rdf { @@ -97,7 +97,8 @@ private static void convertView(JCas aJCas, OntModel aTarget) reachable.add(aJCas.getSofa()); // Set up the view itself - String viewUri = String.format("%s#%d", docuri, aJCas.getLowLevelCas().ll_getFSRef(aJCas.getSofa())); + String viewUri = String.format("%s#%d", docuri, + aJCas.getLowLevelCas().ll_getFSRef(aJCas.getSofa())); Individual rdfView = m.createIndividual(viewUri, tView); for (FeatureStructure uimaFS : reachable) { diff --git a/dkpro-core-io-rdf-asl/src/test/java/org/dkpro/core/io/rdf/RdfWriterTest.java b/dkpro-core-io-rdf-asl/src/test/java/org/dkpro/core/io/rdf/RdfWriterTest.java index cb81dcf924..5c07a8ba28 100644 --- a/dkpro-core-io-rdf-asl/src/test/java/org/dkpro/core/io/rdf/RdfWriterTest.java +++ b/dkpro-core-io-rdf-asl/src/test/java/org/dkpro/core/io/rdf/RdfWriterTest.java @@ -17,8 +17,8 @@ */ package org.dkpro.core.io.rdf; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testOneWay; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testRoundTrip; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; +import static org.dkpro.core.testing.IOTestRunner.testRoundTrip; import static org.junit.Assert.assertEquals; import java.io.File; @@ -28,15 +28,14 @@ import org.apache.commons.lang3.StringUtils; import org.apache.jena.rdf.model.Model; import org.apache.jena.rdf.model.ModelFactory; +import org.dkpro.core.io.conll.Conll2006Reader; +import org.dkpro.core.io.conll.Conll2006Writer; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestOptions; import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2006Reader; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2006Writer; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestOptions; - public class RdfWriterTest { @Test diff --git a/dkpro-core-io-rdf-asl/src/test/resources/conll/2006/fi-ref.ttl b/dkpro-core-io-rdf-asl/src/test/resources/conll/2006/fi-ref.ttl index 5127fb672b..c5b416b85c 100644 --- a/dkpro-core-io-rdf-asl/src/test/resources/conll/2006/fi-ref.ttl +++ b/dkpro-core-io-rdf-asl/src/test/resources/conll/2006/fi-ref.ttl @@ -19,1742 +19,1742 @@ @prefix segmentation: . @prefix ner: . - + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; + rdfcas:indexedIn ; + lexmorph-morph:MorphologicalFeatures-value + "Punct" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "143"^^xsd:int ; + tcas:Annotation-end "144"^^xsd:int . + + a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "58"^^xsd:int ; - tcas:Annotation-end "78"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "233"^^xsd:int ; + tcas:Annotation-end "245"^^xsd:int . + + + a rdfcas:FeatureStructure , lexmorph-pos:POS ; + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "V" ; + lexmorph-pos:POS-coarseValue "V" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "95"^^xsd:int ; + tcas:Annotation-end "100"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "54" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "131"^^xsd:int ; - tcas:Annotation-end "133"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Lemma-value "Eurooppa" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "49"^^xsd:int ; + tcas:Annotation-end "57"^^xsd:int . - - a rdfcas:FeatureStructure , syntax-dependency:ROOT ; - rdfcas:indexedIn ; - syntax-dependency:Dependency-DependencyType - "main" ; - syntax-dependency:Dependency-Dependent - ; - syntax-dependency:Dependency-Governor - ; - syntax-dependency:Dependency-flavor - "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "79"^^xsd:int ; - tcas:Annotation-end "87"^^xsd:int . + + a cas:Sofa , rdfcas:View ; + cas:Sofa-mimeType "text" ; + cas:Sofa-sofaID "_InitialView" ; + cas:Sofa-sofaNum "1"^^xsd:int ; + cas:Sofa-sofaString "NEUVOSTO EURATOMIN HANKINTAKESKUKSEN PERUSSÄÄNTÖ EUROOPAN ATOMIENERGIAYHTEISÖN NEUVOSTO , joka ottaa huomioon perustamissopimuksen 54 artiklan , ja ottaa huomioon komission ehdotuksen , ON PÄÄTTÄNYT antaa Euratomin hankintakeskuksen perussäännön seuraavasti :\n1 artikla Nimi ja tarkoitus\n" . + a rdfcas:FeatureStructure , lexmorph-pos:POS ; + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "N" ; + lexmorph-pos:POS-coarseValue "N" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "270"^^xsd:int ; + tcas:Annotation-end "274"^^xsd:int . + + a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "37"^^xsd:int ; - tcas:Annotation-end "48"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "189"^^xsd:int ; + tcas:Annotation-end "198"^^xsd:int . - - a rdfcas:FeatureStructure , syntax-dependency:Dependency ; - rdfcas:indexedIn ; + + a syntax-dependency:Dependency , rdfcas:FeatureStructure ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "attr" ; + "subj" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "58"^^xsd:int ; - tcas:Annotation-end "78"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "163"^^xsd:int ; + tcas:Annotation-end "172"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-morph:MorphologicalFeatures-value - "CC" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "275"^^xsd:int ; - tcas:Annotation-end "277"^^xsd:int . + "V Prs Act Sg3" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "148"^^xsd:int ; + tcas:Annotation-end "153"^^xsd:int . + + + a rdfcas:FeatureStructure , segmentation:Token ; + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "258"^^xsd:int ; + tcas:Annotation-end "259"^^xsd:int . + + + a rdfcas:FeatureStructure , lexmorph-pos:POS ; + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "N" ; + lexmorph-pos:POS-coarseValue "N" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "110"^^xsd:int ; + tcas:Annotation-end "130"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; segmentation:Lemma-value "neuvosto" ; - cas:AnnotationBase-sofa ; + cas:AnnotationBase-sofa ; tcas:Annotation-begin "79"^^xsd:int ; tcas:Annotation-end "87"^^xsd:int . - - a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "huomio" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "101"^^xsd:int ; - tcas:Annotation-end "109"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; - lexmorph-morph:MorphologicalFeatures-value - "Punct" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "258"^^xsd:int ; - tcas:Annotation-end "259"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; - lexmorph-morph:MorphologicalFeatures-value - "N Nom Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "262"^^xsd:int ; - tcas:Annotation-end "269"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-pos:POS ; + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "N" ; + lexmorph-pos:POS-coarseValue "N" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "278"^^xsd:int ; + tcas:Annotation-end "287"^^xsd:int . - - a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "Eurooppa" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "49"^^xsd:int ; - tcas:Annotation-end "57"^^xsd:int . + + a rdfcas:FeatureStructure , segmentation:Token ; + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "205"^^xsd:int ; + tcas:Annotation-end "214"^^xsd:int . - - a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "joka" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "90"^^xsd:int ; - tcas:Annotation-end "94"^^xsd:int . + + a rdfcas:FeatureStructure , syntax-dependency:Dependency ; + rdfcas:indexedIn ; + syntax-dependency:Dependency-DependencyType + "obj" ; + syntax-dependency:Dependency-Dependent + ; + syntax-dependency:Dependency-Governor + ; + syntax-dependency:Dependency-flavor + "basic" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "233"^^xsd:int ; + tcas:Annotation-end "245"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-morph:MorphologicalFeatures-value "N Gen Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "233"^^xsd:int ; - tcas:Annotation-end "245"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "163"^^xsd:int ; + tcas:Annotation-end "172"^^xsd:int . - + a rdfcas:FeatureStructure , syntax-dependency:Dependency ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "subj" ; + "attr" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "90"^^xsd:int ; - tcas:Annotation-end "94"^^xsd:int . - - - a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "hankinta#keskus" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "19"^^xsd:int ; - tcas:Annotation-end "36"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "49"^^xsd:int ; + tcas:Annotation-end "57"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-pos:POS-PosValue "N" ; lexmorph-pos:POS-coarseValue "N" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "0"^^xsd:int ; - tcas:Annotation-end "8"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "134"^^xsd:int ; + tcas:Annotation-end "142"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-morph:MorphologicalFeatures-value "N Prop Gen Sg" ; - cas:AnnotationBase-sofa ; + cas:AnnotationBase-sofa ; tcas:Annotation-begin "205"^^xsd:int ; tcas:Annotation-end "214"^^xsd:int . - + + a rdfcas:FeatureStructure , segmentation:Lemma ; + rdfcas:indexedIn ; + segmentation:Lemma-value "joka" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "90"^^xsd:int ; + tcas:Annotation-end "94"^^xsd:int . + + + a rdfcas:FeatureStructure , segmentation:Token ; + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "37"^^xsd:int ; + tcas:Annotation-end "48"^^xsd:int . + + + a rdfcas:FeatureStructure , segmentation:Lemma ; + rdfcas:indexedIn ; + segmentation:Lemma-value "artikla" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "262"^^xsd:int ; + tcas:Annotation-end "269"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-morph:MorphologicalFeatures-value "Punct" ; - cas:AnnotationBase-sofa ; + cas:AnnotationBase-sofa ; tcas:Annotation-begin "184"^^xsd:int ; tcas:Annotation-end "185"^^xsd:int . - - a rdfcas:FeatureStructure , syntax-dependency:Dependency ; - rdfcas:indexedIn ; + + a syntax-dependency:Dependency , rdfcas:FeatureStructure ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "mod" ; + "obj" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "95"^^xsd:int ; - tcas:Annotation-end "100"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "134"^^xsd:int ; + tcas:Annotation-end "142"^^xsd:int . - - a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; - lexmorph-morph:MorphologicalFeatures-value - "PrfPrc Act Pos Nom Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "189"^^xsd:int ; - tcas:Annotation-end "198"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-pos:POS ; + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "CC" ; + lexmorph-pos:POS-coarseValue "CC" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "145"^^xsd:int ; + tcas:Annotation-end "147"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-morph:MorphologicalFeatures-value "N Gen Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "163"^^xsd:int ; - tcas:Annotation-end "172"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "233"^^xsd:int ; + tcas:Annotation-end "245"^^xsd:int . - + + a rdfcas:FeatureStructure , segmentation:Lemma ; + rdfcas:indexedIn ; + segmentation:Lemma-value "huomio" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "101"^^xsd:int ; + tcas:Annotation-end "109"^^xsd:int . + + a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "9"^^xsd:int ; - tcas:Annotation-end "18"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "58"^^xsd:int ; + tcas:Annotation-end "78"^^xsd:int . + + + a rdfcas:FeatureStructure , lexmorph-pos:POS ; + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "N" ; + lexmorph-pos:POS-coarseValue "N" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "0"^^xsd:int ; + tcas:Annotation-end "8"^^xsd:int . + + + a rdfcas:FeatureStructure , segmentation:Lemma ; + rdfcas:indexedIn ; + segmentation:Lemma-value "ja" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "275"^^xsd:int ; + tcas:Annotation-end "277"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-morph:MorphologicalFeatures-value - "V Prs Act Sg3" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "148"^^xsd:int ; - tcas:Annotation-end "153"^^xsd:int . + "PrfPrc Act Pos Nom Sg" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "189"^^xsd:int ; + tcas:Annotation-end "198"^^xsd:int . - - a syntax-dependency:Dependency , rdfcas:FeatureStructure ; - rdfcas:indexedIn ; + + a rdfcas:FeatureStructure , syntax-dependency:Dependency ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "phrv" ; + "aux" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "101"^^xsd:int ; - tcas:Annotation-end "109"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "186"^^xsd:int ; + tcas:Annotation-end "188"^^xsd:int . - - a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; - lexmorph-morph:MorphologicalFeatures-value - "N Ill Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "101"^^xsd:int ; - tcas:Annotation-end "109"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-pos:POS ; + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "N" ; + lexmorph-pos:POS-coarseValue "N" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "154"^^xsd:int ; + tcas:Annotation-end "162"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-morph:MorphologicalFeatures-value "Punct" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "143"^^xsd:int ; - tcas:Annotation-end "144"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "258"^^xsd:int ; + tcas:Annotation-end "259"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "neuvosto" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "0"^^xsd:int ; - tcas:Annotation-end "8"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Lemma-value "54" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "131"^^xsd:int ; + tcas:Annotation-end "133"^^xsd:int . - - a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; - lexmorph-morph:MorphologicalFeatures-value - "Pron Rel Nom Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "90"^^xsd:int ; - tcas:Annotation-end "94"^^xsd:int . + + a rdfcas:FeatureStructure , segmentation:Token ; + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "88"^^xsd:int ; + tcas:Annotation-end "89"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-morph:MorphologicalFeatures-value - "Num Digit Nom Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "131"^^xsd:int ; - tcas:Annotation-end "133"^^xsd:int . + "N Prop Gen Sg" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "9"^^xsd:int ; + tcas:Annotation-end "18"^^xsd:int . - - a syntax-dependency:Dependency , rdfcas:FeatureStructure ; - rdfcas:indexedIn ; + + a rdfcas:FeatureStructure , syntax-dependency:Dependency ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType "attr" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "110"^^xsd:int ; - tcas:Annotation-end "130"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "262"^^xsd:int ; + tcas:Annotation-end "269"^^xsd:int . - - a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; - lexmorph-morph:MorphologicalFeatures-value - "N Nom Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "79"^^xsd:int ; - tcas:Annotation-end "87"^^xsd:int . + + a rdfcas:FeatureStructure , segmentation:Token ; + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "260"^^xsd:int ; + tcas:Annotation-end "261"^^xsd:int . - - a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; - lexmorph-morph:MorphologicalFeatures-value - "N Prop Gen Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "49"^^xsd:int ; - tcas:Annotation-end "57"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-pos:POS ; + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "N" ; + lexmorph-pos:POS-coarseValue "N" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "173"^^xsd:int ; + tcas:Annotation-end "183"^^xsd:int . - - a syntax-dependency:Dependency , rdfcas:FeatureStructure ; - rdfcas:indexedIn ; + + a rdfcas:FeatureStructure , syntax-dependency:Dependency ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "attr" ; + "subj" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "260"^^xsd:int ; - tcas:Annotation-end "261"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "90"^^xsd:int ; + tcas:Annotation-end "94"^^xsd:int . - - a syntax-dependency:Dependency , rdfcas:FeatureStructure ; - rdfcas:indexedIn ; - syntax-dependency:Dependency-DependencyType - "attr" ; - syntax-dependency:Dependency-Dependent - ; - syntax-dependency:Dependency-Governor - ; - syntax-dependency:Dependency-flavor - "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "131"^^xsd:int ; - tcas:Annotation-end "133"^^xsd:int . + + a rdfcas:FeatureStructure , segmentation:Lemma ; + rdfcas:indexedIn ; + segmentation:Lemma-value "," ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "143"^^xsd:int ; + tcas:Annotation-end "144"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-pos:POS-PosValue "N" ; lexmorph-pos:POS-coarseValue "N" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "278"^^xsd:int ; - tcas:Annotation-end "287"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "215"^^xsd:int ; + tcas:Annotation-end "232"^^xsd:int . - + + a rdfcas:FeatureStructure , segmentation:Token ; + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "95"^^xsd:int ; + tcas:Annotation-end "100"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-morph:MorphologicalFeatures-value - "N Gen Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "19"^^xsd:int ; - tcas:Annotation-end "36"^^xsd:int . + "N Nom Sg" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "37"^^xsd:int ; + tcas:Annotation-end "48"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "275"^^xsd:int ; - tcas:Annotation-end "277"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "270"^^xsd:int ; + tcas:Annotation-end "274"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "N" ; - lexmorph-pos:POS-coarseValue "N" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "270"^^xsd:int ; - tcas:Annotation-end "274"^^xsd:int . - - - a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "258"^^xsd:int ; - tcas:Annotation-end "259"^^xsd:int . + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "V" ; + lexmorph-pos:POS-coarseValue "V" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "186"^^xsd:int ; + tcas:Annotation-end "188"^^xsd:int . - - a rdfcas:FeatureStructure , syntax-dependency:Dependency ; - rdfcas:indexedIn ; + + a syntax-dependency:Dependency , rdfcas:FeatureStructure ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "attr" ; + "conjunct" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "262"^^xsd:int ; - tcas:Annotation-end "269"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "148"^^xsd:int ; + tcas:Annotation-end "153"^^xsd:int . - - a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "262"^^xsd:int ; - tcas:Annotation-end "269"^^xsd:int . + + a rdfcas:FeatureStructure , segmentation:Lemma ; + rdfcas:indexedIn ; + segmentation:Lemma-value "ottaa" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "148"^^xsd:int ; + tcas:Annotation-end "153"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-pos:POS-PosValue "Adv" ; lexmorph-pos:POS-coarseValue "Adv" ; - cas:AnnotationBase-sofa ; + cas:AnnotationBase-sofa ; tcas:Annotation-begin "246"^^xsd:int ; tcas:Annotation-end "257"^^xsd:int . - - a syntax-dependency:Dependency , rdfcas:FeatureStructure ; - rdfcas:indexedIn ; - syntax-dependency:Dependency-DependencyType - "obj" ; - syntax-dependency:Dependency-Dependent - ; - syntax-dependency:Dependency-Governor - ; - syntax-dependency:Dependency-flavor - "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "134"^^xsd:int ; - tcas:Annotation-end "142"^^xsd:int . + + a rdfcas:FeatureStructure , segmentation:Token ; + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "110"^^xsd:int ; + tcas:Annotation-end "130"^^xsd:int . - - a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "N" ; - lexmorph-pos:POS-coarseValue "N" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "173"^^xsd:int ; - tcas:Annotation-end "183"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; + rdfcas:indexedIn ; + lexmorph-morph:MorphologicalFeatures-value + "N Gen Sg" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "58"^^xsd:int ; + tcas:Annotation-end "78"^^xsd:int . - - a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "Num" ; - lexmorph-pos:POS-coarseValue "Num" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "260"^^xsd:int ; - tcas:Annotation-end "261"^^xsd:int . + + a rdfcas:FeatureStructure , segmentation:Lemma ; + rdfcas:indexedIn ; + segmentation:Lemma-value "Euratom" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "9"^^xsd:int ; + tcas:Annotation-end "18"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "233"^^xsd:int ; - tcas:Annotation-end "245"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "278"^^xsd:int ; + tcas:Annotation-end "287"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-pos:POS-PosValue "V" ; lexmorph-pos:POS-coarseValue "V" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "186"^^xsd:int ; - tcas:Annotation-end "188"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "N" ; - lexmorph-pos:POS-coarseValue "N" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "215"^^xsd:int ; - tcas:Annotation-end "232"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "N" ; - lexmorph-pos:POS-coarseValue "N" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "154"^^xsd:int ; - tcas:Annotation-end "162"^^xsd:int . - - - a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "205"^^xsd:int ; - tcas:Annotation-end "214"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "199"^^xsd:int ; + tcas:Annotation-end "204"^^xsd:int . - - a rdfcas:FeatureStructure , syntax-dependency:ROOT ; - rdfcas:indexedIn ; + + a rdfcas:FeatureStructure , syntax-dependency:Dependency ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "main" ; + "attr" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "270"^^xsd:int ; - tcas:Annotation-end "274"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "205"^^xsd:int ; + tcas:Annotation-end "214"^^xsd:int . + + + a rdfcas:FeatureStructure , segmentation:Lemma ; + rdfcas:indexedIn ; + segmentation:Lemma-value "komissio" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "163"^^xsd:int ; + tcas:Annotation-end "172"^^xsd:int . - + a rdfcas:FeatureStructure , syntax-dependency:Dependency ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "phrm" ; + "attr" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "143"^^xsd:int ; - tcas:Annotation-end "144"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "19"^^xsd:int ; + tcas:Annotation-end "36"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "184"^^xsd:int ; - tcas:Annotation-end "185"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "V" ; - lexmorph-pos:POS-coarseValue "V" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "199"^^xsd:int ; - tcas:Annotation-end "204"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "CC" ; - lexmorph-pos:POS-coarseValue "CC" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "145"^^xsd:int ; - tcas:Annotation-end "147"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "134"^^xsd:int ; + tcas:Annotation-end "142"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "tarkoitus" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "278"^^xsd:int ; - tcas:Annotation-end "287"^^xsd:int . - - - a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "189"^^xsd:int ; - tcas:Annotation-end "198"^^xsd:int . - - - a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "163"^^xsd:int ; - tcas:Annotation-end "172"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Lemma-value "Euratom" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "205"^^xsd:int ; + tcas:Annotation-end "214"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-morph:MorphologicalFeatures-value - "N Nom Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "0"^^xsd:int ; - tcas:Annotation-end "8"^^xsd:int . + "Punct" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "88"^^xsd:int ; + tcas:Annotation-end "89"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-pos:POS-PosValue "N" ; lexmorph-pos:POS-coarseValue "N" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "134"^^xsd:int ; - tcas:Annotation-end "142"^^xsd:int . - - - a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "nimi" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "270"^^xsd:int ; - tcas:Annotation-end "274"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "19"^^xsd:int ; + tcas:Annotation-end "36"^^xsd:int . - - a rdfcas:FeatureStructure , syntax-dependency:Dependency ; - rdfcas:indexedIn ; + + a syntax-dependency:Dependency , rdfcas:FeatureStructure ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "phrm" ; + "conjunct" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "275"^^xsd:int ; - tcas:Annotation-end "277"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "278"^^xsd:int ; + tcas:Annotation-end "287"^^xsd:int . - - a rdfcas:FeatureStructure , syntax-dependency:Dependency ; - rdfcas:indexedIn ; + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; + rdfcas:indexedIn ; + lexmorph-morph:MorphologicalFeatures-value + "Num Digit Nom Sg" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "260"^^xsd:int ; + tcas:Annotation-end "261"^^xsd:int . + + + a rdfcas:FeatureStructure , segmentation:Lemma ; + rdfcas:indexedIn ; + segmentation:Lemma-value "," ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "184"^^xsd:int ; + tcas:Annotation-end "185"^^xsd:int . + + + a syntax-dependency:Dependency , rdfcas:FeatureStructure ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "phrm" ; + "attr" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "145"^^xsd:int ; - tcas:Annotation-end "147"^^xsd:int . - - - a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "seuraava" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "246"^^xsd:int ; - tcas:Annotation-end "257"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "110"^^xsd:int ; + tcas:Annotation-end "130"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "148"^^xsd:int ; - tcas:Annotation-end "153"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "145"^^xsd:int ; + tcas:Annotation-end "147"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "ehdotus" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "173"^^xsd:int ; - tcas:Annotation-end "183"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Lemma-value "perus#sääntö" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "233"^^xsd:int ; + tcas:Annotation-end "245"^^xsd:int . - + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; + rdfcas:indexedIn ; + lexmorph-morph:MorphologicalFeatures-value + "V Prs Act Sg3" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "95"^^xsd:int ; + tcas:Annotation-end "100"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-pos:POS-PosValue "N" ; lexmorph-pos:POS-coarseValue "N" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "110"^^xsd:int ; - tcas:Annotation-end "130"^^xsd:int . - - - a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "1" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "260"^^xsd:int ; - tcas:Annotation-end "261"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "49"^^xsd:int ; + tcas:Annotation-end "57"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "143"^^xsd:int ; - tcas:Annotation-end "144"^^xsd:int . - - - a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "hankinta#keskus" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "215"^^xsd:int ; - tcas:Annotation-end "232"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "0"^^xsd:int ; + tcas:Annotation-end "8"^^xsd:int . - - a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "olla" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "186"^^xsd:int ; - tcas:Annotation-end "188"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; + rdfcas:indexedIn ; + lexmorph-morph:MorphologicalFeatures-value + "N Nom Sg" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "270"^^xsd:int ; + tcas:Annotation-end "274"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "huomio" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "154"^^xsd:int ; - tcas:Annotation-end "162"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "V" ; - lexmorph-pos:POS-coarseValue "V" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "95"^^xsd:int ; - tcas:Annotation-end "100"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Lemma-value "päättää" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "189"^^xsd:int ; + tcas:Annotation-end "198"^^xsd:int . - + a syntax-dependency:Dependency , rdfcas:FeatureStructure ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "conjunct" ; + "obj" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "173"^^xsd:int ; + tcas:Annotation-end "183"^^xsd:int . + + + a rdfcas:FeatureStructure , segmentation:Token ; + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "154"^^xsd:int ; + tcas:Annotation-end "162"^^xsd:int . + + + a rdfcas:FeatureStructure , segmentation:Lemma ; + rdfcas:indexedIn ; + segmentation:Lemma-value ":" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "258"^^xsd:int ; + tcas:Annotation-end "259"^^xsd:int . + + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; + rdfcas:indexedIn ; + lexmorph-morph:MorphologicalFeatures-value + "N Gen Sg" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "110"^^xsd:int ; + tcas:Annotation-end "130"^^xsd:int . + + + a rdfcas:FeatureStructure , lexmorph-pos:POS ; + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "N" ; + lexmorph-pos:POS-coarseValue "N" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "79"^^xsd:int ; + tcas:Annotation-end "87"^^xsd:int . + + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; + rdfcas:indexedIn ; + lexmorph-morph:MorphologicalFeatures-value + "N Nom Sg" ; + cas:AnnotationBase-sofa ; tcas:Annotation-begin "278"^^xsd:int ; tcas:Annotation-end "287"^^xsd:int . - - a syntax-dependency:Dependency , rdfcas:FeatureStructure ; - rdfcas:indexedIn ; + + a rdfcas:FeatureStructure , syntax-dependency:Dependency ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "conjunct" ; + "advl" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "148"^^xsd:int ; - tcas:Annotation-end "153"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "246"^^xsd:int ; + tcas:Annotation-end "257"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "131"^^xsd:int ; - tcas:Annotation-end "133"^^xsd:int . - - - a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "antaa" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "199"^^xsd:int ; - tcas:Annotation-end "204"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "173"^^xsd:int ; + tcas:Annotation-end "183"^^xsd:int . - - a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "ja" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "145"^^xsd:int ; - tcas:Annotation-end "147"^^xsd:int . + + a rdfcas:FeatureStructure , syntax-dependency:Dependency ; + rdfcas:indexedIn ; + syntax-dependency:Dependency-DependencyType + "attr" ; + syntax-dependency:Dependency-Dependent + ; + syntax-dependency:Dependency-Governor + ; + syntax-dependency:Dependency-flavor + "basic" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "58"^^xsd:int ; + tcas:Annotation-end "78"^^xsd:int . - - a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "Punct" ; - lexmorph-pos:POS-coarseValue "Punct" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "88"^^xsd:int ; - tcas:Annotation-end "89"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; + rdfcas:indexedIn ; + lexmorph-morph:MorphologicalFeatures-value + "N Gen Sg" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "134"^^xsd:int ; + tcas:Annotation-end "142"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "101"^^xsd:int ; - tcas:Annotation-end "109"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "215"^^xsd:int ; + tcas:Annotation-end "232"^^xsd:int . - + + a rdfcas:FeatureStructure , lexmorph-pos:POS ; + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "Pron" ; + lexmorph-pos:POS-coarseValue "Pron" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "90"^^xsd:int ; + tcas:Annotation-end "94"^^xsd:int . + + a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "artikla" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "134"^^xsd:int ; - tcas:Annotation-end "142"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Lemma-value "perus#sääntö" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "37"^^xsd:int ; + tcas:Annotation-end "48"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-pos:POS-PosValue "N" ; lexmorph-pos:POS-coarseValue "N" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "58"^^xsd:int ; - tcas:Annotation-end "78"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "262"^^xsd:int ; + tcas:Annotation-end "269"^^xsd:int . - - a rdfcas:FeatureStructure , segmentation:Sentence ; - rdfcas:indexedIn ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "260"^^xsd:int ; - tcas:Annotation-end "287"^^xsd:int . + + a rdfcas:FeatureStructure , segmentation:Token ; + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "186"^^xsd:int ; + tcas:Annotation-end "188"^^xsd:int . - + a syntax-dependency:Dependency , rdfcas:FeatureStructure ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "phrv" ; + "phrm" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "154"^^xsd:int ; - tcas:Annotation-end "162"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "143"^^xsd:int ; + tcas:Annotation-end "144"^^xsd:int . - - a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "49"^^xsd:int ; - tcas:Annotation-end "57"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; + rdfcas:indexedIn ; + lexmorph-morph:MorphologicalFeatures-value + "CC" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "145"^^xsd:int ; + tcas:Annotation-end "147"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "90"^^xsd:int ; - tcas:Annotation-end "94"^^xsd:int . - - - a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "perustamis#sopimus" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "110"^^xsd:int ; - tcas:Annotation-end "130"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "246"^^xsd:int ; + tcas:Annotation-end "257"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-pos:POS-PosValue "N" ; lexmorph-pos:POS-coarseValue "N" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "37"^^xsd:int ; - tcas:Annotation-end "48"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "101"^^xsd:int ; + tcas:Annotation-end "109"^^xsd:int . - - a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "19"^^xsd:int ; - tcas:Annotation-end "36"^^xsd:int . + + a rdfcas:FeatureStructure , segmentation:Lemma ; + rdfcas:indexedIn ; + segmentation:Lemma-value "atomi#energia#yhteisö" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "58"^^xsd:int ; + tcas:Annotation-end "78"^^xsd:int . - - a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "79"^^xsd:int ; - tcas:Annotation-end "87"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; + rdfcas:indexedIn ; + lexmorph-morph:MorphologicalFeatures-value + "N Nom Sg" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "0"^^xsd:int ; + tcas:Annotation-end "8"^^xsd:int . - - a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "ottaa" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "95"^^xsd:int ; - tcas:Annotation-end "100"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-pos:POS ; + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "CC" ; + lexmorph-pos:POS-coarseValue "CC" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "275"^^xsd:int ; + tcas:Annotation-end "277"^^xsd:int . - - a syntax-dependency:Dependency , rdfcas:FeatureStructure ; - rdfcas:indexedIn ; + + a rdfcas:FeatureStructure , segmentation:Token ; + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "199"^^xsd:int ; + tcas:Annotation-end "204"^^xsd:int . + + + a rdfcas:FeatureStructure , syntax-dependency:Dependency ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "subj" ; + "conjunct" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "163"^^xsd:int ; - tcas:Annotation-end "172"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "189"^^xsd:int ; + tcas:Annotation-end "198"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-morph:MorphologicalFeatures-value - "N Nom Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "278"^^xsd:int ; - tcas:Annotation-end "287"^^xsd:int . + "N Ill Sg" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "154"^^xsd:int ; + tcas:Annotation-end "162"^^xsd:int . + + + a syntax-dependency:Dependency , rdfcas:FeatureStructure ; + rdfcas:indexedIn ; + syntax-dependency:Dependency-DependencyType + "attr" ; + syntax-dependency:Dependency-Dependent + ; + syntax-dependency:Dependency-Governor + ; + syntax-dependency:Dependency-flavor + "basic" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "0"^^xsd:int ; + tcas:Annotation-end "8"^^xsd:int . + + + a rdfcas:FeatureStructure , lexmorph-pos:POS ; + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "Num" ; + lexmorph-pos:POS-coarseValue "Num" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "131"^^xsd:int ; + tcas:Annotation-end "133"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; segmentation:Lemma-value "," ; - cas:AnnotationBase-sofa ; + cas:AnnotationBase-sofa ; tcas:Annotation-begin "88"^^xsd:int ; tcas:Annotation-end "89"^^xsd:int . - - a syntax-dependency:Dependency , rdfcas:FeatureStructure ; - rdfcas:indexedIn ; + + a rdfcas:FeatureStructure , segmentation:Token ; + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "19"^^xsd:int ; + tcas:Annotation-end "36"^^xsd:int . + + + a syntax-dependency:ROOT , rdfcas:FeatureStructure ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "phrm" ; + "main" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "184"^^xsd:int ; - tcas:Annotation-end "185"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; - lexmorph-morph:MorphologicalFeatures-value - "N Nom Sg" ; - cas:AnnotationBase-sofa ; + cas:AnnotationBase-sofa ; tcas:Annotation-begin "270"^^xsd:int ; tcas:Annotation-end "274"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "atomi#energia#yhteisö" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "58"^^xsd:int ; - tcas:Annotation-end "78"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Lemma-value "1" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "260"^^xsd:int ; + tcas:Annotation-end "261"^^xsd:int . + + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; + rdfcas:indexedIn ; + lexmorph-morph:MorphologicalFeatures-value + "N Gen Sg" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "173"^^xsd:int ; + tcas:Annotation-end "183"^^xsd:int . - + a syntax-dependency:Dependency , rdfcas:FeatureStructure ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "obj" ; + "mod" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "173"^^xsd:int ; - tcas:Annotation-end "183"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "95"^^xsd:int ; + tcas:Annotation-end "100"^^xsd:int . - - a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; - lexmorph-morph:MorphologicalFeatures-value - "V Inf1 Lat" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "199"^^xsd:int ; - tcas:Annotation-end "204"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-pos:POS ; + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "Punct" ; + lexmorph-pos:POS-coarseValue "Punct" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "143"^^xsd:int ; + tcas:Annotation-end "144"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; - lexmorph-morph:MorphologicalFeatures-value - "Adv Pos Man" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "246"^^xsd:int ; - tcas:Annotation-end "257"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; - lexmorph-morph:MorphologicalFeatures-value - "N Gen Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "173"^^xsd:int ; - tcas:Annotation-end "183"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; - lexmorph-morph:MorphologicalFeatures-value - "Num Digit Nom Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "260"^^xsd:int ; - tcas:Annotation-end "261"^^xsd:int . - - - a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "perus#sääntö" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "37"^^xsd:int ; - tcas:Annotation-end "48"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; - lexmorph-morph:MorphologicalFeatures-value - "V Prs Act Sg3" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "186"^^xsd:int ; - tcas:Annotation-end "188"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "N" ; - lexmorph-pos:POS-coarseValue "N" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "9"^^xsd:int ; - tcas:Annotation-end "18"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-morph:MorphologicalFeatures-value "N Gen Sg" ; - cas:AnnotationBase-sofa ; + cas:AnnotationBase-sofa ; tcas:Annotation-begin "215"^^xsd:int ; tcas:Annotation-end "232"^^xsd:int . - - a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; - lexmorph-morph:MorphologicalFeatures-value - "N Ill Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "154"^^xsd:int ; - tcas:Annotation-end "162"^^xsd:int . - - - a rdfcas:FeatureStructure , syntax-dependency:Dependency ; - rdfcas:indexedIn ; - syntax-dependency:Dependency-DependencyType - "aux" ; - syntax-dependency:Dependency-Dependent - ; - syntax-dependency:Dependency-Governor - ; - syntax-dependency:Dependency-flavor - "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "186"^^xsd:int ; - tcas:Annotation-end "188"^^xsd:int . + + a rdfcas:FeatureStructure , segmentation:Lemma ; + rdfcas:indexedIn ; + segmentation:Lemma-value "ottaa" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "95"^^xsd:int ; + tcas:Annotation-end "100"^^xsd:int . a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "0"^^xsd:int ; - tcas:Annotation-end "8"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "49"^^xsd:int ; + tcas:Annotation-end "57"^^xsd:int . a rdfcas:FeatureStructure , metadata:DocumentMetaData ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; metadata:DocumentMetaData-documentId "fi-orig.conll" ; metadata:DocumentMetaData-documentTitle "fi-orig.conll" ; metadata:DocumentMetaData-isLastSegment false ; - cas:AnnotationBase-sofa ; + cas:AnnotationBase-sofa ; tcas:Annotation-begin "0"^^xsd:int ; tcas:Annotation-end "288"^^xsd:int ; tcas:DocumentAnnotation-language "x-unspecified" . - - a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; - lexmorph-morph:MorphologicalFeatures-value - "CC" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "145"^^xsd:int ; - tcas:Annotation-end "147"^^xsd:int . - - - a rdfcas:FeatureStructure , syntax-dependency:Dependency ; - rdfcas:indexedIn ; - syntax-dependency:Dependency-DependencyType - "conjunct" ; - syntax-dependency:Dependency-Dependent - ; - syntax-dependency:Dependency-Governor - ; - syntax-dependency:Dependency-flavor - "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "189"^^xsd:int ; - tcas:Annotation-end "198"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; - lexmorph-morph:MorphologicalFeatures-value - "N Gen Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "134"^^xsd:int ; - tcas:Annotation-end "142"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; - lexmorph-morph:MorphologicalFeatures-value - "N Gen Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "110"^^xsd:int ; - tcas:Annotation-end "130"^^xsd:int . - - + a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "Euratom" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "9"^^xsd:int ; - tcas:Annotation-end "18"^^xsd:int . - - - a rdfcas:FeatureStructure , syntax-dependency:Dependency ; - rdfcas:indexedIn ; - syntax-dependency:Dependency-DependencyType - "obj" ; - syntax-dependency:Dependency-Dependent - ; - syntax-dependency:Dependency-Governor - ; - syntax-dependency:Dependency-flavor - "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "199"^^xsd:int ; - tcas:Annotation-end "204"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Lemma-value "nimi" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "270"^^xsd:int ; + tcas:Annotation-end "274"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-morph:MorphologicalFeatures-value "V Prs Act Sg3" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "95"^^xsd:int ; - tcas:Annotation-end "100"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; - lexmorph-morph:MorphologicalFeatures-value - "Punct" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "88"^^xsd:int ; - tcas:Annotation-end "89"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "CC" ; - lexmorph-pos:POS-coarseValue "CC" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "275"^^xsd:int ; - tcas:Annotation-end "277"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "186"^^xsd:int ; + tcas:Annotation-end "188"^^xsd:int . - - a rdfcas:FeatureStructure , syntax-dependency:Dependency ; - rdfcas:indexedIn ; + + a syntax-dependency:Dependency , rdfcas:FeatureStructure ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "attr" ; + "phrv" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "205"^^xsd:int ; - tcas:Annotation-end "214"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; - lexmorph-morph:MorphologicalFeatures-value - "N Gen Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "58"^^xsd:int ; - tcas:Annotation-end "78"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "Punct" ; - lexmorph-pos:POS-coarseValue "Punct" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "258"^^xsd:int ; - tcas:Annotation-end "259"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "154"^^xsd:int ; + tcas:Annotation-end "162"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "N" ; - lexmorph-pos:POS-coarseValue "N" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "262"^^xsd:int ; - tcas:Annotation-end "269"^^xsd:int . + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "V" ; + lexmorph-pos:POS-coarseValue "V" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "148"^^xsd:int ; + tcas:Annotation-end "153"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-morph:MorphologicalFeatures-value - "N Nom Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "37"^^xsd:int ; - tcas:Annotation-end "48"^^xsd:int . + "Adv Pos Man" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "246"^^xsd:int ; + tcas:Annotation-end "257"^^xsd:int . + + + a rdfcas:FeatureStructure , segmentation:Lemma ; + rdfcas:indexedIn ; + segmentation:Lemma-value "perustamis#sopimus" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "110"^^xsd:int ; + tcas:Annotation-end "130"^^xsd:int . + + + a rdfcas:FeatureStructure , segmentation:Token ; + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "79"^^xsd:int ; + tcas:Annotation-end "87"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-pos:POS-PosValue "N" ; lexmorph-pos:POS-coarseValue "N" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "233"^^xsd:int ; - tcas:Annotation-end "245"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "9"^^xsd:int ; + tcas:Annotation-end "18"^^xsd:int . - - a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; + + a rdfcas:FeatureStructure , segmentation:Lemma ; + rdfcas:indexedIn ; + segmentation:Lemma-value "tarkoitus" ; + cas:AnnotationBase-sofa ; tcas:Annotation-begin "278"^^xsd:int ; tcas:Annotation-end "287"^^xsd:int . - - a syntax-dependency:Dependency , rdfcas:FeatureStructure ; - rdfcas:indexedIn ; + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; + rdfcas:indexedIn ; + lexmorph-morph:MorphologicalFeatures-value + "V Inf1 Lat" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "199"^^xsd:int ; + tcas:Annotation-end "204"^^xsd:int . + + + a rdfcas:FeatureStructure , syntax-dependency:Dependency ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType "attr" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; + cas:AnnotationBase-sofa ; tcas:Annotation-begin "215"^^xsd:int ; tcas:Annotation-end "232"^^xsd:int . - + + a rdfcas:FeatureStructure , lexmorph-pos:POS ; + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "N" ; + lexmorph-pos:POS-coarseValue "N" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "163"^^xsd:int ; + tcas:Annotation-end "172"^^xsd:int . + + a syntax-dependency:Dependency , rdfcas:FeatureStructure ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType "attr" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "0"^^xsd:int ; - tcas:Annotation-end "8"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "37"^^xsd:int ; + tcas:Annotation-end "48"^^xsd:int . - + + a rdfcas:FeatureStructure , segmentation:Lemma ; + rdfcas:indexedIn ; + segmentation:Lemma-value "artikla" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "134"^^xsd:int ; + tcas:Annotation-end "142"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-pos:POS-PosValue "N" ; lexmorph-pos:POS-coarseValue "N" ; - cas:AnnotationBase-sofa ; + cas:AnnotationBase-sofa ; tcas:Annotation-begin "205"^^xsd:int ; tcas:Annotation-end "214"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "270"^^xsd:int ; - tcas:Annotation-end "274"^^xsd:int . - - - a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "Punct" ; - lexmorph-pos:POS-coarseValue "Punct" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "184"^^xsd:int ; - tcas:Annotation-end "185"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "90"^^xsd:int ; + tcas:Annotation-end "94"^^xsd:int . - - a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "246"^^xsd:int ; - tcas:Annotation-end "257"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; + rdfcas:indexedIn ; + lexmorph-morph:MorphologicalFeatures-value + "N Gen Sg" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "19"^^xsd:int ; + tcas:Annotation-end "36"^^xsd:int . - - a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "PrfPrc" ; - lexmorph-pos:POS-coarseValue "PrfPrc" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "189"^^xsd:int ; - tcas:Annotation-end "198"^^xsd:int . + + a rdfcas:FeatureStructure , segmentation:Sentence ; + rdfcas:indexedIn ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "260"^^xsd:int ; + tcas:Annotation-end "287"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "260"^^xsd:int ; - tcas:Annotation-end "261"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "262"^^xsd:int ; + tcas:Annotation-end "269"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "N" ; - lexmorph-pos:POS-coarseValue "N" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "163"^^xsd:int ; - tcas:Annotation-end "172"^^xsd:int . + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "Punct" ; + lexmorph-pos:POS-coarseValue "Punct" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "184"^^xsd:int ; + tcas:Annotation-end "185"^^xsd:int . - + a rdfcas:FeatureStructure , syntax-dependency:Dependency ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "obj" ; + "attr" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "233"^^xsd:int ; - tcas:Annotation-end "245"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "131"^^xsd:int ; + tcas:Annotation-end "133"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; segmentation:Lemma-value "ja" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "275"^^xsd:int ; - tcas:Annotation-end "277"^^xsd:int . - - - a rdfcas:FeatureStructure , syntax-dependency:Dependency ; - rdfcas:indexedIn ; - syntax-dependency:Dependency-DependencyType - "attr" ; - syntax-dependency:Dependency-Dependent - ; - syntax-dependency:Dependency-Governor - ; - syntax-dependency:Dependency-flavor - "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "9"^^xsd:int ; - tcas:Annotation-end "18"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "145"^^xsd:int ; + tcas:Annotation-end "147"^^xsd:int . - - a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "215"^^xsd:int ; - tcas:Annotation-end "232"^^xsd:int . - - - a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "154"^^xsd:int ; - tcas:Annotation-end "162"^^xsd:int . - - - a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value ":" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "258"^^xsd:int ; - tcas:Annotation-end "259"^^xsd:int . - - - a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "186"^^xsd:int ; - tcas:Annotation-end "188"^^xsd:int . - - + a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "V" ; - lexmorph-pos:POS-coarseValue "V" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "148"^^xsd:int ; - tcas:Annotation-end "153"^^xsd:int . - - - a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "artikla" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "262"^^xsd:int ; - tcas:Annotation-end "269"^^xsd:int . + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "N" ; + lexmorph-pos:POS-coarseValue "N" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "233"^^xsd:int ; + tcas:Annotation-end "245"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "199"^^xsd:int ; - tcas:Annotation-end "204"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "101"^^xsd:int ; + tcas:Annotation-end "109"^^xsd:int . + + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; + rdfcas:indexedIn ; + lexmorph-morph:MorphologicalFeatures-value + "N Prop Gen Sg" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "49"^^xsd:int ; + tcas:Annotation-end "57"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "perus#sääntö" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "233"^^xsd:int ; - tcas:Annotation-end "245"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Lemma-value "neuvosto" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "0"^^xsd:int ; + tcas:Annotation-end "8"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "145"^^xsd:int ; - tcas:Annotation-end "147"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "275"^^xsd:int ; + tcas:Annotation-end "277"^^xsd:int . - - a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "173"^^xsd:int ; - tcas:Annotation-end "183"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-pos:POS ; + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "PrfPrc" ; + lexmorph-pos:POS-coarseValue "PrfPrc" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "189"^^xsd:int ; + tcas:Annotation-end "198"^^xsd:int . - - a rdfcas:FeatureStructure , syntax-dependency:Dependency ; - rdfcas:indexedIn ; + + a syntax-dependency:Dependency , rdfcas:FeatureStructure ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "advl" ; + "phrm" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "246"^^xsd:int ; - tcas:Annotation-end "257"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "184"^^xsd:int ; + tcas:Annotation-end "185"^^xsd:int . - + + a rdfcas:FeatureStructure , segmentation:Lemma ; + rdfcas:indexedIn ; + segmentation:Lemma-value "huomio" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "154"^^xsd:int ; + tcas:Annotation-end "162"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-pos:POS-PosValue "Punct" ; lexmorph-pos:POS-coarseValue "Punct" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "143"^^xsd:int ; - tcas:Annotation-end "144"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "258"^^xsd:int ; + tcas:Annotation-end "259"^^xsd:int . - + + a rdfcas:FeatureStructure , segmentation:Token ; + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "131"^^xsd:int ; + tcas:Annotation-end "133"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-morph:MorphologicalFeatures-value - "N Prop Gen Sg" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "9"^^xsd:int ; - tcas:Annotation-end "18"^^xsd:int . + "N Nom Sg" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "79"^^xsd:int ; + tcas:Annotation-end "87"^^xsd:int . - + a syntax-dependency:Dependency , rdfcas:FeatureStructure ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType "attr" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "19"^^xsd:int ; - tcas:Annotation-end "36"^^xsd:int . - - - a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "Euratom" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "205"^^xsd:int ; - tcas:Annotation-end "214"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "260"^^xsd:int ; + tcas:Annotation-end "261"^^xsd:int . - - a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "134"^^xsd:int ; - tcas:Annotation-end "142"^^xsd:int . + + a rdfcas:FeatureStructure , segmentation:Sentence ; + rdfcas:indexedIn ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "0"^^xsd:int ; + tcas:Annotation-end "259"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "," ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "184"^^xsd:int ; - tcas:Annotation-end "185"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Lemma-value "ehdotus" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "173"^^xsd:int ; + tcas:Annotation-end "183"^^xsd:int . - - a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "Pron" ; - lexmorph-pos:POS-coarseValue "Pron" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "90"^^xsd:int ; - tcas:Annotation-end "94"^^xsd:int . + + a syntax-dependency:ROOT , rdfcas:FeatureStructure ; + rdfcas:indexedIn ; + syntax-dependency:Dependency-DependencyType + "main" ; + syntax-dependency:Dependency-Dependent + ; + syntax-dependency:Dependency-Governor + ; + syntax-dependency:Dependency-flavor + "basic" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "79"^^xsd:int ; + tcas:Annotation-end "87"^^xsd:int . - - a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "Num" ; - lexmorph-pos:POS-coarseValue "Num" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "131"^^xsd:int ; - tcas:Annotation-end "133"^^xsd:int . + + a rdfcas:FeatureStructure , segmentation:Token ; + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "143"^^xsd:int ; + tcas:Annotation-end "144"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "päättää" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "189"^^xsd:int ; - tcas:Annotation-end "198"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Lemma-value "hankinta#keskus" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "215"^^xsd:int ; + tcas:Annotation-end "232"^^xsd:int . - - a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "110"^^xsd:int ; - tcas:Annotation-end "130"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; + rdfcas:indexedIn ; + lexmorph-morph:MorphologicalFeatures-value + "Pron Rel Nom Sg" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "90"^^xsd:int ; + tcas:Annotation-end "94"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-pos:POS-PosValue "N" ; lexmorph-pos:POS-coarseValue "N" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "79"^^xsd:int ; - tcas:Annotation-end "87"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "37"^^xsd:int ; + tcas:Annotation-end "48"^^xsd:int . - - a rdfcas:FeatureStructure , segmentation:Sentence ; - rdfcas:indexedIn ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "0"^^xsd:int ; - tcas:Annotation-end "259"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; + rdfcas:indexedIn ; + lexmorph-morph:MorphologicalFeatures-value + "N Nom Sg" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "262"^^xsd:int ; + tcas:Annotation-end "269"^^xsd:int . - - a rdfcas:FeatureStructure , syntax-dependency:Dependency ; - rdfcas:indexedIn ; + + a rdfcas:FeatureStructure , segmentation:Lemma ; + rdfcas:indexedIn ; + segmentation:Lemma-value "olla" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "186"^^xsd:int ; + tcas:Annotation-end "188"^^xsd:int . + + + a syntax-dependency:Dependency , rdfcas:FeatureStructure ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType - "attr" ; + "phrm" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "37"^^xsd:int ; - tcas:Annotation-end "48"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "145"^^xsd:int ; + tcas:Annotation-end "147"^^xsd:int . + + + a rdfcas:FeatureStructure , segmentation:Token ; + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "148"^^xsd:int ; + tcas:Annotation-end "153"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "komissio" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "163"^^xsd:int ; - tcas:Annotation-end "172"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Lemma-value "seuraava" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "246"^^xsd:int ; + tcas:Annotation-end "257"^^xsd:int . + + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; + rdfcas:indexedIn ; + lexmorph-morph:MorphologicalFeatures-value + "N Ill Sg" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "101"^^xsd:int ; + tcas:Annotation-end "109"^^xsd:int . - + a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; lexmorph-pos:POS-PosValue "N" ; lexmorph-pos:POS-coarseValue "N" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "101"^^xsd:int ; - tcas:Annotation-end "109"^^xsd:int . - - - a cas:Sofa , rdfcas:View ; - cas:Sofa-mimeType "text" ; - cas:Sofa-sofaID "_InitialView" ; - cas:Sofa-sofaNum "1"^^xsd:int ; - cas:Sofa-sofaString "NEUVOSTO EURATOMIN HANKINTAKESKUKSEN PERUSSÄÄNTÖ EUROOPAN ATOMIENERGIAYHTEISÖN NEUVOSTO , joka ottaa huomioon perustamissopimuksen 54 artiklan , ja ottaa huomioon komission ehdotuksen , ON PÄÄTTÄNYT antaa Euratomin hankintakeskuksen perussäännön seuraavasti :\n1 artikla Nimi ja tarkoitus\n" . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "58"^^xsd:int ; + tcas:Annotation-end "78"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "95"^^xsd:int ; - tcas:Annotation-end "100"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "9"^^xsd:int ; + tcas:Annotation-end "18"^^xsd:int . - - a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "N" ; - lexmorph-pos:POS-coarseValue "N" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "49"^^xsd:int ; - tcas:Annotation-end "57"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; + rdfcas:indexedIn ; + lexmorph-morph:MorphologicalFeatures-value + "CC" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "275"^^xsd:int ; + tcas:Annotation-end "277"^^xsd:int . - + a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "ottaa" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "148"^^xsd:int ; - tcas:Annotation-end "153"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Lemma-value "antaa" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "199"^^xsd:int ; + tcas:Annotation-end "204"^^xsd:int . - + + a rdfcas:FeatureStructure , syntax-dependency:Dependency ; + rdfcas:indexedIn ; + syntax-dependency:Dependency-DependencyType + "obj" ; + syntax-dependency:Dependency-Dependent + ; + syntax-dependency:Dependency-Governor + ; + syntax-dependency:Dependency-flavor + "basic" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "199"^^xsd:int ; + tcas:Annotation-end "204"^^xsd:int . + + a rdfcas:FeatureStructure , segmentation:Token ; - rdfcas:indexedIn ; - segmentation:Token-lemma ; - segmentation:Token-morph ; - segmentation:Token-pos ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "88"^^xsd:int ; - tcas:Annotation-end "89"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "163"^^xsd:int ; + tcas:Annotation-end "172"^^xsd:int . - + a syntax-dependency:Dependency , rdfcas:FeatureStructure ; - rdfcas:indexedIn ; + rdfcas:indexedIn ; syntax-dependency:Dependency-DependencyType "attr" ; syntax-dependency:Dependency-Dependent - ; + ; syntax-dependency:Dependency-Governor - ; + ; syntax-dependency:Dependency-flavor "basic" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "49"^^xsd:int ; - tcas:Annotation-end "57"^^xsd:int . + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "9"^^xsd:int ; + tcas:Annotation-end "18"^^xsd:int . - + + a rdfcas:FeatureStructure , lexmorph-morph:MorphologicalFeatures ; + rdfcas:indexedIn ; + lexmorph-morph:MorphologicalFeatures-value + "Num Digit Nom Sg" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "131"^^xsd:int ; + tcas:Annotation-end "133"^^xsd:int . + + + a rdfcas:FeatureStructure , lexmorph-pos:POS ; + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "Punct" ; + lexmorph-pos:POS-coarseValue "Punct" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "88"^^xsd:int ; + tcas:Annotation-end "89"^^xsd:int . + + a rdfcas:FeatureStructure , segmentation:Lemma ; - rdfcas:indexedIn ; - segmentation:Lemma-value "," ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "143"^^xsd:int ; - tcas:Annotation-end "144"^^xsd:int . + rdfcas:indexedIn ; + segmentation:Lemma-value "hankinta#keskus" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "19"^^xsd:int ; + tcas:Annotation-end "36"^^xsd:int . - + + a syntax-dependency:Dependency , rdfcas:FeatureStructure ; + rdfcas:indexedIn ; + syntax-dependency:Dependency-DependencyType + "phrm" ; + syntax-dependency:Dependency-Dependent + ; + syntax-dependency:Dependency-Governor + ; + syntax-dependency:Dependency-flavor + "basic" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "275"^^xsd:int ; + tcas:Annotation-end "277"^^xsd:int . + + a rdfcas:FeatureStructure , lexmorph-pos:POS ; - rdfcas:indexedIn ; - lexmorph-pos:POS-PosValue "N" ; - lexmorph-pos:POS-coarseValue "N" ; - cas:AnnotationBase-sofa ; - tcas:Annotation-begin "19"^^xsd:int ; - tcas:Annotation-end "36"^^xsd:int . + rdfcas:indexedIn ; + lexmorph-pos:POS-PosValue "Num" ; + lexmorph-pos:POS-coarseValue "Num" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "260"^^xsd:int ; + tcas:Annotation-end "261"^^xsd:int . + + + a rdfcas:FeatureStructure , segmentation:Token ; + rdfcas:indexedIn ; + segmentation:Token-lemma ; + segmentation:Token-morph ; + segmentation:Token-pos ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "184"^^xsd:int ; + tcas:Annotation-end "185"^^xsd:int . + + + a syntax-dependency:Dependency , rdfcas:FeatureStructure ; + rdfcas:indexedIn ; + syntax-dependency:Dependency-DependencyType + "phrv" ; + syntax-dependency:Dependency-Dependent + ; + syntax-dependency:Dependency-Governor + ; + syntax-dependency:Dependency-flavor + "basic" ; + cas:AnnotationBase-sofa ; + tcas:Annotation-begin "101"^^xsd:int ; + tcas:Annotation-end "109"^^xsd:int . diff --git a/dkpro-core-io-rdf-asl/src/test/resources/log4j.properties b/dkpro-core-io-rdf-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-io-rdf-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-rdf-asl/src/test/resources/log4j2.xml b/dkpro-core-io-rdf-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-rdf-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-reuters-asl/pom.xml b/dkpro-core-io-reuters-asl/pom.xml index 6b63a44100..63c467c1f4 100644 --- a/dkpro-core-io-reuters-asl/pom.xml +++ b/dkpro-core-io-reuters-asl/pom.xml @@ -17,17 +17,16 @@ --> - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl 4.0.0 - - de.tudarmstadt.ukp.dkpro.core.io.reuters-asl + dkpro-core-io-reuters-asl jar DKPro Core ASL - IO - Reuters-21578 - + https://dkpro.github.io/dkpro-core/ commons-logging @@ -46,16 +45,20 @@ commons-io - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -76,7 +79,7 @@ - do require it as a compile dependency and also at runtime, so we - cannot set it to scope provided. Need to tell Maven to ignore it here. --> - de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core:dkpro-core-api-parameter-asl diff --git a/dkpro-core-io-reuters-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/ExtractReuters.java b/dkpro-core-io-reuters-asl/src/main/java/org/dkpro/core/io/reuters/ExtractReuters.java similarity index 93% rename from dkpro-core-io-reuters-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/ExtractReuters.java rename to dkpro-core-io-reuters-asl/src/main/java/org/dkpro/core/io/reuters/ExtractReuters.java index f02b40373d..45362d44eb 100644 --- a/dkpro-core-io-reuters-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/ExtractReuters.java +++ b/dkpro-core-io-reuters-asl/src/main/java/org/dkpro/core/io/reuters/ExtractReuters.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.reuters; +package org.dkpro.core.io.reuters; import java.io.BufferedReader; import java.io.IOException; @@ -25,7 +25,11 @@ import java.nio.file.Files; import java.nio.file.Path; import java.text.ParseException; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -146,11 +150,15 @@ public static List extractFile(InputStream sgmFile, URI uri) } /** - * Find the {@code } tags that are nested within another tag and add them to the given {@link ReutersDocument}. + * Find the {@code } tags that are nested within another tag and add them to the given + * {@link ReutersDocument}. * - * @param doc the current document represented as a {@link ReutersDocument}. - * @param tag the outer tag, e.g. {@code } - * @param text the value of the outer tag from which nested tags are extracted + * @param doc + * the current document represented as a {@link ReutersDocument}. + * @param tag + * the outer tag, e.g. {@code } + * @param text + * the value of the outer tag from which nested tags are extracted */ private static void extractNested(ReutersDocument doc, String tag, String text) throws ParseException diff --git a/dkpro-core-io-reuters-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/Reuters21578SgmlReader.java b/dkpro-core-io-reuters-asl/src/main/java/org/dkpro/core/io/reuters/Reuters21578SgmlReader.java similarity index 91% rename from dkpro-core-io-reuters-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/Reuters21578SgmlReader.java rename to dkpro-core-io-reuters-asl/src/main/java/org/dkpro/core/io/reuters/Reuters21578SgmlReader.java index ab4fbc0597..78191712b3 100644 --- a/dkpro-core-io-reuters-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/Reuters21578SgmlReader.java +++ b/dkpro-core-io-reuters-asl/src/main/java/org/dkpro/core/io/reuters/Reuters21578SgmlReader.java @@ -15,12 +15,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.reuters; +package org.dkpro.core.io.reuters; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.MetaDataStringField; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; +import java.io.IOException; +import java.text.ParseException; +import java.util.LinkedList; +import java.util.Queue; import org.apache.uima.UimaContext; import org.apache.uima.cas.CAS; @@ -32,18 +32,20 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Progress; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.MimeTypes; -import java.io.IOException; -import java.text.ParseException; -import java.util.LinkedList; -import java.util.Queue; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.MetaDataStringField; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Read a Reuters-21578 corpus in SGML format. *

* Set the directory that contains the SGML files with {@link #PARAM_SOURCE_LOCATION}. */ -@ResourceMetaData(name="Reuters-21578 Corpus SGML Reader") +@ResourceMetaData(name = "Reuters-21578 Corpus SGML Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.APPLICATION_X_REUTERS21578_SGML}) @TypeCapability( outputs = { diff --git a/dkpro-core-io-reuters-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/Reuters21578TxtReader.java b/dkpro-core-io-reuters-asl/src/main/java/org/dkpro/core/io/reuters/Reuters21578TxtReader.java similarity index 83% rename from dkpro-core-io-reuters-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/Reuters21578TxtReader.java rename to dkpro-core-io-reuters-asl/src/main/java/org/dkpro/core/io/reuters/Reuters21578TxtReader.java index 41deca4aff..3f4af29b3d 100644 --- a/dkpro-core-io-reuters-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/Reuters21578TxtReader.java +++ b/dkpro-core-io-reuters-asl/src/main/java/org/dkpro/core/io/reuters/Reuters21578TxtReader.java @@ -15,11 +15,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.reuters; +package org.dkpro.core.io.reuters; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; import org.apache.commons.io.FilenameUtils; import org.apache.uima.cas.CAS; @@ -29,26 +32,28 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.MimeTypes; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import eu.openminted.share.annotations.api.DocumentationResource; /** - * Read a Reuters-21578 corpus that has been transformed into text format using {@code ExtractReuters} in - * the {@code lucene-benchmarks} project. + * Read a Reuters-21578 corpus that has been transformed into text format using + * {@code ExtractReuters} in the {@code lucene-benchmarks} project. *

* The {@link #PARAM_SOURCE_LOCATION} parameter should typically point to the file name pattern * {@code reut2-*.txt}, preceded by the corpus root directory. * - * @see Reuters-21587 Corpus - * @see ExtractReuters - * @see cluster-reuters.sh + * @see Reuters-21587 + * Corpus + * @see ExtractReuters + * @see cluster-reuters.sh */ -@ResourceMetaData(name="Reuters-21578 Corpus Text Reader") +@ResourceMetaData(name = "Reuters-21578 Corpus Text Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.TEXT_X_REUTERS21578}) @TypeCapability( outputs = { diff --git a/dkpro-core-io-reuters-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/ReutersDocument.java b/dkpro-core-io-reuters-asl/src/main/java/org/dkpro/core/io/reuters/ReutersDocument.java similarity index 99% rename from dkpro-core-io-reuters-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/ReutersDocument.java rename to dkpro-core-io-reuters-asl/src/main/java/org/dkpro/core/io/reuters/ReutersDocument.java index d76ac26fd7..57aeb2fab0 100644 --- a/dkpro-core-io-reuters-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/ReutersDocument.java +++ b/dkpro-core-io-reuters-asl/src/main/java/org/dkpro/core/io/reuters/ReutersDocument.java @@ -15,10 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.reuters; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; +package org.dkpro.core.io.reuters; import java.net.URI; import java.text.ParseException; @@ -28,6 +25,9 @@ import java.util.Locale; import java.util.Set; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + /** * A class that holds text and metadata for a Reuters-21578 document. */ diff --git a/dkpro-core-io-reuters-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/ExtractReutersTest.java b/dkpro-core-io-reuters-asl/src/test/java/org/dkpro/core/io/reuters/ExtractReutersTest.java similarity index 94% rename from dkpro-core-io-reuters-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/ExtractReutersTest.java rename to dkpro-core-io-reuters-asl/src/test/java/org/dkpro/core/io/reuters/ExtractReutersTest.java index 805384345b..d01222bbbc 100644 --- a/dkpro-core-io-reuters-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/ExtractReutersTest.java +++ b/dkpro-core-io-reuters-asl/src/test/java/org/dkpro/core/io/reuters/ExtractReutersTest.java @@ -15,16 +15,23 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.reuters; +package org.dkpro.core.io.reuters; -import org.junit.Assert; -import org.junit.Test; +import static junit.framework.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import java.io.File; -import java.util.*; +import java.util.Arrays; +import java.util.Date; +import java.util.GregorianCalendar; +import java.util.HashSet; +import java.util.List; +import java.util.Set; -import static junit.framework.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import org.dkpro.core.io.reuters.ExtractReuters; +import org.dkpro.core.io.reuters.ReutersDocument; +import org.junit.Assert; +import org.junit.Test; public class ExtractReutersTest { @@ -99,4 +106,4 @@ public void testExtract() Assert.assertEquals(oldIdLast, doc999.getOldid()); Assert.assertEquals(newIdLast, doc999.getNewid()); } -} \ No newline at end of file +} diff --git a/dkpro-core-io-reuters-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/Reuters21578SgmlReaderTest.java b/dkpro-core-io-reuters-asl/src/test/java/org/dkpro/core/io/reuters/Reuters21578SgmlReaderTest.java similarity index 95% rename from dkpro-core-io-reuters-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/Reuters21578SgmlReaderTest.java rename to dkpro-core-io-reuters-asl/src/test/java/org/dkpro/core/io/reuters/Reuters21578SgmlReaderTest.java index 130e0dceb4..0e7eecde3c 100644 --- a/dkpro-core-io-reuters-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/Reuters21578SgmlReaderTest.java +++ b/dkpro-core-io-reuters-asl/src/test/java/org/dkpro/core/io/reuters/Reuters21578SgmlReaderTest.java @@ -15,21 +15,23 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.reuters; +package org.dkpro.core.io.reuters; + +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.MetaDataStringField; import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.io.reuters.Reuters21578SgmlReader; import org.junit.Test; -import java.io.File; - -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.MetaDataStringField; public class Reuters21578SgmlReaderTest { diff --git a/dkpro-core-io-reuters-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/Reuters21578TxtReaderTest.java b/dkpro-core-io-reuters-asl/src/test/java/org/dkpro/core/io/reuters/Reuters21578TxtReaderTest.java similarity index 96% rename from dkpro-core-io-reuters-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/Reuters21578TxtReaderTest.java rename to dkpro-core-io-reuters-asl/src/test/java/org/dkpro/core/io/reuters/Reuters21578TxtReaderTest.java index 623bd59302..69a500ac70 100644 --- a/dkpro-core-io-reuters-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/Reuters21578TxtReaderTest.java +++ b/dkpro-core-io-reuters-asl/src/test/java/org/dkpro/core/io/reuters/Reuters21578TxtReaderTest.java @@ -15,21 +15,23 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.reuters; +package org.dkpro.core.io.reuters; + +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.pipeline.JCasIterator; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.io.reuters.Reuters21578TxtReader; import org.junit.Test; -import java.io.File; - -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; public class Reuters21578TxtReaderTest { diff --git a/dkpro-core-io-reuters-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/ReutersDocumentTests.java b/dkpro-core-io-reuters-asl/src/test/java/org/dkpro/core/io/reuters/ReutersDocumentTests.java similarity index 92% rename from dkpro-core-io-reuters-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/ReutersDocumentTests.java rename to dkpro-core-io-reuters-asl/src/test/java/org/dkpro/core/io/reuters/ReutersDocumentTests.java index 2ed14f7e74..8954a98ddf 100644 --- a/dkpro-core-io-reuters-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/reuters/ReutersDocumentTests.java +++ b/dkpro-core-io-reuters-asl/src/test/java/org/dkpro/core/io/reuters/ReutersDocumentTests.java @@ -15,13 +15,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.reuters; +package org.dkpro.core.io.reuters; -import org.junit.Test; +import static org.junit.Assert.assertEquals; import java.text.ParseException; -import static org.junit.Assert.assertEquals; +import org.dkpro.core.io.reuters.ReutersDocument; +import org.junit.Test; public class ReutersDocumentTests { diff --git a/dkpro-core-io-rtf-asl/pom.xml b/dkpro-core-io-rtf-asl/pom.xml index 878239eb4f..bb14f853a6 100644 --- a/dkpro-core-io-rtf-asl/pom.xml +++ b/dkpro-core-io-rtf-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.rtf-asl + dkpro-core-io-rtf-asl DKPro Core ASL - IO - RTF Collection reader for RTF (Rich Text Format) files. + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -36,8 +37,12 @@ uimafit-core - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -50,9 +55,9 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl - test + org.dkpro.core + dkpro-core-testing-asl + test \ No newline at end of file diff --git a/dkpro-core-io-rtf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/rtf/RTFReader.java b/dkpro-core-io-rtf-asl/src/main/java/org/dkpro/core/io/rtf/RTFReader.java similarity index 90% rename from dkpro-core-io-rtf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/rtf/RTFReader.java rename to dkpro-core-io-rtf-asl/src/main/java/org/dkpro/core/io/rtf/RTFReader.java index 9078ee6704..76cb804ce5 100644 --- a/dkpro-core-io-rtf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/rtf/RTFReader.java +++ b/dkpro-core-io-rtf-asl/src/main/java/org/dkpro/core/io/rtf/RTFReader.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.rtf; +package org.dkpro.core.io.rtf; import java.io.IOException; @@ -30,14 +30,16 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.ResourceCollectionReaderBase; +import org.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Read RTF (Rich Text Format) files. Uses {@link RTFEditorKit} for parsing RTF. */ -@ResourceMetaData(name="Rich Text Format (RTF) Reader") +@ResourceMetaData(name = "Rich Text Format (RTF) Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.APPLICATION_RTF, MimeTypes.TEXT_RTF}) @TypeCapability( outputs = { diff --git a/dkpro-core-io-rtf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/rtf/RTFReaderTest.java b/dkpro-core-io-rtf-asl/src/test/java/org/dkpro/core/io/rtf/RTFReaderTest.java similarity index 94% rename from dkpro-core-io-rtf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/rtf/RTFReaderTest.java rename to dkpro-core-io-rtf-asl/src/test/java/org/dkpro/core/io/rtf/RTFReaderTest.java index b66116b5be..edadaf1362 100644 --- a/dkpro-core-io-rtf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/rtf/RTFReaderTest.java +++ b/dkpro-core-io-rtf-asl/src/test/java/org/dkpro/core/io/rtf/RTFReaderTest.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.rtf; +package org.dkpro.core.io.rtf; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; @@ -31,14 +31,13 @@ import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.component.CasDumpWriter; import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.EOLUtils; +import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.EOLUtils; - /** * Test cases for {@link RTFReader}. - * - * */ public class RTFReaderTest { @@ -102,4 +101,7 @@ public void testTwoFiles() actual = EOLUtils.normalizeLineEndings(actual); assertEquals(reference, actual); } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); } diff --git a/dkpro-core-io-rtf-asl/src/test/resources/log4j2.xml b/dkpro-core-io-rtf-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-rtf-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-rtf-asl/src/test/resources/testfile.dump b/dkpro-core-io-rtf-asl/src/test/resources/testfile.dump index 35cc931d86..f9992e1331 100644 --- a/dkpro-core-io-rtf-asl/src/test/resources/testfile.dump +++ b/dkpro-core-io-rtf-asl/src/test/resources/testfile.dump @@ -22,6 +22,4 @@ And another paragraph. -------- View _InitialView end ---------------------------------- -======== CAS 0 end ================================== - - +======== CAS 0 end ================================== \ No newline at end of file diff --git a/dkpro-core-io-rtf-asl/src/test/resources/testfiles.dump b/dkpro-core-io-rtf-asl/src/test/resources/testfiles.dump index b8748e5bec..6d810153a6 100644 --- a/dkpro-core-io-rtf-asl/src/test/resources/testfiles.dump +++ b/dkpro-core-io-rtf-asl/src/test/resources/testfiles.dump @@ -46,6 +46,4 @@ There goes another test file. -------- View _InitialView end ---------------------------------- -======== CAS 1 end ================================== - - +======== CAS 1 end ================================== \ No newline at end of file diff --git a/dkpro-core-io-solr-asl/pom.xml b/dkpro-core-io-solr-asl/pom.xml index 5decd7e2b0..7cc2b3e9de 100644 --- a/dkpro-core-io-solr-asl/pom.xml +++ b/dkpro-core-io-solr-asl/pom.xml @@ -21,15 +21,16 @@ 6.3.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.solr-asl + dkpro-core-io-solr-asl jar DKPro Core ASL - IO - Solr (v${solr-solrj.version}) (ASL) Consumer for writing documents to a Solr server + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -49,12 +50,16 @@ ${solr-solrj.version} - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -62,8 +67,8 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.text-asl + org.dkpro.core + dkpro-core-io-text-asl test diff --git a/dkpro-core-io-solr-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/solr/util/SolrUtils.java b/dkpro-core-io-solr-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/solr/util/SolrUtils.java deleted file mode 100644 index 4993d94b63..0000000000 --- a/dkpro-core-io-solr-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/solr/util/SolrUtils.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright 2016 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.solr.util; - -import org.apache.commons.collections4.map.SingletonMap; -import org.apache.solr.common.SolrInputDocument; - -/** - * Helper utilities. - */ -public class SolrUtils -{ - /** - * The modifiers available for Solr atomic updates: SET, ADD, INC, REMOVE, REMOVEREGEX. - * - * @see Updating Parts of Documents - * @see Atomic Updates - */ - public enum Modifier - { - SET, ADD, INC, REMOVE, REMOVEREGEX - } - - private static final Modifier DEFAULT_MODIFIER = Modifier.SET; - - /** - * Add a field and optionally perform a partial update if applicable on an existing document. - * - * @param document the {@link SolrInputDocument} to add/update - * @param fieldname the field name to add/update - * @param value the value to insert for the field. - * @param update if true, use Solr atomic update mechanism; otherwise overwrite document - * @param modifier The {@link Modifier} to use when performing an atomic update (i.e. iff {@code update} - * is set to true). - * @see #setField(SolrInputDocument, String, Object, boolean) - */ - public static void setField(SolrInputDocument document, String fieldname, Object value, - boolean update, Modifier modifier) - { - if (update) { - /* perform an atomic update on potentially existing document */ - document.setField(fieldname, new SingletonMap<>(modifier.name().toLowerCase(), value)); - } - else { - document.setField(fieldname, value); - } - } - - /** - * Add a field and optionally perform a partial update on an existing document, using the default atomic update operation ("set"). - * - * @param document the {@link SolrInputDocument} to add/update - * @param fieldname the field name to add/update - * @param value the value to insert for the field. - * @param update if true, use Solr atomic update mechanism; otherwise overwrite existing document - * @see #setField(SolrInputDocument, String, Object, boolean, Modifier) - * @see Modifier - */ - public static void setField(SolrInputDocument document, String fieldname, Object value, - boolean update) - { - setField(document, fieldname, value, update, DEFAULT_MODIFIER); - } -} diff --git a/dkpro-core-io-solr-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/solr/SolrWriter.java b/dkpro-core-io-solr-asl/src/main/java/org/dkpro/core/io/solr/SolrWriter.java similarity index 88% rename from dkpro-core-io-solr-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/solr/SolrWriter.java rename to dkpro-core-io-solr-asl/src/main/java/org/dkpro/core/io/solr/SolrWriter.java index f500179d64..b773c1dd7f 100644 --- a/dkpro-core-io-solr-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/solr/SolrWriter.java +++ b/dkpro-core-io-solr-asl/src/main/java/org/dkpro/core/io/solr/SolrWriter.java @@ -1,58 +1,60 @@ -/* - * Copyright 2015 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.solr; - -import org.apache.solr.common.SolrInputDocument; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.jcas.JCas; - -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.io.solr.util.SolrUtils; - -/** - * A simple implementation of {@link SolrWriter_ImplBase} - */ -@ResourceMetaData(name="Solr Index Writer") -public class SolrWriter - extends SolrWriter_ImplBase -{ - /** - * A simple implementation of a the abstract method - * {@link SolrWriter_ImplBase#generateSolrDocument(JCas)}. It generates a - * {@link SolrInputDocument} containing the document id from the JCas metadata and the document - * text. - * - * @param aJCas - * a {@link JCas} - * @return a {@link SolrInputDocument} - * @throws AnalysisEngineProcessException - * if any subclass catches an expression within this method, it should throw this - * exception type only - */ - @Override - protected SolrInputDocument generateSolrDocument(JCas aJCas) - throws AnalysisEngineProcessException - { - // TODO: re-use document; create document pool for ConcurrentUpdateServer - SolrInputDocument document = new SolrInputDocument(); - document.addField(getIdField(), DocumentMetaData.get(aJCas).getDocumentId()); - SolrUtils.setField(document, getTextField(), aJCas.getDocumentText(), update()); - return document; - } -} +/* + * Copyright 2015 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.solr; + +import org.apache.solr.common.SolrInputDocument; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.solr.util.SolrUtils; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * A simple implementation of {@link SolrWriter_ImplBase} + */ +@ResourceMetaData(name = "Solr Index Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +public class SolrWriter + extends SolrWriter_ImplBase +{ + /** + * A simple implementation of a the abstract method + * {@link SolrWriter_ImplBase#generateSolrDocument(JCas)}. It generates a + * {@link SolrInputDocument} containing the document id from the JCas metadata and the document + * text. + * + * @param aJCas + * a {@link JCas} + * @return a {@link SolrInputDocument} + * @throws AnalysisEngineProcessException + * if any subclass catches an expression within this method, it should throw this + * exception type only + */ + @Override + protected SolrInputDocument generateSolrDocument(JCas aJCas) + throws AnalysisEngineProcessException + { + // TODO: re-use document; create document pool for ConcurrentUpdateServer + SolrInputDocument document = new SolrInputDocument(); + document.addField(getIdField(), DocumentMetaData.get(aJCas).getDocumentId()); + SolrUtils.setField(document, getTextField(), aJCas.getDocumentText(), update()); + return document; + } +} diff --git a/dkpro-core-io-solr-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/solr/SolrWriter_ImplBase.java b/dkpro-core-io-solr-asl/src/main/java/org/dkpro/core/io/solr/SolrWriter_ImplBase.java similarity index 94% rename from dkpro-core-io-solr-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/solr/SolrWriter_ImplBase.java rename to dkpro-core-io-solr-asl/src/main/java/org/dkpro/core/io/solr/SolrWriter_ImplBase.java index 77dd300e6c..e7c19ca555 100644 --- a/dkpro-core-io-solr-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/solr/SolrWriter_ImplBase.java +++ b/dkpro-core-io-solr-asl/src/main/java/org/dkpro/core/io/solr/SolrWriter_ImplBase.java @@ -1,219 +1,218 @@ -/* - * Copyright 2015 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.solr; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import org.apache.solr.client.solrj.SolrClient; -import org.apache.solr.client.solrj.SolrServerException; -import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrClient; -import org.apache.solr.client.solrj.response.UpdateResponse; -import org.apache.solr.common.SolrInputDocument; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasConsumer_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import java.io.IOException; - -/** - * This class implements a basic SolrWriter. Specific writers should define a subclass that - * overwrites the {@code generateSolrDocument()} method to take custom fields into account. - *

- * The class initializes a SolrServer instance, and calls {@code generateSolrDocument()} for each - * incoming CAS, and adds the result to the Solr server. A commit is executed when all documents are - * processed. - * - * - * - */ -public abstract class SolrWriter_ImplBase - extends JCasConsumer_ImplBase -{ - /** - * Define whether existing documents with same ID are updated (true) of overwritten (false)? - * Default: true (update). - */ - public static final String PARAM_UPDATE = "update"; - @ConfigurationParameter(name = PARAM_UPDATE, mandatory = true, defaultValue = "true") - private boolean update; - - /** - * Solr server URL string in the form {@code ://:/}, e.g. - * {@code http://localhost:8983/solr/collection1} - */ - public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; - @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) - private String targetLocation; - - /** - * The buffer size before the documents are sent to the server (default: 10000). - */ - public static final String PARAM_QUEUE_SIZE = "queueSize"; - @ConfigurationParameter(name = PARAM_QUEUE_SIZE, mandatory = true, defaultValue = "10000") - private int queueSize; - - /** - * The number of background numThreads used to empty the queue. Default: 1. - */ - public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS; - @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, defaultValue = "1") - private int numThreads; - - /** - * When committing to the index, i.e. when all documents are processed, block until index - * changes are flushed to disk? Default: true. - */ - public static final String PARAM_WAIT_FLUSH = "waitFlush"; - @ConfigurationParameter(name = PARAM_WAIT_FLUSH, mandatory = true, defaultValue = "true") - private boolean waitFlush; - - /** - * When committing to the index, i.e. when all documents are processed, block until a new - * searcher is opened and registered as the main query searcher, making the changes visible? - * Default: true. - */ - public static final String PARAM_WAIT_SEARCHER = "waitSearcher"; - @ConfigurationParameter(name = PARAM_WAIT_SEARCHER, mandatory = true, defaultValue = "true") - private boolean waitSearcher; - - /** - * The name of the text field in the Solr schema (default: "text"). - */ - public static final String PARAM_TEXT_FIELD = "textField"; - @ConfigurationParameter(name = PARAM_TEXT_FIELD, mandatory = true, defaultValue = "text") - private String textField; - - /** - * The name of the id field in the Solr schema (default: "id"). - */ - public static final String PARAM_ID_FIELD = "solrIdField"; - @ConfigurationParameter(name = PARAM_ID_FIELD, mandatory = true, defaultValue = "id") - private String idField; - - /** - * If set to true, the index is optimized once all documents are uploaded. Default is false. - */ - public static final String PARAM_OPTIMIZE_INDEX = "optimizeIndex"; - @ConfigurationParameter(name = PARAM_OPTIMIZE_INDEX, mandatory = true, defaultValue = "false") - private boolean optimizeIndex; - - private SolrClient solrClient; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - getLogger().info( - String.format("Using Solr server at %s.%nQueue size: %d\tThreads: %d%n", - targetLocation, queueSize, numThreads)); - solrClient = new ConcurrentUpdateSolrClient.Builder(targetLocation) - .withQueueSize(queueSize) - .withThreadCount(numThreads) - .build(); - try { - int status = solrClient.ping().getStatus(); - if (status != 0) { - throw new ResourceInitializationException( - "Server error. Response status: " + status, new Integer[] { status }); - } - } - catch (SolrServerException | IOException e) { - throw new ResourceInitializationException(e); - } - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - try { - SolrInputDocument solrDocument = generateSolrDocument(aJCas); - solrClient.add(solrDocument); - } - catch (IOException | SolrServerException e) { - throw new AnalysisEngineProcessException(e); - } - } - - @Override - public void collectionProcessComplete() - throws AnalysisEngineProcessException - { - super.collectionProcessComplete(); - - try { - UpdateResponse response = solrClient.commit(waitFlush, waitSearcher); - getLogger().info(String.format("Solr server at '%s' responded: %s", - targetLocation, response.toString())); - if (optimizeIndex) { - getLogger().info("Starting index optimization..."); - solrClient.optimize(waitFlush, waitSearcher); - getLogger().info(String.format("Solr server at '%s' responded: %s", - targetLocation, response.toString())); - } - solrClient.close(); - } - catch (SolrServerException | IOException e) { - throw new AnalysisEngineProcessException(e); - } - } - - /** - * Perform updates if added documents already exist? - * - * @return true if updates are to be performed rather than overwriting existing documents - */ - public boolean update() - { - return update; - } - - /** - * - * @return the name of the Solr text field (e.g. "text") - */ - public String getTextField() - { - return textField; - } - - /** - * - * @return the name of the Solr ID field (e.g. "id") - */ - public String getIdField() - { - return idField; - } - - /** - * - * @return the SolrClient - */ - public SolrClient getSolrClient() - { - return solrClient; - } - - abstract protected SolrInputDocument generateSolrDocument(JCas aJCas) - throws AnalysisEngineProcessException; - -} +/* + * Copyright 2015 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.solr; + +import java.io.IOException; + +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrClient; +import org.apache.solr.client.solrj.response.UpdateResponse; +import org.apache.solr.common.SolrInputDocument; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasConsumer_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; + +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * This class implements a basic SolrWriter. Specific writers should define a subclass that + * overwrites the {@code generateSolrDocument()} method to take custom fields into account. + *

+ * The class initializes a SolrServer instance, and calls {@code generateSolrDocument()} for each + * incoming CAS, and adds the result to the Solr server. A commit is executed when all documents are + * processed. + */ +@Component(OperationType.WRITER) +public abstract class SolrWriter_ImplBase + extends JCasConsumer_ImplBase +{ + /** + * Define whether existing documents with same ID are updated (true) of overwritten (false)? + */ + public static final String PARAM_UPDATE = "update"; + @ConfigurationParameter(name = PARAM_UPDATE, mandatory = true, defaultValue = "true") + private boolean update; + + /** + * Solr server URL string in the form {@code ://:/}, e.g. + * {@code http://localhost:8983/solr/collection1} + */ + public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; + @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) + private String targetLocation; + + /** + * The buffer size before the documents are sent to the server (default: 10000). + */ + public static final String PARAM_QUEUE_SIZE = "queueSize"; + @ConfigurationParameter(name = PARAM_QUEUE_SIZE, mandatory = true, defaultValue = "10000") + private int queueSize; + + /** + * The number of background numThreads used to empty the queue. + */ + public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS; + @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, defaultValue = "1") + private int numThreads; + + /** + * When committing to the index, i.e. when all documents are processed, block until index + * changes are flushed to disk? + */ + public static final String PARAM_WAIT_FLUSH = "waitFlush"; + @ConfigurationParameter(name = PARAM_WAIT_FLUSH, mandatory = true, defaultValue = "true") + private boolean waitFlush; + + /** + * When committing to the index, i.e. when all documents are processed, block until a new + * searcher is opened and registered as the main query searcher, making the changes visible? + */ + public static final String PARAM_WAIT_SEARCHER = "waitSearcher"; + @ConfigurationParameter(name = PARAM_WAIT_SEARCHER, mandatory = true, defaultValue = "true") + private boolean waitSearcher; + + /** + * The name of the text field in the Solr schema (default: "text"). + */ + public static final String PARAM_TEXT_FIELD = "textField"; + @ConfigurationParameter(name = PARAM_TEXT_FIELD, mandatory = true, defaultValue = "text") + private String textField; + + /** + * The name of the id field in the Solr schema (default: "id"). + */ + public static final String PARAM_ID_FIELD = "solrIdField"; + @ConfigurationParameter(name = PARAM_ID_FIELD, mandatory = true, defaultValue = "id") + private String idField; + + /** + * If set to true, the index is optimized once all documents are uploaded. Default is false. + */ + public static final String PARAM_OPTIMIZE_INDEX = "optimizeIndex"; + @ConfigurationParameter(name = PARAM_OPTIMIZE_INDEX, mandatory = true, defaultValue = "false") + private boolean optimizeIndex; + + private SolrClient solrClient; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + getLogger().info( + String.format("Using Solr server at %s.%nQueue size: %d\tThreads: %d%n", + targetLocation, queueSize, numThreads)); + solrClient = new ConcurrentUpdateSolrClient.Builder(targetLocation) + .withQueueSize(queueSize) + .withThreadCount(numThreads) + .build(); + try { + int status = solrClient.ping().getStatus(); + if (status != 0) { + throw new ResourceInitializationException( + "Server error. Response status: " + status, new Integer[] { status }); + } + } + catch (SolrServerException | IOException e) { + throw new ResourceInitializationException(e); + } + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + try { + SolrInputDocument solrDocument = generateSolrDocument(aJCas); + solrClient.add(solrDocument); + } + catch (IOException | SolrServerException e) { + throw new AnalysisEngineProcessException(e); + } + } + + @Override + public void collectionProcessComplete() + throws AnalysisEngineProcessException + { + super.collectionProcessComplete(); + + try { + UpdateResponse response = solrClient.commit(waitFlush, waitSearcher); + getLogger().info(String.format("Solr server at '%s' responded: %s", + targetLocation, response.toString())); + if (optimizeIndex) { + getLogger().info("Starting index optimization..."); + solrClient.optimize(waitFlush, waitSearcher); + getLogger().info(String.format("Solr server at '%s' responded: %s", + targetLocation, response.toString())); + } + solrClient.close(); + } + catch (SolrServerException | IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + + /** + * Perform updates if added documents already exist? + * + * @return true if updates are to be performed rather than overwriting existing documents + */ + public boolean update() + { + return update; + } + + /** + * + * @return the name of the Solr text field (e.g. "text") + */ + public String getTextField() + { + return textField; + } + + /** + * + * @return the name of the Solr ID field (e.g. "id") + */ + public String getIdField() + { + return idField; + } + + /** + * + * @return the SolrClient + */ + public SolrClient getSolrClient() + { + return solrClient; + } + + abstract protected SolrInputDocument generateSolrDocument(JCas aJCas) + throws AnalysisEngineProcessException; + +} diff --git a/dkpro-core-io-solr-asl/src/main/java/org/dkpro/core/io/solr/util/SolrUtils.java b/dkpro-core-io-solr-asl/src/main/java/org/dkpro/core/io/solr/util/SolrUtils.java new file mode 100644 index 0000000000..697f6d6a8e --- /dev/null +++ b/dkpro-core-io-solr-asl/src/main/java/org/dkpro/core/io/solr/util/SolrUtils.java @@ -0,0 +1,91 @@ +/* + * Copyright 2016 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.solr.util; + +import org.apache.commons.collections4.map.SingletonMap; +import org.apache.solr.common.SolrInputDocument; + +/** + * Helper utilities. + */ +public class SolrUtils +{ + /** + * The modifiers available for Solr atomic updates: SET, ADD, INC, REMOVE, REMOVEREGEX. + * + * @see Updating + * Parts of Documents + * @see Atomic Updates + */ + public enum Modifier + { + SET, ADD, INC, REMOVE, REMOVEREGEX + } + + private static final Modifier DEFAULT_MODIFIER = Modifier.SET; + + /** + * Add a field and optionally perform a partial update if applicable on an existing document. + * + * @param document + * the {@link SolrInputDocument} to add/update + * @param fieldname + * the field name to add/update + * @param value + * the value to insert for the field. + * @param update + * if true, use Solr atomic update mechanism; otherwise overwrite document + * @param modifier + * The {@link Modifier} to use when performing an atomic update (i.e. iff + * {@code update} is set to true). + * @see #setField(SolrInputDocument, String, Object, boolean) + */ + public static void setField(SolrInputDocument document, String fieldname, Object value, + boolean update, Modifier modifier) + { + if (update) { + /* perform an atomic update on potentially existing document */ + document.setField(fieldname, new SingletonMap<>(modifier.name().toLowerCase(), value)); + } + else { + document.setField(fieldname, value); + } + } + + /** + * Add a field and optionally perform a partial update on an existing document, using the + * default atomic update operation ("set"). + * + * @param document + * the {@link SolrInputDocument} to add/update + * @param fieldname + * the field name to add/update + * @param value + * the value to insert for the field. + * @param update + * if true, use Solr atomic update mechanism; otherwise overwrite existing document + * @see #setField(SolrInputDocument, String, Object, boolean, Modifier) + * @see Modifier + */ + public static void setField(SolrInputDocument document, String fieldname, Object value, + boolean update) + { + setField(document, fieldname, value, update, DEFAULT_MODIFIER); + } +} diff --git a/dkpro-core-io-solr-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/solr/SolrWriterTest.java b/dkpro-core-io-solr-asl/src/test/java/org/dkpro/core/io/solr/SolrWriterTest.java similarity index 94% rename from dkpro-core-io-solr-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/solr/SolrWriterTest.java rename to dkpro-core-io-solr-asl/src/test/java/org/dkpro/core/io/solr/SolrWriterTest.java index c3e609051d..673af49fa2 100644 --- a/dkpro-core-io-solr-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/solr/SolrWriterTest.java +++ b/dkpro-core-io-solr-asl/src/test/java/org/dkpro/core/io/solr/SolrWriterTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.solr; +package org.dkpro.core.io.solr; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; @@ -27,10 +27,10 @@ import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.io.solr.SolrWriter; +import org.dkpro.core.io.text.StringReader; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.io.text.StringReader; - /** * Test(s) for {@link SolrWriter}. * diff --git a/dkpro-core-io-solr-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/solr/util/SolrUtilsTest.java b/dkpro-core-io-solr-asl/src/test/java/org/dkpro/core/io/solr/util/SolrUtilsTest.java similarity index 93% rename from dkpro-core-io-solr-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/solr/util/SolrUtilsTest.java rename to dkpro-core-io-solr-asl/src/test/java/org/dkpro/core/io/solr/util/SolrUtilsTest.java index e2692f4364..5f514aa88c 100644 --- a/dkpro-core-io-solr-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/solr/util/SolrUtilsTest.java +++ b/dkpro-core-io-solr-asl/src/test/java/org/dkpro/core/io/solr/util/SolrUtilsTest.java @@ -1,43 +1,44 @@ -/* - * Copyright 2015 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.solr.util; - -import static org.junit.Assert.assertEquals; - -import org.apache.solr.common.SolrInputDocument; -import org.junit.Test; - -public class SolrUtilsTest -{ - @Test - public void testAddField() - { - String fieldname = "field"; - String value = "value"; - String idValue = "1"; - String idFieldname = "id"; - boolean update = false; - - SolrInputDocument document = new SolrInputDocument(); - document.addField(idFieldname, idValue); - SolrUtils.setField(document, fieldname, value, update); - - assertEquals(idValue, document.getFieldValue(idFieldname)); - assertEquals(value, document.getFieldValue(fieldname)); - } -} +/* + * Copyright 2015 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.solr.util; + +import static org.junit.Assert.assertEquals; + +import org.apache.solr.common.SolrInputDocument; +import org.dkpro.core.io.solr.util.SolrUtils; +import org.junit.Test; + +public class SolrUtilsTest +{ + @Test + public void testAddField() + { + String fieldname = "field"; + String value = "value"; + String idValue = "1"; + String idFieldname = "id"; + boolean update = false; + + SolrInputDocument document = new SolrInputDocument(); + document.addField(idFieldname, idValue); + SolrUtils.setField(document, fieldname, value, update); + + assertEquals(idValue, document.getFieldValue(idFieldname)); + assertEquals(value, document.getFieldValue(fieldname)); + } +} diff --git a/dkpro-core-io-tcf-asl/pom.xml b/dkpro-core-io-tcf-asl/pom.xml index 047e6ebe99..95a6b67634 100644 --- a/dkpro-core-io-tcf-asl/pom.xml +++ b/dkpro-core-io-tcf-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.tcf-asl + dkpro-core-io-tcf-asl jar DKPro Core ASL - IO - TCF + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -46,39 +47,67 @@ eu.clarin.weblicht wlfxb - 1.3.3 + 1.4.3 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + javax.xml.bind + jaxb-api - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + com.sun.xml.bind + jaxb-core - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + com.sun.xml.bind + jaxb-impl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.syntax-asl + javax.activation + javax.activation-api + + + org.dkpro.core + dkpro-core-api-metadata-asl + + + org.dkpro.core + dkpro-core-api-segmentation-asl + + + org.dkpro.core + dkpro-core-api-lexmorph-asl + + + org.dkpro.core + dkpro-core-api-transform-asl + + + org.dkpro.core + dkpro-core-api-syntax-asl + + + org.dkpro.core + dkpro-core-api-ner-asl + + + org.dkpro.core + dkpro-core-api-parameter-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.ner-asl + org.dkpro.core + dkpro-core-api-coref-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.coref-asl + eu.openminted.share.annotations + omtd-share-annotations-api - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.slf4j + slf4j-api xmlunit @@ -91,9 +120,38 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test + + + + org.apache.maven.plugins + maven-dependency-plugin + + + default + verify + + analyze-only + + + + + true + + + javax.xml.bind:jaxb-api + com.sun.xml.bind:jaxb-core + com.sun.xml.bind:jaxb-impl + javax.activation:javax.activation-api + + + + + diff --git a/dkpro-core-io-tcf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tcf/TcfReader.java b/dkpro-core-io-tcf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tcf/TcfReader.java deleted file mode 100644 index 629fbaf8df..0000000000 --- a/dkpro-core-io-tcf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tcf/TcfReader.java +++ /dev/null @@ -1,488 +0,0 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.tcf; - -import static org.apache.commons.io.IOUtils.closeQuietly; - -import java.io.BufferedInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.TreeMap; - -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain; -import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; -import eu.clarin.weblicht.wlfxb.io.TextCorpusStreamed; -import eu.clarin.weblicht.wlfxb.io.WLDObjector; -import eu.clarin.weblicht.wlfxb.io.WLFormatException; -import eu.clarin.weblicht.wlfxb.tc.api.DependencyParse; -import eu.clarin.weblicht.wlfxb.tc.api.DependencyParsingLayer; -import eu.clarin.weblicht.wlfxb.tc.api.Reference; -import eu.clarin.weblicht.wlfxb.tc.api.TextCorpus; -import eu.clarin.weblicht.wlfxb.xb.WLData; - -/** - * Reader for the WebLicht TCF format. It reads all the available annotation Layers from the TCF - * file and convert it to a CAS annotations. The TCF data do not have begin/end offsets for all of - * its annotations which is required in CAS annotation. Hence, addresses are manually calculated per - * tokens and stored in a map (token_id, token(CAS object)) where later we get can get the offset - * from the token - */ -@ResourceMetaData(name="CLARIN-DE WebLicht TCF Reader") -@MimeTypeCapability({MimeTypes.TEXT_TCF}) -@TypeCapability(outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", - "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain", - "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink", - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) -public class TcfReader - extends JCasResourceCollectionReader_ImplBase -{ - int j = 0; - - @Override - public void getNext(JCas aJCas) - throws IOException, CollectionException - { - Resource res = nextFile(); - initCas(aJCas, res); - - InputStream is = null; - try { - is = new BufferedInputStream(res.getInputStream()); - WLData wLData = WLDObjector.read(is); - TextCorpus aCorpusData = wLData.getTextCorpus(); - convertToCas(aJCas, aCorpusData); - } - catch (WLFormatException e) { - throw new CollectionException(e); - } - finally { - closeQuietly(is); - - } - } - - private void convertToCas(JCas aJCas, TextCorpus aCorpusData) - { - convertText(aJCas, aCorpusData); - Map tokens = convertTokens(aJCas, aCorpusData); - if (tokens.size() > 0) { - - convertPos(aJCas, aCorpusData, tokens); - - convertLemma(aJCas, aCorpusData, tokens); - - convertSentences(aJCas, aCorpusData, tokens); - - convertDependencies(aJCas, aCorpusData, tokens); - - convertNamedEntities(aJCas, aCorpusData, tokens); - - convertCoreference(aJCas, aCorpusData, tokens); - } - } - - /** - * This method builds texts from the {@link eu.clarin.weblicht.wlfxb.tc.api.Token} annotation - * layer. The getText Method of {@link TextCorpusStreamed} is not used as some tokens, such as - * special characters represented differently than in the original text. - * - * @param aJCas - * the JCas. - * @param aCorpusData - * the TCF document. - */ - private void convertText(JCas aJCas, TextCorpus aCorpusData) - { - StringBuilder text = new StringBuilder(); - - for (int i = 0; i < aCorpusData.getTokensLayer().size(); i++) { - if (i > 0) { - text.append(" "); - } - eu.clarin.weblicht.wlfxb.tc.api.Token token = aCorpusData.getTokensLayer().getToken(i); - text.append(token.getString()); - } - aJCas.setDocumentText(text.toString()); - aJCas.setDocumentLanguage(aCorpusData.getLanguage()); - } - - /** - * Convert TCF Tokens Layer to CAS Token Annotation. - * - * @param aJCas - * the JCas. - * @param aCorpusData - * the TCF document. - * @return returns {@code Map} of (token_id, Token), for later references - */ - private Map convertTokens(JCas aJCas, TextCorpus aCorpusData) - { - if (aCorpusData.getTokensLayer() == null) { - // No layer to read from. - return new HashMap(); - } - - String text = aJCas.getDocumentText(); - - Token outToken; - int tokenBeginPosition = 0; - int tokenEndPosition; - Map tokens = new HashMap(); - - for (int i = 0; i < aCorpusData.getTokensLayer().size(); i++) { - - eu.clarin.weblicht.wlfxb.tc.api.Token token = aCorpusData.getTokensLayer().getToken(i); - - tokenBeginPosition = text.indexOf(token.getString(), tokenBeginPosition); - tokenEndPosition = text.indexOf(token.getString(), tokenBeginPosition) - + token.getString().length(); - - outToken = new Token(aJCas, tokenBeginPosition, tokenEndPosition); - outToken.addToIndexes(); - - tokens.put(token.getID(), outToken); - tokenBeginPosition = tokenEndPosition; - } - return tokens; - } - - private void convertPos(JCas aJCas, TextCorpus aCorpusData, Map aTokens) - { - if (aCorpusData.getPosTagsLayer() == null) { - return; - } - for (int i = 0; i < aCorpusData.getPosTagsLayer().size(); i++) { - eu.clarin.weblicht.wlfxb.tc.api.Token[] posTokens = aCorpusData.getPosTagsLayer() - .getTokens(aCorpusData.getPosTagsLayer().getTag(i)); - String value = aCorpusData.getPosTagsLayer().getTag(i).getString(); - - POS outPos = new POS(aJCas); - - outPos.setBegin(aTokens.get(posTokens[0].getID()).getBegin()); - outPos.setEnd(aTokens.get(posTokens[0].getID()).getEnd()); - outPos.setPosValue(value); - POSUtils.assignCoarseValue(outPos); - outPos.addToIndexes(); - - // Set the POS to the token - aTokens.get(posTokens[0].getID()).setPos(outPos); - } - } - - private void convertLemma(JCas aJCas, TextCorpus aCorpusData, Map aTokens) - { - if (aCorpusData.getLemmasLayer() == null) { - return; - } - for (int i = 0; i < aCorpusData.getLemmasLayer().size(); i++) { - eu.clarin.weblicht.wlfxb.tc.api.Token[] lemmaTokens = aCorpusData.getLemmasLayer() - .getTokens(aCorpusData.getLemmasLayer().getLemma(i)); - String value = aCorpusData.getLemmasLayer().getLemma(i).getString(); - - Lemma outLemma = new Lemma(aJCas); - - outLemma.setBegin(aTokens.get(lemmaTokens[0].getID()).getBegin()); - outLemma.setEnd(aTokens.get(lemmaTokens[0].getID()).getEnd()); - outLemma.setValue(value); - outLemma.addToIndexes(); - - // Set the lemma to the token - aTokens.get(lemmaTokens[0].getID()).setLemma(outLemma); - } - - } - - private void convertSentences(JCas aJCas, TextCorpus aCorpusData, - Map aTokens) - { - if (aCorpusData.getSentencesLayer() == null) { - // No layer to read from. - return; - } - - for (int i = 0; i < aCorpusData.getSentencesLayer().size(); i++) { - eu.clarin.weblicht.wlfxb.tc.api.Token[] sentencesTokens = aCorpusData - .getSentencesLayer().getTokens(aCorpusData.getSentencesLayer().getSentence(i)); - - Sentence outSentence = new Sentence(aJCas); - - outSentence.setBegin(aTokens.get(sentencesTokens[0].getID()).getBegin()); - outSentence.setEnd(aTokens.get(sentencesTokens[sentencesTokens.length - 1].getID()) - .getEnd()); - outSentence.addToIndexes(); - } - } - - private void convertDependencies(JCas aJCas, TextCorpus aCorpusData, - Map aTokens) - { - DependencyParsingLayer depLayer = aCorpusData.getDependencyParsingLayer(); - - if (depLayer == null) { - // No layer to read from. - return; - } - - for (int i = 0; i < depLayer.size(); i++) { - DependencyParse dependencyParse = depLayer.getParse(i); - for (eu.clarin.weblicht.wlfxb.tc.api.Dependency dependency : dependencyParse - .getDependencies()) { - - eu.clarin.weblicht.wlfxb.tc.api.Token[] governorTokens = depLayer - .getGovernorTokens(dependency); - eu.clarin.weblicht.wlfxb.tc.api.Token[] dependentTokens = depLayer - .getDependentTokens(dependency); - - POS dependentPos = aTokens.get(dependentTokens[0].getID()).getPos(); - - // For dependency annotations in the TCF file without POS, add as a default POS -- - if (dependentPos == null) { - getLogger().warn("There is no pos for this token, added [--] as a pos"); - dependentPos = new POS(aJCas); - dependentPos.setBegin(aTokens.get(dependentTokens[0].getID()).getBegin()); - dependentPos.setEnd(aTokens.get(dependentTokens[0].getID()).getEnd()); - dependentPos.setPosValue("--"); - dependentPos.setCoarseValue("--"); - dependentPos.addToIndexes(); - aTokens.get(dependentTokens[0].getID()).setPos(dependentPos); - } - - if (governorTokens != null) { - POS governerPos = aTokens.get(governorTokens[0].getID()).getPos(); - if (governerPos == null) { - if (dependency.getFunction().equals("ROOT")) { - // do nothing - } - else { - getLogger().warn("There is no pos for this token, added [--] as a pos"); - governerPos = new POS(aJCas); - governerPos.setBegin(aTokens.get(governorTokens[0].getID()).getBegin()); - governerPos.setEnd(aTokens.get(governorTokens[0].getID()).getEnd()); - governerPos.setPosValue("--"); - governerPos.addToIndexes(); - aTokens.get(governorTokens[0].getID()).setPos(governerPos); - } - } - } - else { - governorTokens = dependentTokens; - } - - // We set governorTokens = dependentTokens above for root nodes - if (governorTokens == dependentTokens) { - Dependency outDependency = new ROOT(aJCas); - outDependency.setDependencyType(dependency.getFunction()); - outDependency.setGovernor(aTokens.get(dependentTokens[0].getID())); - outDependency.setDependent(aTokens.get(dependentTokens[0].getID())); - outDependency.setBegin(outDependency.getDependent().getBegin()); - outDependency.setEnd(outDependency.getDependent().getEnd()); - outDependency.setFlavor(depLayer.hasMultipleGovernors() - ? DependencyFlavor.ENHANCED : DependencyFlavor.BASIC); - outDependency.addToIndexes(); - - } - else { - Dependency outDependency = new Dependency(aJCas); - outDependency.setDependencyType(dependency.getFunction()); - outDependency.setGovernor(aTokens.get(governorTokens[0].getID())); - outDependency.setDependent(aTokens.get(dependentTokens[0].getID())); - outDependency.setBegin(outDependency.getDependent().getBegin()); - outDependency.setEnd(outDependency.getDependent().getEnd()); - outDependency.setFlavor(depLayer.hasMultipleGovernors() - ? DependencyFlavor.ENHANCED : DependencyFlavor.BASIC); - outDependency.addToIndexes(); - } - } - } - } - - private void convertNamedEntities(JCas aJCas, TextCorpus aCorpusData, - Map aTokens) - { - if (aCorpusData.getNamedEntitiesLayer() == null) { - // No layer to read from. - return; - } - - for (int i = 0; i < aCorpusData.getNamedEntitiesLayer().size(); i++) { - // get the named entity - eu.clarin.weblicht.wlfxb.tc.api.NamedEntity entity = aCorpusData - .getNamedEntitiesLayer().getEntity(i); - - eu.clarin.weblicht.wlfxb.tc.api.Token[] namedEntityTokens = aCorpusData - .getNamedEntitiesLayer().getTokens(entity); - - NamedEntity outNamedEntity = new NamedEntity(aJCas); - - outNamedEntity.setBegin(getOffsets(namedEntityTokens, aTokens)[0]); - outNamedEntity.setEnd(getOffsets(namedEntityTokens, aTokens)[1]); - outNamedEntity.setValue(entity.getType()); - outNamedEntity.addToIndexes(); - } - - } - - /** - * Correferences in CAS should be represented {@link CoreferenceChain} and - * {@link CoreferenceLink}. The TCF representation Uses rel and target to build - * chains. Example:
- * {@literal }
- * {@literal
- * }

The first phase of conversion is getting all references and - * targets alongside the type and relations in different maps
- * Second, an iteration is made through all the maps and the {@link CoreferenceChain} and - * {@link CoreferenceLink} annotations are constructed. - * - * @param aJCas - * the JCas. - * @param aCorpusData - * the TCF document. - * @param aTokens - * id/token map. - */ - private void convertCoreference(JCas aJCas, TextCorpus aCorpusData, - Map aTokens) - { - if (aCorpusData.getReferencesLayer() == null) { - // No layer to read from. - return; - } - for (int i = 0; i < aCorpusData.getReferencesLayer().size(); i++) { - eu.clarin.weblicht.wlfxb.tc.api.ReferencedEntity entity = aCorpusData - .getReferencesLayer().getReferencedEntity(i); - - Map referencesMap = new TreeMap(); - storeReferencesAndTargetsInMap(referencesMap, entity, aCorpusData, aTokens, aJCas); - - CoreferenceChain chain = new CoreferenceChain(aJCas); - CoreferenceLink link = null; - for (Integer address : referencesMap.keySet()) { - if (chain.getFirst() == null) { - chain.setFirst(referencesMap.get(address)); - link = chain.getFirst(); - chain.addToIndexes(); - } - else { - link.setNext(referencesMap.get(address)); - if (link.getReferenceRelation() == null) { - link.setReferenceRelation(referencesMap.get(address).getReferenceRelation()); - } - link = link.getNext(); - link.addToIndexes(); - } - } - } - } - - private void storeReferencesAndTargetsInMap(Map aReferencesMap, - eu.clarin.weblicht.wlfxb.tc.api.ReferencedEntity entity, TextCorpus aCorpusData, - Map aTokens, JCas aJcas) - { - for (Reference reference : entity.getReferences()) { - StringBuilder sbTokens = new StringBuilder(); - for (eu.clarin.weblicht.wlfxb.tc.api.Token token : aCorpusData.getReferencesLayer() - .getTokens(reference)) { - sbTokens.append(token.getID() + " "); - } - - String[] referenceTokens = sbTokens.toString().split(" "); - int begin = getOffsets(referenceTokens, aTokens)[0]; - int end = getOffsets(referenceTokens, aTokens)[1]; - - CoreferenceLink link = new CoreferenceLink(aJcas); - link.setBegin(begin); - link.setEnd(end); - String referencesType = reference.getType() == null ? "nam" : reference.getType(); - link.setReferenceType(referencesType); - if (reference.getRelation() != null) { - link.setReferenceRelation(reference.getRelation()); - } - link.addToIndexes(); - aReferencesMap.put(link.getAddress(), link); - - } - } - - /** - * Get the start and end offsets of a span annotation - * - * @param aSpanTokens - * list of span {@link eu.clarin.weblicht.wlfxb.tc.api.Token}s - * @param aAllTokens - * all available tokens in the file - * @return the offsets. - */ - private int[] getOffsets(eu.clarin.weblicht.wlfxb.tc.api.Token[] aSpanTokens, - Map aAllTokens) - { - List beginPositions = new ArrayList(); - List endPositions = new ArrayList(); - for (eu.clarin.weblicht.wlfxb.tc.api.Token token : aSpanTokens) { - beginPositions.add(aAllTokens.get(token.getID()).getBegin()); - endPositions.add(aAllTokens.get(token.getID()).getEnd()); - } - return new int[] { (Collections.min(beginPositions)), (Collections.max(endPositions)) }; - } - - /** - * Get the start and end offsets of a span annotation - * - * @param aSpanTokens - * list of span token ids. [t_3,_t_5, t_1] - * @param aAllTokens - * all available tokens in the file - * @return the offsets. - */ - private int[] getOffsets(String[] aSpanTokens, Map aAllTokens) - { - List beginPositions = new ArrayList(); - List endPositions = new ArrayList(); - for (String token : aSpanTokens) { - beginPositions.add(aAllTokens.get(token).getBegin()); - endPositions.add(aAllTokens.get(token).getEnd()); - } - return new int[] { (Collections.min(beginPositions)), (Collections.max(endPositions)) }; - } -} diff --git a/dkpro-core-io-tcf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tcf/TcfWriter.java b/dkpro-core-io-tcf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tcf/TcfWriter.java deleted file mode 100644 index 9c07317652..0000000000 --- a/dkpro-core-io-tcf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tcf/TcfWriter.java +++ /dev/null @@ -1,562 +0,0 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.tcf; - -import static org.apache.commons.io.IOUtils.closeQuietly; -import static org.apache.uima.fit.util.JCasUtil.exists; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.net.URL; -import java.util.ArrayList; -import java.util.EnumSet; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; - -import org.apache.commons.io.FileUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain; -import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription; -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; -import eu.clarin.weblicht.wlfxb.io.TextCorpusStreamedWithReplaceableLayers; -import eu.clarin.weblicht.wlfxb.io.WLDObjector; -import eu.clarin.weblicht.wlfxb.io.WLFormatException; -import eu.clarin.weblicht.wlfxb.tc.api.DependencyParsingLayer; -import eu.clarin.weblicht.wlfxb.tc.api.LemmasLayer; -import eu.clarin.weblicht.wlfxb.tc.api.NamedEntitiesLayer; -import eu.clarin.weblicht.wlfxb.tc.api.PosTagsLayer; -import eu.clarin.weblicht.wlfxb.tc.api.Reference; -import eu.clarin.weblicht.wlfxb.tc.api.ReferencesLayer; -import eu.clarin.weblicht.wlfxb.tc.api.SentencesLayer; -import eu.clarin.weblicht.wlfxb.tc.api.TextCorpus; -import eu.clarin.weblicht.wlfxb.tc.api.TokensLayer; -import eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusLayerTag; -import eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusStored; -import eu.clarin.weblicht.wlfxb.xb.WLData; - -/** - * Writer for the WebLicht TCF format. - */ -@ResourceMetaData(name="CLARIN-DE WebLicht TCF Writer") -@MimeTypeCapability({MimeTypes.TEXT_TCF}) -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", - "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain", - "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink", - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) -public class TcfWriter - extends JCasFileWriter_ImplBase -{ - private static final String REL_TYPE_EXPLETIVE = "expletive"; - - /** - * Specify the suffix of output files. Default value .tcf. If the suffix is not - * needed, provide an empty string as value. - */ - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; - @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".tcf") - private String filenameSuffix; - - /** - * If there are no annotations for a particular layer in the CAS, preserve any potentially - * existing annotations in the original TCF.
- * Default: {@code false} - */ - public static final String PARAM_PRESERVE_IF_EMPTY = "preserveIfEmpty"; - @ConfigurationParameter(name = PARAM_PRESERVE_IF_EMPTY, mandatory = true, defaultValue = "false") - private boolean preserveIfEmpty; - - /** - * Merge with source TCF file if one is available.
- * Default: {@code true} - */ - public static final String PARAM_MERGE = "merge"; - @ConfigurationParameter(name = PARAM_MERGE, mandatory = true, defaultValue = "true") - private boolean merge; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - // #670 - TcfWriter can currently not properly write to ZIP files because of the "try and - // error" approach that we take to trying to merge with an existing file. In particular, if - // the attempt fails and we go without merging, we cannot delete the broken entry from the - // ZIP file. - if (StringUtils.startsWith(getTargetLocation(), JAR_PREFIX)) { - throw new ResourceInitializationException(new IllegalStateException( - "TcfWriter cannot write to ZIP files.")); - } - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - InputStream docIS = null; - try { - boolean writeWithoutMerging = true; - if (merge) { - NamedOutputStream docOS = null; - try { - docOS = getOutputStream(aJCas, filenameSuffix); - // Get the original TCF file and preserve it - DocumentMetaData documentMetadata = DocumentMetaData.get(aJCas); - URL filePathUrl = new URL(documentMetadata.getDocumentUri()); - try { - docIS = filePathUrl.openStream(); - - try { - getLogger().debug( - "Merging with [" + documentMetadata.getDocumentUri() + "]"); - casToTcfWriter(docIS, aJCas, docOS); - writeWithoutMerging = false; - } - // See https://github.com/weblicht/wlfxb/issues/7 - // catch (WLFormatException ex) { - // getLogger().debug("No source file to merge with: " + ex.getMessage()); - // } - // Workaround: catch all exceptions - catch (Exception ex) { - getLogger().debug("Source file is not TCF: " + ex.getMessage()); - } - } - catch (IOException e) { - getLogger().debug( - "Cannot open source file to merge with: " + e.getMessage()); - } - } - finally { - if (writeWithoutMerging) { - // Have to delete the output file from this try and will try again without - // merging. Deleting is necessary as not to trigger the overwrite safeguard - // in JCasFileWriter_ImplBase - if ((docOS != null) && (docOS.getName() != null)) { - FileUtils.deleteQuietly(new File(docOS.getName())); - } - } - closeQuietly(docOS); - } - } - else { - getLogger().debug("Merging disabled"); - } - - // If merging failed or is disabled, go on without merging - if (writeWithoutMerging) { - OutputStream docOS = null; - try { - docOS = getOutputStream(aJCas, filenameSuffix); - casToTcfWriter(aJCas, docOS); - } - finally { - closeQuietly(docOS); - } - } - } - catch (Exception e) { - throw new AnalysisEngineProcessException(e); - } - finally { - closeQuietly(docIS); - } - } - - /** - * Create TCF File from scratch - * - * @param aJCas - * the JCas. - * @param aOs - * the output stream. - * @throws WLFormatException - * if a TCF problem occurs. - */ - public void casToTcfWriter(JCas aJCas, OutputStream aOs) - throws WLFormatException - { - // create TextCorpus object, specifying its language from the aJcas Object - TextCorpusStored textCorpus = new TextCorpusStored(aJCas.getDocumentLanguage()); - - // create text annotation layer and add the string of the text into the layer - textCorpus.createTextLayer().addText(aJCas.getDocumentText()); - - write(aJCas, textCorpus); - - // write the annotated data object into the output stream - WLData wldata = new WLData(textCorpus); - WLDObjector.write(wldata, aOs); - } - - /** - * Merge annotations from CAS into an existing TCF file. - * - * @param aIs - * the TCF file with an existing annotation layers - * @param aJCas - * an annotated CAS object - * @param aOs - * the output stream. - * @throws WLFormatException - * if a TCF problem occurs. - */ - public void casToTcfWriter(InputStream aIs, JCas aJCas, OutputStream aOs) - throws WLFormatException - { - // If these layers are present in the TCF file, we use them from there, otherwise - // we generate them - EnumSet layersToRead = EnumSet.of( - TextCorpusLayerTag.TOKENS, - TextCorpusLayerTag.SENTENCES); - - // If we have annotations for these layers in the CAS, we rewrite those layers. - List layersToReplace = new ArrayList(); - if (exists(aJCas, POS.class) || !preserveIfEmpty) { - layersToReplace.add(TextCorpusLayerTag.POSTAGS); - } - if (exists(aJCas, Lemma.class) || !preserveIfEmpty) { - layersToReplace.add(TextCorpusLayerTag.LEMMAS); - } - if (exists(aJCas, NamedEntity.class) || !preserveIfEmpty) { - layersToReplace.add(TextCorpusLayerTag.NAMED_ENTITIES); - } - if (exists(aJCas, Dependency.class) || !preserveIfEmpty) { - layersToReplace.add(TextCorpusLayerTag.PARSING_DEPENDENCY); - } - if (exists(aJCas, CoreferenceChain.class) || !preserveIfEmpty) { - layersToReplace.add(TextCorpusLayerTag.REFERENCES); - } - - TextCorpusStreamedWithReplaceableLayers textCorpus = null; - try { - textCorpus = new TextCorpusStreamedWithReplaceableLayers( - aIs, layersToRead, EnumSet.copyOf(layersToReplace), aOs); - - write(aJCas, textCorpus); - } - finally { - if (textCorpus != null) { - try { - textCorpus.close(); - } - catch (IOException e) { - // Ignore exception while closing - } - } - } - } - - private void write(JCas aJCas, TextCorpus aTextCorpus) - { - Map tokensBeginPositionMap; - tokensBeginPositionMap = writeTokens(aJCas, aTextCorpus); - writeSentence(aJCas, aTextCorpus, tokensBeginPositionMap); - writePosTags(aJCas, aTextCorpus, tokensBeginPositionMap); - writeLemmas(aJCas, aTextCorpus, tokensBeginPositionMap); - writeDependency(aJCas, aTextCorpus, tokensBeginPositionMap); - writeNamedEntity(aJCas, aTextCorpus, tokensBeginPositionMap); - writeCoreference(aJCas, aTextCorpus, tokensBeginPositionMap); - } - - private Map writeTokens(JCas aJCas, - TextCorpus aTextCorpus) - { - boolean tokensLayerCreated = false; - - // Create tokens layer if it does not exist - TokensLayer tokensLayer = aTextCorpus.getTokensLayer(); - if (tokensLayer == null) { - tokensLayer = aTextCorpus.createTokensLayer(); - tokensLayerCreated = true; - getLogger().debug("Layer [" + TextCorpusLayerTag.TOKENS.getXmlName() + "]: created"); - } - else { - getLogger().debug("Layer [" + TextCorpusLayerTag.TOKENS.getXmlName() + "]: found"); - } - - - Map tokensBeginPositionMap = - new HashMap(); - - int j = 0; - for (Token token : select(aJCas, Token.class)) { - if (tokensLayerCreated) { - tokensLayer.addToken(token.getCoveredText()); - } - - tokensBeginPositionMap.put(token.getBegin(), tokensLayer.getToken(j)); - j++; - } - - return tokensBeginPositionMap; - } - - private void writePosTags(JCas aJCas, TextCorpus aTextCorpus, - Map aTokensBeginPositionMap) - { - if (!JCasUtil.exists(aJCas, POS.class)) { - // Do nothing if there are no part-of-speech tags in the CAS - getLogger().debug("Layer [" + TextCorpusLayerTag.POSTAGS.getXmlName() + "]: empty"); - return; - } - - // Tokens layer must already exist - TokensLayer tokensLayer = aTextCorpus.getTokensLayer(); - - // create POS tag annotation layer - String posTagSet = "STTS"; - for (TagsetDescription tagSet : select(aJCas, TagsetDescription.class)) { - if (tagSet.getLayer().equals(POS.class.getName())) { - posTagSet = tagSet.getName(); - break; - } - } - - PosTagsLayer posLayer = aTextCorpus.createPosTagsLayer(posTagSet); - - getLogger().debug("Layer [" + TextCorpusLayerTag.POSTAGS.getXmlName() + "]: created"); - - int j = 0; - for (Token coveredToken : select(aJCas, Token.class)) { - POS pos = coveredToken.getPos(); - - if (pos != null && posLayer != null ) { - String posValue = coveredToken.getPos().getPosValue(); - posLayer.addTag(posValue, tokensLayer.getToken(j)); - } - - j++; - } - } - - private void writeLemmas(JCas aJCas, TextCorpus aTextCorpus, - Map aTokensBeginPositionMap) - { - if (!JCasUtil.exists(aJCas, Lemma.class)) { - // Do nothing if there are no lemmas in the CAS - getLogger().debug("Layer [" + TextCorpusLayerTag.LEMMAS.getXmlName() + "]: empty"); - return; - } - - // Tokens layer must already exist - TokensLayer tokensLayer = aTextCorpus.getTokensLayer(); - - // create lemma annotation layer - LemmasLayer lemmasLayer = aTextCorpus.createLemmasLayer(); - - getLogger().debug("Layer [" + TextCorpusLayerTag.LEMMAS.getXmlName() + "]: created"); - - int j = 0; - for (Token coveredToken : select(aJCas, Token.class)) { - Lemma lemma = coveredToken.getLemma(); - if (lemma != null && lemmasLayer != null) { - String lemmaValue = coveredToken.getLemma().getValue(); - lemmasLayer.addLemma(lemmaValue, tokensLayer.getToken(j)); - } - j++; - } - - } - - private void writeSentence(JCas aJCas, TextCorpus aTextCorpus, - Map aTokensBeginPositionMap) - { - // if not TCF file, add sentence layer (Sentence is required for BRAT) - SentencesLayer sentencesLayer = aTextCorpus.getSentencesLayer(); - if (sentencesLayer != null) { - getLogger().debug("Layer [" + TextCorpusLayerTag.SENTENCES.getXmlName() + "]: found"); - return; - } - - sentencesLayer = aTextCorpus.createSentencesLayer(); - - getLogger().debug("Layer [" + TextCorpusLayerTag.SENTENCES.getXmlName() + "]: created"); - - for (Sentence sentence : select(aJCas, Sentence.class)) { - List tokens = new ArrayList(); - for (Token token : selectCovered(Token.class, sentence)) { - tokens.add(aTokensBeginPositionMap.get(token.getBegin())); - } - sentencesLayer.addSentence(tokens); - } - } - - private void writeDependency(JCas aJCas, TextCorpus aTextCorpus, - Map aTokensBeginPositionMap) - { - if (!JCasUtil.exists(aJCas, Dependency.class)) { - // Do nothing if there are no dependencies in the CAS - getLogger().debug("Layer [" + TextCorpusLayerTag.PARSING_DEPENDENCY.getXmlName() + "]: empty"); - return; - } - - DependencyParsingLayer dependencyParsingLayer = null; - String tagSetName = "tiger"; - for (TagsetDescription tagSet : select(aJCas, TagsetDescription.class)) { - if (tagSet.getLayer().equals(Dependency.class.getName())) { - tagSetName = tagSet.getName(); - break; - } - } - - Optional hasNonBasic = select(aJCas, Dependency.class).stream() - .filter(dep -> dep.getFlavor() != null && !DependencyFlavor.BASIC.equals(dep.getFlavor())) - .findAny(); - - dependencyParsingLayer = aTextCorpus.createDependencyParsingLayer(tagSetName, hasNonBasic.isPresent(), true); - - getLogger().debug("Layer [" + TextCorpusLayerTag.PARSING_DEPENDENCY.getXmlName() + "]: created"); - - for (Sentence s : select(aJCas, Sentence.class)) { - List deps = new ArrayList(); - for (Dependency d : selectCovered(Dependency.class, s)) { - eu.clarin.weblicht.wlfxb.tc.api.Dependency dependency = dependencyParsingLayer - .createDependency(d.getDependencyType(), - aTokensBeginPositionMap.get(d.getDependent().getBegin()), - aTokensBeginPositionMap.get(d.getGovernor().getBegin())); - deps.add(dependency); - } - if (deps.size() > 0) { - dependencyParsingLayer.addParse(deps); - } - } - } - - private void writeNamedEntity(JCas aJCas, TextCorpus aTextCorpus, - Map aTokensBeginPositionMap) - { - if (!JCasUtil.exists(aJCas, NamedEntity.class)) { - // Do nothing if there are no named entities in the CAS - getLogger().debug("Layer [" + TextCorpusLayerTag.NAMED_ENTITIES.getXmlName() + "]: empty"); - return; - } - - String tagSetName = "BART"; - for (TagsetDescription tagSet : select(aJCas, TagsetDescription.class)) { - if (tagSet.getLayer().equals(NamedEntity.class.getName())) { - tagSetName = tagSet.getName(); - break; - } - } - - NamedEntitiesLayer namedEntitiesLayer = aTextCorpus.createNamedEntitiesLayer(tagSetName); - - getLogger().debug("Layer [" + TextCorpusLayerTag.NAMED_ENTITIES.getXmlName() + "]: created"); - - for (NamedEntity namedEntity : select(aJCas, NamedEntity.class)) { - List tokensInCas = selectCovered(aJCas, Token.class, namedEntity.getBegin(), - namedEntity.getEnd()); - List tokensInTcf = new ArrayList(); - for (Token token : tokensInCas) { - tokensInTcf.add(aTokensBeginPositionMap.get(token.getBegin())); - } - namedEntitiesLayer.addEntity(namedEntity.getValue(), tokensInTcf); - } - } - - private void writeCoreference(JCas aJCas, TextCorpus aTextCorpus, - Map aTokensBeginPositionMap) - { - if (!JCasUtil.exists(aJCas, CoreferenceChain.class)) { - // Do nothing if there are no coreference chains in the CAS - getLogger().debug("Layer [" + TextCorpusLayerTag.REFERENCES.getXmlName() + "]: empty"); - return; - } - - String tagSetName = "TueBaDz"; - for (TagsetDescription tagSet : select(aJCas, TagsetDescription.class)) { - if (tagSet.getLayer().equals(CoreferenceLink.class.getName())) { - tagSetName = tagSet.getName(); - break; - } - } - - ReferencesLayer coreferencesLayer = aTextCorpus.createReferencesLayer(null, tagSetName, - null); - - getLogger().debug("Layer [" + TextCorpusLayerTag.REFERENCES.getXmlName() + "]: created"); - - for (CoreferenceChain chain : select(aJCas, CoreferenceChain.class)) { - CoreferenceLink prevLink = null; - Reference prevRef = null; - List refs = new ArrayList(); - for (CoreferenceLink link : chain.links()) { - // Get covered tokens - List tokens = new ArrayList(); - for (Token token : selectCovered(Token.class, link)) { - tokens.add(aTokensBeginPositionMap.get(token.getBegin())); - } - - // Create current reference - Reference ref = coreferencesLayer.createReference(link.getReferenceType(), tokens, null); - - // Special handling for expletive relations - if (REL_TYPE_EXPLETIVE.equals(link.getReferenceRelation())) { - coreferencesLayer.addRelation(ref, REL_TYPE_EXPLETIVE); - // if the relation is expletive, then there must not be a next element in the - // chain, so we bail out here. - continue; - } - - // Create relation between previous and current reference - if (prevLink != null) { - coreferencesLayer.addRelation(prevRef, prevLink.getReferenceRelation(), ref); - } - - prevLink = link; - prevRef = ref; - refs.add(ref); - } - coreferencesLayer.addReferent(refs); - } - } -} diff --git a/dkpro-core-io-tcf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tcf/package-info.java b/dkpro-core-io-tcf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tcf/package-info.java deleted file mode 100644 index fca8c3afa5..0000000000 --- a/dkpro-core-io-tcf-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tcf/package-info.java +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Provides Classes for reading/writing TCF data files. - */ -package de.tudarmstadt.ukp.dkpro.core.io.tcf; diff --git a/dkpro-core-io-tcf-asl/src/main/java/org/dkpro/core/io/tcf/TcfReader.java b/dkpro-core-io-tcf-asl/src/main/java/org/dkpro/core/io/tcf/TcfReader.java new file mode 100644 index 0000000000..5a3cb28b39 --- /dev/null +++ b/dkpro-core-io-tcf-asl/src/main/java/org/dkpro/core/io/tcf/TcfReader.java @@ -0,0 +1,82 @@ +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.tcf; + +import static org.apache.commons.io.IOUtils.toBufferedInputStream; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.io.tcf.internal.Tcf2DKPro; + +import eu.clarin.weblicht.wlfxb.io.WLDObjector; +import eu.clarin.weblicht.wlfxb.io.WLFormatException; +import eu.clarin.weblicht.wlfxb.tc.api.TextCorpus; +import eu.clarin.weblicht.wlfxb.xb.WLData; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Reader for the WebLicht TCF format. It reads all the available annotation Layers from the TCF + * file and convert it to a CAS annotations. The TCF data do not have begin/end offsets for all of + * its annotations which is required in CAS annotation. Hence, addresses are manually calculated per + * tokens and stored in a map (token_id, token(CAS object)) where later we get can get the offset + * from the token + */ +@ResourceMetaData(name = "CLARIN-DE WebLicht TCF Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.TEXT_TCF}) +@TypeCapability(outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", + "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain", + "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink", + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency", + "de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation"}) +public class TcfReader + extends JCasResourceCollectionReader_ImplBase +{ + int j = 0; + + @Override + public void getNext(JCas aJCas) + throws IOException, CollectionException + { + Resource res = nextFile(); + initCas(aJCas, res); + + try (InputStream is = toBufferedInputStream(res.getInputStream())) { + WLData wLData = WLDObjector.read(is); + TextCorpus aCorpusData = wLData.getTextCorpus(); + new Tcf2DKPro().convert(aCorpusData, aJCas); + } + catch (WLFormatException e) { + throw new CollectionException(e); + } + } +} diff --git a/dkpro-core-io-tcf-asl/src/main/java/org/dkpro/core/io/tcf/TcfWriter.java b/dkpro-core-io-tcf-asl/src/main/java/org/dkpro/core/io/tcf/TcfWriter.java new file mode 100644 index 0000000000..61f02b4854 --- /dev/null +++ b/dkpro-core-io-tcf-asl/src/main/java/org/dkpro/core/io/tcf/TcfWriter.java @@ -0,0 +1,286 @@ +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.tcf; + +import static eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusLayerTag.LEMMAS; +import static eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusLayerTag.NAMED_ENTITIES; +import static eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusLayerTag.ORTHOGRAPHY; +import static eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusLayerTag.PARSING_DEPENDENCY; +import static eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusLayerTag.POSTAGS; +import static eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusLayerTag.REFERENCES; +import static eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusLayerTag.SENTENCES; +import static eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusLayerTag.TOKENS; +import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.apache.uima.fit.util.JCasUtil.exists; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.List; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.io.tcf.internal.DKPro2Tcf; + +import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation; +import eu.clarin.weblicht.wlfxb.io.TextCorpusStreamedWithReplaceableLayers; +import eu.clarin.weblicht.wlfxb.io.WLDObjector; +import eu.clarin.weblicht.wlfxb.io.WLFormatException; +import eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusLayerTag; +import eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusStored; +import eu.clarin.weblicht.wlfxb.xb.WLData; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Writer for the WebLicht TCF format. + */ +@ResourceMetaData(name = "CLARIN-DE WebLicht TCF Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.TEXT_TCF}) +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", + "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain", + "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink", + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency", + "de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation"}) +public class TcfWriter + extends JCasFileWriter_ImplBase +{ + /** + * Specify the suffix of output files. Default value .tcf. If the suffix is not + * needed, provide an empty string as value. + */ + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; + @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".tcf") + private String filenameSuffix; + + /** + * If there are no annotations for a particular layer in the CAS, preserve any potentially + * existing annotations in the original TCF. + */ + public static final String PARAM_PRESERVE_IF_EMPTY = "preserveIfEmpty"; + @ConfigurationParameter(name = PARAM_PRESERVE_IF_EMPTY, mandatory = true, defaultValue = "false") + private boolean preserveIfEmpty; + + /** + * Merge with source TCF file if one is available. + */ + public static final String PARAM_MERGE = "merge"; + @ConfigurationParameter(name = PARAM_MERGE, mandatory = true, defaultValue = "true") + private boolean merge; + + /** + * TCF version. + */ + public static final String PARAM_TCF_VERSION = "tcfVersion"; + @ConfigurationParameter(name = PARAM_TCF_VERSION, mandatory = true, defaultValue = "0.4") + private String tcfVersion; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + // #670 - TcfWriter can currently not properly write to ZIP files because of the "try and + // error" approach that we take to trying to merge with an existing file. In particular, if + // the attempt fails and we go without merging, we cannot delete the broken entry from the + // ZIP file. + if (StringUtils.startsWith(getTargetLocation(), JAR_PREFIX)) { + throw new ResourceInitializationException(new IllegalStateException( + "TcfWriter cannot write to ZIP files.")); + } + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + InputStream docIS = null; + try { + boolean writeWithoutMerging = true; + if (merge) { + NamedOutputStream docOS = null; + try { + docOS = getOutputStream(aJCas, filenameSuffix); + // Get the original TCF file and preserve it + DocumentMetaData documentMetadata = DocumentMetaData.get(aJCas); + URL filePathUrl = new URL(documentMetadata.getDocumentUri()); + try { + docIS = filePathUrl.openStream(); + + try { + getLogger().debug( + "Merging with [" + documentMetadata.getDocumentUri() + "]"); + casToTcfWriter(docIS, aJCas, docOS); + writeWithoutMerging = false; + } + // See https://github.com/weblicht/wlfxb/issues/7 + // catch (WLFormatException ex) { + // getLogger().debug("No source file to merge with: " + ex.getMessage()); + // } + // Workaround: catch all exceptions + catch (Exception ex) { + getLogger().debug("Source file is not TCF: " + ex.getMessage()); + } + } + catch (IOException e) { + getLogger().debug( + "Cannot open source file to merge with: " + e.getMessage()); + } + } + finally { + if (writeWithoutMerging) { + // Have to delete the output file from this try and will try again without + // merging. Deleting is necessary as not to trigger the overwrite safeguard + // in JCasFileWriter_ImplBase + if ((docOS != null) && (docOS.getName() != null)) { + FileUtils.deleteQuietly(new File(docOS.getName())); + } + } + closeQuietly(docOS); + } + } + else { + getLogger().debug("Merging disabled"); + } + + // If merging failed or is disabled, go on without merging + if (writeWithoutMerging) { + OutputStream docOS = null; + try { + docOS = getOutputStream(aJCas, filenameSuffix); + casToTcfWriter(aJCas, docOS); + } + finally { + closeQuietly(docOS); + } + } + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + finally { + closeQuietly(docIS); + } + } + + /** + * Create TCF File from scratch + * + * @param aJCas + * the JCas. + * @param aOs + * the output stream. + * @throws WLFormatException + * if a TCF problem occurs. + */ + public void casToTcfWriter(JCas aJCas, OutputStream aOs) + throws WLFormatException + { + // create TextCorpus object, specifying its language from the aJcas Object + TextCorpusStored textCorpus = new TextCorpusStored(aJCas.getDocumentLanguage()); + + // create text annotation layer and add the string of the text into the layer + textCorpus.createTextLayer().addText(aJCas.getDocumentText()); + + new DKPro2Tcf().convert(aJCas, textCorpus); + + // write the annotated data object into the output stream + WLData wldata = new WLData(textCorpus); + wldata.setVersion(tcfVersion); + WLDObjector.write(wldata, aOs); + } + + /** + * Merge annotations from CAS into an existing TCF file. + * + * @param aIs + * the TCF file with an existing annotation layers + * @param aJCas + * an annotated CAS object + * @param aOs + * the output stream. + * @throws WLFormatException + * if a TCF problem occurs. + */ + public void casToTcfWriter(InputStream aIs, JCas aJCas, OutputStream aOs) + throws WLFormatException + { + // If we have annotations for these layers in the CAS, we rewrite those layers. + List layersToReplaceList = new ArrayList<>(); + if (exists(aJCas, POS.class) || !preserveIfEmpty) { + layersToReplaceList.add(POSTAGS); + } + if (exists(aJCas, Lemma.class) || !preserveIfEmpty) { + layersToReplaceList.add(LEMMAS); + } + if (exists(aJCas, SofaChangeAnnotation.class) || !preserveIfEmpty) { + layersToReplaceList.add(ORTHOGRAPHY); + } + if (exists(aJCas, NamedEntity.class) || !preserveIfEmpty) { + layersToReplaceList.add(NAMED_ENTITIES); + } + if (exists(aJCas, Dependency.class) || !preserveIfEmpty) { + layersToReplaceList.add(PARSING_DEPENDENCY); + } + if (exists(aJCas, CoreferenceChain.class) || !preserveIfEmpty) { + layersToReplaceList.add(REFERENCES); + } + + EnumSet layersToReplaceSet = EnumSet.copyOf(layersToReplaceList); + + // If these layers are present in the TCF file, we use them from there, otherwise + // we generate them + EnumSet layersToReadSet = EnumSet.of(TOKENS, SENTENCES); + + try (TextCorpusStreamedWithReplaceableLayers textCorpus = + new TextCorpusStreamedWithReplaceableLayers(aIs, layersToReadSet, + layersToReplaceSet, aOs)) { + new DKPro2Tcf().convert(aJCas, textCorpus); + } + } +} diff --git a/dkpro-core-io-tcf-asl/src/main/java/org/dkpro/core/io/tcf/internal/DKPro2Tcf.java b/dkpro-core-io-tcf-asl/src/main/java/org/dkpro/core/io/tcf/internal/DKPro2Tcf.java new file mode 100644 index 0000000000..f287c00aee --- /dev/null +++ b/dkpro-core-io-tcf-asl/src/main/java/org/dkpro/core/io/tcf/internal/DKPro2Tcf.java @@ -0,0 +1,386 @@ +/* + * Copyright 2019 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.tcf.internal; + +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; + +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain; +import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription; +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; +import de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation; +import eu.clarin.weblicht.wlfxb.tc.api.CorrectionOperation; +import eu.clarin.weblicht.wlfxb.tc.api.DependencyParsingLayer; +import eu.clarin.weblicht.wlfxb.tc.api.LemmasLayer; +import eu.clarin.weblicht.wlfxb.tc.api.NamedEntitiesLayer; +import eu.clarin.weblicht.wlfxb.tc.api.OrthographyLayer; +import eu.clarin.weblicht.wlfxb.tc.api.PosTagsLayer; +import eu.clarin.weblicht.wlfxb.tc.api.Reference; +import eu.clarin.weblicht.wlfxb.tc.api.ReferencesLayer; +import eu.clarin.weblicht.wlfxb.tc.api.SentencesLayer; +import eu.clarin.weblicht.wlfxb.tc.api.TextCorpus; +import eu.clarin.weblicht.wlfxb.tc.api.TokensLayer; +import eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusLayerTag; + +public class DKPro2Tcf +{ + private final Logger log = LoggerFactory.getLogger(getClass()); + + private static final String REL_TYPE_EXPLETIVE = "expletive"; + + public void convert(JCas aJCas, TextCorpus textCorpus) + { + write(aJCas, textCorpus); + } + + public void write(JCas aJCas, TextCorpus aTextCorpus) + { + Map tokensBeginPositionMap; + tokensBeginPositionMap = writeTokens(aJCas, aTextCorpus); + writeSentence(aJCas, aTextCorpus, tokensBeginPositionMap); + writePosTags(aJCas, aTextCorpus, tokensBeginPositionMap); + writeLemmas(aJCas, aTextCorpus, tokensBeginPositionMap); + writeOrthograph(aJCas, aTextCorpus); + writeDependency(aJCas, aTextCorpus, tokensBeginPositionMap); + writeNamedEntity(aJCas, aTextCorpus, tokensBeginPositionMap); + writeCoreference(aJCas, aTextCorpus, tokensBeginPositionMap); + } + + public Map writeTokens(JCas aJCas, + TextCorpus aTextCorpus) + { + boolean tokensLayerCreated = false; + + // Create tokens layer if it does not exist + TokensLayer tokensLayer = aTextCorpus.getTokensLayer(); + if (tokensLayer == null) { + tokensLayer = aTextCorpus.createTokensLayer(); + tokensLayerCreated = true; + log.debug("Layer [{}]: created", TextCorpusLayerTag.TOKENS.getXmlName()); + } + else { + log.debug("Layer [{}]: found", TextCorpusLayerTag.TOKENS.getXmlName()); + } + + + Map tokensBeginPositionMap = + new HashMap<>(); + + int j = 0; + for (Token token : select(aJCas, Token.class)) { + if (tokensLayerCreated) { + if (token.getId() != null) { + // Assuming all of the tokens have IDs ... + tokensLayer.addToken(token.getCoveredText(), token.getBegin(), token.getEnd(), + token.getId()); + } + else { + // Assuming none of the tokens have IDs ... + tokensLayer.addToken(token.getCoveredText(), token.getBegin(), token.getEnd()); + } + } + + tokensBeginPositionMap.put(token.getBegin(), tokensLayer.getToken(j)); + j++; + } + + return tokensBeginPositionMap; + } + + public void writePosTags(JCas aJCas, TextCorpus aTextCorpus, + Map aTokensBeginPositionMap) + { + if (!JCasUtil.exists(aJCas, POS.class)) { + // Do nothing if there are no part-of-speech tags in the CAS + log.debug("Layer [{}]: empty", TextCorpusLayerTag.POSTAGS.getXmlName()); + return; + } + + // Tokens layer must already exist + TokensLayer tokensLayer = aTextCorpus.getTokensLayer(); + + // create POS tag annotation layer + String posTagSet = "STTS"; + for (TagsetDescription tagSet : select(aJCas, TagsetDescription.class)) { + if (tagSet.getLayer().equals(POS.class.getName())) { + posTagSet = tagSet.getName(); + break; + } + } + + PosTagsLayer posLayer = aTextCorpus.createPosTagsLayer(posTagSet); + + log.debug("Layer [{}]: created", TextCorpusLayerTag.POSTAGS.getXmlName()); + + int j = 0; + for (Token coveredToken : select(aJCas, Token.class)) { + POS pos = coveredToken.getPos(); + + if (pos != null && posLayer != null ) { + String posValue = coveredToken.getPos().getPosValue(); + posLayer.addTag(posValue, tokensLayer.getToken(j)); + } + + j++; + } + } + + public void writeLemmas(JCas aJCas, TextCorpus aTextCorpus, + Map aTokensBeginPositionMap) + { + if (!JCasUtil.exists(aJCas, Lemma.class)) { + // Do nothing if there are no lemmas in the CAS + log.debug("Layer [{}]: empty", TextCorpusLayerTag.LEMMAS.getXmlName()); + return; + } + + // Tokens layer must already exist + TokensLayer tokensLayer = aTextCorpus.getTokensLayer(); + + // create lemma annotation layer + LemmasLayer lemmasLayer = aTextCorpus.createLemmasLayer(); + + log.debug("Layer [{}]: created", TextCorpusLayerTag.LEMMAS.getXmlName()); + + int j = 0; + for (Token coveredToken : select(aJCas, Token.class)) { + Lemma lemma = coveredToken.getLemma(); + if (lemma != null && lemmasLayer != null) { + String lemmaValue = coveredToken.getLemma().getValue(); + lemmasLayer.addLemma(lemmaValue, tokensLayer.getToken(j)); + } + j++; + } + + } + + public void writeOrthograph(JCas aJCas, TextCorpus aTextCorpus) { + if (!JCasUtil.exists(aJCas, SofaChangeAnnotation.class)) { + // Do nothing if there are no SofaChangeAnnotation layer + // (Which is equivalent to Orthography layer in TCF) in the CAS + log.debug("Layer [{}]: empty", TextCorpusLayerTag.ORTHOGRAPHY.getXmlName()); + return; + } + + // Tokens layer must already exist + TokensLayer tokensLayer = aTextCorpus.getTokensLayer(); + + // create orthographyLayer annotation layer + OrthographyLayer orthographyLayer = aTextCorpus.createOrthographyLayer(); + + log.debug("Layer [{}]: created", TextCorpusLayerTag.ORTHOGRAPHY.getXmlName()); + + int j = 0; + for (Token token : select(aJCas, Token.class)) { + List scas = selectCovered(aJCas, SofaChangeAnnotation.class, + token.getBegin(), token.getEnd()); + if (scas.size() > 0 && orthographyLayer != null) { + SofaChangeAnnotation change = scas.get(0); + + orthographyLayer.addCorrection(scas.get(0).getValue(), tokensLayer.getToken(j), + Optional.ofNullable(change.getOperation()).map(CorrectionOperation::valueOf) + .orElse(null)); + } + j++; + } + + } + + public void writeSentence(JCas aJCas, TextCorpus aTextCorpus, + Map aTokensBeginPositionMap) + { + // if not TCF file, add sentence layer (Sentence is required for BRAT) + SentencesLayer sentencesLayer = aTextCorpus.getSentencesLayer(); + if (sentencesLayer != null) { + log.debug("Layer [{}]: found", TextCorpusLayerTag.SENTENCES.getXmlName()); + return; + } + + sentencesLayer = aTextCorpus.createSentencesLayer(); + + log.debug("Layer [{}]: created", TextCorpusLayerTag.SENTENCES.getXmlName()); + + for (Sentence sentence : select(aJCas, Sentence.class)) { + List tokens = new ArrayList<>(); + for (Token token : selectCovered(Token.class, sentence)) { + tokens.add(aTokensBeginPositionMap.get(token.getBegin())); + } + sentencesLayer.addSentence(tokens); + } + } + + public void writeDependency(JCas aJCas, TextCorpus aTextCorpus, + Map aTokensBeginPositionMap) + { + if (!JCasUtil.exists(aJCas, Dependency.class)) { + // Do nothing if there are no dependencies in the CAS + log.debug("Layer [{}]: empty", TextCorpusLayerTag.PARSING_DEPENDENCY.getXmlName()); + return; + } + + DependencyParsingLayer dependencyParsingLayer = null; + String tagSetName = "tiger"; + for (TagsetDescription tagSet : select(aJCas, TagsetDescription.class)) { + if (tagSet.getLayer().equals(Dependency.class.getName())) { + tagSetName = tagSet.getName(); + break; + } + } + + Optional hasNonBasic = select(aJCas, Dependency.class).stream() + .filter(dep -> dep.getFlavor() != null && + !DependencyFlavor.BASIC.equals(dep.getFlavor())) + .findAny(); + + dependencyParsingLayer = aTextCorpus.createDependencyParsingLayer(tagSetName, + hasNonBasic.isPresent(), true); + + log.debug("Layer [{}]: created", TextCorpusLayerTag.PARSING_DEPENDENCY.getXmlName()); + + for (Sentence s : select(aJCas, Sentence.class)) { + List deps = new ArrayList<>(); + for (Dependency d : selectCovered(Dependency.class, s)) { + eu.clarin.weblicht.wlfxb.tc.api.Dependency dependency = dependencyParsingLayer + .createDependency(d.getDependencyType(), + aTokensBeginPositionMap.get(d.getDependent().getBegin()), + aTokensBeginPositionMap.get(d.getGovernor().getBegin())); + + deps.add(dependency); + } + if (deps.size() > 0) { + dependencyParsingLayer.addParse(deps); + } + } + } + + public void writeNamedEntity(JCas aJCas, TextCorpus aTextCorpus, + Map aTokensBeginPositionMap) + { + if (!JCasUtil.exists(aJCas, NamedEntity.class)) { + // Do nothing if there are no named entities in the CAS + log.debug("Layer [{}]: empty", TextCorpusLayerTag.NAMED_ENTITIES.getXmlName()); + return; + } + + String tagSetName = "BART"; + for (TagsetDescription tagSet : select(aJCas, TagsetDescription.class)) { + if (tagSet.getLayer().equals(NamedEntity.class.getName())) { + tagSetName = tagSet.getName(); + break; + } + } + + NamedEntitiesLayer namedEntitiesLayer = aTextCorpus.createNamedEntitiesLayer(tagSetName); + + log.debug("Layer [{}]: created", TextCorpusLayerTag.NAMED_ENTITIES.getXmlName()); + + for (NamedEntity namedEntity : select(aJCas, NamedEntity.class)) { + List tokensInCas = selectCovered(aJCas, Token.class, namedEntity.getBegin(), + namedEntity.getEnd()); + List tokensInTcf = new ArrayList<>(); + for (Token token : tokensInCas) { + tokensInTcf.add(aTokensBeginPositionMap.get(token.getBegin())); + } + namedEntitiesLayer.addEntity(namedEntity.getValue(), tokensInTcf); + } + } + + public void writeCoreference(JCas aJCas, TextCorpus aTextCorpus, + Map aTokensBeginPositionMap) + { + if (!JCasUtil.exists(aJCas, CoreferenceChain.class)) { + // Do nothing if there are no coreference chains in the CAS + log.debug("Layer [{}]: empty", TextCorpusLayerTag.REFERENCES.getXmlName()); + return; + } + + String tagSetName = "TueBaDz"; + for (TagsetDescription tagSet : select(aJCas, TagsetDescription.class)) { + if (tagSet.getLayer().equals(CoreferenceLink.class.getName())) { + tagSetName = tagSet.getName(); + break; + } + } + + ReferencesLayer coreferencesLayer = aTextCorpus.createReferencesLayer(null, tagSetName, + null); + + log.debug("Layer [{}]: created", TextCorpusLayerTag.REFERENCES.getXmlName()); + + // Sort by begin to provide a more-or-less stable order for the unit tests + List chains = select(aJCas, CoreferenceChain.class) + .stream() + .filter(chain -> chain.getFirst() != null) + .sorted((a, b) -> a.getFirst().getBegin() - b.getFirst().getBegin()) + .collect(Collectors.toList()); + + for (CoreferenceChain chain : chains) { + CoreferenceLink prevLink = null; + Reference prevRef = null; + List refs = new ArrayList<>(); + for (CoreferenceLink link : chain.links()) { + // Get covered tokens + List tokens = new ArrayList<>(); + for (Token token : selectCovered(Token.class, link)) { + tokens.add(aTokensBeginPositionMap.get(token.getBegin())); + } + + // Create current reference + Reference ref = coreferencesLayer.createReference(link.getReferenceType(), tokens, + null); + + // Special handling for expletive relations + if (REL_TYPE_EXPLETIVE.equals(link.getReferenceRelation())) { + coreferencesLayer.addRelation(ref, REL_TYPE_EXPLETIVE); + // if the relation is expletive, then there must not be a next element in the + // chain, so we bail out here. + continue; + } + + // Create relation between previous and current reference + if (prevLink != null) { + coreferencesLayer.addRelation(prevRef, prevLink.getReferenceRelation(), ref); + } + + prevLink = link; + prevRef = ref; + refs.add(ref); + } + coreferencesLayer.addReferent(refs); + } + } +} diff --git a/dkpro-core-io-tcf-asl/src/main/java/org/dkpro/core/io/tcf/internal/Tcf2DKPro.java b/dkpro-core-io-tcf-asl/src/main/java/org/dkpro/core/io/tcf/internal/Tcf2DKPro.java new file mode 100644 index 0000000000..64ba8e22dd --- /dev/null +++ b/dkpro-core-io-tcf-asl/src/main/java/org/dkpro/core/io/tcf/internal/Tcf2DKPro.java @@ -0,0 +1,496 @@ +/* + * Copyright 2019 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.tcf.internal; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.TreeMap; + +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain; +import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; +import de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation; +import eu.clarin.weblicht.wlfxb.io.TextCorpusStreamed; +import eu.clarin.weblicht.wlfxb.tc.api.CorrectionOperation; +import eu.clarin.weblicht.wlfxb.tc.api.DependencyParse; +import eu.clarin.weblicht.wlfxb.tc.api.DependencyParsingLayer; +import eu.clarin.weblicht.wlfxb.tc.api.Reference; +import eu.clarin.weblicht.wlfxb.tc.api.TextCorpus; + +public class Tcf2DKPro +{ + private final Logger log = LoggerFactory.getLogger(getClass()); + + public void convert(TextCorpus aCorpusData, JCas aJCas) + { + convertText(aJCas, aCorpusData); + + Map tokens = convertTokens(aJCas, aCorpusData); + if (tokens.size() > 0) { + + convertPos(aJCas, aCorpusData, tokens); + + convertLemma(aJCas, aCorpusData, tokens); + + convertOrthoGraphy(aJCas, aCorpusData, tokens); + + convertSentences(aJCas, aCorpusData, tokens); + + convertDependencies(aJCas, aCorpusData, tokens); + + convertNamedEntities(aJCas, aCorpusData, tokens); + + convertCoreference(aJCas, aCorpusData, tokens); + } + } + + /** + * This method builds texts from the {@link eu.clarin.weblicht.wlfxb.tc.api.Token} annotation + * layer. The getText Method of {@link TextCorpusStreamed} is not used as some tokens, such as + * special characters represented differently than in the original text. + *

+ * If the CAS already contains a document text, it is kept. + *

+ * If the CAS already contains a document language, it is kept. + * + * @param aJCas + * the JCas. + * @param aCorpusData + * the TCF document. + */ + public void convertText(JCas aJCas, TextCorpus aCorpusData) + { + if (aJCas.getDocumentText() == null) { + StringBuilder text = new StringBuilder(); + + for (int i = 0; i < aCorpusData.getTokensLayer().size(); i++) { + eu.clarin.weblicht.wlfxb.tc.api.Token token = aCorpusData.getTokensLayer() + .getToken(i); + + if (token.getStart() != null && token.getEnd() != null) { + // Assuming all of the tokens have offset information... + while (text.length() < token.getStart()) { + text.append(" "); + } + } + else { + // Assuming none of the tokens has offset information... + if (i > 0) { + text.append(" "); + } + } + + text.append(token.getString()); + } + aJCas.setDocumentText(text.toString()); + } + + aJCas.setDocumentLanguage(aCorpusData.getLanguage()); + } + + /** + * Convert TCF Tokens Layer to CAS Token Annotation. + * + * @param aJCas + * the JCas. + * @param aCorpusData + * the TCF document. + * @return returns {@code Map} of (token_id, Token), for later references + */ + public Map convertTokens(JCas aJCas, TextCorpus aCorpusData) + { + if (aCorpusData.getTokensLayer() == null) { + // No layer to read from. + return new HashMap<>(); + } + + String text = aJCas.getDocumentText(); + + Token outToken; + int tokenBeginPosition = 0; + int tokenEndPosition; + Map tokens = new HashMap<>(); + + for (int i = 0; i < aCorpusData.getTokensLayer().size(); i++) { + + eu.clarin.weblicht.wlfxb.tc.api.Token token = aCorpusData.getTokensLayer().getToken(i); + + if (token.getStart() != null && token.getEnd() != null) { + // Assuming all of the tokens have offset information... + tokenBeginPosition = token.getStart().intValue(); + tokenEndPosition = token.getEnd().intValue(); + } + else { + // Assuming none of the tokens has offset information... + tokenBeginPosition = text.indexOf(token.getString(), tokenBeginPosition); + tokenEndPosition = text.indexOf(token.getString(), tokenBeginPosition) + + token.getString().length(); + } + + outToken = new Token(aJCas, tokenBeginPosition, tokenEndPosition); + if (token.getID() != null) { + outToken.setId(token.getID()); + } + outToken.addToIndexes(); + + tokens.put(token.getID(), outToken); + tokenBeginPosition = tokenEndPosition; + } + return tokens; + } + + public void convertPos(JCas aJCas, TextCorpus aCorpusData, Map aTokens) + { + if (aCorpusData.getPosTagsLayer() == null) { + return; + } + for (int i = 0; i < aCorpusData.getPosTagsLayer().size(); i++) { + eu.clarin.weblicht.wlfxb.tc.api.Token[] posTokens = aCorpusData.getPosTagsLayer() + .getTokens(aCorpusData.getPosTagsLayer().getTag(i)); + String value = aCorpusData.getPosTagsLayer().getTag(i).getString(); + + POS outPos = new POS(aJCas); + + outPos.setBegin(aTokens.get(posTokens[0].getID()).getBegin()); + outPos.setEnd(aTokens.get(posTokens[0].getID()).getEnd()); + outPos.setPosValue(value); + POSUtils.assignCoarseValue(outPos); + outPos.addToIndexes(); + + // Set the POS to the token + aTokens.get(posTokens[0].getID()).setPos(outPos); + } + } + + public void convertLemma(JCas aJCas, TextCorpus aCorpusData, Map aTokens) + { + if (aCorpusData.getLemmasLayer() == null) { + return; + } + for (int i = 0; i < aCorpusData.getLemmasLayer().size(); i++) { + eu.clarin.weblicht.wlfxb.tc.api.Token[] lemmaTokens = aCorpusData.getLemmasLayer() + .getTokens(aCorpusData.getLemmasLayer().getLemma(i)); + String value = aCorpusData.getLemmasLayer().getLemma(i).getString(); + + Lemma outLemma = new Lemma(aJCas); + + outLemma.setBegin(aTokens.get(lemmaTokens[0].getID()).getBegin()); + outLemma.setEnd(aTokens.get(lemmaTokens[0].getID()).getEnd()); + outLemma.setValue(value); + outLemma.addToIndexes(); + + // Set the lemma to the token + aTokens.get(lemmaTokens[0].getID()).setLemma(outLemma); + } + + } + + public void convertOrthoGraphy(JCas aJCas, TextCorpus aCorpusData, Map aTokens) + { + if (aCorpusData.getOrthographyLayer() == null) { + return; + } + + for (int i = 0; i < aCorpusData.getOrthographyLayer().size(); i++) { + eu.clarin.weblicht.wlfxb.tc.api.Token[] orthoTokens = aCorpusData.getOrthographyLayer() + .getTokens(aCorpusData.getOrthographyLayer().getCorrection(i)); + String value = aCorpusData.getOrthographyLayer().getCorrection(i).getString(); + String operation = Optional + .ofNullable(aCorpusData.getOrthographyLayer().getCorrection(i).getOperation()) + .map(CorrectionOperation::name).orElse(null); + + SofaChangeAnnotation ortho = new SofaChangeAnnotation(aJCas); + ortho.setBegin(aTokens.get(orthoTokens[0].getID()).getBegin()); + ortho.setEnd(aTokens.get(orthoTokens[0].getID()).getEnd()); + ortho.setValue(value); + ortho.setOperation(operation); + ortho.addToIndexes(); + } + } + + public void convertSentences(JCas aJCas, TextCorpus aCorpusData, + Map aTokens) + { + if (aCorpusData.getSentencesLayer() == null) { + // No layer to read from. + return; + } + + for (int i = 0; i < aCorpusData.getSentencesLayer().size(); i++) { + eu.clarin.weblicht.wlfxb.tc.api.Token[] sentencesTokens = aCorpusData + .getSentencesLayer().getTokens(aCorpusData.getSentencesLayer().getSentence(i)); + + Sentence outSentence = new Sentence(aJCas); + + outSentence.setBegin(aTokens.get(sentencesTokens[0].getID()).getBegin()); + outSentence.setEnd(aTokens.get(sentencesTokens[sentencesTokens.length - 1].getID()) + .getEnd()); + outSentence.addToIndexes(); + } + } + + public void convertDependencies(JCas aJCas, TextCorpus aCorpusData, + Map aTokens) + { + DependencyParsingLayer depLayer = aCorpusData.getDependencyParsingLayer(); + + if (depLayer == null) { + // No layer to read from. + return; + } + + for (int i = 0; i < depLayer.size(); i++) { + DependencyParse dependencyParse = depLayer.getParse(i); + for (eu.clarin.weblicht.wlfxb.tc.api.Dependency dependency : dependencyParse + .getDependencies()) { + + eu.clarin.weblicht.wlfxb.tc.api.Token[] governorTokens = depLayer + .getGovernorTokens(dependency); + eu.clarin.weblicht.wlfxb.tc.api.Token[] dependentTokens = depLayer + .getDependentTokens(dependency); + + POS dependentPos = aTokens.get(dependentTokens[0].getID()).getPos(); + + // For dependency annotations in the TCF file without POS, add as a default POS -- + if (dependentPos == null) { + log.warn("There is no pos for this token, added [--] as a pos"); + dependentPos = new POS(aJCas); + dependentPos.setBegin(aTokens.get(dependentTokens[0].getID()).getBegin()); + dependentPos.setEnd(aTokens.get(dependentTokens[0].getID()).getEnd()); + dependentPos.setPosValue("--"); + dependentPos.setCoarseValue("--"); + dependentPos.addToIndexes(); + aTokens.get(dependentTokens[0].getID()).setPos(dependentPos); + } + + if (governorTokens != null) { + POS governerPos = aTokens.get(governorTokens[0].getID()).getPos(); + if (governerPos == null) { + if (dependency.getFunction().equals("ROOT")) { + // do nothing + } + else { + log.warn("There is no pos for this token, added [--] as a pos"); + governerPos = new POS(aJCas); + governerPos.setBegin(aTokens.get(governorTokens[0].getID()).getBegin()); + governerPos.setEnd(aTokens.get(governorTokens[0].getID()).getEnd()); + governerPos.setPosValue("--"); + governerPos.addToIndexes(); + aTokens.get(governorTokens[0].getID()).setPos(governerPos); + } + } + } + else { + governorTokens = dependentTokens; + } + + // We set governorTokens = dependentTokens above for root nodes + if (governorTokens == dependentTokens) { + Dependency outDependency = new ROOT(aJCas); + outDependency.setDependencyType(dependency.getFunction()); + outDependency.setGovernor(aTokens.get(dependentTokens[0].getID())); + outDependency.setDependent(aTokens.get(dependentTokens[0].getID())); + outDependency.setBegin(outDependency.getDependent().getBegin()); + outDependency.setEnd(outDependency.getDependent().getEnd()); + outDependency.setFlavor(depLayer.hasMultipleGovernors() + ? DependencyFlavor.ENHANCED : DependencyFlavor.BASIC); + outDependency.addToIndexes(); + + } + else { + Dependency outDependency = new Dependency(aJCas); + outDependency.setDependencyType(dependency.getFunction()); + outDependency.setGovernor(aTokens.get(governorTokens[0].getID())); + outDependency.setDependent(aTokens.get(dependentTokens[0].getID())); + outDependency.setBegin(outDependency.getDependent().getBegin()); + outDependency.setEnd(outDependency.getDependent().getEnd()); + outDependency.setFlavor(depLayer.hasMultipleGovernors() + ? DependencyFlavor.ENHANCED : DependencyFlavor.BASIC); + outDependency.addToIndexes(); + } + } + } + } + + public void convertNamedEntities(JCas aJCas, TextCorpus aCorpusData, + Map aTokens) + { + if (aCorpusData.getNamedEntitiesLayer() == null) { + // No layer to read from. + return; + } + + for (int i = 0; i < aCorpusData.getNamedEntitiesLayer().size(); i++) { + // get the named entity + eu.clarin.weblicht.wlfxb.tc.api.NamedEntity entity = aCorpusData + .getNamedEntitiesLayer().getEntity(i); + + eu.clarin.weblicht.wlfxb.tc.api.Token[] namedEntityTokens = aCorpusData + .getNamedEntitiesLayer().getTokens(entity); + + NamedEntity outNamedEntity = new NamedEntity(aJCas); + + outNamedEntity.setBegin(getOffsets(namedEntityTokens, aTokens)[0]); + outNamedEntity.setEnd(getOffsets(namedEntityTokens, aTokens)[1]); + outNamedEntity.setValue(entity.getType()); + outNamedEntity.addToIndexes(); + } + + } + + /** + * Correferences in CAS should be represented {@link CoreferenceChain} and + * {@link CoreferenceLink}. The TCF representation Uses rel and target to build + * chains. Example:
+ * {@literal } + *
+ * {@literal
+ * }

+ * The first phase of conversion is getting all references and targets alongside + * the type and relations in different maps
+ * Second, an iteration is made through all the maps and the {@link CoreferenceChain} and + * {@link CoreferenceLink} annotations are constructed. + * + * @param aJCas + * the JCas. + * @param aCorpusData + * the TCF document. + * @param aTokens + * id/token map. + */ + public void convertCoreference(JCas aJCas, TextCorpus aCorpusData, + Map aTokens) + { + if (aCorpusData.getReferencesLayer() == null) { + // No layer to read from. + return; + } + for (int i = 0; i < aCorpusData.getReferencesLayer().size(); i++) { + eu.clarin.weblicht.wlfxb.tc.api.ReferencedEntity entity = aCorpusData + .getReferencesLayer().getReferencedEntity(i); + + Map referencesMap = new TreeMap(); + storeReferencesAndTargetsInMap(referencesMap, entity, aCorpusData, aTokens, aJCas); + + CoreferenceChain chain = new CoreferenceChain(aJCas); + CoreferenceLink link = null; + for (Integer address : referencesMap.keySet()) { + if (chain.getFirst() == null) { + chain.setFirst(referencesMap.get(address)); + link = chain.getFirst(); + chain.addToIndexes(); + } + else { + link.setNext(referencesMap.get(address)); + if (link.getReferenceRelation() == null) { + link.setReferenceRelation( + referencesMap.get(address).getReferenceRelation()); + } + link = link.getNext(); + link.addToIndexes(); + } + } + } + } + + public void storeReferencesAndTargetsInMap(Map aReferencesMap, + eu.clarin.weblicht.wlfxb.tc.api.ReferencedEntity entity, TextCorpus aCorpusData, + Map aTokens, JCas aJcas) + { + for (Reference reference : entity.getReferences()) { + StringBuilder sbTokens = new StringBuilder(); + for (eu.clarin.weblicht.wlfxb.tc.api.Token token : aCorpusData.getReferencesLayer() + .getTokens(reference)) { + sbTokens.append(token.getID()).append(" "); + } + + String[] referenceTokens = sbTokens.toString().split(" "); + int begin = getOffsets(referenceTokens, aTokens)[0]; + int end = getOffsets(referenceTokens, aTokens)[1]; + + CoreferenceLink link = new CoreferenceLink(aJcas); + link.setBegin(begin); + link.setEnd(end); + String referencesType = reference.getType() == null ? "nam" : reference.getType(); + link.setReferenceType(referencesType); + if (reference.getRelation() != null) { + link.setReferenceRelation(reference.getRelation()); + } + link.addToIndexes(); + aReferencesMap.put(link.getAddress(), link); + } + } + + /** + * Get the start and end offsets of a span annotation + * + * @param aSpanTokens + * list of span {@link eu.clarin.weblicht.wlfxb.tc.api.Token}s + * @param aAllTokens + * all available tokens in the file + * @return the offsets. + */ + public int[] getOffsets(eu.clarin.weblicht.wlfxb.tc.api.Token[] aSpanTokens, + Map aAllTokens) + { + List beginPositions = new ArrayList<>(); + List endPositions = new ArrayList<>(); + for (eu.clarin.weblicht.wlfxb.tc.api.Token token : aSpanTokens) { + beginPositions.add(aAllTokens.get(token.getID()).getBegin()); + endPositions.add(aAllTokens.get(token.getID()).getEnd()); + } + return new int[] { (Collections.min(beginPositions)), (Collections.max(endPositions)) }; + } + + /** + * Get the start and end offsets of a span annotation + * + * @param aSpanTokens + * list of span token ids. [t_3,_t_5, t_1] + * @param aAllTokens + * all available tokens in the file + * @return the offsets. + */ + public int[] getOffsets(String[] aSpanTokens, Map aAllTokens) + { + List beginPositions = new ArrayList<>(); + List endPositions = new ArrayList<>(); + for (String token : aSpanTokens) { + beginPositions.add(aAllTokens.get(token).getBegin()); + endPositions.add(aAllTokens.get(token).getEnd()); + } + return new int[] { (Collections.min(beginPositions)), (Collections.max(endPositions)) }; + } +} diff --git a/dkpro-core-io-tcf-asl/src/main/java/org/dkpro/core/io/tcf/package-info.java b/dkpro-core-io-tcf-asl/src/main/java/org/dkpro/core/io/tcf/package-info.java new file mode 100644 index 0000000000..c46618a70d --- /dev/null +++ b/dkpro-core-io-tcf-asl/src/main/java/org/dkpro/core/io/tcf/package-info.java @@ -0,0 +1,21 @@ +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Provides Classes for reading/writing TCF data files. + */ +package org.dkpro.core.io.tcf; diff --git a/dkpro-core-io-tcf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tcf/TcfReaderWriterTest.java b/dkpro-core-io-tcf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tcf/TcfReaderWriterTest.java deleted file mode 100644 index 6009950c89..0000000000 --- a/dkpro-core-io-tcf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tcf/TcfReaderWriterTest.java +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.tcf; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; -import static org.junit.Assert.assertEquals; - -import java.io.File; -import java.io.FileInputStream; -import java.io.InputStream; - -import org.apache.commons.io.FileUtils; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.component.CasDumpWriter; -import org.custommonkey.xmlunit.XMLAssert; -import org.junit.Rule; -import org.junit.Test; -import org.xml.sax.InputSource; - -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.EOLUtils; -import eu.clarin.weblicht.wlfxb.io.WLDObjector; -import eu.clarin.weblicht.wlfxb.tc.api.TextCorpus; -import eu.clarin.weblicht.wlfxb.tc.api.TextCorpusLayer; -import eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusStored; -import eu.clarin.weblicht.wlfxb.xb.WLData; - -public class TcfReaderWriterTest -{ - @Test - public void test1() - throws Exception - { - testOneWay("tcf-after.xml", "tcf-after-expected.xml"); - } - - @Test - public void testWithCmdMetadata() - throws Exception - { - testOneWay("tcf04-karin-wl.xml", "tcf04-karin-wl_expected.xml"); - } - - public void testOneWay(String aInputFile, String aExpectedFile) - throws Exception - { - CollectionReaderDescription reader = createReaderDescription(TcfReader.class, - TcfReader.PARAM_SOURCE_LOCATION, "src/test/resources/", - TcfReader.PARAM_PATTERNS, aInputFile); - - AnalysisEngineDescription writer = createEngineDescription( - TcfWriter.class, - TcfWriter.PARAM_TARGET_LOCATION, "target/test-output/oneway", - TcfWriter.PARAM_OVERWRITE, true, - TcfWriter.PARAM_FILENAME_EXTENSION, ".xml", - TcfWriter.PARAM_STRIP_EXTENSION, true); - - AnalysisEngineDescription dumper = createEngineDescription(CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "target/test-output/oneway/dump.txt"); - - runPipeline(reader, writer, dumper); - - InputStream isReference = new FileInputStream(new File("src/test/resources/" - + aExpectedFile)); - - InputStream isActual = new FileInputStream(new File("target/test-output/oneway/" - + aInputFile)); - - WLData wLDataReference = WLDObjector.read(isReference); - TextCorpusStored aCorpusDataReference = wLDataReference.getTextCorpus(); - - WLData wLDataActual = WLDObjector.read(isActual); - TextCorpusStored aCorpusDataActual = wLDataActual.getTextCorpus(); - - // check if layers maintained - assertEquals(aCorpusDataReference.getLayers().size(), aCorpusDataActual.getLayers().size()); - - // Check if every layers have the same number of annotations - for (TextCorpusLayer layer : aCorpusDataReference.getLayers()) { - assertEquals( - "Layer size mismatch in ["+layer.getClass().getName()+"]", - layer.size(), - getLayer(aCorpusDataActual, layer.getClass()).size()); - } - - XMLAssert.assertXMLEqual( - new InputSource("src/test/resources/" + aExpectedFile), - new InputSource(new File("target/test-output/oneway/" + aInputFile).getPath())); - } - - private static TextCorpusLayer getLayer(TextCorpus aCorpus, Class aLayerType) - { - for (TextCorpusLayer layer : aCorpus.getLayers()) { - if (layer.getClass().equals(aLayerType)) { - return layer; - } - } - throw new IllegalArgumentException("No layer of type [" + aLayerType.getName() + "]"); - } - - @Test - public void testRoundtrip() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription(TcfReader.class, - TcfReader.PARAM_SOURCE_LOCATION, "src/test/resources/", - TcfReader.PARAM_PATTERNS, "wlfxb.xml"); - - AnalysisEngineDescription writer = createEngineDescription( - TcfWriter.class, - TcfWriter.PARAM_TARGET_LOCATION, "target/test-output/roundtrip", - TcfWriter.PARAM_OVERWRITE, true, - TcfWriter.PARAM_FILENAME_EXTENSION, ".xml", - TcfWriter.PARAM_STRIP_EXTENSION, true); - - runPipeline(reader, writer); - - String reference = FileUtils.readFileToString( - new File("src/test/resources/wlfxb.xml"), "UTF-8"); - String actual = FileUtils.readFileToString( - new File("target/test-output/roundtrip/wlfxb.xml"), "UTF-8"); - reference = EOLUtils.normalizeLineEndings(reference); - actual = EOLUtils.normalizeLineEndings(actual); - assertEquals(reference, actual); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-io-tcf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tcf/TcfReaderTest.java b/dkpro-core-io-tcf-asl/src/test/java/org/dkpro/core/io/tcf/TcfReaderTest.java similarity index 93% rename from dkpro-core-io-tcf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tcf/TcfReaderTest.java rename to dkpro-core-io-tcf-asl/src/test/java/org/dkpro/core/io/tcf/TcfReaderTest.java index 63d50f4f23..43f2ae8db8 100644 --- a/dkpro-core-io-tcf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tcf/TcfReaderTest.java +++ b/dkpro-core-io-tcf-asl/src/test/java/org/dkpro/core/io/tcf/TcfReaderTest.java @@ -15,10 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tcf; +package org.dkpro.core.io.tcf; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; -import static org.apache.uima.fit.util.JCasUtil.*; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; import static org.junit.Assert.assertEquals; import java.util.ArrayList; @@ -27,12 +28,13 @@ import org.apache.uima.collection.CollectionReader; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.tcf.TcfReader; +import org.dkpro.core.testing.AssertAnnotations; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; public class TcfReaderTest { diff --git a/dkpro-core-io-tcf-asl/src/test/java/org/dkpro/core/io/tcf/TcfReaderWriterTest.java b/dkpro-core-io-tcf-asl/src/test/java/org/dkpro/core/io/tcf/TcfReaderWriterTest.java new file mode 100644 index 0000000000..0f50429a0b --- /dev/null +++ b/dkpro-core-io-tcf-asl/src/test/java/org/dkpro/core/io/tcf/TcfReaderWriterTest.java @@ -0,0 +1,112 @@ +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.tcf; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; +import static org.dkpro.core.testing.IOTestRunner.testRoundTrip; + +import java.io.File; +import java.io.IOException; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.jcas.JCas; +import org.custommonkey.xmlunit.XMLAssert; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestOptions; +import org.junit.Rule; +import org.junit.Test; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; + +public class TcfReaderWriterTest +{ + @Test + public void test1() + throws Exception + { + testOneWay( + createReaderDescription(TcfReader.class), + createEngineDescription(TcfWriter.class, + TcfWriter.PARAM_MERGE, false, + TcfWriter.PARAM_FILENAME_EXTENSION, ".xml"), + "tcf-after-expected.xml", + "tcf-after.xml", + new TestOptions().keepDocumentMetadata().resultAssertor(this::assertXmlEquals)); + } + + @Test + public void testWithCmdMetadata() + throws Exception + { + testOneWay( + createReaderDescription(TcfReader.class), + createEngineDescription(TcfWriter.class, + TcfWriter.PARAM_FILENAME_EXTENSION, ".xml"), + "tcf04-karin-wl_expected.xml", + "tcf04-karin-wl.xml", + new TestOptions() + .keepDocumentMetadata() + .resultAssertor(this::assertXmlEquals) + // To spot-check if replaced layers enter into the output, we reverse the + // POS tags. + .processor(createEngineDescription(PosReplacer.class))); + } + + @Test + public void testRoundtrip() + throws Exception + { + testRoundTrip( + createReaderDescription(TcfReader.class), + createEngineDescription(TcfWriter.class, + TcfWriter.PARAM_MERGE, false, + TcfWriter.PARAM_FILENAME_EXTENSION, ".xml"), + "wlfxb_expected.xml", + new TestOptions().keepDocumentMetadata().resultAssertor(this::assertXmlEquals)); + } + + public static class PosReplacer extends JCasAnnotator_ImplBase { + + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException + { + aJCas.select(POS.class).forEach(pos -> pos + .setPosValue(new StringBuilder(pos.getPosValue()).reverse().toString())); + } + } + + private void assertXmlEquals(File expected, File actual) + { + try { + XMLAssert.assertXMLEqual( + new InputSource(expected.getPath()), + new InputSource(actual.getPath())); + } + catch (SAXException | IOException e) { + throw new RuntimeException(e); + } + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-tcf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tcf/TcfWriterTest.java b/dkpro-core-io-tcf-asl/src/test/java/org/dkpro/core/io/tcf/TcfWriterTest.java similarity index 91% rename from dkpro-core-io-tcf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tcf/TcfWriterTest.java rename to dkpro-core-io-tcf-asl/src/test/java/org/dkpro/core/io/tcf/TcfWriterTest.java index 9ea5d406a0..d38db26d49 100644 --- a/dkpro-core-io-tcf-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tcf/TcfWriterTest.java +++ b/dkpro-core-io-tcf-asl/src/test/java/org/dkpro/core/io/tcf/TcfWriterTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tcf; +package org.dkpro.core.io.tcf; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; @@ -28,12 +28,14 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.tcf.TcfReader; +import org.dkpro.core.io.tcf.TcfWriter; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class TcfWriterTest { @@ -54,7 +56,8 @@ public void testOriginalNotTcf() // Generate a fake metadata that points to a non-TCF file DocumentMetaData meta = DocumentMetaData.create(jcas); meta.setDocumentBaseUri(new File("src/test/resources").toURI().toURL().toString()); - meta.setDocumentUri(new File("src/test/resources/not-a-tcf-file.txt").toURI().toURL().toString()); + meta.setDocumentUri( + new File("src/test/resources/not-a-tcf-file.txt").toURI().toURL().toString()); // Add some content jcas.setDocumentText("okeydokey"); diff --git a/dkpro-core-io-tcf-asl/src/test/resources/log4j.properties b/dkpro-core-io-tcf-asl/src/test/resources/log4j.properties deleted file mode 100644 index e227765ec0..0000000000 --- a/dkpro-core-io-tcf-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -#log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-tcf-asl/src/test/resources/log4j2.xml b/dkpro-core-io-tcf-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-tcf-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-tcf-asl/src/test/resources/tcf-after-expected.xml b/dkpro-core-io-tcf-asl/src/test/resources/tcf-after-expected.xml index f53e39194a..0493ca4d79 100644 --- a/dkpro-core-io-tcf-asl/src/test/resources/tcf-after-expected.xml +++ b/dkpro-core-io-tcf-asl/src/test/resources/tcf-after-expected.xml @@ -1,46 +1,41 @@ - - IMS, Uni Stuttgart - - - Sie sind gegen den Euro, gegen Ausländer und Abtreibungen: Bei - der Parlamentswahl in Finnland haben die "Wahren Finnen" riesige - Gewinne erzielt. - - - Sie - sind - gegen - den - Euro - , - gegen - Ausländer - und - Abtreibungen - : - Bei - der - Parlamentswahl - in - Finnland - haben - die - " - Wahren - Finnen - " - riesige - Gewinne - erzielt - . + + + Sie sind gegen den Euro , gegen Ausländer und Abtreibungen : Bei der Parlamentswahl in Finnland haben die " Wahren Finnen " riesige Gewinne erzielt . + + Sie + sind + gegen + den + Euro + , + gegen + Ausländer + und + Abtreibungen + : + Bei + der + Parlamentswahl + in + Finnland + haben + die + " + Wahren + Finnen + " + riesige + Gewinne + erzielt + . - - + + - + Sie|sie|sie sein gegen @@ -68,7 +63,7 @@ erzielen . - + PPER VAFIN APPR @@ -96,11 +91,11 @@ VVPP $. - + - + \ No newline at end of file diff --git a/dkpro-core-io-tcf-asl/src/test/resources/tcf04-karin-wl_expected.xml b/dkpro-core-io-tcf-asl/src/test/resources/tcf04-karin-wl_expected.xml index e5d12a957a..2378f66e4d 100644 --- a/dkpro-core-io-tcf-asl/src/test/resources/tcf04-karin-wl_expected.xml +++ b/dkpro-core-io-tcf-asl/src/test/resources/tcf04-karin-wl_expected.xml @@ -1,4 +1,4 @@ - + @@ -292,27 +292,24 @@ 2 4 2 - + http://www.geonames.org/ - + ˈuːɐ̯ˌlaʊ̯p - + - - - - - - + + + + + + - - Karina - Karin fliegen @@ -328,18 +325,18 @@ -- - NE - VVFIN - APPR - NE - NE - $. - PPER - VMFIN - ADV + EN + NIFVV + RPPA + EN + EN + .$ + REPP + NIFMV + VDA NN - VVINF - $. + FNIVV + .$ @@ -373,5 +370,8 @@ + + Karina + \ No newline at end of file diff --git a/dkpro-core-io-tcf-asl/src/test/resources/wlfxb_expected.xml b/dkpro-core-io-tcf-asl/src/test/resources/wlfxb_expected.xml new file mode 100644 index 0000000000..0493ca4d79 --- /dev/null +++ b/dkpro-core-io-tcf-asl/src/test/resources/wlfxb_expected.xml @@ -0,0 +1,101 @@ + + + + + + Sie sind gegen den Euro , gegen Ausländer und Abtreibungen : Bei der Parlamentswahl in Finnland haben die " Wahren Finnen " riesige Gewinne erzielt . + + Sie + sind + gegen + den + Euro + , + gegen + Ausländer + und + Abtreibungen + : + Bei + der + Parlamentswahl + in + Finnland + haben + die + " + Wahren + Finnen + " + riesige + Gewinne + erzielt + . + + + + + + Sie|sie|sie + sein + gegen + d + Euro + , + gegen + Ausländer + und + Abtreibung + : + bei + d + Parlamentswahl + in + Finnland + haben + d + " + wahr + Finne + " + riesig + Gewinn + erzielen + . + + + PPER + VAFIN + APPR + ART + NN + $, + APPR + NN + KON + NN + $. + APPR + ART + NN + APPR + NE + VAFIN + ART + $( + ADJA + NN + $( + ADJA + NN + VVPP + $. + + + + + + + + + \ No newline at end of file diff --git a/dkpro-core-io-tei-asl/pom.xml b/dkpro-core-io-tei-asl/pom.xml index 602784de58..2e370eaaae 100644 --- a/dkpro-core-io-tei-asl/pom.xml +++ b/dkpro-core-io-tei-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.tei-asl + dkpro-core-io-tei-asl jar DKPro Core ASL - IO - TEI + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -48,45 +49,44 @@ jaxen - dom4j + org.dom4j dom4j xml-apis xml-apis - 1.3.02 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.syntax-asl + org.dkpro.core + dkpro-core-api-syntax-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.ner-asl + org.dkpro.core + dkpro-core-api-ner-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl net.java.dev.stax-utils @@ -100,29 +100,38 @@ + + eu.openminted.share.annotations + omtd-share-annotations-api + junit junit test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.assertj + assertj-core test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.text-asl + org.dkpro.core + dkpro-core-testing-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.imscwb-asl + org.dkpro.core + dkpro-core-io-text-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl + org.dkpro.core + dkpro-core-io-imscwb-asl + test + + + org.dkpro.core + dkpro-core-opennlp-asl test @@ -154,9 +163,9 @@ - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-opennlp-asl + 2.3.0-SNAPSHOT pom import diff --git a/dkpro-core-io-tei-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tei/TeiReader.java b/dkpro-core-io-tei-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tei/TeiReader.java deleted file mode 100644 index 5c9b1bfd49..0000000000 --- a/dkpro-core-io-tei-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tei/TeiReader.java +++ /dev/null @@ -1,622 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.tei; - -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.ATTR_FUNCTION; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.ATTR_LEMMA; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.ATTR_POS; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.ATTR_TYPE; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_CHARACTER; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_MULTIWORD; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_PARAGRAPH; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_PHRASE; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_RS; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_SUNIT; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_TEI_DOC; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_TEXT; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_TITLE; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_U; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_WORD; -import static java.util.Arrays.asList; -import static org.apache.commons.io.IOUtils.closeQuietly; -import static org.apache.commons.lang3.StringUtils.isNotBlank; - -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Stack; -import java.util.zip.GZIPInputStream; - -import org.apache.commons.io.FilenameUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.CASException; -import org.apache.uima.cas.Type; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.FSCollectionFactory; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.tcas.Annotation; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.Logger; -import org.dom4j.Document; -import org.dom4j.DocumentException; -import org.dom4j.Element; -import org.dom4j.io.SAXReader; -import org.dom4j.io.SAXWriter; -import org.jaxen.JaxenException; -import org.jaxen.XPath; -import org.jaxen.dom4j.Dom4jXPath; -import org.xml.sax.Attributes; -import org.xml.sax.InputSource; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; - -/** - * Reader for the TEI XML. - */ -@ResourceMetaData(name="TEI XML Reader") -@MimeTypeCapability({MimeTypes.APPLICATION_TEI_XML}) -@TypeCapability( - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent", - "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity"}) -public class TeiReader - extends ResourceCollectionReaderBase -{ - /** - * Write token annotations to the CAS. - */ - public static final String PARAM_READ_TOKEN = ComponentParameters.PARAM_READ_TOKEN; - @ConfigurationParameter(name = PARAM_READ_TOKEN, mandatory = true, defaultValue = "true") - private boolean readToken; - - /** - * Write part-of-speech annotations to the CAS. - */ - public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; - @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") - private boolean readPOS; - - /** - * Write lemma annotations to the CAS. - */ - public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; - @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") - private boolean readLemma; - - /** - * Write sentence annotations to the CAS. - */ - public static final String PARAM_READ_SENTENCE = ComponentParameters.PARAM_READ_SENTENCE; - @ConfigurationParameter(name = PARAM_READ_SENTENCE, mandatory = true, defaultValue = "true") - private boolean readSentence; - - /** - * Write constituent annotations to the CAS. - */ - public static final String PARAM_READ_CONSTITUENT = ComponentParameters.PARAM_READ_CONSTITUENT; - @ConfigurationParameter(name = PARAM_READ_CONSTITUENT, mandatory = true, defaultValue = "true") - private boolean readConstituent; - - /** - * Write named entity annotations to the CAS. - */ - public static final String PARAM_READ_NAMED_ENTITY = ComponentParameters.PARAM_READ_NAMED_ENTITY; - @ConfigurationParameter(name = PARAM_READ_NAMED_ENTITY, mandatory = true, defaultValue = "true") - private boolean readNamedEntity; - - /** - * Write paragraphs annotations to the CAS. - */ - public static final String PARAM_READ_PARAGRAPH = "readParagraph"; - @ConfigurationParameter(name = PARAM_READ_PARAGRAPH, mandatory = true, defaultValue = "true") - private boolean readParagraph; - - /** - * Use the xml:id attribute on the TEI elements as document ID. Mind that many TEI files - * may not have this attribute on all TEI elements and you may end up with no document ID - * at all. Also mind that the IDs should be unique. - */ - public static final String PARAM_USE_XML_ID = "useXmlId"; - @ConfigurationParameter(name = PARAM_USE_XML_ID, mandatory = true, defaultValue = "false") - private boolean useXmlId; - - /** - * When not using the XML ID, use only the filename instead of the whole URL as ID. Mind that - * the filenames should be unique in this case. - */ - public static final String PARAM_USE_FILENAME_ID = "useFilenameId"; - @ConfigurationParameter(name = PARAM_USE_FILENAME_ID, mandatory = true, defaultValue = "false") - private boolean useFilenameId; - - /** - * Do not write ignoreable whitespace from the XML file to the CAS. - */ - // REC: This does not seem to work. Maybe because SAXWriter does not generate this event? - public static final String PARAM_OMIT_IGNORABLE_WHITESPACE = "omitIgnorableWhitespace"; - @ConfigurationParameter(name = PARAM_OMIT_IGNORABLE_WHITESPACE, mandatory = true, defaultValue = "false") - private boolean omitIgnorableWhitespace; - - /** - * Location of the mapping file for part-of-speech tags to UIMA types. - */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - protected String mappingPosLocation; - - /** - * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the - * tag set defined as part of the model meta data. This can be useful if a custom model is - * specified which does not have such meta data, or it can be used in readers. - */ - public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; - @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) - protected String posTagset; - - /** - * Interpret utterances "u" as sentenes "s". (EXPERIMENTAL) - */ - public static final String PARAM_UTTERANCES_AS_SENTENCES = "utterancesAsSentences"; - @ConfigurationParameter(name = PARAM_UTTERANCES_AS_SENTENCES, mandatory = true, defaultValue = "false") - private boolean utterancesAsSentences; - - private Iterator teiElementIterator; - private Element currentTeiElement; - private Resource currentResource; - private int currentTeiElementNumber; - - private MappingProvider posMappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - if (readPOS && !readToken) { - throw new ResourceInitializationException(new IllegalArgumentException( - "Setting readPOS to 'true' requires writeToken to be 'true' too.")); - } - - try { - // Init with an empty iterator - teiElementIterator = asList(new Element[0]).iterator(); - - // Make sure we know about the first element; - nextTeiElement(); - } - catch (CollectionException | IOException e) { - throw new ResourceInitializationException(e); - } - - posMappingProvider = MappingProviderFactory.createPosMappingProvider(mappingPosLocation, - posTagset, getLanguage()); - } - - private void nextTeiElement() throws CollectionException, IOException - { - if (teiElementIterator == null) { - currentTeiElement = null; - return; - } - - while (!teiElementIterator.hasNext() && super.hasNext()) { - currentResource = nextFile(); - - InputStream is = null; - try { - is = currentResource.getInputStream(); - - if (currentResource.getPath().endsWith(".gz")) { - is = new GZIPInputStream(is); - } - - InputSource source = new InputSource(is); - source.setPublicId(currentResource.getLocation()); - source.setSystemId(currentResource.getLocation()); - - SAXReader reader = new SAXReader(); - Document xml = reader.read(source); - - final XPath teiPath = new Dom4jXPath("//tei:TEI"); - teiPath.addNamespace("tei", "http://www.tei-c.org/ns/1.0"); - - List teiElements = teiPath.selectNodes(xml); - -// System.out.printf("Found %d TEI elements in %s.%n", teiElements.size(), -// currentResource.getLocation()); - - teiElementIterator = teiElements.iterator(); - currentTeiElementNumber = 0; - } - catch (DocumentException e) { - throw new IOException(e); - } - catch (JaxenException e) { - throw new IOException(e); - } - finally { - closeQuietly(is); - } - } - - currentTeiElement = teiElementIterator.hasNext() ? teiElementIterator.next() : null; - currentTeiElementNumber++; - - if (!super.hasNext() && !teiElementIterator.hasNext()) { - // Mark end of processing. - teiElementIterator = null; - } - } - - @Override - public boolean hasNext() - throws IOException, CollectionException - { - return teiElementIterator != null || currentTeiElement != null; - } - - @Override - public void getNext(CAS aCAS) - throws IOException, CollectionException - { - initCas(aCAS, currentResource); - - // Set up language - if (getConfigParameterValue(PARAM_LANGUAGE) != null) { - aCAS.setDocumentLanguage((String) getConfigParameterValue(PARAM_LANGUAGE)); - } - - // Configure mapping only now, because now the language is set in the CAS - try { - posMappingProvider.configure(aCAS); - } - catch (AnalysisEngineProcessException e1) { - throw new IOException(e1); - } - - InputStream is = null; - - try { - JCas jcas = aCAS.getJCas(); - - // Create handler - Handler handler = newSaxHandler(); - handler.setJCas(jcas); - handler.setLogger(getLogger()); - - // Parse TEI text - SAXWriter writer = new SAXWriter(handler); - writer.write(currentTeiElement); - handler.endDocument(); - } - catch (CASException e) { - throw new CollectionException(e); - } - catch (SAXException e) { - throw new IOException(e); - } - finally { - closeQuietly(is); - } - - // Move currentTeiElement to the next text - nextTeiElement(); - } - - protected Handler newSaxHandler() - { - return new TeiHandler(); - } - - protected abstract static class Handler - extends DefaultHandler - { - private JCas jcas; - private Logger logger; - - public void setJCas(final JCas aJCas) - { - jcas = aJCas; - } - - protected JCas getJCas() - { - return jcas; - } - - public void setLogger(Logger aLogger) - { - logger = aLogger; - } - - public Logger getLogger() - { - return logger; - } - } - - public class TeiHandler - extends Handler - { - private String documentId = null; - private boolean titleSet = false; - private boolean inTextElement = false; - private boolean captureText = false; - private int paragraphStart = -1; - private int sentenceStart = -1; - private int tokenStart = -1; - private String posTag = null; - private String lemma = null; - private Stack constituents = new Stack<>(); - private Stack namedEntities = new Stack<>(); - - private final StringBuilder buffer = new StringBuilder(); - - @Override - public void endDocument() - throws SAXException - { - getJCas().setDocumentText(buffer.toString()); - } - - protected StringBuilder getBuffer() - { - return buffer; - } - - @Override - public void startElement(String aUri, String aLocalName, String aName, - Attributes aAttributes) - throws SAXException - { -// System.out.printf("%b START %s %n", captureText, aLocalName); - if (!inTextElement && TAG_TEI_DOC.equals(aName)) { - if (useXmlId) { - documentId = aAttributes.getValue("xml:id"); - } - else if (useFilenameId) { - documentId = FilenameUtils.getName(currentResource.getPath()) + "#" - + currentTeiElementNumber; - } - else { - documentId = currentResource.getPath()+"#"+currentTeiElementNumber; - } - } - else if (!inTextElement && TAG_TITLE.equals(aName)) { - captureText = true; - } - else if (TAG_TEXT.equals(aName)) { - captureText = true; - inTextElement = true; - } - else if (inTextElement && (TAG_SUNIT.equals(aName) || - (utterancesAsSentences && TAG_U.equals(aName)))) { - sentenceStart = getBuffer().length(); - } - else if (inTextElement && TAG_PARAGRAPH.equals(aName)) { - paragraphStart = getBuffer().length(); - } - else if (readNamedEntity && inTextElement && TAG_RS.equals(aName)) { - NamedEntity ne = new NamedEntity(getJCas()); - ne.setBegin(getBuffer().length()); - ne.setValue(aAttributes.getValue(ATTR_TYPE)); - namedEntities.push(ne); - } - else if (readConstituent && inTextElement && TAG_PHRASE.equals(aName)) { - if (constituents.isEmpty()) { - ROOT root = new ROOT(getJCas()); - root.setBegin(getBuffer().length()); - root.setConstituentType("ROOT"); - constituents.push(new ConstituentWrapper(root)); - } - - Constituent constituent = new Constituent(getJCas()); - constituent.setBegin(getBuffer().length()); - constituent.setConstituentType(aAttributes.getValue(ATTR_TYPE)); - constituent.setSyntacticFunction(aAttributes.getValue(ATTR_FUNCTION)); - constituents.push(new ConstituentWrapper(constituent)); - } - else if (inTextElement - && (TAG_WORD.equals(aName) || TAG_CHARACTER.equals(aName) || TAG_MULTIWORD - .equals(aName))) { - tokenStart = getBuffer().length(); - if (StringUtils.isNotEmpty(aAttributes.getValue(ATTR_POS))) { - posTag = aAttributes.getValue(ATTR_POS); - } - else { - posTag = aAttributes.getValue(ATTR_TYPE); - } - lemma = aAttributes.getValue(ATTR_LEMMA); - } - } - - @Override - public void endElement(String aUri, String aLocalName, String aName) - throws SAXException - { -// System.out.printf("%b END %s %n", captureText, aLocalName); - if (!inTextElement && TAG_TITLE.equals(aName)) { - DocumentMetaData meta = DocumentMetaData.get(getJCas()); - // Read only the first title and hope it is the main title - if (!titleSet) { - meta.setDocumentTitle(getBuffer().toString().trim()); - titleSet = true; - } - meta.setDocumentId(documentId); - getBuffer().setLength(0); - captureText = false; - } - else if (TAG_TEXT.equals(aName)) { - captureText = false; - inTextElement = false; - } - else if (inTextElement && (TAG_SUNIT.equals(aName) || - (utterancesAsSentences && TAG_U.equals(aName)))) { - if (readSentence) { - new Sentence(getJCas(), sentenceStart, getBuffer().length()).addToIndexes(); - } - sentenceStart = -1; - } - else if (inTextElement && TAG_PARAGRAPH.equals(aName)) { - if (readParagraph) { - new Paragraph(getJCas(), paragraphStart, getBuffer().length()).addToIndexes(); - } - paragraphStart = -1; - } - else if (readNamedEntity && inTextElement && TAG_RS.equals(aName)) { - NamedEntity ne = namedEntities.pop(); - ne.setEnd(getBuffer().length()); - ne.addToIndexes(); - } - else if (readConstituent && inTextElement && TAG_PHRASE.equals(aName)) { - ConstituentWrapper wrapper = constituents.pop(); - wrapper.constituent.setEnd(getBuffer().length()); - if (!constituents.isEmpty()) { - ConstituentWrapper parent = constituents.peek(); - wrapper.constituent.setParent(parent.constituent); - parent.children.add(wrapper.constituent); - } - wrapper.constituent.setChildren(FSCollectionFactory.createFSArray(getJCas(), - wrapper.children)); - wrapper.constituent.addToIndexes(); - - // Close off the ROOT - if (constituents.peek().constituent instanceof ROOT) { - ConstituentWrapper rootWrapper = constituents.pop(); - rootWrapper.constituent.setEnd(getBuffer().length()); - rootWrapper.constituent.setChildren(FSCollectionFactory.createFSArray( - getJCas(), rootWrapper.children)); - rootWrapper.constituent.addToIndexes(); - } - } - else if (inTextElement - && (TAG_WORD.equals(aName) || TAG_CHARACTER.equals(aName) || TAG_MULTIWORD - .equals(aName))) { - if (isNotBlank(getBuffer().substring(tokenStart, getBuffer().length()))) { - Token token = new Token(getJCas(), tokenStart, getBuffer().length()); - trim(token); - - if (posTag != null && readPOS) { - Type posTagType = posMappingProvider.getTagType(posTag); - POS pos = (POS) getJCas().getCas().createAnnotation(posTagType, - token.getBegin(), token.getEnd()); - pos.setPosValue(posTag); - POSUtils.assignCoarseValue(pos); - pos.addToIndexes(); - token.setPos(pos); - } - - if (lemma != null && readLemma) { - Lemma l = new Lemma(getJCas(), token.getBegin(), token.getEnd()); - l.setValue(lemma); - l.addToIndexes(); - token.setLemma(l); - } - - // FIXME: if readToken is disabled, the JCas wrapper should not be generated - // at all! - if (readToken) { - if (!constituents.isEmpty()) { - ConstituentWrapper parent = constituents.peek(); - token.setParent(parent.constituent); - parent.children.add(token); - } - - token.addToIndexes(); - } - } - - tokenStart = -1; - } - } - - @Override - public void characters(char[] aCh, int aStart, int aLength) - throws SAXException - { - if (captureText) { - buffer.append(aCh, aStart, aLength); - } - } - - @Override - public void ignorableWhitespace(char[] aCh, int aStart, int aLength) - throws SAXException - { - if (captureText && !omitIgnorableWhitespace) { - buffer.append(aCh, aStart, aLength); - } - } - - private void trim(Annotation aAnnotation) - { - StringBuilder buffer = getBuffer(); - int s = aAnnotation.getBegin(); - int e = aAnnotation.getEnd(); - while (Character.isWhitespace(buffer.charAt(s))) { - s++; - } - while ((e > s+1) && Character.isWhitespace(buffer.charAt(e-1))) { - e--; - } - aAnnotation.setBegin(s); - aAnnotation.setEnd(e); - } - } - - private static class ConstituentWrapper { - public Constituent constituent; - public List children = new ArrayList(); - - public ConstituentWrapper(Constituent aConstituent) - { - constituent = aConstituent; - } - } -} diff --git a/dkpro-core-io-tei-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tei/package-info.java b/dkpro-core-io-tei-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tei/package-info.java deleted file mode 100644 index abfd916830..0000000000 --- a/dkpro-core-io-tei-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tei/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Support for TEI XML. - */ -package de.tudarmstadt.ukp.dkpro.core.io.tei; diff --git a/dkpro-core-io-tei-asl/src/main/java/org/dkpro/core/io/tei/TeiReader.java b/dkpro-core-io-tei-asl/src/main/java/org/dkpro/core/io/tei/TeiReader.java new file mode 100644 index 0000000000..10f8353fe6 --- /dev/null +++ b/dkpro-core-io-tei-asl/src/main/java/org/dkpro/core/io/tei/TeiReader.java @@ -0,0 +1,715 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.tei; + +import static java.util.Arrays.asList; +import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.apache.commons.lang3.StringUtils.isNotBlank; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; +import static org.dkpro.core.io.tei.internal.TeiConstants.ATTR_FUNCTION; +import static org.dkpro.core.io.tei.internal.TeiConstants.ATTR_LEMMA; +import static org.dkpro.core.io.tei.internal.TeiConstants.ATTR_POS; +import static org.dkpro.core.io.tei.internal.TeiConstants.ATTR_TYPE; +import static org.dkpro.core.io.tei.internal.TeiConstants.TAG_CHARACTER; +import static org.dkpro.core.io.tei.internal.TeiConstants.TAG_MULTIWORD; +import static org.dkpro.core.io.tei.internal.TeiConstants.TAG_PARAGRAPH; +import static org.dkpro.core.io.tei.internal.TeiConstants.TAG_PHRASE; +import static org.dkpro.core.io.tei.internal.TeiConstants.TAG_RS; +import static org.dkpro.core.io.tei.internal.TeiConstants.TAG_SUNIT; +import static org.dkpro.core.io.tei.internal.TeiConstants.TAG_TEI_DOC; +import static org.dkpro.core.io.tei.internal.TeiConstants.TAG_TEXT; +import static org.dkpro.core.io.tei.internal.TeiConstants.TAG_TITLE; +import static org.dkpro.core.io.tei.internal.TeiConstants.TAG_U; +import static org.dkpro.core.io.tei.internal.TeiConstants.TAG_WORD; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.Stack; +import java.util.zip.GZIPInputStream; + +import org.apache.commons.io.FilenameUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.CASException; +import org.apache.uima.cas.Type; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.FSCollectionFactory; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Logger; +import org.dkpro.core.api.io.ResourceCollectionReaderBase; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.MappingProvider; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Element; +import org.dom4j.io.SAXReader; +import org.dom4j.io.SAXWriter; +import org.jaxen.JaxenException; +import org.jaxen.XPath; +import org.jaxen.dom4j.Dom4jXPath; +import org.xml.sax.Attributes; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.TrimUtils; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Reader for the TEI XML. + *

Supported formats
SERIALIZED_TSI or S+CAS structures are dumped to disc as they are using Java serialization as in form 0, but - * now using the {@link CASCompleteSerializer} which includes CAS metadata like type system and - * index repositories.CAS structures are dumped to disc as they are using Java serialization as in form 0, but now + * using the {@link CASCompleteSerializer} which includes CAS metadata like type system and index + * repositories.is reinitializedyes
BINARY_TSI or 0The same as BINARY_TSI, except that the type system and index configuration - * are also stored in the file. However, lenient loading or reinitalizing the CAS with this - * information is presently not supported.The same as BINARY_TSI, except that the type system and index configuration are + * also stored in the file. However, lenient loading or reinitalizing the CAS with this information + * is presently not supported.must be the sameyes
COMPRESSED or 4 - * UIMA binary serialization saving all feature structures (reachable or not). This format + * UIMA binary serialization saving all feature structures (reachable or not). This format * internally uses gzip compression and a binary representation of the CAS, making it much more * efficient than format 0.must be the same
COMPRESSED_FILTERED or 6 - * UIMA binary serialization as format 4, but saving only reachable feature structures.UIMA binary serialization as format 4, but saving only reachable feature structures.must be the sameno
6+ - * This is a legacy format specific to DKPro Core. Since UIMA 2.9.0, COMPRESSED_FILTERED_TSI - * is supported and should be used instead of this format. UIMA binary serialization as format 6, - * but also contains the type system definition. This allows the {@link BinaryCasReader} to load data - * leniently into a CAS that has been initialized with a different type system.This is a legacy format specific to DKPro Core. Since UIMA 2.9.0, + * COMPRESSED_FILTERED_TSI is supported and should be used instead of this format. UIMA + * binary serialization as format 6, but also contains the type system definition. This allows the + * {@link BinaryCasReader} to load data leniently into a CAS that has been initialized with a + * different type system.lenient loadingno
COMPRESSED_FILTERED_TS - * Same as COMPRESSED_FILTERED, but also contains the type system definition. This + * Same as COMPRESSED_FILTERED, but also contains the type system definition. This * allows the {@link BinaryCasReader} to load data leniently into a CAS that has been initialized * with a different type system.lenient loading
COMPRESSED_FILTERED_TSI - * Default. UIMA binary serialization as format 6, but also contains the type system + * Default. UIMA binary serialization as format 6, but also contains the type system * definition and index definitions. This allows the {@link BinaryCasReader} to load data leniently * into a CAS that has been initialized with a different type system.lenient loading
+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
Supported TEI XML elements and attributes
ElementDescriptionDKPro Core typeAttribute mappings
TEIdocument boundarygetNext(...) returns one TEI document at a time
titledocument titleDocumentMetaData
ss-unitSentence
uutteranceSentence
pparagraphParagraph
rsreferencing stringNamedEntitytype -> value
phrphraseConstituenttype -> constituentType, function -> syntacticFunction
wwordToken(pos, type) -> POS.PosValue (pos preferred over + * type)
mwmulti-wordTokensame as for w
ccharacterTokensame as for w
+ */ +@ResourceMetaData(name = "TEI XML Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.APPLICATION_TEI_XML}) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent", + "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity"}) +public class TeiReader + extends ResourceCollectionReaderBase +{ + /** + * Write token annotations to the CAS. + */ + public static final String PARAM_READ_TOKEN = ComponentParameters.PARAM_READ_TOKEN; + @ConfigurationParameter(name = PARAM_READ_TOKEN, mandatory = true, defaultValue = "true") + private boolean readToken; + + /** + * Write part-of-speech annotations to the CAS. + */ + public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; + @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") + private boolean readPOS; + + /** + * Write lemma annotations to the CAS. + */ + public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; + @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") + private boolean readLemma; + + /** + * Write sentence annotations to the CAS. + */ + public static final String PARAM_READ_SENTENCE = ComponentParameters.PARAM_READ_SENTENCE; + @ConfigurationParameter(name = PARAM_READ_SENTENCE, mandatory = true, defaultValue = "true") + private boolean readSentence; + + /** + * Write constituent annotations to the CAS. + */ + public static final String PARAM_READ_CONSTITUENT = ComponentParameters.PARAM_READ_CONSTITUENT; + @ConfigurationParameter(name = PARAM_READ_CONSTITUENT, mandatory = true, defaultValue = "true") + private boolean readConstituent; + + /** + * Write named entity annotations to the CAS. + */ + public static final String PARAM_READ_NAMED_ENTITY = + ComponentParameters.PARAM_READ_NAMED_ENTITY; + @ConfigurationParameter(name = PARAM_READ_NAMED_ENTITY, mandatory = true, defaultValue = "true") + private boolean readNamedEntity; + + /** + * Write paragraphs annotations to the CAS. + */ + public static final String PARAM_READ_PARAGRAPH = "readParagraph"; + @ConfigurationParameter(name = PARAM_READ_PARAGRAPH, mandatory = true, defaultValue = "true") + private boolean readParagraph; + + /** + * Use the xml:id attribute on the TEI elements as document ID. Mind that many TEI files + * may not have this attribute on all TEI elements and you may end up with no document ID + * at all. Also mind that the IDs should be unique. + */ + public static final String PARAM_USE_XML_ID = "useXmlId"; + @ConfigurationParameter(name = PARAM_USE_XML_ID, mandatory = true, defaultValue = "false") + private boolean useXmlId; + + /** + * When not using the XML ID, use only the filename instead of the whole URL as ID. Mind that + * the filenames should be unique in this case. + */ + public static final String PARAM_USE_FILENAME_ID = "useFilenameId"; + @ConfigurationParameter(name = PARAM_USE_FILENAME_ID, mandatory = true, defaultValue = "false") + private boolean useFilenameId; + + /** + * Do not write ignoreable whitespace from the XML file to the CAS. + */ + // REC: This does not seem to work. Maybe because SAXWriter does not generate this event? + public static final String PARAM_OMIT_IGNORABLE_WHITESPACE = "omitIgnorableWhitespace"; + @ConfigurationParameter(name = PARAM_OMIT_IGNORABLE_WHITESPACE, mandatory = true, defaultValue = "false") + private boolean omitIgnorableWhitespace; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Location of the mapping file for part-of-speech tags to UIMA types. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + protected String mappingPosLocation; + + /** + * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the + * tag set defined as part of the model meta data. This can be useful if a custom model is + * specified which does not have such meta data, or it can be used in readers. + */ + public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; + @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) + protected String posTagset; + + /** + * Interpret utterances "u" as sentenes "s". (EXPERIMENTAL) + */ + public static final String PARAM_UTTERANCES_AS_SENTENCES = "utterancesAsSentences"; + @ConfigurationParameter(name = PARAM_UTTERANCES_AS_SENTENCES, mandatory = true, defaultValue = "false") + private boolean utterancesAsSentences; + + /** + * Trim the given elements (remote leading and trailing whitespace). DKPro Core usually expects + * annotations to start and end at a non-whitespace character. + */ + public static final String PARAM_ELEMENTS_TO_TRIM = "elementsToTrim"; + @ConfigurationParameter(name = PARAM_ELEMENTS_TO_TRIM, mandatory = true, defaultValue = { + TAG_SUNIT, TAG_U, TAG_PARAGRAPH, TAG_RS, TAG_WORD, TAG_CHARACTER, TAG_MULTIWORD}) + private Set elementsToTrim; + + private Iterator teiElementIterator; + private Element currentTeiElement; + private Resource currentResource; + private int currentTeiElementNumber; + + private MappingProvider posMappingProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + if (readPOS && !readToken) { + throw new ResourceInitializationException(new IllegalArgumentException( + "Setting readPOS to 'true' requires writeToken to be 'true' too.")); + } + + try { + // Init with an empty iterator + teiElementIterator = asList(new Element[0]).iterator(); + + // Make sure we know about the first element; + nextTeiElement(); + } + catch (CollectionException | IOException e) { + throw new ResourceInitializationException(e); + } + + posMappingProvider = createPosMappingProvider(this, mappingPosLocation, posTagset, + getLanguage()); + } + + private void nextTeiElement() throws CollectionException, IOException + { + if (teiElementIterator == null) { + currentTeiElement = null; + return; + } + + while (!teiElementIterator.hasNext() && super.hasNext()) { + currentResource = nextFile(); + + InputStream is = null; + try { + is = currentResource.getInputStream(); + + if (currentResource.getPath().endsWith(".gz")) { + is = new GZIPInputStream(is); + } + + InputSource source = new InputSource(is); + source.setPublicId(currentResource.getLocation()); + source.setSystemId(currentResource.getLocation()); + + SAXReader reader = new SAXReader(); + Document xml = reader.read(source); + + final XPath teiPath = new Dom4jXPath("//tei:TEI"); + teiPath.addNamespace("tei", "http://www.tei-c.org/ns/1.0"); + + List teiElements = teiPath.selectNodes(xml); + +// System.out.printf("Found %d TEI elements in %s.%n", teiElements.size(), +// currentResource.getLocation()); + + teiElementIterator = teiElements.iterator(); + currentTeiElementNumber = 0; + } + catch (DocumentException e) { + throw new IOException(e); + } + catch (JaxenException e) { + throw new IOException(e); + } + finally { + closeQuietly(is); + } + } + + currentTeiElement = teiElementIterator.hasNext() ? teiElementIterator.next() : null; + currentTeiElementNumber++; + + if (!super.hasNext() && !teiElementIterator.hasNext()) { + // Mark end of processing. + teiElementIterator = null; + } + } + + @Override + public boolean hasNext() + throws IOException, CollectionException + { + return teiElementIterator != null || currentTeiElement != null; + } + + @Override + public void getNext(CAS aCAS) + throws IOException, CollectionException + { + initCas(aCAS, currentResource); + + // Set up language + if (getConfigParameterValue(PARAM_LANGUAGE) != null) { + aCAS.setDocumentLanguage((String) getConfigParameterValue(PARAM_LANGUAGE)); + } + + // Configure mapping only now, because now the language is set in the CAS + try { + posMappingProvider.configure(aCAS); + } + catch (AnalysisEngineProcessException e1) { + throw new IOException(e1); + } + + InputStream is = null; + + try { + JCas jcas = aCAS.getJCas(); + + // Create handler + Handler handler = newSaxHandler(); + handler.setJCas(jcas); + handler.setLogger(getLogger()); + + // Parse TEI text + SAXWriter writer = new SAXWriter(handler); + writer.write(currentTeiElement); + handler.endDocument(); + } + catch (CASException e) { + throw new CollectionException(e); + } + catch (SAXException e) { + throw new IOException(e); + } + finally { + closeQuietly(is); + } + + // Move currentTeiElement to the next text + nextTeiElement(); + } + + protected Handler newSaxHandler() + { + return new TeiHandler(); + } + + protected abstract static class Handler + extends DefaultHandler + { + private JCas jcas; + private Logger logger; + + public void setJCas(final JCas aJCas) + { + jcas = aJCas; + } + + protected JCas getJCas() + { + return jcas; + } + + public void setLogger(Logger aLogger) + { + logger = aLogger; + } + + public Logger getLogger() + { + return logger; + } + } + + public class TeiHandler + extends Handler + { + private String documentId = null; + private boolean titleSet = false; + private boolean inTextElement = false; + private boolean captureText = false; + private int paragraphStart = -1; + private int sentenceStart = -1; + private int tokenStart = -1; + private String posTag = null; + private String lemma = null; + private Stack constituents = new Stack<>(); + private Stack namedEntities = new Stack<>(); + + private final StringBuilder buffer = new StringBuilder(); + + @Override + public void endDocument() + throws SAXException + { + getJCas().setDocumentText(buffer.toString()); + } + + protected StringBuilder getBuffer() + { + return buffer; + } + + @Override + public void startElement(String aUri, String aLocalName, String aName, + Attributes aAttributes) + throws SAXException + { +// System.out.printf("%b START %s %n", captureText, aLocalName); + if (!inTextElement && TAG_TEI_DOC.equals(aName)) { + if (useXmlId) { + documentId = aAttributes.getValue("xml:id"); + } + else if (useFilenameId) { + documentId = FilenameUtils.getName(currentResource.getPath()) + "#" + + currentTeiElementNumber; + } + else { + documentId = currentResource.getPath() + "#" + currentTeiElementNumber; + } + } + else if (!inTextElement && TAG_TITLE.equals(aName)) { + captureText = true; + } + else if (TAG_TEXT.equals(aName)) { + captureText = true; + inTextElement = true; + } + else if (inTextElement && (TAG_SUNIT.equals(aName) || + (utterancesAsSentences && TAG_U.equals(aName)))) { + sentenceStart = getBuffer().length(); + } + else if (inTextElement && TAG_PARAGRAPH.equals(aName)) { + paragraphStart = getBuffer().length(); + } + else if (readNamedEntity && inTextElement && TAG_RS.equals(aName)) { + NamedEntity ne = new NamedEntity(getJCas()); + ne.setBegin(getBuffer().length()); + ne.setValue(aAttributes.getValue(ATTR_TYPE)); + namedEntities.push(ne); + } + else if (readConstituent && inTextElement && TAG_PHRASE.equals(aName)) { + if (constituents.isEmpty()) { + ROOT root = new ROOT(getJCas()); + root.setConstituentType("ROOT"); + constituents.push(new ConstituentWrapper(root)); + } + + Constituent constituent = new Constituent(getJCas()); + constituent.setConstituentType(aAttributes.getValue(ATTR_TYPE)); + constituent.setSyntacticFunction(aAttributes.getValue(ATTR_FUNCTION)); + constituents.push(new ConstituentWrapper(constituent)); + } + else if (inTextElement + && (TAG_WORD.equals(aName) || TAG_CHARACTER.equals(aName) || TAG_MULTIWORD + .equals(aName))) { + tokenStart = getBuffer().length(); + if (StringUtils.isNotEmpty(aAttributes.getValue(ATTR_POS))) { + posTag = aAttributes.getValue(ATTR_POS); + } + else { + posTag = aAttributes.getValue(ATTR_TYPE); + } + lemma = aAttributes.getValue(ATTR_LEMMA); + } + } + + @Override + public void endElement(String aUri, String aLocalName, String aName) + throws SAXException + { +// System.out.printf("%b END %s %n", captureText, aLocalName); + if (!inTextElement && TAG_TITLE.equals(aName)) { + DocumentMetaData meta = DocumentMetaData.get(getJCas()); + // Read only the first title and hope it is the main title + if (!titleSet) { + meta.setDocumentTitle(getBuffer().toString().trim()); + titleSet = true; + } + meta.setDocumentId(documentId); + getBuffer().setLength(0); + captureText = false; + } + else if (TAG_TEXT.equals(aName)) { + captureText = false; + inTextElement = false; + } + else if (inTextElement && (TAG_SUNIT.equals(aName) || + (utterancesAsSentences && TAG_U.equals(aName)))) { + if (readSentence) { + Sentence s = new Sentence(getJCas(), sentenceStart, getBuffer().length()); + if (elementsToTrim.contains(aName)) { + TrimUtils.trim(getBuffer(), s); + } + s.addToIndexes(); + } + sentenceStart = -1; + } + else if (inTextElement && TAG_PARAGRAPH.equals(aName)) { + if (readParagraph) { + Paragraph para = new Paragraph(getJCas(), paragraphStart, getBuffer().length()); + if (elementsToTrim.contains(aName)) { + TrimUtils.trim(getBuffer(), para); + } + para.addToIndexes(); + } + paragraphStart = -1; + } + else if (readNamedEntity && inTextElement && TAG_RS.equals(aName)) { + NamedEntity ne = namedEntities.pop(); + ne.setEnd(getBuffer().length()); + if (elementsToTrim.contains(aName)) { + TrimUtils.trim(getBuffer(), ne); + } + ne.addToIndexes(); + } + else if (readConstituent && inTextElement && TAG_PHRASE.equals(aName)) { + ConstituentWrapper wrapper = constituents.pop(); + wrapper.constituent.setBegin(wrapper.children.get(0).getBegin()); + wrapper.constituent + .setEnd(wrapper.children.get(wrapper.children.size() - 1).getEnd()); + if (!constituents.isEmpty()) { + ConstituentWrapper parent = constituents.peek(); + wrapper.constituent.setParent(parent.constituent); + parent.children.add(wrapper.constituent); + } + wrapper.constituent.setChildren(FSCollectionFactory.createFSArray(getJCas(), + wrapper.children)); + wrapper.constituent.addToIndexes(); + + // Close off the ROOT + if (constituents.peek().constituent instanceof ROOT) { + ConstituentWrapper rootWrapper = constituents.pop(); + rootWrapper.constituent.setBegin(wrapper.children.get(0).getBegin()); + rootWrapper.constituent + .setEnd(wrapper.children.get(wrapper.children.size() - 1).getEnd()); + rootWrapper.constituent.setChildren(FSCollectionFactory.createFSArray( + getJCas(), rootWrapper.children)); + rootWrapper.constituent.addToIndexes(); + } + } + else if (inTextElement + && (TAG_WORD.equals(aName) || TAG_CHARACTER.equals(aName) || TAG_MULTIWORD + .equals(aName))) { + if (isNotBlank(getBuffer().substring(tokenStart, getBuffer().length()))) { + Token token = new Token(getJCas(), tokenStart, getBuffer().length()); + + if (elementsToTrim.contains(aName)) { + TrimUtils.trim(getBuffer(), token); + } + + if (posTag != null && readPOS) { + Type posTagType = posMappingProvider.getTagType(posTag); + POS pos = (POS) getJCas().getCas().createAnnotation(posTagType, + token.getBegin(), token.getEnd()); + pos.setPosValue(posTag); + POSUtils.assignCoarseValue(pos); + pos.addToIndexes(); + token.setPos(pos); + } + + if (lemma != null && readLemma) { + Lemma l = new Lemma(getJCas(), token.getBegin(), token.getEnd()); + l.setValue(lemma); + l.addToIndexes(); + token.setLemma(l); + } + + // FIXME: if readToken is disabled, the JCas wrapper should not be generated + // at all! + if (readToken) { + if (!constituents.isEmpty()) { + ConstituentWrapper parent = constituents.peek(); + token.setParent(parent.constituent); + parent.children.add(token); + } + + token.addToIndexes(); + } + } + + tokenStart = -1; + } + } + + @Override + public void characters(char[] aCh, int aStart, int aLength) + throws SAXException + { + if (captureText) { + buffer.append(aCh, aStart, aLength); + } + } + + @Override + public void ignorableWhitespace(char[] aCh, int aStart, int aLength) + throws SAXException + { + if (captureText && !omitIgnorableWhitespace) { + buffer.append(aCh, aStart, aLength); + } + } + } + + private static class ConstituentWrapper { + public Constituent constituent; + public List children = new ArrayList(); + + public ConstituentWrapper(Constituent aConstituent) + { + constituent = aConstituent; + } + } +} diff --git a/dkpro-core-io-tei-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tei/TeiWriter.java b/dkpro-core-io-tei-asl/src/main/java/org/dkpro/core/io/tei/TeiWriter.java similarity index 78% rename from dkpro-core-io-tei-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tei/TeiWriter.java rename to dkpro-core-io-tei-asl/src/main/java/org/dkpro/core/io/tei/TeiWriter.java index 0057a004ae..ee26fbfebd 100644 --- a/dkpro-core-io-tei-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tei/TeiWriter.java +++ b/dkpro-core-io-tei-asl/src/main/java/org/dkpro/core/io/tei/TeiWriter.java @@ -15,26 +15,25 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tei; +package org.dkpro.core.io.tei; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.ATTR_FUNCTION; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.ATTR_LEMMA; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.ATTR_TYPE; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.E_TEI_BODY; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.E_TEI_FILE_DESC; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.E_TEI_HEADER; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.E_TEI_TEI; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.E_TEI_TEXT; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.E_TEI_TITLE; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.E_TEI_TITLE_STMT; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_CHARACTER; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_PARAGRAPH; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_PHRASE; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_RS; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_SUNIT; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_WORD; -import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TEI_NS; -import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.dkpro.core.io.tei.internal.TeiConstants.ATTR_FUNCTION; +import static org.dkpro.core.io.tei.internal.TeiConstants.ATTR_LEMMA; +import static org.dkpro.core.io.tei.internal.TeiConstants.ATTR_TYPE; +import static org.dkpro.core.io.tei.internal.TeiConstants.E_TEI_BODY; +import static org.dkpro.core.io.tei.internal.TeiConstants.E_TEI_FILE_DESC; +import static org.dkpro.core.io.tei.internal.TeiConstants.E_TEI_HEADER; +import static org.dkpro.core.io.tei.internal.TeiConstants.E_TEI_TEI; +import static org.dkpro.core.io.tei.internal.TeiConstants.E_TEI_TEXT; +import static org.dkpro.core.io.tei.internal.TeiConstants.E_TEI_TITLE; +import static org.dkpro.core.io.tei.internal.TeiConstants.E_TEI_TITLE_STMT; +import static org.dkpro.core.io.tei.internal.TeiConstants.TAG_CHARACTER; +import static org.dkpro.core.io.tei.internal.TeiConstants.TAG_PARAGRAPH; +import static org.dkpro.core.io.tei.internal.TeiConstants.TAG_PHRASE; +import static org.dkpro.core.io.tei.internal.TeiConstants.TAG_RS; +import static org.dkpro.core.io.tei.internal.TeiConstants.TAG_SUNIT; +import static org.dkpro.core.io.tei.internal.TeiConstants.TAG_WORD; +import static org.dkpro.core.io.tei.internal.TeiConstants.TEI_NS; import java.io.OutputStream; import java.util.ArrayList; @@ -44,8 +43,6 @@ import java.util.Stack; import java.util.regex.Pattern; -import javanet.staxutils.IndentingXMLEventWriter; - import javax.xml.namespace.QName; import javax.xml.stream.XMLEventFactory; import javax.xml.stream.XMLEventWriter; @@ -61,22 +58,25 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; +import eu.openminted.share.annotations.api.DocumentationResource; +import javanet.staxutils.IndentingXMLEventWriter; /** * UIMA CAS consumer writing the CAS document text in TEI format. */ -@ResourceMetaData(name="TEI XML Writer") +@ResourceMetaData(name = "TEI XML Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.APPLICATION_TEI_XML}) @TypeCapability( inputs = { @@ -95,7 +95,8 @@ public class TeiWriter * Specify the suffix of output files. Default value .xml. If the suffix is not * needed, provide an empty string as value. */ - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".xml") private String filenameSuffix; @@ -110,14 +111,16 @@ public class TeiWriter * Write constituent annotations to the CAS. Disabled by default because it requires type * priorities to be set up (Constituents must have a higher prio than Tokens). */ - public static final String PARAM_WRITE_CONSTITUENT = ComponentParameters.PARAM_WRITE_CONSTITUENT; + public static final String PARAM_WRITE_CONSTITUENT = + ComponentParameters.PARAM_WRITE_CONSTITUENT; @ConfigurationParameter(name = PARAM_WRITE_CONSTITUENT, mandatory = true, defaultValue = "false") private boolean writeConstituent; /** * Write named entity annotations to the CAS. Overlapping named entities are not supported. */ - public static final String PARAM_WRITE_NAMED_ENTITY = ComponentParameters.PARAM_WRITE_NAMED_ENTITY; + public static final String PARAM_WRITE_NAMED_ENTITY = + ComponentParameters.PARAM_WRITE_NAMED_ENTITY; @ConfigurationParameter(name = PARAM_WRITE_NAMED_ENTITY, mandatory = true, defaultValue = "true") private boolean writeNamedEntity; @@ -136,10 +139,8 @@ public void process(JCas aJCas) { String text = aJCas.getDocumentText(); - OutputStream docOS = null; XMLEventWriter xmlEventWriter = null; - try { - docOS = getOutputStream(aJCas, filenameSuffix); + try (OutputStream docOS = getOutputStream(aJCas, filenameSuffix)) { XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newInstance(); xmlOutputFactory.setProperty(XMLOutputFactory.IS_REPAIRING_NAMESPACES, true); @@ -193,8 +194,9 @@ public void process(JCas aJCas) xmlEventWriter.add(xmlef.createCharacters(text.substring(pos, nextAnnot.getBegin()))); // Next annotation - xmlEventWriter.add(xmlef.createStartElement(new QName(TEI_NS, teiElement.get()), - getAttributes(nextAnnot), null)); + xmlEventWriter + .add(xmlef.createStartElement(new QName(TEI_NS, teiElement.get()), + getAttributes(nextAnnot), null)); stack.push(cur); cur = nextAnnot; @@ -210,7 +212,8 @@ public void process(JCas aJCas) else { // Text between current and next annotation xmlEventWriter.add(xmlef.createCharacters(text.substring(pos, cur.getEnd()))); - xmlEventWriter.add(xmlef.createEndElement(new QName(TEI_NS, teiElement.get()), null)); + xmlEventWriter + .add(xmlef.createEndElement(new QName(TEI_NS, teiElement.get()), null)); pos = cur.getEnd(); cur = stack.pop(); @@ -221,7 +224,8 @@ public void process(JCas aJCas) if (cur != null) { xmlEventWriter.add(xmlef.createCharacters(text.substring(pos, cur.getEnd()))); pos = cur.getEnd(); - xmlEventWriter.add(xmlef.createEndElement(new QName(TEI_NS, getTeiTag(cur).get()), null)); + xmlEventWriter + .add(xmlef.createEndElement(new QName(TEI_NS, getTeiTag(cur).get()), null)); while (!stack.isEmpty()) { cur = stack.pop(); @@ -230,7 +234,8 @@ public void process(JCas aJCas) } xmlEventWriter.add(xmlef.createCharacters(text.substring(pos, cur.getEnd()))); pos = cur.getEnd(); - xmlEventWriter.add(xmlef.createEndElement(new QName(TEI_NS, getTeiTag(cur).get()), null)); + xmlEventWriter.add( + xmlef.createEndElement(new QName(TEI_NS, getTeiTag(cur).get()), null)); } } @@ -255,8 +260,6 @@ public void process(JCas aJCas) getLogger().warn("Error closing the XML event writer", e); } } - - closeQuietly(docOS); } } @@ -264,22 +267,21 @@ private Iterator getAttributes(Annotation aAnnotation) { List attributes = new ArrayList(); if (aAnnotation instanceof Token) { Token t = (Token) aAnnotation; - if (t.getPos() != null) { + if (t.getPos() != null && t.getPos().getPosValue() != null) { attributes.add(xmlef.createAttribute(ATTR_TYPE, t.getPos().getPosValue())); } - if (t.getLemma() != null) { + if (t.getLemma() != null && t.getLemma().getValue() != null) { attributes.add(xmlef.createAttribute(ATTR_LEMMA, t.getLemma().getValue())); } } else if (aAnnotation instanceof NamedEntity) { NamedEntity ne = (NamedEntity) aAnnotation; - attributes.add(xmlef.createAttribute(ATTR_TYPE, ne.getValue())); + if (ne.getValue() != null) { + attributes.add(xmlef.createAttribute(ATTR_TYPE, ne.getValue())); + } } else if (aAnnotation instanceof Constituent) { Constituent c = (Constituent) aAnnotation; - if ("ROOT".equals(c.getConstituentType())) { - System.out.println(); - } if (c.getConstituentType() != null) { attributes.add(xmlef.createAttribute(ATTR_TYPE, c.getConstituentType())); } @@ -292,13 +294,6 @@ else if (aAnnotation instanceof Constituent) { private Optional getTeiTag(Annotation aAnnotation) { - if (aAnnotation instanceof Constituent) { - Constituent c = (Constituent) aAnnotation; - if ("ROOT".equals(c.getConstituentType())) { - System.out.println(); - } - } - if (aAnnotation.getTypeIndexID() == Token.type) { if (cTextPattern.matcher(aAnnotation.getCoveredText()).matches()) { return Optional.of(TAG_CHARACTER); diff --git a/dkpro-core-io-tei-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tei/internal/TeiConstants.java b/dkpro-core-io-tei-asl/src/main/java/org/dkpro/core/io/tei/internal/TeiConstants.java similarity index 98% rename from dkpro-core-io-tei-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tei/internal/TeiConstants.java rename to dkpro-core-io-tei-asl/src/main/java/org/dkpro/core/io/tei/internal/TeiConstants.java index a03ca0099e..18e4483065 100644 --- a/dkpro-core-io-tei-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tei/internal/TeiConstants.java +++ b/dkpro-core-io-tei-asl/src/main/java/org/dkpro/core/io/tei/internal/TeiConstants.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tei.internal; +package org.dkpro.core.io.tei.internal; import javax.xml.namespace.QName; diff --git a/dkpro-core-io-tei-asl/src/main/java/org/dkpro/core/io/tei/package-info.java b/dkpro-core-io-tei-asl/src/main/java/org/dkpro/core/io/tei/package-info.java new file mode 100644 index 0000000000..e66afe591a --- /dev/null +++ b/dkpro-core-io-tei-asl/src/main/java/org/dkpro/core/io/tei/package-info.java @@ -0,0 +1,22 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Support for TEI XML. + */ +package org.dkpro.core.io.tei; diff --git a/dkpro-core-io-tei-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tei/TeiReaderTest.java b/dkpro-core-io-tei-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tei/TeiReaderTest.java deleted file mode 100644 index e41e83924d..0000000000 --- a/dkpro-core-io-tei-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tei/TeiReaderTest.java +++ /dev/null @@ -1,258 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.tei; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.junit.Assert.assertEquals; - -import java.io.File; -import java.util.LinkedHashMap; -import java.util.Map; - -import org.apache.commons.io.FileUtils; -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.JCasIterable; -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.io.imscwb.ImsCwbWriter; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextWriter; -import de.tudarmstadt.ukp.dkpro.core.testing.EOLUtils; - -public class TeiReaderTest -{ - @Test - public void digibibTest() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - TeiReader.class, - TeiReader.PARAM_OMIT_IGNORABLE_WHITESPACE, true, - TeiReader.PARAM_LANGUAGE, "de", - TeiReader.PARAM_SOURCE_LOCATION, "classpath:/digibib", - TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml" }); - - AnalysisEngine writer = createEngine(TextWriter.class, - TextWriter.PARAM_USE_DOCUMENT_ID, true, - TextWriter.PARAM_OVERWRITE, true, - TextWriter.PARAM_TARGET_LOCATION, "target/digibibTest/"); - - Map actualSizes = new LinkedHashMap(); - for (JCas jcas : new JCasIterable(reader)) { - DocumentMetaData meta = DocumentMetaData.get(jcas); - String text = jcas.getDocumentText(); - // System.out.printf("%s - %d%n", meta.getDocumentId(), text.length()); - actualSizes.put(meta.getDocumentId(), text.length()); - - writer.process(jcas); - } - - Map expectedSizes = new LinkedHashMap(); - expectedSizes.put("Literatur-Balde,-Jacob.xml#1", 152); - expectedSizes.put("Literatur-Balde,-Jacob.xml#2", 14378); - expectedSizes.put("Literatur-Balde,-Jacob.xml#3", 532); - expectedSizes.put("Literatur-Balde,-Jacob.xml#4", 1322); - expectedSizes.put("Literatur-Balde,-Jacob.xml#5", 26588); - expectedSizes.put("Literatur-Besser,-Johann-von.xml#1", 279); - expectedSizes.put("Literatur-Besser,-Johann-von.xml#2", 3846); - expectedSizes.put("Literatur-Besser,-Johann-von.xml#3", 22363); - expectedSizes.put("Literatur-Besser,-Johann-von.xml#4", 3576); - expectedSizes.put("Literatur-Besser,-Johann-von.xml#5", 3369); - expectedSizes.put("Literatur-Besser,-Johann-von.xml#6", 3903); - expectedSizes.put("Literatur-Besser,-Johann-von.xml#7", 2035); - expectedSizes.put("Literatur-Kobell,-Franz-von.xml#1", 164); - expectedSizes.put("Literatur-Kobell,-Franz-von.xml#2", 2078); - expectedSizes.put("Literatur-Kobell,-Franz-von.xml#3", 50730); - expectedSizes.put("Literatur-Marcel,-Gabriel.xml#1", 52696); - expectedSizes.put("Literatur-Meister,-Johann-Gottlieb.xml#1", 41418); - - assertEquals(expectedSizes, actualSizes); - } - - @Test - public void brownReaderTest() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - TeiReader.class, - TeiReader.PARAM_LANGUAGE, "en", - TeiReader.PARAM_SOURCE_LOCATION, "classpath:/brown_tei/", - TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml" }); - - String firstSentence = "The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place . "; - - int i = 0; - for (JCas jcas : new JCasIterable(reader)) { - DocumentMetaData meta = DocumentMetaData.get(jcas); - String text = jcas.getDocumentText(); - // System.out.printf("%s - %d%n", meta.getDocumentId(), text.length()); - - if (i == 0) { - assertEquals(2242, JCasUtil.select(jcas, Token.class).size()); - assertEquals(2242, JCasUtil.select(jcas, POS.class).size()); - assertEquals(98, JCasUtil.select(jcas, Sentence.class).size()); - - assertEquals(firstSentence, JCasUtil.select(jcas, Sentence.class).iterator().next().getCoveredText()); - } - i++; - } - - assertEquals(3, i); - } - - @Test - public void brownReaderTest2() - throws Exception - { - File referenceFile = new File("src/test/resources/brown_ims.txt"); - File outputFile = new File("target/test-output/brown_ims.txt"); - - CollectionReaderDescription reader = createReaderDescription( - TeiReader.class, - TeiReader.PARAM_LANGUAGE, "en", - TeiReader.PARAM_SOURCE_LOCATION, "classpath:/brown_tei/", - TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml" }); - - AnalysisEngineDescription writer = createEngineDescription(ImsCwbWriter.class, - ImsCwbWriter.PARAM_TARGET_LOCATION, outputFile, - ImsCwbWriter.PARAM_WRITE_CPOS, true, - ImsCwbWriter.PARAM_SENTENCE_TAG, "sentence"); - - SimplePipeline.runPipeline(reader, writer); - - String reference = FileUtils.readFileToString(referenceFile, "UTF-8"); - String output = FileUtils.readFileToString(outputFile, "UTF-8"); - reference = EOLUtils.normalizeLineEndings(reference); - output = EOLUtils.normalizeLineEndings(output); - assertEquals(reference, output); - } - - @Test - public void brownReaderTest3() - throws Exception - { - File referenceFile = new File("src/test/resources/brown_ims.gz.txt"); - File outputFile = new File("target/test-output/brown_ims.gz.txt"); - - CollectionReaderDescription reader = createReaderDescription( - TeiReader.class, - TeiReader.PARAM_LANGUAGE, "en", - TeiReader.PARAM_SOURCE_LOCATION, "classpath:/brown_tei_gzip/", - TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml.gz" }); - - AnalysisEngineDescription writer = createEngineDescription(ImsCwbWriter.class, - ImsCwbWriter.PARAM_TARGET_LOCATION, outputFile, - ImsCwbWriter.PARAM_WRITE_CPOS, true, - ImsCwbWriter.PARAM_SENTENCE_TAG, "sentence"); - - SimplePipeline.runPipeline(reader, writer); - - String reference = FileUtils.readFileToString(referenceFile, "UTF-8"); - String output = FileUtils.readFileToString(outputFile, "UTF-8"); - reference = EOLUtils.normalizeLineEndings(reference); - output = EOLUtils.normalizeLineEndings(output); - assertEquals(reference, output); - } - - - @Test - public void brownReaderTest_noSentences() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - TeiReader.class, - TeiReader.PARAM_LANGUAGE, "en", - TeiReader.PARAM_SOURCE_LOCATION, "classpath:/brown_tei/", - TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml" }, - TeiReader.PARAM_READ_SENTENCE, false); - - int i = 0; - for (JCas jcas : new JCasIterable(reader)) { - DocumentMetaData meta = DocumentMetaData.get(jcas); - String text = jcas.getDocumentText(); - // System.out.printf("%s - %d%n", meta.getDocumentId(), text.length()); - - if (i == 0) { - assertEquals(2242, JCasUtil.select(jcas, Token.class).size()); - assertEquals(2242, JCasUtil.select(jcas, POS.class).size()); - assertEquals(0, JCasUtil.select(jcas, Sentence.class).size()); - } - i++; - } - - assertEquals(3, i); - } - - @Test - public void brownReaderTest_noToken_noPOS() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - TeiReader.class, - TeiReader.PARAM_LANGUAGE, "en", - TeiReader.PARAM_SOURCE_LOCATION, "classpath:/brown_tei/", - TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml" }, - TeiReader.PARAM_READ_TOKEN, false, - TeiReader.PARAM_READ_POS, false - ); - - int i = 0; - for (JCas jcas : new JCasIterable(reader)) { - DocumentMetaData meta = DocumentMetaData.get(jcas); - String text = jcas.getDocumentText(); - // System.out.printf("%s - %d%n", meta.getDocumentId(), text.length()); - - if (i == 0) { - assertEquals(0, JCasUtil.select(jcas, Token.class).size()); - assertEquals(0, JCasUtil.select(jcas, POS.class).size()); - assertEquals(98, JCasUtil.select(jcas, Sentence.class).size()); - } - i++; - } - - assertEquals(3, i); - } - - @Test(expected=IllegalStateException.class) - public void brownReaderTest_expectedException() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - TeiReader.class, - TeiReader.PARAM_LANGUAGE, "en", - TeiReader.PARAM_SOURCE_LOCATION, "classpath:/brown_tei/", - TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml" }, - TeiReader.PARAM_READ_POS, true, - TeiReader.PARAM_READ_TOKEN, false); - - for (JCas jcas : new JCasIterable(reader)) { - // should never get here - // System.out.println(jcas.getDocumentText()); - } - } -} diff --git a/dkpro-core-io-tei-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tei/TeiReaderWriterTest.java b/dkpro-core-io-tei-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tei/TeiReaderWriterTest.java deleted file mode 100644 index 7ba3029d35..0000000000 --- a/dkpro-core-io-tei-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tei/TeiReaderWriterTest.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.tei; - -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testRoundTrip; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; - -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - -public class TeiReaderWriterTest -{ - @Test - public void test() - throws Exception - { - testRoundTrip(TeiReader.class, TeiWriter.class, "reference/example1.xml"); - } - - @Test - public void test2() - throws Exception - { - testRoundTrip( - createReaderDescription(TeiReader.class), - createEngineDescription(TeiWriter.class, - TeiWriter.PARAM_WRITE_CONSTITUENT, true), - "reference/example2.xml"); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-io-tei-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tei/TeiWriterTest.java b/dkpro-core-io-tei-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tei/TeiWriterTest.java deleted file mode 100644 index 7bc59f709b..0000000000 --- a/dkpro-core-io-tei-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tei/TeiWriterTest.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.tei; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; -import static org.junit.Assert.assertTrue; - -import java.io.File; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReaderDescription; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpNamedEntityRecognizer; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpParser; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpSegmenter; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.dumper.CasDumpWriter; - -public class TeiWriterTest -{ - @Test - public void test() - throws Exception - { - File targetFolder = testContext.getTestOutputFolder(); - - CollectionReaderDescription textReader = createReaderDescription( - TextReader.class, - TextReader.PARAM_LANGUAGE, "en", - TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/texts", - TextReader.PARAM_PATTERNS, "*.txt"); - - AnalysisEngineDescription segmenter = createEngineDescription(OpenNlpSegmenter.class); - - AnalysisEngineDescription posTagger = createEngineDescription(OpenNlpPosTagger.class); - - AnalysisEngineDescription parser = createEngineDescription(OpenNlpParser.class); - - AnalysisEngineDescription ner = createEngineDescription(OpenNlpNamedEntityRecognizer.class); - - AnalysisEngineDescription dump = createEngineDescription(CasDumpWriter.class); - - AnalysisEngineDescription teiWriter = createEngineDescription( - TeiWriter.class, - TeiWriter.PARAM_TARGET_LOCATION, targetFolder, - TeiWriter.PARAM_WRITE_CONSTITUENT, true); - - runPipeline(textReader, segmenter, posTagger, parser, ner, dump, teiWriter); - - File output = new File(targetFolder, "example1.txt.xml"); - assertTrue(output.exists()); - -// Diff myDiff = new Diff( -// new InputSource("src/test/resources/reference/example1.txt.xml"), -// new InputSource(output.getPath())); -// myDiff.overrideElementQualifier(new ElementNameAndAttributeQualifier()); -// XMLAssert.assertXMLEqual(myDiff, true); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-io-tei-asl/src/test/java/org/dkpro/core/io/tei/TeiReaderTest.java b/dkpro-core-io-tei-asl/src/test/java/org/dkpro/core/io/tei/TeiReaderTest.java new file mode 100644 index 0000000000..d44412d50d --- /dev/null +++ b/dkpro-core-io-tei-asl/src/test/java/org/dkpro/core/io/tei/TeiReaderTest.java @@ -0,0 +1,224 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.tei; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.assertj.core.api.Assertions.tuple; +import static org.assertj.core.util.Files.contentOf; +import static org.junit.Assert.assertEquals; + +import java.io.File; + +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.JCasIterable; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.assertj.core.api.ListAssert; +import org.dkpro.core.io.imscwb.ImsCwbWriter; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.ReaderAssert; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class TeiReaderTest +{ + @Test + public void digibibTest() + throws Exception + { + ListAssert casList = ReaderAssert.assertThat( + TeiReader.class, + TeiReader.PARAM_OMIT_IGNORABLE_WHITESPACE, true, + TeiReader.PARAM_LANGUAGE, "de", + TeiReader.PARAM_SOURCE_LOCATION, "classpath:/digibib", + TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml" }) + .asJCasList(); + + casList + .extracting( + jcas -> DocumentMetaData.get(jcas).getDocumentId(), + jcas -> jcas.getDocumentText().length()) + .containsExactly( + tuple("Literatur-Balde,-Jacob.xml#1", 152), + tuple("Literatur-Balde,-Jacob.xml#2", 14378), + tuple("Literatur-Balde,-Jacob.xml#3", 532), + tuple("Literatur-Balde,-Jacob.xml#4", 1322), + tuple("Literatur-Balde,-Jacob.xml#5", 26588), + tuple("Literatur-Besser,-Johann-von.xml#1", 279), + tuple("Literatur-Besser,-Johann-von.xml#2", 3846), + tuple("Literatur-Besser,-Johann-von.xml#3", 22363), + tuple("Literatur-Besser,-Johann-von.xml#4", 3576), + tuple("Literatur-Besser,-Johann-von.xml#5", 3369), + tuple("Literatur-Besser,-Johann-von.xml#6", 3903), + tuple("Literatur-Besser,-Johann-von.xml#7", 2035), + tuple("Literatur-Kobell,-Franz-von.xml#1", 164), + tuple("Literatur-Kobell,-Franz-von.xml#2", 2078), + tuple("Literatur-Kobell,-Franz-von.xml#3", 50730), + tuple("Literatur-Marcel,-Gabriel.xml#1", 52696), + tuple("Literatur-Meister,-Johann-Gottlieb.xml#1", 41418)); + } + + @Test + public void thatBrownCorpusIsReadCorrectly() + throws Exception + { + ReaderAssert.assertThat( + TeiReader.class, + TeiReader.PARAM_LANGUAGE, "en", + TeiReader.PARAM_SOURCE_LOCATION, "classpath:/brown_tei/", + TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml" }) + .asJCasList() + .hasSize(3) + .element(0) + .extracting( + jcas -> jcas.select(Token.class).count(), + jcas -> jcas.select(POS.class).count(), + jcas -> jcas.select(Sentence.class).count(), + jcas -> jcas.select(Sentence.class).get(0).getCoveredText()) + .containsExactly( + 2242l, + 2242l, + 98l, + "The Fulton County Grand Jury said Friday an investigation of " + + "Atlanta's recent primary election produced `` no evidence '' that any " + + "irregularities took place ."); + } + + @Test + public void thatBrownCorpusTeiCanBeReadFromClasspath() + throws Exception + { + ReaderAssert.assertThat( + TeiReader.class, + TeiReader.PARAM_LANGUAGE, "en", + TeiReader.PARAM_SOURCE_LOCATION, "classpath:/brown_tei/", + TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml" }) + .usingWriter( + ImsCwbWriter.class, + ImsCwbWriter.PARAM_WRITE_CPOS, true, + ImsCwbWriter.PARAM_SENTENCE_TAG, "sentence") + .writingToSingular("${TARGET}/brown.vrt") + .outputAsString() + .isEqualToNormalizingNewlines(contentOf( + new File("src/test/resources/brown_tei/brown-ref.vrt"), UTF_8)); + } + + @Test + public void thatBrownCorpusTeiCanBeReadFromGZippedFile() + throws Exception + { + ReaderAssert.assertThat( + TeiReader.class, + TeiReader.PARAM_LANGUAGE, "en", + TeiReader.PARAM_SOURCE_LOCATION, "classpath:/brown_tei_gzip/", + TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml.gz" }) + .usingWriter( + ImsCwbWriter.class, + ImsCwbWriter.PARAM_WRITE_CPOS, true, + ImsCwbWriter.PARAM_SENTENCE_TAG, "sentence") + .writingToSingular("${TARGET}/brown.vrt") + .outputAsString() + .isEqualToNormalizingNewlines(contentOf( + new File("src/test/resources/brown_tei_gzip/brown-ref.vrt"), UTF_8)); + } + + @Test + public void brownReaderTest_noSentences() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + TeiReader.class, + TeiReader.PARAM_LANGUAGE, "en", + TeiReader.PARAM_SOURCE_LOCATION, "classpath:/brown_tei/", + TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml" }, + TeiReader.PARAM_READ_SENTENCE, false); + + int i = 0; + for (JCas jcas : new JCasIterable(reader)) { + DocumentMetaData meta = DocumentMetaData.get(jcas); + String text = jcas.getDocumentText(); + // System.out.printf("%s - %d%n", meta.getDocumentId(), text.length()); + + if (i == 0) { + assertEquals(2242, JCasUtil.select(jcas, Token.class).size()); + assertEquals(2242, JCasUtil.select(jcas, POS.class).size()); + assertEquals(0, JCasUtil.select(jcas, Sentence.class).size()); + } + i++; + } + + assertEquals(3, i); + } + + @Test + public void brownReaderTest_noToken_noPOS() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + TeiReader.class, + TeiReader.PARAM_LANGUAGE, "en", + TeiReader.PARAM_SOURCE_LOCATION, "classpath:/brown_tei/", + TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml" }, + TeiReader.PARAM_READ_TOKEN, false, + TeiReader.PARAM_READ_POS, false + ); + + int i = 0; + for (JCas jcas : new JCasIterable(reader)) { + DocumentMetaData meta = DocumentMetaData.get(jcas); + String text = jcas.getDocumentText(); + // System.out.printf("%s - %d%n", meta.getDocumentId(), text.length()); + + if (i == 0) { + assertEquals(0, JCasUtil.select(jcas, Token.class).size()); + assertEquals(0, JCasUtil.select(jcas, POS.class).size()); + assertEquals(98, JCasUtil.select(jcas, Sentence.class).size()); + } + i++; + } + + assertEquals(3, i); + } + + @Test(expected = IllegalStateException.class) + public void brownReaderTest_expectedException() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + TeiReader.class, + TeiReader.PARAM_LANGUAGE, "en", + TeiReader.PARAM_SOURCE_LOCATION, "classpath:/brown_tei/", + TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml" }, + TeiReader.PARAM_READ_POS, true, + TeiReader.PARAM_READ_TOKEN, false); + + for (JCas jcas : new JCasIterable(reader)) { + // should never get here + // System.out.println(jcas.getDocumentText()); + } + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-tei-asl/src/test/java/org/dkpro/core/io/tei/TeiReaderWriterTest.java b/dkpro-core-io-tei-asl/src/test/java/org/dkpro/core/io/tei/TeiReaderWriterTest.java new file mode 100644 index 0000000000..291e83e128 --- /dev/null +++ b/dkpro-core-io-tei-asl/src/test/java/org/dkpro/core/io/tei/TeiReaderWriterTest.java @@ -0,0 +1,65 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.tei; + +import static java.util.Collections.emptyList; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; +import static org.dkpro.core.testing.IOTestRunner.testRoundTrip; + +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Rule; +import org.junit.Test; + +public class TeiReaderWriterTest +{ + @Test + public void testWithoutTrim() + throws Exception + { + testRoundTrip( + createReaderDescription(TeiReader.class, + TeiReader.PARAM_ELEMENTS_TO_TRIM, emptyList()), + createEngineDescription(TeiWriter.class), + "reference/example1.xml"); + } + + @Test + public void testWithTrimming() + throws Exception + { + testOneWay(TeiReader.class, TeiWriter.class, + "reference/example1_out.xml", + "reference/example1.xml"); + } + + @Test + public void test2() + throws Exception + { + testRoundTrip( + createReaderDescription(TeiReader.class), + createEngineDescription(TeiWriter.class, + TeiWriter.PARAM_WRITE_CONSTITUENT, true), + "reference/example2.xml"); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-tei-asl/src/test/java/org/dkpro/core/io/tei/TeiWriterTest.java b/dkpro-core-io-tei-asl/src/test/java/org/dkpro/core/io/tei/TeiWriterTest.java new file mode 100644 index 0000000000..d13f6eb8e4 --- /dev/null +++ b/dkpro-core-io-tei-asl/src/test/java/org/dkpro/core/io/tei/TeiWriterTest.java @@ -0,0 +1,83 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.tei; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; +import static org.junit.Assert.assertTrue; + +import java.io.File; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.collection.CollectionReaderDescription; +import org.dkpro.core.io.tei.TeiWriter; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.opennlp.OpenNlpNamedEntityRecognizer; +import org.dkpro.core.opennlp.OpenNlpParser; +import org.dkpro.core.opennlp.OpenNlpPosTagger; +import org.dkpro.core.opennlp.OpenNlpSegmenter; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.dumper.CasDumpWriter; +import org.junit.Rule; +import org.junit.Test; + +public class TeiWriterTest +{ + @Test + public void test() + throws Exception + { + File targetFolder = testContext.getTestOutputFolder(); + + CollectionReaderDescription textReader = createReaderDescription( + TextReader.class, + TextReader.PARAM_LANGUAGE, "en", + TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/texts", + TextReader.PARAM_PATTERNS, "*.txt"); + + AnalysisEngineDescription segmenter = createEngineDescription(OpenNlpSegmenter.class); + + AnalysisEngineDescription posTagger = createEngineDescription(OpenNlpPosTagger.class); + + AnalysisEngineDescription parser = createEngineDescription(OpenNlpParser.class); + + AnalysisEngineDescription ner = createEngineDescription(OpenNlpNamedEntityRecognizer.class); + + AnalysisEngineDescription dump = createEngineDescription(CasDumpWriter.class); + + AnalysisEngineDescription teiWriter = createEngineDescription( + TeiWriter.class, + TeiWriter.PARAM_TARGET_LOCATION, targetFolder, + TeiWriter.PARAM_WRITE_CONSTITUENT, true); + + runPipeline(textReader, segmenter, posTagger, parser, ner, dump, teiWriter); + + File output = new File(targetFolder, "example1.txt.xml"); + assertTrue(output.exists()); + +// Diff myDiff = new Diff( +// new InputSource("src/test/resources/reference/example1.txt.xml"), +// new InputSource(output.getPath())); +// myDiff.overrideElementQualifier(new ElementNameAndAttributeQualifier()); +// XMLAssert.assertXMLEqual(myDiff, true); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-tei-asl/src/test/resources/brown_ims.txt b/dkpro-core-io-tei-asl/src/test/resources/brown_tei/brown-ref.vrt similarity index 100% rename from dkpro-core-io-tei-asl/src/test/resources/brown_ims.txt rename to dkpro-core-io-tei-asl/src/test/resources/brown_tei/brown-ref.vrt diff --git a/dkpro-core-io-tei-asl/src/test/resources/brown_ims.gz.txt b/dkpro-core-io-tei-asl/src/test/resources/brown_tei_gzip/brown-ref.vrt similarity index 100% rename from dkpro-core-io-tei-asl/src/test/resources/brown_ims.gz.txt rename to dkpro-core-io-tei-asl/src/test/resources/brown_tei_gzip/brown-ref.vrt diff --git a/dkpro-core-io-tei-asl/src/test/resources/log4j.properties b/dkpro-core-io-tei-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-io-tei-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-tei-asl/src/test/resources/log4j2.xml b/dkpro-core-io-tei-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-tei-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-tei-asl/src/test/resources/reference/example1_out.xml b/dkpro-core-io-tei-asl/src/test/resources/reference/example1_out.xml new file mode 100644 index 0000000000..00f5f61082 --- /dev/null +++ b/dkpro-core-io-tei-asl/src/test/resources/reference/example1_out.xml @@ -0,0 +1,167 @@ +Sample A01 from The Atlanta Constitution +

The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .

+ +

The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise and thanks of the City of Atlanta '' for the manner in which the election was conducted .

+ +

The September-October term jury had been charged by Fulton Superior Court Judge Durwood Pye to investigate reports of possible `` irregularities '' in the hard-fought primary which was won by Mayor-nominate Ivan Allen Jr. .

+ +

`` Only a relative handful of such reports was received '' , the jury said , `` considering the widespread interest in the election , the number of voters and the size of this city '' .

+ +

The jury said it did find that many of Georgia's registration and election laws `` are outmoded or inadequate and often ambiguous '' .

+ +

It recommended that Fulton legislators act `` to have these laws studied and revised to the end of modernizing and improving them '' .

+ +

The grand jury commented on a number of other topics , among them the Atlanta and Fulton County purchasing departments which it said `` are well operated and follow generally accepted practices which inure to the best interest of both governments '' .

+ +

Merger proposed +However , the jury said it believes `` these two offices should be combined to achieve greater efficiency and reduce the cost of administration '' .

+ +

The City Purchasing Department , the jury said , `` is lacking in experienced clerical personnel as a result of city personnel policies '' . +It urged that the city `` take steps to remedy '' this problem .

+ +

Implementation of Georgia's automobile title law was also recommended by the outgoing jury .

+ +

It urged that the next Legislature `` provide enabling funds and re-set the effective date so that an orderly implementation of the law may be effected '' .

+ +

The grand jury took a swipe at the State Welfare Department's handling of federal funds granted for child welfare services in foster homes .

+ +

`` This is one of the major items in the Fulton County general assistance program '' , the jury said , but the State Welfare Department `` has seen fit to distribute these funds through the welfare departments of all the counties in the state with the exception of Fulton County , which receives none of this money .

+ +

The jurors said they realize `` a proportionate distribution of these funds might disable this program in our less populous counties '' .

+ +

Nevertheless , `` we feel that in the future Fulton County should receive some portion of these available funds '' , the jurors said . +`` Failure to do this will continue to place a disproportionate burden '' on Fulton taxpayers .

+ +

The jury also commented on the Fulton ordinary's court which has been under fire for its practices in the appointment of appraisers , guardians and administrators and the awarding of fees and compensation .

+ +

Wards protected +The jury said it found the court `` has incorporated into its operating procedures the recommendations '' of two previous grand juries , the Atlanta Bar Association and an interim citizens committee .

+ +

`` These actions should serve to protect in fact and in effect the court's wards from undue costs and its appointed and elected servants from unmeritorious criticisms '' , the jury said .

+ +

Regarding Atlanta's new multi-million-dollar airport , the jury recommended `` that when the new management takes charge Jan. 1 the airport be operated in a manner that will eliminate political influences '' .

+ +

The jury did not elaborate , but it added that `` there should be periodic surveillance of the pricing practices of the concessionaires for the purpose of keeping the prices reasonable '' .

+ +

Ask jail deputies +On other matters , the jury recommended that : ( 1 ) +Four additional deputies be employed at the Fulton County Jail and `` a doctor , medical intern or extern be employed for night and weekend duty at the jail '' . +( 2 ) +Fulton legislators `` work with city officials to pass enabling legislation that will permit the establishment of a fair and equitable '' pension plan for city employes .

+ +

The jury praised the administration and operation of the Atlanta Police Department , the Fulton Tax Commissioner's Office , the Bellwood and Alpharetta prison farms , Grady Hospital and the Fulton Health Department .

+ +

Mayor William B. Hartsfield filed suit for divorce from his wife , Pearl Williams Hartsfield , in Fulton Superior Court Friday . +His petition charged mental cruelty .

+ +

The couple was married Aug. 2 , 1913 . +They have a son , William Berry Jr. , and a daughter , Mrs. J. M. Cheshire of Griffin .

+ +

Attorneys for the mayor said that an amicable property settlement has been agreed upon .

+ +

The petition listed the mayor's occupation as `` attorney '' and his age as 71 . +It listed his wife's age as 74 and place of birth as Opelika , Ala. .

+ +

The petition said that the couple has not lived together as man and wife for more than a year .

+ +

The Hartsfield home is at 637 E. Pelham Rd. Aj .

+ +

Henry L. Bowden was listed on the petition as the mayor's attorney .

+ +

Hartsfield has been mayor of Atlanta , with exception of one brief interlude , since 1937 . +His political career goes back to his election to city council in 1923 .

+ +

The mayor's present term of office expires Jan. 1 . +He will be succeeded by Ivan Allen Jr. , who became a candidate in the Sept. 13 primary after Mayor Hartsfield announced that he would not run for reelection .

+ +

Georgia Republicans are getting strong encouragement to enter a candidate in the 1962 governor's race , a top official said Wednesday .

+ +

Robert Snodgrass , state GOP chairman , said a meeting held Tuesday night in Blue Ridge brought enthusiastic responses from the audience .

+ +

State Party Chairman James W. Dorsey added that enthusiasm was picking up for a state rally to be held Sept. 8 in Savannah at which newly elected Texas Sen. John Tower will be the featured speaker .

+ +

In the Blue Ridge meeting , the audience was warned that entering a candidate for governor would force it to take petitions out into voting precincts to obtain the signatures of registered voters .

+ +

Despite the warning , there was a unanimous vote to enter a candidate , according to Republicans who attended .

+ +

When the crowd was asked whether it wanted to wait one more term to make the race , it voted no -- and there were no dissents .

+ +

The largest hurdle the Republicans would have to face is a state law which says that before making a first race , one of two alternative courses must be taken : 1 +Five per cent of the voters in each county must sign petitions requesting that the Republicans be allowed to place names of candidates on the general election ballot , or 2 +The Republicans must hold a primary under the county unit system -- a system which the party opposes in its platform .

+ +

Sam Caldwell , State Highway Department public relations director , resigned Tuesday to work for Lt. Gov. Garland Byrd's campaign .

+ +

Caldwell's resignation had been expected for some time . +He will be succeeded by Rob Ledford of Gainesville , who has been an assistant more than three years . +When the gubernatorial campaign starts , Caldwell is expected to become a campaign coordinator for Byrd .

+ +

The Georgia Legislature will wind up its 1961 session Monday and head for home -- where some of the highway bond money it approved will follow shortly .

+ +

Before adjournment Monday afternoon , the Senate is expected to approve a study of the number of legislators allotted to rural and urban areas to determine what adjustments should be made .

+ +

Gov. Vandiver is expected to make the traditional visit to both chambers as they work toward adjournment . +Vandiver likely will mention the $100 million highway bond issue approved earlier in the session as his first priority item .

+ +

Construction bonds +Meanwhile , it was learned the State Highway Department is very near being ready to issue the first $30 million worth of highway reconstruction bonds .

+ +

The bond issue will go to the state courts for a friendly test suit to test the validity of the act , and then the sales will begin and contracts let for repair work on some of Georgia's most heavily traveled highways .

+ +

A Highway Department source said there also is a plan there to issue some $3 million to $4 million worth of Rural Roads Authority bonds for rural road construction work .

+ +

A revolving fund +The department apparently intends to make the Rural Roads Authority a revolving fund under which new bonds would be issued every time a portion of the old ones are paid off by tax authorities .

+ +

Vandiver opened his race for governor in 1958 with a battle in the Legislature against the issuance of $50 million worth of additional rural roads bonds proposed by then Gov. Marvin Griffin .

+ +

The Highway Department source told The Constitution , however , that Vandiver has not been consulted yet about the plans to issue the new rural roads bonds .

+ +

Schley County Rep. B. D. Pelham will offer a resolution Monday in the House to rescind the body's action of Friday in voting itself a $10 per day increase in expense allowances .

+ +

Pelham said Sunday night there was research being done on whether the `` quickie '' vote on the increase can be repealed outright or whether notice would have to first be given that reconsideration of the action would be sought .

+ +

While emphasizing that technical details were not fully worked out , Pelham said his resolution would seek to set aside the privilege resolution which the House voted through 87-31 .

+ +

A similar resolution passed in the Senate by a vote of 29-5 . +As of Sunday night , there was no word of a resolution being offered there to rescind the action .

+ +

Pelham pointed out that Georgia voters last November rejected a constitutional amendment to allow legislators to vote on pay raises for future Legislature sessions .

+ +

A veteran Jackson County legislator will ask the Georgia House Monday to back federal aid to education , something it has consistently opposed in the past .

+ +

Rep. Mac Barber of Commerce is asking the House in a privilege resolution to `` endorse increased federal support for public education , provided that such funds be received and expended '' as state funds .

+ +

Barber , who is in his 13th year as a legislator , said there `` are some members of our congressional delegation in Washington who would like to see it ( the resolution ) passed '' . +But he added that none of Georgia's congressmen specifically asked him to offer the resolution .

+ +

The resolution , which Barber tossed into the House hopper Friday , will be formally read Monday . +It says that `` in the event Congress does provide this increase in federal funds '' , the State Board of Education should be directed to `` give priority '' to teacher pay raises . +Colquitt +-- After a long , hot controversy , Miller County has a new school superintendent , elected , as a policeman put it , in the `` coolest election I ever saw in this county '' .

+ +

The new school superintendent is Harry Davis , a veteran agriculture teacher , who defeated Felix Bush , a school principal and chairman of the Miller County Democratic Executive Committee .

+ +

Davis received 1,119 votes in Saturday's election , and Bush got 402 . +Ordinary Carey Williams , armed with a pistol , stood by at the polls to insure order .

+ +

`` This was the coolest , calmest election I ever saw '' , Colquitt Policeman Tom Williams said . +`` Being at the polls was just like being at church . +I didn't smell a drop of liquor , and we didn't have a bit of trouble '' .

+ +

The campaign leading to the election was not so quiet , however . +It was marked by controversy , anonymous midnight phone calls and veiled threats of violence .

+ +

The former county school superintendent , George P. Callan , shot himself to death March 18 , four days after he resigned his post in a dispute with the county school board .

+ +

During the election campaign , both candidates , Davis and Bush , reportedly received anonymous telephone calls . +Ordinary Williams said he , too , was subjected to anonymous calls soon after he scheduled the election .

+ +

Many local citizens feared that there would be irregularities at the polls , and Williams got himself a permit to carry a gun and promised an orderly election .

+ +

Sheriff Felix Tabb said the ordinary apparently made good his promise .

+ +

`` Everything went real smooth '' , the sheriff said . +`` There wasn't a bit of trouble '' .

+ +

\ No newline at end of file diff --git a/dkpro-core-io-text-asl/pom.xml b/dkpro-core-io-text-asl/pom.xml index a4ff1ba7ca..9b4ee80782 100644 --- a/dkpro-core-io-text-asl/pom.xml +++ b/dkpro-core-io-text-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.text-asl + dkpro-core-io-text-asl jar DKPro Core ASL - IO - Text + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -44,28 +45,32 @@ commons-io - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.featurepath-asl + org.dkpro.core + dkpro-core-api-featurepath-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -78,8 +83,8 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test diff --git a/dkpro-core-io-text-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/text/StringReader.java b/dkpro-core-io-text-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/text/StringReader.java deleted file mode 100644 index 3adf8ea9b4..0000000000 --- a/dkpro-core-io-text-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/text/StringReader.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.text; - -import java.io.IOException; - -import org.apache.uima.UimaContext; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.factory.JCasFactory; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.Progress; -import org.apache.uima.util.ProgressImpl; - -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; - -/** - * Simple reader that generates a CAS from a String. This can be useful in situations where a reader - * is preferred over manually crafting a CAS using {@link JCasFactory#createJCas()}. - * - */ -@MimeTypeCapability({MimeTypes.TEXT_PLAIN}) -@TypeCapability( - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) - -public class StringReader - extends JCasCollectionReader_ImplBase -{ - /** - * Set this as the language of the produced documents. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true) - private String language; - - /** - * The document text. - */ - public static final String PARAM_DOCUMENT_TEXT = "documentText"; - @ConfigurationParameter(name = PARAM_DOCUMENT_TEXT, mandatory = true) - private String documentText; - - /** - * The collection ID to set in the {@link DocumentMetaData}. - */ - public static final String PARAM_COLLECTION_ID = "collectionId"; - @ConfigurationParameter(name = PARAM_COLLECTION_ID, mandatory = true, - defaultValue = "COLLECTION_ID") - private String collectionId; - - /** - * The document ID to set in the {@link DocumentMetaData}. - */ - public static final String PARAM_DOCUMENT_ID = "documentId"; - @ConfigurationParameter(name = PARAM_DOCUMENT_ID, mandatory = true, - defaultValue = "DOCUMENT_ID") - private String documentId; - - /** - * The document base URI to set in the {@link DocumentMetaData}. - */ - public static final String PARAM_DOCUMENT_BASE_URI = "documentBaseUri"; - @ConfigurationParameter(name = PARAM_DOCUMENT_BASE_URI, mandatory = false) - private String documentBaseUri; - - /** - * The document URI to set in the {@link DocumentMetaData}. - */ - public static final String PARAM_DOCUMENT_URI = "documentUri"; - @ConfigurationParameter(name = PARAM_DOCUMENT_URI, mandatory = true, defaultValue = "STRING") - private String documentUri; - - private boolean isDone = false; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - isDone = false; - } - - @Override - public void getNext(JCas sJCas) - throws IOException - { - isDone = true; - - DocumentMetaData meta = DocumentMetaData.create(sJCas); - meta.setCollectionId(collectionId); - meta.setDocumentUri(documentUri); - meta.setDocumentId(documentId); - meta.setDocumentBaseUri(documentBaseUri); - - sJCas.setDocumentLanguage(language); - sJCas.setDocumentText(documentText); - } - - @Override - public boolean hasNext() - throws IOException, CollectionException - { - return !isDone; - } - - @Override - public Progress[] getProgress() - { - return new Progress[] { new ProgressImpl(isDone ? 0 : 1, 1, Progress.ENTITIES) }; - } -} diff --git a/dkpro-core-io-text-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/text/TextReader.java b/dkpro-core-io-text-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/text/TextReader.java deleted file mode 100644 index d2726f02f4..0000000000 --- a/dkpro-core-io-text-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/text/TextReader.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.text; - -import java.io.BufferedInputStream; -import java.io.IOException; -import java.io.InputStream; - -import org.apache.commons.io.IOUtils; -import org.apache.uima.cas.CAS; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import com.ibm.icu.text.CharsetDetector; - -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; - -/** - * UIMA collection reader for plain text files. - */ -@ResourceMetaData(name="Text Reader") -@MimeTypeCapability(MimeTypes.TEXT_PLAIN) -@TypeCapability( - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) -public class TextReader - extends ResourceCollectionReaderBase -{ - /** - * Automatically detect encoding. - * - * @see CharsetDetector - */ - public static final String ENCODING_AUTO = "auto"; - - /** - * Name of configuration parameter that contains the character encoding used by the input files. - */ - public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) - private String sourceEncoding; - - @Override - public void getNext(CAS aJCas) - throws IOException, CollectionException - { - Resource res = nextFile(); - initCas(aJCas, res); - - try (InputStream is = new BufferedInputStream( - CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()))) { - String text; - - if (ENCODING_AUTO.equals(sourceEncoding)) { - CharsetDetector detector = new CharsetDetector(); - text = IOUtils.toString(detector.getReader(is, null)); - } - else { - text = IOUtils.toString(is, sourceEncoding); - } - - aJCas.setDocumentText(text); - } - } -} diff --git a/dkpro-core-io-text-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/text/package-info.java b/dkpro-core-io-text-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/text/package-info.java deleted file mode 100644 index 8d74cc04c8..0000000000 --- a/dkpro-core-io-text-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/text/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Support for plain text files. - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.io.text; diff --git a/dkpro-core-io-text-asl/src/main/java/org/dkpro/core/io/text/StringReader.java b/dkpro-core-io-text-asl/src/main/java/org/dkpro/core/io/text/StringReader.java new file mode 100644 index 0000000000..c53dde50b5 --- /dev/null +++ b/dkpro-core-io-text-asl/src/main/java/org/dkpro/core/io/text/StringReader.java @@ -0,0 +1,138 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.text; + +import java.io.IOException; + +import org.apache.uima.UimaContext; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Progress; +import org.apache.uima.util.ProgressImpl; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Simple reader that generates a CAS from a String. This can be useful in situations where a reader + * is preferred over manually crafting a CAS using {@link JCasFactory#createJCas()}. + */ +@Component(value = OperationType.READER) +@ResourceMetaData(name = "String Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.TEXT_PLAIN}) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) +public class StringReader + extends JCasCollectionReader_ImplBase +{ + /** + * Set this as the language of the produced documents. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true) + private String language; + + /** + * The document text. + */ + public static final String PARAM_DOCUMENT_TEXT = "documentText"; + @ConfigurationParameter(name = PARAM_DOCUMENT_TEXT, mandatory = true) + private String documentText; + + /** + * The collection ID to set in the {@link DocumentMetaData}. + */ + public static final String PARAM_COLLECTION_ID = "collectionId"; + @ConfigurationParameter(name = PARAM_COLLECTION_ID, mandatory = true, + defaultValue = "COLLECTION_ID") + private String collectionId; + + /** + * The document ID to set in the {@link DocumentMetaData}. + */ + public static final String PARAM_DOCUMENT_ID = "documentId"; + @ConfigurationParameter(name = PARAM_DOCUMENT_ID, mandatory = true, + defaultValue = "DOCUMENT_ID") + private String documentId; + + /** + * The document base URI to set in the {@link DocumentMetaData}. + */ + public static final String PARAM_DOCUMENT_BASE_URI = "documentBaseUri"; + @ConfigurationParameter(name = PARAM_DOCUMENT_BASE_URI, mandatory = false) + private String documentBaseUri; + + /** + * The document URI to set in the {@link DocumentMetaData}. + */ + public static final String PARAM_DOCUMENT_URI = "documentUri"; + @ConfigurationParameter(name = PARAM_DOCUMENT_URI, mandatory = true, defaultValue = "STRING") + private String documentUri; + + private boolean isDone = false; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + isDone = false; + } + + @Override + public void getNext(JCas sJCas) + throws IOException + { + isDone = true; + + DocumentMetaData meta = DocumentMetaData.create(sJCas); + meta.setCollectionId(collectionId); + meta.setDocumentUri(documentUri); + meta.setDocumentId(documentId); + meta.setDocumentBaseUri(documentBaseUri); + + sJCas.setDocumentLanguage(language); + sJCas.setDocumentText(documentText); + } + + @Override + public boolean hasNext() + throws IOException, CollectionException + { + return !isDone; + } + + @Override + public Progress[] getProgress() + { + return new Progress[] { new ProgressImpl(isDone ? 0 : 1, 1, Progress.ENTITIES) }; + } +} diff --git a/dkpro-core-io-text-asl/src/main/java/org/dkpro/core/io/text/TextReader.java b/dkpro-core-io-text-asl/src/main/java/org/dkpro/core/io/text/TextReader.java new file mode 100644 index 0000000000..37e9b782f4 --- /dev/null +++ b/dkpro-core-io-text-asl/src/main/java/org/dkpro/core/io/text/TextReader.java @@ -0,0 +1,89 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.text; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.io.IOUtils; +import org.apache.uima.cas.CAS; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.dkpro.core.api.io.ResourceCollectionReaderBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; + +import com.ibm.icu.text.CharsetDetector; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * UIMA collection reader for plain text files. + */ +@ResourceMetaData(name = "Text Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability(MimeTypes.TEXT_PLAIN) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) +public class TextReader + extends ResourceCollectionReaderBase +{ + /** + * Automatically detect encoding. + * + * @see CharsetDetector + */ + public static final String ENCODING_AUTO = "auto"; + + /** + * Name of configuration parameter that contains the character encoding used by the input files. + */ + public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) + private String sourceEncoding; + + @Override + public void getNext(CAS aJCas) + throws IOException, CollectionException + { + Resource res = nextFile(); + initCas(aJCas, res); + + try (InputStream is = new BufferedInputStream( + CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()))) { + String text; + + if (ENCODING_AUTO.equals(sourceEncoding)) { + CharsetDetector detector = new CharsetDetector(); + text = IOUtils.toString(detector.getReader(is, null)); + } + else { + text = IOUtils.toString(is, sourceEncoding); + } + + aJCas.setDocumentText(text); + } + } +} diff --git a/dkpro-core-io-text-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/text/TextWriter.java b/dkpro-core-io-text-asl/src/main/java/org/dkpro/core/io/text/TextWriter.java similarity index 79% rename from dkpro-core-io-text-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/text/TextWriter.java rename to dkpro-core-io-text-asl/src/main/java/org/dkpro/core/io/text/TextWriter.java index c08e3630c6..22d4ee46d3 100644 --- a/dkpro-core-io-text-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/text/TextWriter.java +++ b/dkpro-core-io-text-asl/src/main/java/org/dkpro/core/io/text/TextWriter.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.text; +package org.dkpro.core.io.text; import java.io.OutputStream; @@ -26,18 +26,20 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; +import eu.openminted.share.annotations.api.DocumentationResource; /** * UIMA CAS consumer writing the CAS document text as plain text file. */ -@ResourceMetaData(name="Text Writer") +@ResourceMetaData(name = "Text Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.TEXT_PLAIN}) @TypeCapability( - inputs={ + inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) public class TextWriter extends JCasFileWriter_ImplBase @@ -46,7 +48,8 @@ public class TextWriter * Specify the suffix of output files. Default value .txt. If the suffix is not * needed, provide an empty string as value. */ - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".txt") private String filenameSuffix; @@ -54,7 +57,8 @@ public class TextWriter * Character encoding of the output data. */ public static final String PARAM_TARGET_ENCODING = "targetEncoding"; - @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String targetEncoding; @Override diff --git a/dkpro-core-io-text-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/text/TokenizedTextWriter.java b/dkpro-core-io-text-asl/src/main/java/org/dkpro/core/io/text/TokenizedTextWriter.java similarity index 78% rename from dkpro-core-io-text-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/text/TokenizedTextWriter.java rename to dkpro-core-io-text-asl/src/main/java/org/dkpro/core/io/text/TokenizedTextWriter.java index aad155bbfb..6252fb14c6 100644 --- a/dkpro-core-io-text-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/text/TokenizedTextWriter.java +++ b/dkpro-core-io-text-asl/src/main/java/org/dkpro/core/io/text/TokenizedTextWriter.java @@ -15,14 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.text; +package org.dkpro.core.io.text; -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator.PhraseSequenceGenerator; -import de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator.StringSequenceGenerator; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; @@ -32,19 +29,32 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.featurepath.FeaturePathException; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.io.sequencegenerator.PhraseSequenceGenerator; +import org.dkpro.core.api.io.sequencegenerator.StringSequenceGenerator; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; -import java.io.IOException; -import java.io.OutputStream; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.Parameters; /** - * This class writes a set of pre-processed documents into a large text file containing one sentence - * per line and tokens split by whitespaces. Optionally, annotations other than tokens (e.g. lemmas) - * are written as specified by {@link #PARAM_FEATURE_PATH}. + * Write texts into into a large file containing one sentence per line and tokens separated by + * whitespace. Optionally, annotations other than tokens (e.g. lemmas) are written as specified by + * {@link #PARAM_FEATURE_PATH}. */ -@ResourceMetaData(name="Tokenized Text Writer") +@ResourceMetaData(name = "Tokenized Text Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@Parameters( + exclude = { + JCasFileWriter_ImplBase.PARAM_TARGET_LOCATION, + JCasFileWriter_ImplBase.PARAM_SINGULAR_TARGET, + JCasFileWriter_ImplBase.PARAM_OVERWRITE, + TokenizedTextWriter.PARAM_STOPWORDS_FILE }) @MimeTypeCapability({MimeTypes.TEXT_PLAIN}) @TypeCapability( - inputs={ + inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) public class TokenizedTextWriter extends JCasFileWriter_ImplBase @@ -63,8 +73,7 @@ public class TokenizedTextWriter /** * The feature path, e.g. - * {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token/lemma/value} for lemmas. Default: - * {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token} (i.e. token texts). + * {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token/lemma/value} for lemmas. */ public static final String PARAM_FEATURE_PATH = "featurePath"; /** @@ -81,6 +90,9 @@ public class TokenizedTextWriter @ConfigurationParameter(name = PARAM_FEATURE_PATH, mandatory = true, defaultValue = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token") private String featurePath; + /** + * Regular expression to match numbers. These are written to the output as {@code NUM}. + */ public static final String PARAM_NUMBER_REGEX = "numberRegex"; @ConfigurationParameter(name = PARAM_NUMBER_REGEX, mandatory = true, defaultValue = "") private String numberRegex; @@ -91,23 +103,25 @@ public class TokenizedTextWriter */ public static final String PARAM_STOPWORDS_FILE = "stopwordsFile"; @ConfigurationParameter(name = PARAM_STOPWORDS_FILE, mandatory = true, defaultValue = "") - private String stopwordsFile; + private File stopwordsFile; /** - * Set the output file extension. Default: {@code .txt}. + * Set the output file extension. */ public static final String PARAM_EXTENSION = "extension"; @ConfigurationParameter(name = PARAM_EXTENSION, mandatory = true, defaultValue = ".txt") private String extension = ".txt"; /** - * In the output file, each unit of the covering type is written into a separate line. The default - * (set in {@link #DEFAULT_COVERING_TYPE}), is sentences so that each sentence is written to a line. + * In the output file, each unit of the covering type is written into a separate line. The + * default (set in {@link #DEFAULT_COVERING_TYPE}), is sentences so that each sentence is + * written to a line. *

- * If no linebreaks within a document is desired, set this value to {@code null}. + * If no line breaks within a document are desired, set this value to {@code null}. */ public static final String PARAM_COVERING_TYPE = "coveringType"; - @ConfigurationParameter(name = PARAM_COVERING_TYPE, mandatory = true, defaultValue = DEFAULT_COVERING_TYPE) + @ConfigurationParameter(name = PARAM_COVERING_TYPE, mandatory = true, + defaultValue = DEFAULT_COVERING_TYPE) private String coveringType; private StringSequenceGenerator sequenceGenerator; diff --git a/dkpro-core-io-text-asl/src/main/java/org/dkpro/core/io/text/package-info.java b/dkpro-core-io-text-asl/src/main/java/org/dkpro/core/io/text/package-info.java new file mode 100644 index 0000000000..a070901e75 --- /dev/null +++ b/dkpro-core-io-text-asl/src/main/java/org/dkpro/core/io/text/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Support for plain text files. + * + * @since 1.1.0 + */ +package org.dkpro.core.io.text; diff --git a/dkpro-core-io-text-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/text/TextReaderTest.java b/dkpro-core-io-text-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/text/TextReaderTest.java deleted file mode 100644 index fd563d12ff..0000000000 --- a/dkpro-core-io-text-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/text/TextReaderTest.java +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.text; - -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.apache.uima.fit.factory.TypeSystemDescriptionFactory.createTypeSystemDescription; -import static org.apache.uima.fit.util.CasUtil.select; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.io.File; -import java.util.Arrays; -import java.util.List; - -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.JCasIterable; -import org.apache.uima.fit.util.CasUtil; -import org.apache.uima.jcas.JCas; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TestName; - -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; - -public class TextReaderTest -{ - private static final String FILE1 = "test1.txt"; - private static final String FILE2 = "test2.txt"; - private static final List FILES = Arrays.asList(FILE1, FILE2); - - @Test - public void fileSystemReaderTest() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription(TextReader.class, - ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "src/test/resources/texts", - ResourceCollectionReaderBase.PARAM_PATTERNS, "[+]*.txt"); - - for (JCas jcas : new JCasIterable(reader)) { - DocumentMetaData md = DocumentMetaData.get(jcas); - dumpMetaData(md); - - assertEquals(1, CasUtil.select(jcas.getCas(), jcas.getDocumentAnnotationFs().getType()) - .size()); - assertTrue(FILES.contains(md.getDocumentId())); - - assertTrue( - !FILE1.equals(md.getDocumentId()) || ( - "This is a test.".equals(jcas.getDocumentText()) && - 15 == md.getEnd())); - - assertTrue( - !FILE2.equals(md.getDocumentId()) - || "This is a second test.".equals(jcas.getDocumentText())); - } - } - - @Test - public void fileSystemReaderAbsolutePathTest() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription(TextReader.class, - ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, new File("src/test/resources/texts").getAbsolutePath(), - ResourceCollectionReaderBase.PARAM_PATTERNS, new String[] { - ResourceCollectionReaderBase.INCLUDE_PREFIX + "*.txt" }); - - for (JCas jcas : new JCasIterable(reader)) { - DocumentMetaData md = DocumentMetaData.get(jcas); - dumpMetaData(md); - - assertEquals(1, select(jcas.getCas(), jcas.getDocumentAnnotationFs().getType()).size()); - - assertTrue(FILES.contains(md.getDocumentId())); - - assertTrue( - !FILE1.equals(md.getDocumentId()) || ( - "This is a test.".equals(jcas.getDocumentText()) && - 15 == md.getEnd())); - - assertTrue( - !FILE2.equals(md.getDocumentId()) - || "This is a second test.".equals(jcas.getDocumentText())); - } - } - - @Test - public void fileSystemReaderTest3() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription(TextReader.class, - createTypeSystemDescription(), - ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "src/test/resources/name with space", - ResourceCollectionReaderBase.PARAM_PATTERNS, new String[] { - ResourceCollectionReaderBase.INCLUDE_PREFIX + "*.txt" }); - - for (JCas jcas : new JCasIterable(reader)) { - DocumentMetaData md = DocumentMetaData.get(jcas); - dumpMetaData(md); - - assertEquals(1, select(jcas.getCas(), jcas.getDocumentAnnotationFs().getType()).size()); - - assertTrue(FILES.contains(md.getDocumentId())); - - assertTrue( - !FILE1.equals(md.getDocumentId()) || ( - "This is a test.".equals(jcas.getDocumentText()) && - 15 == md.getEnd())); - - assertTrue( - !FILE2.equals(md.getDocumentId()) - || "This is a second test.".equals(jcas.getDocumentText())); - } - } - - @Test - public void fileSystemReaderTest2() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription(TextReader.class, - createTypeSystemDescription(), - ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "src/test/resources/texts", - ResourceCollectionReaderBase.PARAM_PATTERNS, new String[0]); - - for (JCas jcas : new JCasIterable(reader)) { - DocumentMetaData md = DocumentMetaData.get(jcas); - dumpMetaData(md); - - assertEquals(1, select(jcas.getCas(), jcas.getDocumentAnnotationFs().getType()).size()); - - assertTrue(FILES.contains(md.getDocumentId())); - - assertTrue( - !FILE1.equals(md.getDocumentId()) - || "This is a test.".equals(jcas.getDocumentText())); - - assertTrue( - !FILE2.equals(md.getDocumentId()) - || "This is a second test.".equals(jcas.getDocumentText())); - } - } - - @Test - public void fileSystemReaderTest4() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription(TextReader.class, - createTypeSystemDescription(), - ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "classpath:texts", - ResourceCollectionReaderBase.PARAM_PATTERNS, new String[0]); - - for (JCas jcas : new JCasIterable(reader)) { - DocumentMetaData md = DocumentMetaData.get(jcas); - dumpMetaData(md); - - assertEquals(1, select(jcas.getCas(), jcas.getDocumentAnnotationFs().getType()).size()); - - assertTrue(FILES.contains(md.getDocumentId())); - - assertTrue( - !FILE1.equals(md.getDocumentId()) - || "This is a test.".equals(jcas.getDocumentText())); - - assertTrue( - !FILE2.equals(md.getDocumentId()) - || "This is a second test.".equals(jcas.getDocumentText())); - } - } - - private void dumpMetaData(final DocumentMetaData aMetaData) - { - System.out.println("Collection ID: "+aMetaData.getCollectionId()); - System.out.println("ID : "+aMetaData.getDocumentId()); - System.out.println("Base URI : "+aMetaData.getDocumentBaseUri()); - System.out.println("URI : "+aMetaData.getDocumentUri()); - } - - @Rule public TestName name = new TestName(); -} diff --git a/dkpro-core-io-text-asl/src/test/java/org/dkpro/core/io/text/TextReaderTest.java b/dkpro-core-io-text-asl/src/test/java/org/dkpro/core/io/text/TextReaderTest.java new file mode 100644 index 0000000000..f43b27a53f --- /dev/null +++ b/dkpro-core-io-text-asl/src/test/java/org/dkpro/core/io/text/TextReaderTest.java @@ -0,0 +1,194 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.text; + +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.factory.TypeSystemDescriptionFactory.createTypeSystemDescription; +import static org.apache.uima.fit.util.CasUtil.select; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.util.Arrays; +import java.util.List; + +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.JCasIterable; +import org.apache.uima.fit.util.CasUtil; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.ResourceCollectionReaderBase; +import org.dkpro.core.io.text.TextReader; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; + +public class TextReaderTest +{ + private static final String FILE1 = "test1.txt"; + private static final String FILE2 = "test2.txt"; + private static final List FILES = Arrays.asList(FILE1, FILE2); + + @Test + public void fileSystemReaderTest() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription(TextReader.class, + ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "src/test/resources/texts", + ResourceCollectionReaderBase.PARAM_PATTERNS, "[+]*.txt"); + + for (JCas jcas : new JCasIterable(reader)) { + DocumentMetaData md = DocumentMetaData.get(jcas); + dumpMetaData(md); + + assertEquals(1, CasUtil.select(jcas.getCas(), jcas.getDocumentAnnotationFs().getType()) + .size()); + assertTrue(FILES.contains(md.getDocumentId())); + + assertTrue( + !FILE1.equals(md.getDocumentId()) || ( + "This is a test.".equals(jcas.getDocumentText()) && + 15 == md.getEnd())); + + assertTrue( + !FILE2.equals(md.getDocumentId()) + || "This is a second test.".equals(jcas.getDocumentText())); + } + } + + @Test + public void fileSystemReaderAbsolutePathTest() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription(TextReader.class, + ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "src/test/resources/texts", + ResourceCollectionReaderBase.PARAM_PATTERNS, new String[] { + ResourceCollectionReaderBase.INCLUDE_PREFIX + "*.txt" }); + + for (JCas jcas : new JCasIterable(reader)) { + DocumentMetaData md = DocumentMetaData.get(jcas); + dumpMetaData(md); + + assertEquals(1, select(jcas.getCas(), jcas.getDocumentAnnotationFs().getType()).size()); + + assertTrue(FILES.contains(md.getDocumentId())); + + assertTrue( + !FILE1.equals(md.getDocumentId()) || ( + "This is a test.".equals(jcas.getDocumentText()) && + 15 == md.getEnd())); + + assertTrue( + !FILE2.equals(md.getDocumentId()) + || "This is a second test.".equals(jcas.getDocumentText())); + } + } + + @Test + public void fileSystemReaderTest3() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription(TextReader.class, + createTypeSystemDescription(), + ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "src/test/resources/name with space", + ResourceCollectionReaderBase.PARAM_PATTERNS, new String[] { + ResourceCollectionReaderBase.INCLUDE_PREFIX + "*.txt" }); + + for (JCas jcas : new JCasIterable(reader)) { + DocumentMetaData md = DocumentMetaData.get(jcas); + dumpMetaData(md); + + assertEquals(1, select(jcas.getCas(), jcas.getDocumentAnnotationFs().getType()).size()); + + assertTrue(FILES.contains(md.getDocumentId())); + + assertTrue( + !FILE1.equals(md.getDocumentId()) || ( + "This is a test.".equals(jcas.getDocumentText()) && + 15 == md.getEnd())); + + assertTrue( + !FILE2.equals(md.getDocumentId()) + || "This is a second test.".equals(jcas.getDocumentText())); + } + } + + @Test + public void fileSystemReaderTest2() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription(TextReader.class, + createTypeSystemDescription(), + ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "src/test/resources/texts", + ResourceCollectionReaderBase.PARAM_PATTERNS, new String[0]); + + for (JCas jcas : new JCasIterable(reader)) { + DocumentMetaData md = DocumentMetaData.get(jcas); + dumpMetaData(md); + + assertEquals(1, select(jcas.getCas(), jcas.getDocumentAnnotationFs().getType()).size()); + + assertTrue(FILES.contains(md.getDocumentId())); + + assertTrue( + !FILE1.equals(md.getDocumentId()) + || "This is a test.".equals(jcas.getDocumentText())); + + assertTrue( + !FILE2.equals(md.getDocumentId()) + || "This is a second test.".equals(jcas.getDocumentText())); + } + } + + @Test + public void fileSystemReaderTest4() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription(TextReader.class, + createTypeSystemDescription(), + ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "classpath:texts", + ResourceCollectionReaderBase.PARAM_PATTERNS, new String[0]); + + for (JCas jcas : new JCasIterable(reader)) { + DocumentMetaData md = DocumentMetaData.get(jcas); + dumpMetaData(md); + + assertEquals(1, select(jcas.getCas(), jcas.getDocumentAnnotationFs().getType()).size()); + + assertTrue(FILES.contains(md.getDocumentId())); + + assertTrue( + !FILE1.equals(md.getDocumentId()) + || "This is a test.".equals(jcas.getDocumentText())); + + assertTrue( + !FILE2.equals(md.getDocumentId()) + || "This is a second test.".equals(jcas.getDocumentText())); + } + } + + private void dumpMetaData(final DocumentMetaData aMetaData) + { + System.out.println("Collection ID: " + aMetaData.getCollectionId()); + System.out.println("ID : " + aMetaData.getDocumentId()); + System.out.println("Base URI : " + aMetaData.getDocumentBaseUri()); + System.out.println("URI : " + aMetaData.getDocumentUri()); + } + + @Rule public TestName name = new TestName(); +} diff --git a/dkpro-core-io-text-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/text/TextWriterTest.java b/dkpro-core-io-text-asl/src/test/java/org/dkpro/core/io/text/TextWriterTest.java similarity index 94% rename from dkpro-core-io-text-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/text/TextWriterTest.java rename to dkpro-core-io-text-asl/src/test/java/org/dkpro/core/io/text/TextWriterTest.java index 98dbcfd28d..706f3915f8 100644 --- a/dkpro-core-io-text-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/text/TextWriterTest.java +++ b/dkpro-core-io-text-asl/src/test/java/org/dkpro/core/io/text/TextWriterTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.text; +package org.dkpro.core.io.text; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; @@ -33,13 +33,14 @@ import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.resources.CompressionMethod; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.io.text.TextWriter; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionMethod; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class TextWriterTest { @@ -132,7 +133,8 @@ public void testCompressed() runPipeline(jcas, writer); File input = new File(outputPath, "dummy.txt.gz"); - InputStream is = CompressionUtils.getInputStream(input.getPath(), new FileInputStream(input)); + InputStream is = CompressionUtils.getInputStream(input.getPath(), + new FileInputStream(input)); assertEquals(text, IOUtils.toString(is)); } diff --git a/dkpro-core-io-text-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/text/TokenizedTextWriterTest.java b/dkpro-core-io-text-asl/src/test/java/org/dkpro/core/io/text/TokenizedTextWriterTest.java similarity index 98% rename from dkpro-core-io-text-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/text/TokenizedTextWriterTest.java rename to dkpro-core-io-text-asl/src/test/java/org/dkpro/core/io/text/TokenizedTextWriterTest.java index 09de1244d1..66afdbbd62 100644 --- a/dkpro-core-io-text-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/text/TokenizedTextWriterTest.java +++ b/dkpro-core-io-text-asl/src/test/java/org/dkpro/core/io/text/TokenizedTextWriterTest.java @@ -15,31 +15,33 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.text; +package org.dkpro.core.io.text; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.List; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; import org.apache.commons.io.FileUtils; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.text.TokenizedTextWriter; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.util.List; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; public class TokenizedTextWriterTest { diff --git a/dkpro-core-io-text-asl/src/test/resources/log4j.properties b/dkpro-core-io-text-asl/src/test/resources/log4j.properties deleted file mode 100644 index 9f0bdd6149..0000000000 --- a/dkpro-core-io-text-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,12 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO - -log4j.logger.de.tudarmstadt.ukp.dkpro.core.io.bincas.BinaryCasReader = WARN -log4j.logger.de.tudarmstadt.ukp.dkpro.core.io.bincas.BinaryCasWriter = WARN -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase = WARN \ No newline at end of file diff --git a/dkpro-core-io-text-asl/src/test/resources/log4j2.xml b/dkpro-core-io-text-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-text-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-tgrep-gpl/.license-header.txt b/dkpro-core-io-tgrep-gpl/.license-header.txt index ab08133a17..bbaf6e0e56 100644 --- a/dkpro-core-io-tgrep-gpl/.license-header.txt +++ b/dkpro-core-io-tgrep-gpl/.license-header.txt @@ -13,4 +13,4 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with this program. If not, see http://www.gnu.org/licenses/. +along with this program. If not, see http://www.gnu.org/licenses/. diff --git a/dkpro-core-io-tgrep-gpl/pom.xml b/dkpro-core-io-tgrep-gpl/pom.xml index 68444f5cfb..e8d124b6e0 100644 --- a/dkpro-core-io-tgrep-gpl/pom.xml +++ b/dkpro-core-io-tgrep-gpl/pom.xml @@ -1,6 +1,6 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-gpl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-gpl + 2.3.0-SNAPSHOT ../dkpro-core-gpl - de.tudarmstadt.ukp.dkpro.core.io.tgrep-gpl + dkpro-core-io-tgrep-gpl jar DKPro Core GPL - IO - TGrep2 + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -47,20 +48,34 @@ commons-lang3 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.syntax-asl + org.dkpro.core + dkpro-core-api-syntax-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api + + + + xml-apis + xml-apis + runtime junit @@ -68,13 +83,13 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.stanfordnlp-gpl + org.dkpro.core + dkpro-core-stanfordnlp-gpl test @@ -86,9 +101,9 @@ - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.stanfordnlp-gpl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-stanfordnlp-gpl + 2.3.0-SNAPSHOT import pom @@ -102,6 +117,7 @@ maven-dependency-plugin + xml-apis:xml-apis de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-parser-en-pcfg diff --git a/dkpro-core-io-tgrep-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tgrep/TGrepWriter.java b/dkpro-core-io-tgrep-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tgrep/TGrepWriter.java deleted file mode 100644 index c8f66d1b6a..0000000000 --- a/dkpro-core-io-tgrep-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tgrep/TGrepWriter.java +++ /dev/null @@ -1,317 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.io.tgrep; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStreamWriter; -import java.io.PrintWriter; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionMethod; -import de.tudarmstadt.ukp.dkpro.core.api.resources.RuntimeProvider; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; - -/** - * TGrep2 corpus file writer. Requires {@link PennTree}s to be annotated before. - */ -@MimeTypeCapability({MimeTypes.APPLICATION_X_TGREP2}) -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree"}) -public class TGrepWriter - extends JCasAnnotator_ImplBase -{ - /** - * Path to which the output is written. - */ - public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; - @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) - private File outputPath; - - /** - * Set this parameter to true if you want to add a comment to each PennTree which is written to - * the output files. The comment is of the form {@code documentId,beginOffset,endOffset}. - * - * Default: {@code true} - */ - public static final String PARAM_WRITE_COMMENTS = "writeComments"; - @ConfigurationParameter(name = PARAM_WRITE_COMMENTS, mandatory = true, defaultValue="true") - private boolean writeComments; - - /** - * Set this parameter to true if you want to encode directly into the tgrep2 binary format. - * - * Default: {@code true} - */ - public static final String PARAM_WRITE_T2C = "writeT2c"; - @ConfigurationParameter(name = PARAM_WRITE_T2C, mandatory = true, defaultValue = "true") - private boolean writeT2c; - - /** - * Method to compress the tgrep file (only used if PARAM_WRITE_T2C is true). Only NONE, GZIP and - * BZIP2 are supported. - * - * Default: {@link CompressionMethod#NONE} - * - * @see CompressionMethod - */ - public static final String PARAM_COMPRESSION = "compression"; - @ConfigurationParameter(name = PARAM_COMPRESSION, mandatory = true, defaultValue = "NONE") - private CompressionMethod compression; - - /** - * If true, silently drops malformed Penn Trees instead of throwing an exception. - * - * Default: {@code false} - */ - public static final String PARAM_DROP_MALFORMED_TREES = "dropMalformedTrees"; - @ConfigurationParameter(name = PARAM_DROP_MALFORMED_TREES, mandatory = true, defaultValue = "false") - private boolean dropMalformedTrees; - - private static final String EXT_CORPUS = ".txt"; - private static final String EXT_BINARY = ".t2c"; - - private Map writers; - private File tgrep2File; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - if (compression != CompressionMethod.NONE && compression != CompressionMethod.GZIP - && compression != CompressionMethod.BZIP2) { - throw new ResourceInitializationException(new IllegalArgumentException( - "Only gzip and bzip2 compression are supported by TGrep2, but [" + compression - + "] was specified.")); - } - - try { - FileUtils.forceMkdir(outputPath); - } - catch (IOException e) { - throw new ResourceInitializationException(e); - } - - writers = new HashMap(); - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - String filename; - String collectionId; - String documentId; - - try { - DocumentMetaData meta = DocumentMetaData.get(aJCas); - collectionId = meta.getCollectionId(); - documentId = meta.getDocumentId(); - } - catch (IllegalArgumentException e) { - getLogger().warn("No DocumentMetaData found."); - collectionId = "defaultCollectionId"; - documentId = "defaultDocumentId"; - } - - // if the collectionId contains inconvenient characters, remove them for the filename - // filename = collectionId; - filename = collectionId.replaceAll("\\W", ""); - - try { - PrintWriter pw = writers.get(filename); - if (pw == null) { - pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(new File( - outputPath, filename + EXT_CORPUS)), "UTF-8")); - writers.put(filename, pw); - } - - for (PennTree pt : JCasUtil.select(aJCas, PennTree.class)) { - String tree = StringUtils.normalizeSpace(pt.getPennTree()); - // detect and handle malformed trees - if (!isTermiteFree(tree)) { - if (dropMalformedTrees) { - getLogger().warn("Dropping malformed tree: [" + tree + "]."); - continue; - } - else { - throw new AnalysisEngineProcessException(new IllegalArgumentException( - "Found malformed tree: [" + tree + "].")); - } - } - // write comments and trees - if (writeComments) { - pw.printf("# %s,%d,%d\n", documentId, pt.getBegin(), pt.getEnd()); - } - pw.printf("%s\n", tree); - } - } - catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - } - - /** - * Check if a given Penn tree will be rejected by TGrep2. - * - * @param aTree - * the Penn tree to check - * @return true if aTree is fit for use with Tgrep2, false otherwise - */ - private boolean isTermiteFree(String aTree) - { - int bracketCount = 0; - boolean justOpened = false; - - if (aTree.isEmpty() || aTree.charAt(0) != '(') { - return false; - } - - for (int idx = 0; idx < aTree.length(); idx++) { - char c = aTree.charAt(idx); - switch (c) { - case '(': - bracketCount++; - if (justOpened) { - // "((" is illegal, also with spaces in between - return false; - } - justOpened = true; - break; - case ' ': - break; - case ')': - bracketCount--; - if (justOpened) { - // "()" is illegal, also with spaces in between - return false; - } - if (bracketCount < 0) { - // more closing than opening brackets at any point are illegal - return false; - } - justOpened = false; - break; - default: - justOpened = false; - break; - } - } - // if not all brackets are closed, the next sentence is thought to be part of this one - // we consider these cases as illegal, as the files are usually built one sentence/line - return bracketCount == 0; - } - - @Override - public void collectionProcessComplete() - throws AnalysisEngineProcessException - { - for (PrintWriter pw : writers.values()) { - IOUtils.closeQuietly(pw); - } - - if (writeT2c) { - RuntimeProvider runtime = new RuntimeProvider( - "classpath:/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/"); - try { - tgrep2File = runtime.getFile("tgrep2"); - for (String filename : writers.keySet()) { - writeTgrepBinary(filename); - } - } - catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - finally { - runtime.uninstall(); - } - } - } - - /** - * Produces a TGrep2 binary corpus file. - * - * @param aFilename - * the name of the file from which a corpus file shall be created, without extension - * @throws IOException - * if the employed tgrep2 process is interrupted or if it reports an error - */ - private void writeTgrepBinary(String aFilename) - throws IOException - { - List cmd = new ArrayList(); - cmd.add(tgrep2File.getAbsolutePath()); - if (writeComments) { - // enable writing comments - cmd.add("-C"); - } - // specify corpus - cmd.add("-p"); - cmd.add(new File(outputPath, aFilename + EXT_CORPUS).getAbsolutePath()); - cmd.add(new File(outputPath, aFilename + EXT_BINARY + compression.getExtension()) - .getAbsolutePath()); - - getLogger().info("Running tgrep2 command: [" + StringUtils.join(cmd, " ") + "]."); - - Process tgrepProcess = null; - try { - tgrepProcess = new ProcessBuilder(cmd).start(); - tgrepProcess.waitFor(); - } - catch (InterruptedException e) { - throw new IOException(); - } - finally { - if (tgrepProcess != null) { - InputStream stderr = tgrepProcess.getErrorStream(); - if (stderr.available() > 0) { - byte[] data = new byte[stderr.available()]; - stderr.read(data); - String error = new String(data, "UTF-8"); - getLogger().error(error); - throw new IOException(error); - } - } - } - } -} diff --git a/dkpro-core-io-tgrep-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tgrep/package-info.java b/dkpro-core-io-tgrep-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tgrep/package-info.java deleted file mode 100644 index ba30bd4fc7..0000000000 --- a/dkpro-core-io-tgrep-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tgrep/package-info.java +++ /dev/null @@ -1,25 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -/** - * Integration of the TGrep2 search engine for parse - * trees. - * - * @since 1.5.0 - */ -package de.tudarmstadt.ukp.dkpro.core.io.tgrep; \ No newline at end of file diff --git a/dkpro-core-io-tgrep-gpl/src/main/java/org/dkpro/core/io/tgrep/TGrepWriter.java b/dkpro-core-io-tgrep-gpl/src/main/java/org/dkpro/core/io/tgrep/TGrepWriter.java new file mode 100644 index 0000000000..1b104029d9 --- /dev/null +++ b/dkpro-core-io-tgrep-gpl/src/main/java/org/dkpro/core/io/tgrep/TGrepWriter.java @@ -0,0 +1,315 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.io.tgrep; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionMethod; +import org.dkpro.core.api.resources.RuntimeProvider; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; +import eu.openminted.share.annotations.api.Parameters; + +/** + * TGrep2 corpus file writer. Requires {@link PennTree}s to be annotated before. + */ +@ResourceMetaData(name = "TGrep2 Writer") +@MimeTypeCapability({MimeTypes.APPLICATION_X_TGREP2}) +@Parameters( + exclude = { + TGrepWriter.PARAM_TARGET_LOCATION }) +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree"}) +public class TGrepWriter + extends JCasAnnotator_ImplBase +{ + /** + * Path to which the output is written. + */ + public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; + @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) + private File outputPath; + + /** + * Set this parameter to true if you want to add a comment to each PennTree which is written to + * the output files. The comment is of the form {@code documentId,beginOffset,endOffset}. + */ + public static final String PARAM_WRITE_COMMENTS = "writeComments"; + @ConfigurationParameter(name = PARAM_WRITE_COMMENTS, mandatory = true, defaultValue = "true") + private boolean writeComments; + + /** + * Set this parameter to true if you want to encode directly into the tgrep2 binary format. + */ + public static final String PARAM_WRITE_T2C = "writeT2c"; + @ConfigurationParameter(name = PARAM_WRITE_T2C, mandatory = true, defaultValue = "true") + private boolean writeT2c; + + /** + * Method to compress the tgrep file (only used if PARAM_WRITE_T2C is true). Only NONE, GZIP and + * BZIP2 are supported. + * + * @see CompressionMethod + */ + public static final String PARAM_COMPRESSION = "compression"; + @ConfigurationParameter(name = PARAM_COMPRESSION, mandatory = true, defaultValue = "NONE") + private CompressionMethod compression; + + /** + * If true, silently drops malformed Penn Trees instead of throwing an exception. + */ + public static final String PARAM_DROP_MALFORMED_TREES = "dropMalformedTrees"; + @ConfigurationParameter(name = PARAM_DROP_MALFORMED_TREES, mandatory = true, defaultValue = "false") + private boolean dropMalformedTrees; + + private static final String EXT_CORPUS = ".txt"; + private static final String EXT_BINARY = ".t2c"; + + private Map writers; + private File tgrep2File; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + if (compression != CompressionMethod.NONE && compression != CompressionMethod.GZIP + && compression != CompressionMethod.BZIP2) { + throw new ResourceInitializationException(new IllegalArgumentException( + "Only gzip and bzip2 compression are supported by TGrep2, but [" + compression + + "] was specified.")); + } + + try { + FileUtils.forceMkdir(outputPath); + } + catch (IOException e) { + throw new ResourceInitializationException(e); + } + + writers = new HashMap(); + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + String filename; + String collectionId; + String documentId; + + try { + DocumentMetaData meta = DocumentMetaData.get(aJCas); + collectionId = meta.getCollectionId(); + documentId = meta.getDocumentId(); + } + catch (IllegalArgumentException e) { + getLogger().warn("No DocumentMetaData found."); + collectionId = "defaultCollectionId"; + documentId = "defaultDocumentId"; + } + + // if the collectionId contains inconvenient characters, remove them for the filename + // filename = collectionId; + filename = collectionId.replaceAll("\\W", ""); + + try { + PrintWriter pw = writers.get(filename); + if (pw == null) { + pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(new File( + outputPath, filename + EXT_CORPUS)), "UTF-8")); + writers.put(filename, pw); + } + + for (PennTree pt : JCasUtil.select(aJCas, PennTree.class)) { + String tree = StringUtils.normalizeSpace(pt.getPennTree()); + // detect and handle malformed trees + if (!isTermiteFree(tree)) { + if (dropMalformedTrees) { + getLogger().warn("Dropping malformed tree: [" + tree + "]."); + continue; + } + else { + throw new AnalysisEngineProcessException(new IllegalArgumentException( + "Found malformed tree: [" + tree + "].")); + } + } + // write comments and trees + if (writeComments) { + pw.printf("# %s,%d,%d\n", documentId, pt.getBegin(), pt.getEnd()); + } + pw.printf("%s\n", tree); + } + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + + /** + * Check if a given Penn tree will be rejected by TGrep2. + * + * @param aTree + * the Penn tree to check + * @return true if aTree is fit for use with Tgrep2, false otherwise + */ + private boolean isTermiteFree(String aTree) + { + int bracketCount = 0; + boolean justOpened = false; + + if (aTree.isEmpty() || aTree.charAt(0) != '(') { + return false; + } + + for (int idx = 0; idx < aTree.length(); idx++) { + char c = aTree.charAt(idx); + switch (c) { + case '(': + bracketCount++; + if (justOpened) { + // "((" is illegal, also with spaces in between + return false; + } + justOpened = true; + break; + case ' ': + break; + case ')': + bracketCount--; + if (justOpened) { + // "()" is illegal, also with spaces in between + return false; + } + if (bracketCount < 0) { + // more closing than opening brackets at any point are illegal + return false; + } + justOpened = false; + break; + default: + justOpened = false; + break; + } + } + // if not all brackets are closed, the next sentence is thought to be part of this one + // we consider these cases as illegal, as the files are usually built one sentence/line + return bracketCount == 0; + } + + @Override + public void collectionProcessComplete() + throws AnalysisEngineProcessException + { + for (PrintWriter pw : writers.values()) { + IOUtils.closeQuietly(pw); + } + + if (writeT2c) { + RuntimeProvider runtime = new RuntimeProvider( + "classpath:/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/"); + try { + tgrep2File = runtime.getFile("tgrep2"); + for (String filename : writers.keySet()) { + writeTgrepBinary(filename); + } + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + finally { + runtime.uninstall(); + } + } + } + + /** + * Produces a TGrep2 binary corpus file. + * + * @param aFilename + * the name of the file from which a corpus file shall be created, without extension + * @throws IOException + * if the employed tgrep2 process is interrupted or if it reports an error + */ + private void writeTgrepBinary(String aFilename) + throws IOException + { + List cmd = new ArrayList(); + cmd.add(tgrep2File.getAbsolutePath()); + if (writeComments) { + // enable writing comments + cmd.add("-C"); + } + // specify corpus + cmd.add("-p"); + cmd.add(new File(outputPath, aFilename + EXT_CORPUS).getAbsolutePath()); + cmd.add(new File(outputPath, aFilename + EXT_BINARY + compression.getExtension()) + .getAbsolutePath()); + + getLogger().info("Running tgrep2 command: [" + StringUtils.join(cmd, " ") + "]."); + + Process tgrepProcess = null; + try { + tgrepProcess = new ProcessBuilder(cmd).start(); + tgrepProcess.waitFor(); + } + catch (InterruptedException e) { + throw new IOException(); + } + finally { + if (tgrepProcess != null) { + InputStream stderr = tgrepProcess.getErrorStream(); + if (stderr.available() > 0) { + byte[] data = new byte[stderr.available()]; + stderr.read(data); + String error = new String(data, "UTF-8"); + getLogger().error(error); + throw new IOException(error); + } + } + } + } +} diff --git a/dkpro-core-io-tgrep-gpl/src/main/java/org/dkpro/core/io/tgrep/package-info.java b/dkpro-core-io-tgrep-gpl/src/main/java/org/dkpro/core/io/tgrep/package-info.java new file mode 100644 index 0000000000..dd6f20763e --- /dev/null +++ b/dkpro-core-io-tgrep-gpl/src/main/java/org/dkpro/core/io/tgrep/package-info.java @@ -0,0 +1,25 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +/** + * Integration of the TGrep2 search engine for parse + * trees. + * + * @since 1.5.0 + */ +package org.dkpro.core.io.tgrep; diff --git a/dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/LICENSE.txt b/dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/LICENSE.txt similarity index 100% rename from dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/LICENSE.txt rename to dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/LICENSE.txt diff --git a/dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/NOTICE.txt b/dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/NOTICE.txt similarity index 100% rename from dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/NOTICE.txt rename to dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/NOTICE.txt diff --git a/dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/linux-x86_32/manifest.properties b/dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/linux-x86_32/manifest.properties similarity index 100% rename from dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/linux-x86_32/manifest.properties rename to dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/linux-x86_32/manifest.properties diff --git a/dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/linux-x86_32/tgrep2 b/dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/linux-x86_32/tgrep2 similarity index 100% rename from dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/linux-x86_32/tgrep2 rename to dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/linux-x86_32/tgrep2 diff --git a/dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/linux-x86_64/README b/dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/linux-x86_64/README similarity index 100% rename from dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/linux-x86_64/README rename to dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/linux-x86_64/README diff --git a/dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/linux-x86_64/manifest.properties b/dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/linux-x86_64/manifest.properties similarity index 100% rename from dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/linux-x86_64/manifest.properties rename to dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/linux-x86_64/manifest.properties diff --git a/dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/linux-x86_64/tgrep2 b/dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/linux-x86_64/tgrep2 similarity index 100% rename from dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/linux-x86_64/tgrep2 rename to dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/linux-x86_64/tgrep2 diff --git a/dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/osx-x86_32/manifest.properties b/dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/osx-x86_32/manifest.properties similarity index 100% rename from dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/osx-x86_32/manifest.properties rename to dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/osx-x86_32/manifest.properties diff --git a/dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/osx-x86_32/tgrep2 b/dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/osx-x86_32/tgrep2 similarity index 100% rename from dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/osx-x86_32/tgrep2 rename to dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/osx-x86_32/tgrep2 diff --git a/dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/osx-x86_64/manifest.properties b/dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/osx-x86_64/manifest.properties similarity index 100% rename from dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/osx-x86_64/manifest.properties rename to dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/osx-x86_64/manifest.properties diff --git a/dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/osx-x86_64/tgrep2 b/dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/osx-x86_64/tgrep2 similarity index 100% rename from dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/osx-x86_64/tgrep2 rename to dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/osx-x86_64/tgrep2 diff --git a/dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/windows-x86_32/manifest.properties b/dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/windows-x86_32/manifest.properties similarity index 100% rename from dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/windows-x86_32/manifest.properties rename to dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/windows-x86_32/manifest.properties diff --git a/dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/windows-x86_32/tgrep2 b/dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/windows-x86_32/tgrep2 similarity index 100% rename from dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/windows-x86_32/tgrep2 rename to dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/windows-x86_32/tgrep2 diff --git a/dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/windows-x86_64/manifest.properties b/dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/windows-x86_64/manifest.properties similarity index 100% rename from dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/windows-x86_64/manifest.properties rename to dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/windows-x86_64/manifest.properties diff --git a/dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/windows-x86_64/tgrep2 b/dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/windows-x86_64/tgrep2 similarity index 100% rename from dkpro-core-io-tgrep-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/windows-x86_64/tgrep2 rename to dkpro-core-io-tgrep-gpl/src/main/resources/org/dkpro/core/io/tgrep/bin/windows-x86_64/tgrep2 diff --git a/dkpro-core-io-tgrep-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tgrep/TGrepWriterTest.java b/dkpro-core-io-tgrep-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tgrep/TGrepWriterTest.java deleted file mode 100644 index 6c0df99756..0000000000 --- a/dkpro-core-io-tgrep-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tgrep/TGrepWriterTest.java +++ /dev/null @@ -1,91 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.io.tgrep; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; - -import java.io.File; -import java.util.ArrayList; -import java.util.List; - -import org.apache.commons.io.FileUtils; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.fit.factory.JCasFactory; -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.apache.uima.jcas.JCas; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionMethod; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordParser; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - -public class TGrepWriterTest -{ - @Test - public void testTxt() - throws Exception - { - File outputPath = testContext.getTestOutputFolder(); - - String language = "en"; - String text = "This is a sample sentence. Followed by another one."; - AnalysisEngineDescription seg = createEngineDescription(StanfordSegmenter.class); - - AnalysisEngineDescription parse = createEngineDescription(StanfordParser.class, - StanfordParser.PARAM_WRITE_PENN_TREE, true, - StanfordParser.PARAM_LANGUAGE, "en", - StanfordParser.PARAM_VARIANT, "pcfg"); - - AnalysisEngineDescription tgrep = createEngineDescription(TGrepWriter.class, - TGrepWriter.PARAM_TARGET_LOCATION, outputPath, - TGrepWriter.PARAM_COMPRESSION, CompressionMethod.GZIP, - TGrepWriter.PARAM_DROP_MALFORMED_TREES, true, - TGrepWriter.PARAM_WRITE_COMMENTS, true, - TGrepWriter.PARAM_WRITE_T2C, false); - - JCas jcas = JCasFactory.createJCas(); - jcas.setDocumentLanguage(language); - jcas.setDocumentText(text); - DocumentMetaData meta = DocumentMetaData.create(jcas); - meta.setCollectionId("testCollection"); - meta.setDocumentId("testDocument"); - - SimplePipeline.runPipeline(jcas, seg, parse, tgrep); - - List expected = new ArrayList(); - expected.add("# testDocument,0,26"); - expected.add("(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN sample) (NN sentence))) (. .)))"); - expected.add("# testDocument,27,51"); - expected.add("(ROOT (S (VP (VBN Followed) (PP (IN by) (NP (DT another) (NN one)))) (. .)))"); - List actual = FileUtils.readLines(new File(outputPath, "testCollection.txt"), "UTF-8"); - - Assert.assertEquals(expected.size(), actual.size()); - - for (int i = 0; i < actual.size(); i++) { - Assert.assertEquals(expected.get(i), actual.get(i)); - } - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} \ No newline at end of file diff --git a/dkpro-core-io-tgrep-gpl/src/test/java/org/dkpro/core/io/tgrep/TGrepWriterTest.java b/dkpro-core-io-tgrep-gpl/src/test/java/org/dkpro/core/io/tgrep/TGrepWriterTest.java new file mode 100644 index 0000000000..5a1073678f --- /dev/null +++ b/dkpro-core-io-tgrep-gpl/src/test/java/org/dkpro/core/io/tgrep/TGrepWriterTest.java @@ -0,0 +1,95 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.io.tgrep; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.io.FileUtils; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.resources.CompressionMethod; +import org.dkpro.core.io.tgrep.TGrepWriter; +import org.dkpro.core.stanfordnlp.StanfordParser; +import org.dkpro.core.stanfordnlp.StanfordSegmenter; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; + +/** + * + */ +public class TGrepWriterTest +{ + @Test + public void testTxt() + throws Exception + { + File outputPath = testContext.getTestOutputFolder(); + + String language = "en"; + String text = "This is a sample sentence. Followed by another one."; + AnalysisEngineDescription seg = createEngineDescription(StanfordSegmenter.class); + + AnalysisEngineDescription parse = createEngineDescription(StanfordParser.class, + StanfordParser.PARAM_WRITE_PENN_TREE, true, + StanfordParser.PARAM_LANGUAGE, "en", + StanfordParser.PARAM_VARIANT, "pcfg"); + + AnalysisEngineDescription tgrep = createEngineDescription(TGrepWriter.class, + TGrepWriter.PARAM_TARGET_LOCATION, outputPath, + TGrepWriter.PARAM_COMPRESSION, CompressionMethod.GZIP, + TGrepWriter.PARAM_DROP_MALFORMED_TREES, true, + TGrepWriter.PARAM_WRITE_COMMENTS, true, + TGrepWriter.PARAM_WRITE_T2C, false); + + JCas jcas = JCasFactory.createJCas(); + jcas.setDocumentLanguage(language); + jcas.setDocumentText(text); + DocumentMetaData meta = DocumentMetaData.create(jcas); + meta.setCollectionId("testCollection"); + meta.setDocumentId("testDocument"); + + SimplePipeline.runPipeline(jcas, seg, parse, tgrep); + + List expected = new ArrayList(); + expected.add("# testDocument,0,26"); + expected.add("(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN sample) (NN sentence))) (. .)))"); + expected.add("# testDocument,27,51"); + expected.add("(ROOT (S (VP (VBN Followed) (PP (IN by) (NP (DT another) (NN one)))) (. .)))"); + List actual = FileUtils.readLines(new File(outputPath, "testCollection.txt"), "UTF-8"); + + Assert.assertEquals(expected.size(), actual.size()); + + for (int i = 0; i < actual.size(); i++) { + Assert.assertEquals(expected.get(i), actual.get(i)); + } + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-tiger-asl/pom.xml b/dkpro-core-io-tiger-asl/pom.xml index 89aee92bbc..d73738167f 100644 --- a/dkpro-core-io-tiger-asl/pom.xml +++ b/dkpro-core-io-tiger-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.tiger-asl + dkpro-core-io-tiger-asl jar DKPro Core ASL - IO - TIGER-XML + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -52,45 +53,65 @@ - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.syntax-asl + org.dkpro.core + dkpro-core-api-syntax-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.semantics-asl + org.dkpro.core + dkpro-core-api-semantics-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.penntree-asl + org.dkpro.core + dkpro-core-io-penntree-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + eu.openminted.share.annotations + omtd-share-annotations-api + + + javax.xml.bind + jaxb-api + + + com.sun.xml.bind + jaxb-core + + + com.sun.xml.bind + jaxb-impl + + + javax.activation + javax.activation-api + junit junit test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test @@ -99,18 +120,18 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl + org.dkpro.core + dkpro-core-opennlp-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.conll-asl + org.dkpro.core + dkpro-core-io-conll-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl test @@ -122,28 +143,33 @@ - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-opennlp-asl + 2.3.0-SNAPSHOT import pom - - - - org.apache.maven.plugins - maven-dependency-plugin - - - - de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.opennlp-model-parser-en-chunking - - - - - + + + org.apache.maven.plugins + maven-dependency-plugin + + true + + + de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.opennlp-model-parser-en-chunking + + com.sun.xml.bind:jaxb-core + com.sun.xml.bind:jaxb-impl + javax.activation:javax.activation-api + + + + \ No newline at end of file diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/package-info.java b/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/package-info.java deleted file mode 100644 index 7bf6db0eac..0000000000 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Support for the TIGER-XML format. - */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger; \ No newline at end of file diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/TigerXmlReader.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/TigerXmlReader.java similarity index 88% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/TigerXmlReader.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/TigerXmlReader.java index ef88cbd0aa..31b23bcc21 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/TigerXmlReader.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/TigerXmlReader.java @@ -15,11 +15,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger; +package org.dkpro.core.io.tiger; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; import java.io.IOException; import java.io.InputStream; @@ -55,15 +56,31 @@ import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.io.penntree.PennTreeNode; +import org.dkpro.core.io.penntree.PennTreeUtils; +import org.dkpro.core.io.tiger.internal.IllegalAnnotationStructureException; +import org.dkpro.core.io.tiger.internal.model.AnnotationDecl; +import org.dkpro.core.io.tiger.internal.model.Meta; +import org.dkpro.core.io.tiger.internal.model.TigerEdge; +import org.dkpro.core.io.tiger.internal.model.TigerFeNode; +import org.dkpro.core.io.tiger.internal.model.TigerFrame; +import org.dkpro.core.io.tiger.internal.model.TigerFrameElement; +import org.dkpro.core.io.tiger.internal.model.TigerGraph; +import org.dkpro.core.io.tiger.internal.model.TigerNode; +import org.dkpro.core.io.tiger.internal.model.TigerNonTerminal; +import org.dkpro.core.io.tiger.internal.model.TigerPart; +import org.dkpro.core.io.tiger.internal.model.TigerSem; +import org.dkpro.core.io.tiger.internal.model.TigerSentence; +import org.dkpro.core.io.tiger.internal.model.TigerSplitword; +import org.dkpro.core.io.tiger.internal.model.TigerTerminal; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; @@ -73,29 +90,14 @@ import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; -import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeNode; -import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.IllegalAnnotationStructureException; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.AnnotationDecl; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.Meta; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerEdge; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerFeNode; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerFrame; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerFrameElement; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerGraph; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerNode; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerNonTerminal; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerPart; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerSem; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerSentence; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerSplitword; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerTerminal; +import eu.openminted.share.annotations.api.DocumentationResource; /** * UIMA collection reader for TIGER-XML files. Also supports the augmented format used in the * Semeval 2010 task which includes semantic role data. */ -@ResourceMetaData(name="TIGER-XML Reader") +@ResourceMetaData(name = "TIGER-XML Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.APPLICATION_X_TIGER_XML, MimeTypes.APPLICATION_X_SEMEVAL_2010_XML}) @TypeCapability( outputs = { @@ -110,10 +112,19 @@ public class TigerXmlReader extends JCasResourceCollectionReader_ImplBase { + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Location of the mapping file for part-of-speech tags to UIMA types. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String mappingPosLocation; @@ -130,8 +141,6 @@ public class TigerXmlReader * Write Penn Treebank bracketed structure information. Mind this may not work with all tagsets, * in particular not with such that contain "(" or ")" in their tags. The tree is generated * using the original tag set in the corpus, not using the mapped tagset! - * - * Default: {@code false} */ public static final String PARAM_READ_PENN_TREE = ComponentParameters.PARAM_READ_PENN_TREE; @ConfigurationParameter(name = PARAM_READ_PENN_TREE, mandatory = true, defaultValue = "false") @@ -140,8 +149,6 @@ public class TigerXmlReader /** * If a sentence has an illegal structure (e.g. TIGER 2.0 has non-terminal nodes that do not * have child nodes), then just ignore these sentences. - * - * Default: {@code false} */ public static final String PARAM_IGNORE_ILLEGAL_SENTENCES = "ignoreIllegalSentences"; @ConfigurationParameter(name = PARAM_IGNORE_ILLEGAL_SENTENCES, mandatory = true, defaultValue = "false") @@ -155,8 +162,8 @@ public void initialize(UimaContext aContext) { super.initialize(aContext); - posMappingProvider = MappingProviderFactory.createPosMappingProvider(mappingPosLocation, - posTagset, getLanguage()); + posMappingProvider = createPosMappingProvider(this, mappingPosLocation, posTagset, + getLanguage()); } @Override @@ -261,7 +268,7 @@ protected void readSentence(JCasBuilder aBuilder, TigerSentence aSentence) Type posType = posMappingProvider.getTagType(t.pos); POS posAnno = (POS) aBuilder.getJCas().getCas().createAnnotation(posType, token.getBegin(), token.getEnd()); - posAnno.setPosValue(t.pos.intern()); + posAnno.setPosValue(t.pos != null ? t.pos.intern() : null); POSUtils.assignCoarseValue(posAnno); posAnno.addToIndexes(); token.setPos(posAnno); @@ -288,13 +295,15 @@ protected void readSentence(JCasBuilder aBuilder, TigerSentence aSentence) if (aSentence.sem != null) { if (aSentence.sem.splitwords != null) { // read splitwords as terminals/tokens - readSplit(aBuilder.getJCas(), terminals, aSentence.sem.splitwords, tokenIdToTextMap); + readSplit(aBuilder.getJCas(), terminals, aSentence.sem.splitwords, + tokenIdToTextMap); } readSem(aBuilder.getJCas(), terminals, nonterminals, aSentence.sem, tokenIdToTextMap); } } - private void readSplit(JCas jCas, Map terminals, List splitwords, Map tokenIdToTextMap) + private void readSplit(JCas jCas, Map terminals, List splitwords, + Map tokenIdToTextMap) { for (TigerSplitword split : splitwords) { Token orig = terminals.get(split.idref); @@ -312,7 +321,8 @@ private void readSplit(JCas jCas, Map terminals, List terminals, - Map nonterminals, TigerSem sem, Map tokenIdToTextMap) + Map nonterminals, TigerSem sem, + Map tokenIdToTextMap) { if (sem.frames != null) { for (TigerFrame frame : sem.frames) { @@ -457,9 +467,9 @@ private int[] getBoundaryOfFirstContiguousElement(Set frameTokenSet, String completeFrameTarget = ""; for (String word : tokenList) { String textRepresentation = tokenIdToTextMap.get(word); - if(textRepresentation == null){ + if (textRepresentation == null) { textRepresentation = ""; - for(String part:word.split(" ")) { + for (String part : word.split(" ")) { textRepresentation += tokenIdToTextMap.get(part) + " "; } textRepresentation = textRepresentation.trim(); diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/TigerXmlWriter.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/TigerXmlWriter.java similarity index 83% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/TigerXmlWriter.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/TigerXmlWriter.java index 0a63ad0398..75d24fd9a7 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/TigerXmlWriter.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/TigerXmlWriter.java @@ -15,9 +15,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger; +package org.dkpro.core.io.tiger; import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.apache.uima.fit.util.FSCollectionFactory.create; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; @@ -27,8 +28,6 @@ import java.util.List; import java.util.Map; -import javanet.staxutils.IndentingXMLEventWriter; - import javax.xml.bind.JAXBContext; import javax.xml.bind.JAXBElement; import javax.xml.bind.Marshaller; @@ -44,26 +43,30 @@ import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.FSCollectionFactory; import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.jcas.tcas.Annotation; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.io.tiger.internal.model.TigerEdge; +import org.dkpro.core.io.tiger.internal.model.TigerGraph; +import org.dkpro.core.io.tiger.internal.model.TigerNode; +import org.dkpro.core.io.tiger.internal.model.TigerNonTerminal; +import org.dkpro.core.io.tiger.internal.model.TigerSentence; +import org.dkpro.core.io.tiger.internal.model.TigerTerminal; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerEdge; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerGraph; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerNode; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerNonTerminal; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerSentence; -import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerTerminal; +import eu.openminted.share.annotations.api.DocumentationResource; +import javanet.staxutils.IndentingXMLEventWriter; /** * UIMA CAS consumer writing the CAS document text in the TIGER-XML format. */ -@ResourceMetaData(name="TIGER-XML Writer") +@ResourceMetaData(name = "TIGER-XML Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.APPLICATION_X_TIGER_XML}) @TypeCapability( inputs = { @@ -79,10 +82,19 @@ public class TigerXmlWriter extends JCasFileWriter_ImplBase * Specify the suffix of output files. Default value .xml. If the suffix is not * needed, provide an empty string as value. */ - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".xml") private String filenameSuffix; + /** + * Character encoding of the output data. + */ + public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; + @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) + private String targetEncoding; + @Override public void process(JCas aJCas) throws AnalysisEngineProcessException @@ -94,7 +106,7 @@ public void process(JCas aJCas) XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newInstance(); xmlEventWriter = new IndentingXMLEventWriter( - xmlOutputFactory.createXMLEventWriter(docOS)); + xmlOutputFactory.createXMLEventWriter(docOS, targetEncoding)); JAXBContext context = JAXBContext.newInstance(TigerSentence.class); Marshaller marshaller = context.createMarshaller(); @@ -184,7 +196,7 @@ protected TigerSentence convertSentence(Sentence aSentence, int aSentNum) // Convert the parse tree (pass 2: edges) for (Constituent constituent : constituents) { TigerNode node = nodes.get(constituent); - for (FeatureStructure c : FSCollectionFactory.create(constituent.getChildren())) { + for (FeatureStructure c : create((FSArray) constituent.getChildren())) { if (c instanceof Constituent) { String synFun = ((Constituent) c).getSyntacticFunction(); TigerEdge edge = new TigerEdge(); diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/IllegalAnnotationStructureException.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/IllegalAnnotationStructureException.java similarity index 95% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/IllegalAnnotationStructureException.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/IllegalAnnotationStructureException.java index af106eeb51..494e65f5c3 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/IllegalAnnotationStructureException.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/IllegalAnnotationStructureException.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger.internal; +package org.dkpro.core.io.tiger.internal; /** * Thrown when the annotation structure is illegal. diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/AnnotationDecl.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/AnnotationDecl.java similarity index 93% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/AnnotationDecl.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/AnnotationDecl.java index 16e57fcac3..86558ae0d2 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/AnnotationDecl.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/AnnotationDecl.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model; +package org.dkpro.core.io.tiger.internal.model; import java.util.List; @@ -25,8 +25,10 @@ public class AnnotationDecl { @XmlElement(name = "feature") public List features; + @XmlElement(name = "edgelabel") public List edgeLabels; + @XmlElement(name = "secedgelabel") public List secEdgeLabels; -} \ No newline at end of file +} diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/EdgeLabelDecl.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/EdgeLabelDecl.java similarity index 92% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/EdgeLabelDecl.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/EdgeLabelDecl.java index c32fce3afe..29b9448db7 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/EdgeLabelDecl.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/EdgeLabelDecl.java @@ -15,11 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model; +package org.dkpro.core.io.tiger.internal.model; import java.util.List; public class EdgeLabelDecl { public List values; -} \ No newline at end of file +} diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/FeatureDecl.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/FeatureDecl.java similarity index 93% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/FeatureDecl.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/FeatureDecl.java index 9035d68bbe..580b9c9194 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/FeatureDecl.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/FeatureDecl.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model; +package org.dkpro.core.io.tiger.internal.model; import java.util.List; @@ -26,8 +26,10 @@ public class FeatureDecl { @XmlAttribute public String name; + @XmlAttribute public String domain; + @XmlElement(name = "value") public List values; -} \ No newline at end of file +} diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/Meta.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/Meta.java similarity index 94% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/Meta.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/Meta.java index 81886f25ec..4738ae8da5 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/Meta.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/Meta.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model; +package org.dkpro.core.io.tiger.internal.model; public class Meta { @@ -31,4 +31,4 @@ public String toString() return "Meta [name=" + name + ", author=" + author + ", date=" + date + ", description=" + description + ", format=" + format + "]"; } -} \ No newline at end of file +} diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerEdge.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerEdge.java similarity index 92% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerEdge.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerEdge.java index 5e5b1eec61..cad3bd6414 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerEdge.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerEdge.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model; +package org.dkpro.core.io.tiger.internal.model; import javax.xml.bind.annotation.XmlAttribute; @@ -23,6 +23,7 @@ public class TigerEdge { @XmlAttribute public String idref; + @XmlAttribute public String label; -} \ No newline at end of file +} diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerFeNode.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerFeNode.java similarity index 92% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerFeNode.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerFeNode.java index 47386eafda..1c381ad01d 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerFeNode.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerFeNode.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model; +package org.dkpro.core.io.tiger.internal.model; import javax.xml.bind.annotation.XmlAttribute; @@ -23,6 +23,7 @@ public class TigerFeNode { @XmlAttribute public String idref; + @XmlAttribute public Boolean is_split; -} \ No newline at end of file +} diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerFrame.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerFrame.java similarity index 94% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerFrame.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerFrame.java index 6f48c1e5f9..05f1413070 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerFrame.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerFrame.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model; +package org.dkpro.core.io.tiger.internal.model; import java.util.List; @@ -26,10 +26,13 @@ public class TigerFrame { @XmlAttribute public String id; + @XmlAttribute public String name; + @XmlElement(name = "fe") public List fes; + @XmlElement(name = "target") public TigerTarget target; -} \ No newline at end of file +} diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerFrameElement.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerFrameElement.java similarity index 93% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerFrameElement.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerFrameElement.java index 160374020e..333c9e4b36 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerFrameElement.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerFrameElement.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model; +package org.dkpro.core.io.tiger.internal.model; import java.util.List; @@ -26,8 +26,10 @@ public class TigerFrameElement { @XmlAttribute public String id; + @XmlAttribute public String name; + @XmlElement(name = "fenode") public List fenodes; -} \ No newline at end of file +} diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerGraph.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerGraph.java similarity index 95% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerGraph.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerGraph.java index a95932cfb1..85e51efaac 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerGraph.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerGraph.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model; +package org.dkpro.core.io.tiger.internal.model; import java.util.List; @@ -27,11 +27,14 @@ public class TigerGraph { @XmlAttribute public String root; + @XmlAttribute public boolean discontinuous; + @XmlElementWrapper(name = "terminals") @XmlElement(name = "t") public List terminals; + @XmlElementWrapper(name = "nonterminals") @XmlElement(name = "nt") public List nonTerminals; @@ -50,4 +53,4 @@ public TigerNode get(String aId) } return null; } -} \ No newline at end of file +} diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerNode.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerNode.java similarity index 93% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerNode.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerNode.java index 05465f3309..91f30ca7f2 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerNode.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerNode.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model; +package org.dkpro.core.io.tiger.internal.model; import java.util.List; @@ -26,8 +26,10 @@ public class TigerNode { @XmlAttribute public String id; + @XmlElement(name = "edge") public List edges; + @XmlElement(name = "secedge") public List secEdges; -} \ No newline at end of file +} diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerNonTerminal.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerNonTerminal.java similarity index 92% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerNonTerminal.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerNonTerminal.java index 2d142c01be..53d1de9140 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerNonTerminal.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerNonTerminal.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model; +package org.dkpro.core.io.tiger.internal.model; import javax.xml.bind.annotation.XmlAttribute; @@ -24,4 +24,4 @@ public class TigerNonTerminal { @XmlAttribute public String cat; -} \ No newline at end of file +} diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerPart.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerPart.java similarity index 92% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerPart.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerPart.java index fb28a596f4..9bd87f6d39 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerPart.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerPart.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model; +package org.dkpro.core.io.tiger.internal.model; import javax.xml.bind.annotation.XmlAttribute; @@ -23,6 +23,7 @@ public class TigerPart { @XmlAttribute public String id; + @XmlAttribute public String word; -} \ No newline at end of file +} diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerSem.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerSem.java similarity index 94% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerSem.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerSem.java index abe3adbeb1..3bebe95f9d 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerSem.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerSem.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model; +package org.dkpro.core.io.tiger.internal.model; import java.util.List; @@ -31,4 +31,4 @@ public class TigerSem @XmlElementWrapper(name = "splitwords") @XmlElement(name = "splitword") public List splitwords; -} \ No newline at end of file +} diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerSentence.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerSentence.java similarity index 94% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerSentence.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerSentence.java index d8ef39386b..0f37b7baed 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerSentence.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerSentence.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model; +package org.dkpro.core.io.tiger.internal.model; import javax.xml.bind.annotation.XmlAttribute; import javax.xml.bind.annotation.XmlID; @@ -25,7 +25,9 @@ public class TigerSentence @XmlID @XmlAttribute public String id; + public TigerGraph graph; + public TigerSem sem; public String getText() @@ -39,4 +41,4 @@ public String getText() } return sb.toString(); } -} \ No newline at end of file +} diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerSplitword.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerSplitword.java similarity index 93% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerSplitword.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerSplitword.java index 7716bf011c..8aadfd0837 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerSplitword.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerSplitword.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model; +package org.dkpro.core.io.tiger.internal.model; import java.util.List; @@ -26,6 +26,7 @@ public class TigerSplitword { @XmlAttribute public String idref = null; + @XmlElement(name = "part") public List parts; -} \ No newline at end of file +} diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerTarget.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerTarget.java similarity index 92% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerTarget.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerTarget.java index ee4da5e0a6..cf6724bea2 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerTarget.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerTarget.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model; +package org.dkpro.core.io.tiger.internal.model; import java.util.List; @@ -25,4 +25,4 @@ public class TigerTarget { @XmlElement(name = "fenode") public List fenodes; -} \ No newline at end of file +} diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerTerminal.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerTerminal.java similarity index 92% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerTerminal.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerTerminal.java index 969af407fe..9863164cc3 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/TigerTerminal.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/TigerTerminal.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model; +package org.dkpro.core.io.tiger.internal.model; import javax.xml.bind.annotation.XmlAttribute; @@ -24,24 +24,34 @@ public class TigerTerminal { @XmlAttribute public String word; + @XmlAttribute public String lemma; + @XmlAttribute public String pos; + @XmlAttribute public String morph; + @XmlAttribute(name = "case") public String casus; + @XmlAttribute public String number; + @XmlAttribute public String gender; + @XmlAttribute public String person; + @XmlAttribute public String degree; + @XmlAttribute public String tense; + @XmlAttribute public String mood; -} \ No newline at end of file +} diff --git a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/ValueDecl.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/ValueDecl.java similarity index 93% rename from dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/ValueDecl.java rename to dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/ValueDecl.java index 9b2d78b83f..e47eebedff 100644 --- a/dkpro-core-io-tiger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/internal/model/ValueDecl.java +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/internal/model/ValueDecl.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model; +package org.dkpro.core.io.tiger.internal.model; import javax.xml.bind.annotation.XmlAttribute; import javax.xml.bind.annotation.XmlValue; @@ -32,4 +32,4 @@ public String toString() { return "ValueDecl [name=" + name + ", value=" + value + "]"; } -} \ No newline at end of file +} diff --git a/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/package-info.java b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/package-info.java new file mode 100644 index 0000000000..2915ba7f78 --- /dev/null +++ b/dkpro-core-io-tiger-asl/src/main/java/org/dkpro/core/io/tiger/package-info.java @@ -0,0 +1,22 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Support for the TIGER-XML format. + */ +package org.dkpro.core.io.tiger; diff --git a/dkpro-core-io-tiger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/TigerXmlReaderTest.java b/dkpro-core-io-tiger-asl/src/test/java/org/dkpro/core/io/tiger/TigerXmlReaderTest.java similarity index 77% rename from dkpro-core-io-tiger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/TigerXmlReaderTest.java rename to dkpro-core-io-tiger-asl/src/test/java/org/dkpro/core/io/tiger/TigerXmlReaderTest.java index bcd72ea5c5..a127662d54 100644 --- a/dkpro-core-io-tiger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/TigerXmlReaderTest.java +++ b/dkpro-core-io-tiger-asl/src/test/java/org/dkpro/core/io/tiger/TigerXmlReaderTest.java @@ -15,32 +15,34 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger; +package org.dkpro.core.io.tiger; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.*; -import static org.apache.uima.fit.pipeline.SimplePipeline.*; -import static org.apache.uima.fit.util.JCasUtil.selectSingle; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.pipeline.SimplePipeline.iteratePipeline; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.apache.uima.fit.util.JCasUtil.selectSingle; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; import static org.junit.Assert.assertEquals; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.*; - import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReader; import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.conll.Conll2012Writer; +import org.dkpro.core.io.tiger.TigerXmlReader; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; + import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2012Writer; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class TigerXmlReaderTest { @@ -63,7 +65,7 @@ public void test() AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); } - @Test(expected=IllegalStateException.class) + @Test(expected = IllegalStateException.class) public void test2() throws Exception { @@ -125,15 +127,17 @@ public void testNoncontiguousFrameTarget() TigerXmlReader.PARAM_LANGUAGE, "de", TigerXmlReader.PARAM_READ_PENN_TREE, true); - int[][] frameRanges = new int[][] {{4, 11}, {33, 47}, {71, 74}, {112, 138}, {143, 147}, {246, 255}}; + int[][] frameRanges = new int[][] { { 4, 11 }, { 33, 47 }, { 71, 74 }, { 112, 138 }, + { 143, 147 }, { 246, 255 } }; for (JCas cas : iteratePipeline(reader, new AnalysisEngineDescription[] {})) { - for (Sentence sentence : select(cas, Sentence.class)){ - for(SemPred frame: selectCovered(SemPred.class, sentence)){ - System.out.println("frame boundary " + frame.getBegin() + " : " + frame.getEnd()); + for (Sentence sentence : select(cas, Sentence.class)) { + for (SemPred frame : selectCovered(SemPred.class, sentence)) { + System.out + .println("frame boundary " + frame.getBegin() + " : " + frame.getEnd()); boolean found = false; - for(int[] element:frameRanges){ - if(element[0] == frame.getBegin() && element[1] == frame.getEnd()){ + for (int[] element : frameRanges) { + if (element[0] == frame.getBegin() && element[1] == frame.getEnd()) { found = true; break; } @@ -154,7 +158,8 @@ public void testFrameTargetHavingMultipleChildren() TigerXmlReader.PARAM_LANGUAGE, "de", TigerXmlReader.PARAM_READ_PENN_TREE, true); - int[][] frameRanges = new int[][] {{26, 41}, {54, 61}, {64, 85}, {97, 104}, {120, 130}, {135, 151}, {152, 169}}; + int[][] frameRanges = new int[][] { { 26, 41 }, { 54, 61 }, { 64, 85 }, { 97, 104 }, + { 120, 130 }, { 135, 151 }, { 152, 169 } }; /* Frame targets: * Glaubwürdigkeit * wichtig @@ -166,12 +171,13 @@ public void testFrameTargetHavingMultipleChildren() * **/ for (JCas cas : iteratePipeline(reader, new AnalysisEngineDescription[] {})) { - for (Sentence sentence : select(cas, Sentence.class)){ - for(SemPred frame: selectCovered(SemPred.class, sentence)){ - System.out.println("frame target text [" +frame.getCoveredText() + "], frame boundary " + frame.getBegin() + " : " + frame.getEnd()); + for (Sentence sentence : select(cas, Sentence.class)) { + for (SemPred frame : selectCovered(SemPred.class, sentence)) { + System.out.println("frame target text [" + frame.getCoveredText() + + "], frame boundary " + frame.getBegin() + " : " + frame.getEnd()); boolean found = false; - for(int[] element:frameRanges){ - if(element[0] == frame.getBegin() && element[1] == frame.getEnd()){ + for (int[] element : frameRanges) { + if (element[0] == frame.getBegin() && element[1] == frame.getEnd()) { found = true; break; } @@ -198,15 +204,17 @@ public void testContiguousFrameTarget() * it spans over 2 tokens "schlage" and "mit", so the boundary should be * schlage.begin and mit.end ==> (4, 15) */ - int[][] frameRanges = new int[][] {{4, 15}, {33, 47}, {71, 74}, {112, 138}, {143, 147}, {246, 255}}; + int[][] frameRanges = new int[][] { { 4, 15 }, { 33, 47 }, { 71, 74 }, { 112, 138 }, + { 143, 147 }, { 246, 255 } }; for (JCas cas : iteratePipeline(reader, new AnalysisEngineDescription[] {})) { - for (Sentence sentence : select(cas, Sentence.class)){ - for(SemPred frame: selectCovered(SemPred.class, sentence)){ - System.out.println("frame boundary " + frame.getBegin() + " : " + frame.getEnd()); + for (Sentence sentence : select(cas, Sentence.class)) { + for (SemPred frame : selectCovered(SemPred.class, sentence)) { + System.out + .println("frame boundary " + frame.getBegin() + " : " + frame.getEnd()); boolean found = false; - for(int[] element:frameRanges){ - if(element[0] == frame.getBegin() && element[1] == frame.getEnd()){ + for (int[] element : frameRanges) { + if (element[0] == frame.getBegin() && element[1] == frame.getEnd()) { found = true; break; } diff --git a/dkpro-core-io-tiger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/TigerXmlReaderWriterTest.java b/dkpro-core-io-tiger-asl/src/test/java/org/dkpro/core/io/tiger/TigerXmlReaderWriterTest.java similarity index 93% rename from dkpro-core-io-tiger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/TigerXmlReaderWriterTest.java rename to dkpro-core-io-tiger-asl/src/test/java/org/dkpro/core/io/tiger/TigerXmlReaderWriterTest.java index 924a4ab35f..b64b973b22 100644 --- a/dkpro-core-io-tiger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/TigerXmlReaderWriterTest.java +++ b/dkpro-core-io-tiger-asl/src/test/java/org/dkpro/core/io/tiger/TigerXmlReaderWriterTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger; +package org.dkpro.core.io.tiger; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; @@ -29,9 +29,11 @@ import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.pipeline.SimplePipeline; import org.custommonkey.xmlunit.XMLAssert; +import org.dkpro.core.io.tiger.TigerXmlReader; +import org.dkpro.core.io.tiger.TigerXmlWriter; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class TigerXmlReaderWriterTest { diff --git a/dkpro-core-io-tiger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/TigerXmlWriterTest.java b/dkpro-core-io-tiger-asl/src/test/java/org/dkpro/core/io/tiger/TigerXmlWriterTest.java similarity index 90% rename from dkpro-core-io-tiger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/TigerXmlWriterTest.java rename to dkpro-core-io-tiger-asl/src/test/java/org/dkpro/core/io/tiger/TigerXmlWriterTest.java index 0ca039f15d..43a6ec6f86 100644 --- a/dkpro-core-io-tiger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/tiger/TigerXmlWriterTest.java +++ b/dkpro-core-io-tiger-asl/src/test/java/org/dkpro/core/io/tiger/TigerXmlWriterTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tiger; +package org.dkpro.core.io.tiger; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; @@ -27,12 +27,14 @@ import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; import org.custommonkey.xmlunit.XMLAssert; +import org.dkpro.core.io.tiger.TigerXmlWriter; +import org.dkpro.core.opennlp.OpenNlpParser; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; + import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpParser; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class TigerXmlWriterTest { diff --git a/dkpro-core-io-tiger-asl/src/test/resources/log4j.properties b/dkpro-core-io-tiger-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-io-tiger-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-tiger-asl/src/test/resources/log4j2.xml b/dkpro-core-io-tiger-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-tiger-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-tiger-asl/src/test/resources/semeval1010-en-sample.conll b/dkpro-core-io-tiger-asl/src/test/resources/semeval1010-en-sample.conll index 2792a02c97..55365fe81c 100644 --- a/dkpro-core-io-tiger-asl/src/test/resources/semeval1010-en-sample.conll +++ b/dkpro-core-io-tiger-asl/src/test/resources/semeval1010-en-sample.conll @@ -1,35 +1,35 @@ #begin document (semeval1010-en-sample.xml); part 000 -semeval1010-en-sample.xml 0 0 Save VB (S* save - - - * * * * * * * * * * * - -semeval1010-en-sample.xml 0 1 for IN (VP* for - - - * * * * * * * * * * * - -semeval1010-en-sample.xml 0 2 this DT (PP(PP* this - - - * * (Current* * * * * * * * * - -semeval1010-en-sample.xml 0 3 one CD (NPB(ADJP* one - - - * * * * * * * * * * * - -semeval1010-en-sample.xml 0 4 excursion NN * excursion Travel - - * (V*) * (V*) (V*) (V*) (V*) (V*) (V*) (V*) (V*) - -semeval1010-en-sample.xml 0 5 , PUNC, (ADJP* , - - - * * *) * * * * * * * * - -semeval1010-en-sample.xml 0 6 he PRP *)) he Coreference - - * (Traveler) (V*) (Current) (Coreferent) (V*) (V*) (Self_mover) (Interlocutor_1) (V*) (Coreferent) - -semeval1010-en-sample.xml 0 7 spent VBD * spend - - - * * * * * * * * * * * - -semeval1010-en-sample.xml 0 8 his PRP$ *)) his Coreference - - * (V*) (V*) (V*) (Current) (V*) (V*) (V*) (V*) (V*) (V*) - -semeval1010-en-sample.xml 0 9 days NNS * day Calendric_unit - - * (V*) (V*) (V*) (V*) (Unit) (V*) (V*) (V*) (V*) (V*) - -semeval1010-en-sample.xml 0 10 in IN (PP* in - - - * * * * * * * * * * * - -semeval1010-en-sample.xml 0 11 long JJ (VP* long Duration - - * (V*) (V*) (V*) (V*) (V*) (V*) (V*) (V*) (V*) (V*) - -semeval1010-en-sample.xml 0 12 and CC (PP* and - - - * * * * * * * * * * * - -semeval1010-en-sample.xml 0 13 often RB (NP(NPB* often - - - * * * * * * * (Depictive* * * * - -semeval1010-en-sample.xml 0 14 solitary JJ *) solitary - - - * * * * * * * *) * * * - -semeval1010-en-sample.xml 0 15 walks VBZ (PP* walk Self_motion - - * (V*) (V*) (V*) (V*) (V*) (Eventuality) (V*) (V*) (V*) (V*) - -semeval1010-en-sample.xml 0 16 , PUNC, (NP(NPB* , - - - * * * * * * * * * * * - -semeval1010-en-sample.xml 0 17 or CC *) or - - - * * * * * * * * * * * - -semeval1010-en-sample.xml 0 18 in IN (SBAR(WHNP* in - - - * * * * * * * * * * * - -semeval1010-en-sample.xml 0 19 chatting VBG *) chat Chatting - - * (V*) (V*) (V*) (V*) (V*) (V*) (V*) (V*) (V*) (V*) - -semeval1010-en-sample.xml 0 20 with IN (S* with - - - * * * * * * * * (Interlocutor_2* * * - -semeval1010-en-sample.xml 0 21 a DT (VP* a - - - * * * * * * * * * * * - -semeval1010-en-sample.xml 0 22 number NN (VP* number Quantity - - * (V*) (V*) (V*) (V*) (V*) (V*) (V*) * (Quantity) (V*) - -semeval1010-en-sample.xml 0 23 of IN *))))))))))) of - - - * * * * * * * * * (Individuals* * - -semeval1010-en-sample.xml 0 24 village NN (NPB* village - - - * * * * * * * * * * * - -semeval1010-en-sample.xml 0 25 gossips NNS *)) gossip - - - * * * * * * * * * * * - -semeval1010-en-sample.xml 0 26 whose WP$ (VP* whose - - - * * * * * * * * * * * - -semeval1010-en-sample.xml 0 27 acquaintance NN (PP* acquaintance - - - * * * * * * * * * * * - -semeval1010-en-sample.xml 0 28 he PRP (NPB* he Coreference - - * (V*) (V*) (V*) (V*) (V*) (V*) (V*) * * (Current) - -semeval1010-en-sample.xml 0 29 had VBD * have - - - * * * * * * * * * * * - -semeval1010-en-sample.xml 0 30 cultivated VBN * cultivate - - - * * (Current) * * * * * * * * - -semeval1010-en-sample.xml 0 31 . PUNC. *)))) . - - - * * * * * * * * *) *) * - +semeval1010-en-sample.xml 0 0 Save VB (S* save - - - * * * * * * * * * * - +semeval1010-en-sample.xml 0 1 for IN (VP* for - - - * * * * * * * * * * - +semeval1010-en-sample.xml 0 2 this DT (PP(PP* this - - - * * * * * * * * * * - +semeval1010-en-sample.xml 0 3 one CD (NPB(ADJP* one - - - * * * * * * * * * * - +semeval1010-en-sample.xml 0 4 excursion NN * excursion Travel - - * (V*) * * * * * * * * - +semeval1010-en-sample.xml 0 5 , PUNC, (ADJP* , - - - * * * * * * * * * * - +semeval1010-en-sample.xml 0 6 he PRP *)) he Coreference - - * (Traveler) (V*) (Coreferent) * * (Self_mover) (Interlocutor_1) * (Coreferent) - +semeval1010-en-sample.xml 0 7 spent VBD * spend - - - * * * * * * * * * * - +semeval1010-en-sample.xml 0 8 his PRP$ *)) his Coreference - - * * * (V*) * * * * * * - +semeval1010-en-sample.xml 0 9 days NNS * day Calendric_unit - - * * * * (V*) * * * * * - +semeval1010-en-sample.xml 0 10 in IN (PP* in - - - * * * * * * * * * * - +semeval1010-en-sample.xml 0 11 long JJ (VP* long Duration - - * * * * * (V*) * * * * - +semeval1010-en-sample.xml 0 12 and CC (PP* and - - - * * * * * * * * * * - +semeval1010-en-sample.xml 0 13 often RB (NP(NPB* often - - - * * * * * * (Depictive* * * * - +semeval1010-en-sample.xml 0 14 solitary JJ *) solitary - - - * * * * * * *) * * * - +semeval1010-en-sample.xml 0 15 walks VBZ (PP* walk Self_motion - - * * * * * (Eventuality) (V*) * * * - +semeval1010-en-sample.xml 0 16 , PUNC, (NP(NPB* , - - - * * * * * * * * * * - +semeval1010-en-sample.xml 0 17 or CC *) or - - - * * * * * * * * * * - +semeval1010-en-sample.xml 0 18 in IN (SBAR(WHNP* in - - - * * * * * * * * * * - +semeval1010-en-sample.xml 0 19 chatting VBG *) chat Chatting - - * * * * * * * (V*) * * - +semeval1010-en-sample.xml 0 20 with IN (S* with - - - * * * * * * * (Interlocutor_2* * * - +semeval1010-en-sample.xml 0 21 a DT (VP* a - - - * * * * * * * * * * - +semeval1010-en-sample.xml 0 22 number NN (VP* number Quantity - - * * * * * * * * (V*) * - +semeval1010-en-sample.xml 0 23 of IN *))))))))))) of - - - * * * * * * * * (Individuals* * - +semeval1010-en-sample.xml 0 24 village NN (NPB* village - - - * * * * * * * * * * - +semeval1010-en-sample.xml 0 25 gossips NNS *)) gossip - - - * * * * * * * * * * - +semeval1010-en-sample.xml 0 26 whose WP$ (VP* whose - - - * * * * * * * * * * - +semeval1010-en-sample.xml 0 27 acquaintance NN (PP* acquaintance - - - * * * * * * * * * * - +semeval1010-en-sample.xml 0 28 he PRP (NPB* he Coreference - - * * * * * * * * * (V*) - +semeval1010-en-sample.xml 0 29 had VBD * have - - - * * * * * * * * * * - +semeval1010-en-sample.xml 0 30 cultivated VBN * cultivate - - - * * * * * * * * * * - +semeval1010-en-sample.xml 0 31 . PUNC. *)))) . - - - * * * * * * * *) *) * - -#end document +#end document \ No newline at end of file diff --git a/dkpro-core-io-tiger-asl/src/test/resources/semeval1010-sample.xml.dump b/dkpro-core-io-tiger-asl/src/test/resources/semeval1010-sample.xml.dump index c75d573b68..46449797a3 100644 --- a/dkpro-core-io-tiger-asl/src/test/resources/semeval1010-sample.xml.dump +++ b/dkpro-core-io-tiger-asl/src/test/resources/semeval1010-sample.xml.dump @@ -25,7 +25,7 @@ PennTree sofa: _InitialView begin: 0 end: 169 - PennTree: "(S (PRP he) (VP (VBD spent) (PP (PP (IN in) (NPB (ADJP (JJ long) (CC and) (ADJP (RB often) (JJ solitary))) (VBZ walks) (PUNC, ,))) (CC or) (PP (IN in) (VP (VBG chatting) (PP (IN with) (NP (NPB (DT a) (NN number)) (PP (IN of) (NP (NPB (NN village) (NNS gossips)) (SBAR (WHNP (WP$ whose) (NN acquaintance)) (S (PRP he) (VP (VBD had) (VP (VBN cultivated) (PUNC. .)))))))))))) (NPB (PRP$ his) (NNS days))) (VP (VB Save) (PP (IN for) (NPB (DT this) (CD one) (NN excursion) (PUNC, ,)))))" + PennTree: "(S (PRP he) (VP (VBD spent) (PP (PP (IN in) (NPB (ADJP (JJ long) (CC and) (ADJP ..." [Save for this one excursion , he spent his days in long and often solitary walks , or in chatting with a number of village gossips whose acquaintance he had cultivated .] ROOT sofa: _InitialView @@ -88,6 +88,7 @@ Token PosValue: "VB" coarseValue: "VERB" id: "s115_0" + order: 0 [for this one excursion ,] Constituent sofa: _InitialView @@ -155,6 +156,7 @@ Token PosValue: "IN" coarseValue: "ADP" id: "s115_1" + order: 0 [this one excursion ,] SemArg sofa: _InitialView @@ -239,6 +241,7 @@ Token PosValue: "DT" coarseValue: "DET" id: "s115_2" + order: 0 [one] POS_NUM sofa: _InitialView @@ -293,6 +296,7 @@ Token PosValue: "CD" coarseValue: "NUM" id: "s115_3" + order: 0 [excursion] POS_NOUN sofa: _InitialView @@ -347,6 +351,7 @@ Token PosValue: "NN" coarseValue: "NOUN" id: "s115_4" + order: 0 [excursion] SemPred sofa: _InitialView @@ -413,6 +418,7 @@ Token end: 29 PosValue: "PUNC," id: "s115_5" + order: 0 [he] POS_PRON sofa: _InitialView @@ -449,6 +455,7 @@ Token PosValue: "PRP" coarseValue: "PRON" id: "s115_6" + order: 0 [he] SemArg sofa: _InitialView @@ -541,6 +548,7 @@ Token PosValue: "VBD" coarseValue: "VERB" id: "s115_7" + order: 0 [his days] Constituent sofa: _InitialView @@ -608,6 +616,7 @@ Token PosValue: "PRP$" coarseValue: "PRON" id: "s115_8" + order: 0 [his] SemArg sofa: _InitialView @@ -668,6 +677,7 @@ Token PosValue: "NNS" coarseValue: "NOUN" id: "s115_9" + order: 0 [days] SemArg sofa: _InitialView @@ -778,6 +788,7 @@ Token PosValue: "IN" coarseValue: "ADP" id: "s115_10" + order: 0 [long and often solitary walks ,] Constituent sofa: _InitialView @@ -912,6 +923,7 @@ Token PosValue: "JJ" coarseValue: "ADJ" id: "s115_11" + order: 0 [long] SemPred sofa: _InitialView @@ -985,6 +997,7 @@ Token PosValue: "CC" coarseValue: "CONJ" id: "s115_12" + order: 0 [often solitary] SemArg sofa: _InitialView @@ -1105,6 +1118,7 @@ Token PosValue: "RB" coarseValue: "ADV" id: "s115_13" + order: 0 [solitary] POS_ADJ sofa: _InitialView @@ -1177,6 +1191,7 @@ Token PosValue: "JJ" coarseValue: "ADJ" id: "s115_14" + order: 0 [walks] POS_VERB sofa: _InitialView @@ -1237,6 +1252,7 @@ Token PosValue: "VBZ" coarseValue: "VERB" id: "s115_15" + order: 0 [walks] SemArg sofa: _InitialView @@ -1307,6 +1323,7 @@ Token end: 82 PosValue: "PUNC," id: "s115_16" + order: 0 [or] POS_CONJ sofa: _InitialView @@ -1355,6 +1372,7 @@ Token PosValue: "CC" coarseValue: "CONJ" id: "s115_17" + order: 0 [in chatting with a number of village gossips whose acquaintance he had cultivated .] Constituent sofa: _InitialView @@ -1434,6 +1452,7 @@ Token PosValue: "IN" coarseValue: "ADP" id: "s115_18" + order: 0 [chatting with a number of village gossips whose acquaintance he had cultivated .] Constituent sofa: _InitialView @@ -1525,6 +1544,7 @@ Token PosValue: "VBG" coarseValue: "VERB" id: "s115_19" + order: 0 [chatting] SemPred sofa: _InitialView @@ -1640,6 +1660,7 @@ Token PosValue: "IN" coarseValue: "ADP" id: "s115_20" + order: 0 [a number of village gossips whose acquaintance he had cultivated .] Constituent sofa: _InitialView @@ -1810,6 +1831,7 @@ Token PosValue: "DT" coarseValue: "DET" id: "s115_21" + order: 0 [number] POS_NOUN sofa: _InitialView @@ -1888,6 +1910,7 @@ Token PosValue: "NN" coarseValue: "NOUN" id: "s115_22" + order: 0 [number] SemArg sofa: _InitialView @@ -2032,6 +2055,7 @@ Token PosValue: "IN" coarseValue: "ADP" id: "s115_23" + order: 0 [village gossips whose acquaintance he had cultivated .] Constituent sofa: _InitialView @@ -2238,6 +2262,7 @@ Token PosValue: "NN" coarseValue: "NOUN" id: "s115_24" + order: 0 [gossips] POS_NOUN sofa: _InitialView @@ -2328,6 +2353,7 @@ Token PosValue: "NNS" coarseValue: "NOUN" id: "s115_25" + order: 0 [whose acquaintance he had cultivated .] Constituent sofa: _InitialView @@ -2552,6 +2578,7 @@ Token PosValue: "WP$" coarseValue: "PRON" id: "s115_26" + order: 0 [acquaintance] POS_NOUN sofa: _InitialView @@ -2648,6 +2675,7 @@ Token PosValue: "NN" coarseValue: "NOUN" id: "s115_27" + order: 0 [he had cultivated .] Constituent sofa: _InitialView @@ -2811,6 +2839,7 @@ Token PosValue: "PRP" coarseValue: "PRON" id: "s115_28" + order: 0 [he] SemArg sofa: _InitialView @@ -2998,6 +3027,7 @@ Token PosValue: "VBD" coarseValue: "VERB" id: "s115_29" + order: 0 [cultivated .] Constituent sofa: _InitialView @@ -3185,6 +3215,7 @@ Token PosValue: "VBN" coarseValue: "VERB" id: "s115_30" + order: 0 [cultivated] SemArg sofa: _InitialView @@ -3296,6 +3327,7 @@ Token end: 169 PosValue: "PUNC." id: "s115_31" + order: 0 -------- View _InitialView end ---------------------------------- ======== CAS 0 end ================================== \ No newline at end of file diff --git a/dkpro-core-io-tiger-asl/src/test/resources/tiger-sample.xml.dump b/dkpro-core-io-tiger-asl/src/test/resources/tiger-sample.xml.dump index d01b1b8897..9c95b053f1 100644 --- a/dkpro-core-io-tiger-asl/src/test/resources/tiger-sample.xml.dump +++ b/dkpro-core-io-tiger-asl/src/test/resources/tiger-sample.xml.dump @@ -25,7 +25,7 @@ PennTree sofa: _InitialView begin: 0 end: 56 - PennTree: "(VROOT ($( ``) (S (PN-SB (NE Ross) (NE Perot)) (VAFIN wäre) (ADV vielleicht) (NP-PD (ART ein) (ADJA prächtiger) (NN Diktator))) ($( ''))" + PennTree: "(VROOT ($( ``) (S (PN-SB (NE Ross) (NE Perot)) (VAFIN wäre) (ADV vielleicht) (NP..." [`` Ross Perot wäre vielleicht ein prächtiger Diktator ''] ROOT sofa: _InitialView @@ -69,6 +69,7 @@ Token PosValue: "$(" coarseValue: "PUNCT" id: "s1_1" + order: 0 [Ross Perot wäre vielleicht ein prächtiger Diktator] Constituent sofa: _InitialView @@ -151,6 +152,7 @@ Token PosValue: "NE" coarseValue: "PROPN" id: "s1_2" + order: 0 [Perot] POS_PROPN sofa: _InitialView @@ -200,6 +202,7 @@ Token PosValue: "NE" coarseValue: "PROPN" id: "s1_3" + order: 0 [wäre] POS_VERB sofa: _InitialView @@ -242,6 +245,7 @@ Token PosValue: "VAFIN" coarseValue: "VERB" id: "s1_4" + order: 0 [vielleicht] POS_ADV sofa: _InitialView @@ -284,6 +288,7 @@ Token PosValue: "ADV" coarseValue: "ADV" id: "s1_5" + order: 0 [ein prächtiger Diktator] Constituent sofa: _InitialView @@ -353,6 +358,7 @@ Token PosValue: "ART" coarseValue: "DET" id: "s1_6" + order: 0 [prächtiger] POS_ADJ sofa: _InitialView @@ -402,6 +408,7 @@ Token PosValue: "ADJA" coarseValue: "ADJ" id: "s1_7" + order: 0 [Diktator] POS_NOUN sofa: _InitialView @@ -451,6 +458,7 @@ Token PosValue: "NN" coarseValue: "NOUN" id: "s1_8" + order: 0 [''] POS_PUNCT sofa: _InitialView @@ -487,6 +495,7 @@ Token PosValue: "$(" coarseValue: "PUNCT" id: "s1_9" + order: 0 -------- View _InitialView end ---------------------------------- ======== CAS 0 end ================================== \ No newline at end of file diff --git a/dkpro-core-io-tika-asl/pom.xml b/dkpro-core-io-tika-asl/pom.xml index 281688e00e..3cfb34dc1a 100644 --- a/dkpro-core-io-tika-asl/pom.xml +++ b/dkpro-core-io-tika-asl/pom.xml @@ -18,17 +18,17 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - org.dkpro.core dkpro-core-io-tika-asl jar DKPro Core ASL - IO - Tika (v ${tika.version}) + https://dkpro.github.io/dkpro-core/ - 1.13 + 1.25 @@ -50,14 +50,41 @@ ${tika.version} - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + xml-apis + xml-apis + + + org.dkpro.core + dkpro-core-api-io-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api + + + org.slf4j + slf4j-api junit junit test + + org.slf4j + jul-to-slf4j + test + + + org.slf4j + jcl-over-slf4j + test + + + org.slf4j + slf4j-log4j12 + test + @@ -69,6 +96,13 @@ org.apache.tika:tika-parsers + + org.slf4j:slf4j-api + org.slf4j:jcl-over-slf4j + org.slf4j:slf4j-log4j12 + org.slf4j:jul-to-slf4j diff --git a/dkpro-core-io-tika-asl/src/main/java/org/dkpro/core/io/tika/TikaReader.java b/dkpro-core-io-tika-asl/src/main/java/org/dkpro/core/io/tika/TikaReader.java index d1196a5418..bc4ec119b6 100644 --- a/dkpro-core-io-tika-asl/src/main/java/org/dkpro/core/io/tika/TikaReader.java +++ b/dkpro-core-io-tika-asl/src/main/java/org/dkpro/core/io/tika/TikaReader.java @@ -20,25 +20,52 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; + import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.EmptyParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; import org.apache.uima.UimaContext; import org.apache.uima.cas.CAS; +import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.ResourceCollectionReaderBase; import org.xml.sax.SAXException; -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Reader for many file formats based on Apache Tika. */ -@ResourceMetaData(name="Tika Multi-Format Reader") +@ResourceMetaData(name = "Tika Multi-Format Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" }) public class TikaReader extends ResourceCollectionReaderBase { + /** + * Parse embedded documents in addition to the main document. + */ + public static final String PARAM_PARSE_EMBEDDED_DOCUMENTS = "parseEmbeddedDocuments"; + @ConfigurationParameter(name = PARAM_PARSE_EMBEDDED_DOCUMENTS, mandatory = false, + defaultValue = "false") + private boolean parseEmbeddedDocuments; + + /** + * Internal buffer size. If the buffer size is exceeded, the reader will throw an exception + * (-1 means unlimited size). + */ + public static final String PARAM_BUFFER_SIZE = "bufferSize"; + @ConfigurationParameter(name = PARAM_BUFFER_SIZE, mandatory = false, defaultValue = "-1") + private int bufferSize; + // The Tika parser private AutoDetectParser parser; @@ -64,14 +91,23 @@ public void getNext(CAS cas) initCas(cas, fileResource); // Parse the document, docText stores the parsed text + BodyContentHandler handler = new BodyContentHandler(bufferSize); + // Give hints to NameDetector about the filename + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, new File(fileResource.getPath()).getName()); + + // If we process embedded documents, we use the auto-detect parser recursively + ParseContext parseContext = new ParseContext(); + if (parseEmbeddedDocuments) { + parseContext.set(Parser.class, parser); + } + else { + parseContext.set(Parser.class, new EmptyParser()); + } + String docText = null; try (InputStream in = fileResource.getInputStream()) { - // Give hints to NameDetector about the filename - Metadata metadata = new Metadata(); - metadata.set(Metadata.RESOURCE_NAME_KEY, new File(fileResource.getPath()).getName()); - BodyContentHandler handler = new BodyContentHandler(-1); - parser.parse(in, handler, metadata); - + parser.parse(in, handler, metadata, parseContext); docText = handler.toString(); } catch (SAXException | TikaException e) { diff --git a/dkpro-core-io-tika-asl/src/test/java/org/dkpro/core/io/tika/TikaReaderTest.java b/dkpro-core-io-tika-asl/src/test/java/org/dkpro/core/io/tika/TikaReaderTest.java index 26b5446164..6fb62dcb8a 100644 --- a/dkpro-core-io-tika-asl/src/test/java/org/dkpro/core/io/tika/TikaReaderTest.java +++ b/dkpro-core-io-tika-asl/src/test/java/org/dkpro/core/io/tika/TikaReaderTest.java @@ -52,4 +52,17 @@ public void testOdt() assertEquals("This is a test. And here is another one.\n\n", jcas.getDocumentText()); } + + @Test + public void testDoc() + throws Exception + { + CollectionReaderDescription reader = createReaderDescription( + TikaReader.class, + TikaReader.PARAM_SOURCE_LOCATION, "src/test/resources/doc/test.doc"); + + JCas jcas = new JCasIterable(reader).iterator().next(); + + assertEquals("test\n\n", jcas.getDocumentText()); + } } diff --git a/dkpro-core-io-tika-asl/src/test/resources/doc/test.doc b/dkpro-core-io-tika-asl/src/test/resources/doc/test.doc new file mode 100644 index 0000000000..93198c87cb Binary files /dev/null and b/dkpro-core-io-tika-asl/src/test/resources/doc/test.doc differ diff --git a/dkpro-core-io-tika-asl/src/test/resources/log4j2.xml b/dkpro-core-io-tika-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-tika-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-tuebadz-asl/pom.xml b/dkpro-core-io-tuebadz-asl/pom.xml index 8d9dd34f1b..0b4f04bfd5 100644 --- a/dkpro-core-io-tuebadz-asl/pom.xml +++ b/dkpro-core-io-tuebadz-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.tuebadz-asl + dkpro-core-io-tuebadz-asl jar DKPro Core ASL - IO - TUEBADZ + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -44,28 +45,32 @@ commons-lang3 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.syntax-asl + org.dkpro.core + dkpro-core-api-syntax-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api diff --git a/dkpro-core-io-tuebadz-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuebadz/TuebaDZReader.java b/dkpro-core-io-tuebadz-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuebadz/TuebaDZReader.java deleted file mode 100644 index 0284ccebcb..0000000000 --- a/dkpro-core-io-tuebadz-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuebadz/TuebaDZReader.java +++ /dev/null @@ -1,295 +0,0 @@ -/* - * Copyright 2013 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.tuebadz; - -import static org.apache.commons.io.IOUtils.closeQuietly; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.List; - -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.Type; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.factory.JCasBuilder; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.io.IobDecoder; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; - -/** - * Reads the Tüba-D/Z chunking format. - * - * @see TüBA-D/Z - * Web page - */ -@ResourceMetaData(name="Tüba-D/Z Chunking Format Reader") -@MimeTypeCapability({MimeTypes.APPLICATION_X_TUEBADZ_CHUNK}) -@TypeCapability(outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk"}) -public class TuebaDZReader - extends JCasResourceCollectionReader_ImplBase -{ - private static final int FORM = 0; - private static final int POSTAG = 1; - private static final int IOB = 2; - - private static final String TAB = "\t"; - private static final String EQUAL_SIGN = "="; - private static final String SENTENCE_HEADER = "%% sent no."; - private static final int SENTENCE_HEADER_LEN = SENTENCE_HEADER.length(); - - /** - * Character encoding of the input data. - */ - public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) - private String sourceEncoding; - - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spamming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code true} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - - /** - * Write part-of-speech information. - * - * Default: {@code true} - */ - public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; - @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") - private boolean posEnabled; - - /** - * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the - * tag set defined as part of the model meta data. This can be useful if a custom model is - * specified which does not have such meta data, or it can be used in readers. - */ - public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; - @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) - protected String posTagset; - - /** - * Load the part-of-speech tag to UIMA type mapping from this location instead of locating - * the mapping automatically. - */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - protected String posMappingLocation; - - /** - * Read chunk information. - * - * Default: {@code true} - */ - public static final String PARAM_READ_CHUNK = ComponentParameters.PARAM_READ_CHUNK; - @ConfigurationParameter(name = PARAM_READ_CHUNK, mandatory = true, defaultValue = "true") - private boolean chunkEnabled; - - /** - * Read named entity information. - * - * Default: {@code false} - */ - public static final String PARAM_READ_NAMED_ENTITY = ComponentParameters.PARAM_READ_NAMED_ENTITY; - @ConfigurationParameter(name = PARAM_READ_NAMED_ENTITY, mandatory = true, defaultValue = "false") - private boolean namedEntityEnabled; - - /** - * Use this chunk tag set to use to resolve the tag set mapping instead of using the - * tag set defined as part of the model meta data. This can be useful if a custom model is - * specified which does not have such meta data, or it can be used in readers. - */ - public static final String PARAM_CHUNK_TAG_SET = ComponentParameters.PARAM_CHUNK_TAG_SET; - @ConfigurationParameter(name = PARAM_CHUNK_TAG_SET, mandatory = false) - protected String chunkTagset; - - /** - * Load the chunk tag to UIMA type mapping from this location instead of locating - * the mapping automatically. - */ - public static final String PARAM_CHUNK_MAPPING_LOCATION = ComponentParameters.PARAM_CHUNK_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_CHUNK_MAPPING_LOCATION, mandatory = false) - protected String chunkMappingLocation; - - private MappingProvider posMappingProvider; - private MappingProvider chunkMappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - posTagset, getLanguage()); - - chunkMappingProvider = MappingProviderFactory.createChunkMappingProvider(chunkMappingLocation, - chunkTagset, getLanguage()); - } - - @Override - public void getNext(JCas aJCas) - throws IOException, CollectionException - { - try { - if (posEnabled) { - posMappingProvider.configure(aJCas.getCas()); - } - if (chunkEnabled) { - chunkMappingProvider.configure(aJCas.getCas()); - } - } - catch (AnalysisEngineProcessException e) { - throw new IOException(e); - } - - Resource res = nextFile(); - initCas(aJCas, res); - BufferedReader reader = null; - try { - reader = new BufferedReader(new InputStreamReader(res.getInputStream(), sourceEncoding)); - convert(aJCas, reader); - } - finally { - closeQuietly(reader); - } - } - - private void convert(JCas aJCas, BufferedReader aReader) - throws IOException - { - JCasBuilder doc = new JCasBuilder(aJCas); - - Type chunkType = JCasUtil.getType(aJCas, Chunk.class); - Feature chunkValue = chunkType.getFeatureByBaseName("chunkValue"); - IobDecoder decoder = new IobDecoder(aJCas.getCas(), chunkValue, chunkMappingProvider); - decoder.setInternTags(internTags); - - List words; - while ((words = readSentence(aReader)) != null) { - if (words.isEmpty()) { - continue; - } - - int sentenceBegin = doc.getPosition(); - int sentenceEnd = sentenceBegin; - - List tokens = new ArrayList(); - String[] chunkTags = new String[words.size()]; - - // Tokens, POS - int i = 0; - for (String[] word : words) { - // Read token - Token token = doc.add(word[FORM], Token.class); - sentenceEnd = token.getEnd(); - doc.add(" "); - - if (posEnabled) { - Type posTag = posMappingProvider.getTagType(word[POSTAG]); - POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), - token.getEnd()); - pos.setPosValue(word[POSTAG]); - pos.addToIndexes(); - token.setPos(pos); - } - - tokens.add(token); - - // Chunk tag may be simple (B-PX, I-PX) or compound, like B-NX=ORG or I-NX=PER for named entities - // Currently, the reader uses only the chunk part. In the future, it might also use the - // name entity information. - String[] chunkTag = word[IOB].split(EQUAL_SIGN); - chunkTags[i] = chunkTag[0]; - i++; - } - - if (chunkEnabled) { - decoder.decode(tokens, chunkTags); - } - - // Sentence - Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); - sentence.addToIndexes(); - - // Once sentence per line. - doc.add("\n"); - } - - doc.close(); - } - - /** - * Read a single sentence. - */ - private static List readSentence(BufferedReader aReader) - throws IOException - { - List words = new ArrayList(); - String line; - while ((line = aReader.readLine()) != null) { - if (StringUtils.isBlank(line)) { - break; // End of sentence - } - if (StringUtils.left(line, SENTENCE_HEADER_LEN).equals(SENTENCE_HEADER)) { - break; // Ignore sentence header line - } - String[] fields = line.split(TAB); - if (fields.length != 3) { - throw new IOException( - "Invalid file format. Line needs to have 3 tab-separated fields."); - } - words.add(fields); - } - - if (line == null && words.isEmpty()) { - return null; - } - else { - return words; - } - } -} diff --git a/dkpro-core-io-tuebadz-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuebadz/package-info.java b/dkpro-core-io-tuebadz-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuebadz/package-info.java deleted file mode 100644 index d7a7761df9..0000000000 --- a/dkpro-core-io-tuebadz-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuebadz/package-info.java +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Provides classes for the conversion of specific TüBA-D/Z file formats. - */ -package de.tudarmstadt.ukp.dkpro.core.io.tuebadz; diff --git a/dkpro-core-io-tuebadz-asl/src/main/java/org/dkpro/core/io/tuebadz/TuebaDZReader.java b/dkpro-core-io-tuebadz-asl/src/main/java/org/dkpro/core/io/tuebadz/TuebaDZReader.java new file mode 100644 index 0000000000..4d3bd17218 --- /dev/null +++ b/dkpro-core-io-tuebadz-asl/src/main/java/org/dkpro/core/io/tuebadz/TuebaDZReader.java @@ -0,0 +1,293 @@ +/* + * Copyright 2013 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.tuebadz; + +import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.dkpro.core.api.resources.MappingProviderFactory.createChunkMappingProvider; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.Type; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.factory.JCasBuilder; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.IobDecoder; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.MappingProvider; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Reads the Tüba-D/Z chunking format. + * + * @see TüBA-D/Z + * Web page + */ +@ResourceMetaData(name = "Tüba-D/Z Chunking Format Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({ MimeTypes.APPLICATION_X_TUEBADZ_CHUNK }) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk" }) +public class TuebaDZReader + extends JCasResourceCollectionReader_ImplBase +{ + private static final int FORM = 0; + private static final int POSTAG = 1; + private static final int IOB = 2; + + private static final String TAB = "\t"; + private static final String EQUAL_SIGN = "="; + private static final String SENTENCE_HEADER = "%% sent no."; + private static final int SENTENCE_HEADER_LEN = SENTENCE_HEADER.length(); + + /** + * Character encoding of the input data. + */ + public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) + private String sourceEncoding; + + /** + * Write part-of-speech information. + */ + public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; + @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") + private boolean posEnabled; + + /** + * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the + * tag set defined as part of the model meta data. This can be useful if a custom model is + * specified which does not have such meta data, or it can be used in readers. + */ + public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; + @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) + protected String posTagset; + + /** + * Read chunk information. + */ + public static final String PARAM_READ_CHUNK = ComponentParameters.PARAM_READ_CHUNK; + @ConfigurationParameter(name = PARAM_READ_CHUNK, mandatory = true, defaultValue = "true") + private boolean chunkEnabled; + + /** + * Read named entity information. + */ + public static final String PARAM_READ_NAMED_ENTITY = + ComponentParameters.PARAM_READ_NAMED_ENTITY; + @ConfigurationParameter(name = PARAM_READ_NAMED_ENTITY, mandatory = true, defaultValue = "false") + private boolean namedEntityEnabled; + + /** + * Use this chunk tag set to use to resolve the tag set mapping instead of using the tag set + * defined as part of the model meta data. This can be useful if a custom model is specified + * which does not have such meta data, or it can be used in readers. + */ + public static final String PARAM_CHUNK_TAG_SET = ComponentParameters.PARAM_CHUNK_TAG_SET; + @ConfigurationParameter(name = PARAM_CHUNK_TAG_SET, mandatory = false) + protected String chunkTagset; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Load the part-of-speech tag to UIMA type mapping from this location instead of locating the + * mapping automatically. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + protected String posMappingLocation; + + /** + * Load the chunk tag to UIMA type mapping from this location instead of locating the mapping + * automatically. + */ + public static final String PARAM_CHUNK_MAPPING_LOCATION = + ComponentParameters.PARAM_CHUNK_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_CHUNK_MAPPING_LOCATION, mandatory = false) + protected String chunkMappingLocation; + + private MappingProvider posMappingProvider; + private MappingProvider chunkMappingProvider; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException + { + super.initialize(aContext); + + posMappingProvider = createPosMappingProvider(this, posMappingLocation, posTagset, + getLanguage()); + + chunkMappingProvider = createChunkMappingProvider(this, chunkMappingLocation, chunkTagset, + getLanguage()); + } + + @Override + public void getNext(JCas aJCas) throws IOException, CollectionException + { + try { + if (posEnabled) { + posMappingProvider.configure(aJCas.getCas()); + } + if (chunkEnabled) { + chunkMappingProvider.configure(aJCas.getCas()); + } + } + catch (AnalysisEngineProcessException e) { + throw new IOException(e); + } + + Resource res = nextFile(); + initCas(aJCas, res); + BufferedReader reader = null; + try { + reader = new BufferedReader( + new InputStreamReader(res.getInputStream(), sourceEncoding)); + convert(aJCas, reader); + } + finally { + closeQuietly(reader); + } + } + + private void convert(JCas aJCas, BufferedReader aReader) throws IOException + { + JCasBuilder doc = new JCasBuilder(aJCas); + + Type chunkType = JCasUtil.getType(aJCas, Chunk.class); + Feature chunkValue = chunkType.getFeatureByBaseName("chunkValue"); + IobDecoder decoder = new IobDecoder(aJCas.getCas(), chunkValue, chunkMappingProvider); + + List words; + while ((words = readSentence(aReader)) != null) { + if (words.isEmpty()) { + continue; + } + + int sentenceBegin = doc.getPosition(); + int sentenceEnd = sentenceBegin; + + List tokens = new ArrayList(); + String[] chunkTags = new String[words.size()]; + + // Tokens, POS + int i = 0; + for (String[] word : words) { + // Read token + Token token = doc.add(word[FORM], Token.class); + sentenceEnd = token.getEnd(); + doc.add(" "); + + if (posEnabled) { + Type posTag = posMappingProvider.getTagType(word[POSTAG]); + POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), + token.getEnd()); + pos.setPosValue(word[POSTAG]); + pos.addToIndexes(); + token.setPos(pos); + } + + tokens.add(token); + + // Chunk tag may be simple (B-PX, I-PX) or compound, like B-NX=ORG or I-NX=PER for + // named entities + // Currently, the reader uses only the chunk part. In the future, it might also use + // the + // name entity information. + String[] chunkTag = word[IOB].split(EQUAL_SIGN); + chunkTags[i] = chunkTag[0]; + i++; + } + + if (chunkEnabled) { + decoder.decode(tokens, chunkTags); + } + + // Sentence + Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); + sentence.addToIndexes(); + + // Once sentence per line. + doc.add("\n"); + } + + doc.close(); + } + + /** + * Read a single sentence. + */ + private static List readSentence(BufferedReader aReader) throws IOException + { + List words = new ArrayList(); + String line; + while ((line = aReader.readLine()) != null) { + if (StringUtils.isBlank(line)) { + break; // End of sentence + } + if (StringUtils.left(line, SENTENCE_HEADER_LEN).equals(SENTENCE_HEADER)) { + break; // Ignore sentence header line + } + String[] fields = line.split(TAB); + if (fields.length != 3) { + throw new IOException( + "Invalid file format. Line needs to have 3 tab-separated fields."); + } + words.add(fields); + } + + if (line == null && words.isEmpty()) { + return null; + } + else { + return words; + } + } +} diff --git a/dkpro-core-io-tuebadz-asl/src/main/java/org/dkpro/core/io/tuebadz/package-info.java b/dkpro-core-io-tuebadz-asl/src/main/java/org/dkpro/core/io/tuebadz/package-info.java new file mode 100644 index 0000000000..795aad83fc --- /dev/null +++ b/dkpro-core-io-tuebadz-asl/src/main/java/org/dkpro/core/io/tuebadz/package-info.java @@ -0,0 +1,21 @@ +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Provides classes for the conversion of specific TüBA-D/Z file formats. + */ +package org.dkpro.core.io.tuebadz; diff --git a/dkpro-core-io-tuepp-asl/pom.xml b/dkpro-core-io-tuepp-asl/pom.xml index ea33a61749..d9329d5c80 100644 --- a/dkpro-core-io-tuepp-asl/pom.xml +++ b/dkpro-core-io-tuepp-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.tuepp-asl + dkpro-core-io-tuepp-asl jar DKPro Core ASL - IO - TüPP-D/Z + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -40,28 +41,63 @@ commons-io - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + javax.xml.bind + jaxb-api - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + com.sun.xml.bind + jaxb-core - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + com.sun.xml.bind + jaxb-impl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + javax.activation + javax.activation-api + + + org.dkpro.core + dkpro-core-api-resources-asl + + + org.dkpro.core + dkpro-core-api-metadata-asl + + + org.dkpro.core + dkpro-core-api-segmentation-asl + + + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl - \ No newline at end of file + + + + org.apache.maven.plugins + maven-dependency-plugin + + true + + + com.sun.xml.bind:jaxb-core + com.sun.xml.bind:jaxb-impl + javax.activation:javax.activation-api + + + + + + diff --git a/dkpro-core-io-tuepp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuepp/package-info.java b/dkpro-core-io-tuepp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuepp/package-info.java deleted file mode 100644 index 1b0b14a9cb..0000000000 --- a/dkpro-core-io-tuepp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuepp/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Support for the Tübingen Partially Parsed Corpus of Written German (TüPP-D/Z) XML format. - */ -package de.tudarmstadt.ukp.dkpro.core.io.tuepp; \ No newline at end of file diff --git a/dkpro-core-io-tuepp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuepp/TueppReader.java b/dkpro-core-io-tuepp-asl/src/main/java/org/dkpro/core/io/tuepp/TueppReader.java similarity index 88% rename from dkpro-core-io-tuepp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuepp/TueppReader.java rename to dkpro-core-io-tuepp-asl/src/main/java/org/dkpro/core/io/tuepp/TueppReader.java index d93be14a14..58dac1ee52 100644 --- a/dkpro-core-io-tuepp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuepp/TueppReader.java +++ b/dkpro-core-io-tuepp-asl/src/main/java/org/dkpro/core/io/tuepp/TueppReader.java @@ -15,7 +15,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tuepp; +package org.dkpro.core.io.tuepp; + +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; import java.io.IOException; import java.io.InputStream; @@ -41,22 +43,21 @@ import org.apache.uima.fit.factory.JCasBuilder; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.io.tuepp.internal.model.TueppBaseform; +import org.dkpro.core.io.tuepp.internal.model.TueppPos; +import org.dkpro.core.io.tuepp.internal.model.TueppToken; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.io.tuepp.internal.model.TueppBaseform; -import de.tudarmstadt.ukp.dkpro.core.io.tuepp.internal.model.TueppPos; -import de.tudarmstadt.ukp.dkpro.core.io.tuepp.internal.model.TueppToken; /** * UIMA collection reader for Tübingen Partially Parsed Corpus of Written German (TüPP-D/Z) XML @@ -171,10 +172,19 @@ public class TueppReader */ private static final String TAG_TOKEN = "t"; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Location of the mapping file for part-of-speech tags to UIMA types. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String mappingPosLocation; @@ -191,7 +201,8 @@ public class TueppReader * Character encoding of the input data. */ public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String sourceEncoding; private MappingProvider posMappingProvider; @@ -212,8 +223,8 @@ public void initialize(UimaContext aContext) { super.initialize(aContext); - posMappingProvider = MappingProviderFactory.createPosMappingProvider(mappingPosLocation, - posTagset, getLanguage()); + posMappingProvider = createPosMappingProvider(this, mappingPosLocation, posTagset, + getLanguage()); // Set up XML deserialization try { @@ -268,11 +279,13 @@ private void step() throws IOException while (true) { try { if (res == null) { - // Call to super here because we want to know about the resources, not the articles + // Call to super here because we want to know about the resources, not the + // articles if (getResourceIterator().hasNext()) { // There are still resources left to read res = nextFile(); - is = CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()); + is = CompressionUtils.getInputStream(res.getLocation(), + res.getInputStream()); xmlEventReader = xmlInputFactory.createXMLEventReader(is, sourceEncoding); } else { @@ -352,13 +365,13 @@ else if (isEndElement(e, TAG_ART)) { catch (XMLStreamException ex1) { throw new IOException(ex1); } - catch(JAXBException ex2){ + catch (JAXBException ex2) { throw new IOException(ex2); } - catch(AnalysisEngineProcessException ex3){ + catch (AnalysisEngineProcessException ex3) { throw new IOException(ex3); } - + // Seek next article so we know what to return on hasNext() step(); } @@ -373,7 +386,7 @@ protected void readToken(JCasBuilder aBuilder, TueppToken aToken) Type posType = posMappingProvider.getTagType(pos.tag); POS posAnno = (POS) aBuilder.getJCas().getCas() .createAnnotation(posType, token.getBegin(), token.getEnd()); - posAnno.setPosValue(pos.tag.intern()); + posAnno.setPosValue(pos.tag != null ? pos.tag.intern() : null); POSUtils.assignCoarseValue(posAnno); posAnno.addToIndexes(); token.setPos(posAnno); diff --git a/dkpro-core-io-tuepp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuepp/internal/model/TueppBaseform.java b/dkpro-core-io-tuepp-asl/src/main/java/org/dkpro/core/io/tuepp/internal/model/TueppBaseform.java similarity index 89% rename from dkpro-core-io-tuepp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuepp/internal/model/TueppBaseform.java rename to dkpro-core-io-tuepp-asl/src/main/java/org/dkpro/core/io/tuepp/internal/model/TueppBaseform.java index e5b9ae5efe..fcf6543e4c 100644 --- a/dkpro-core-io-tuepp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuepp/internal/model/TueppBaseform.java +++ b/dkpro-core-io-tuepp-asl/src/main/java/org/dkpro/core/io/tuepp/internal/model/TueppBaseform.java @@ -15,12 +15,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tuepp.internal.model; +package org.dkpro.core.io.tuepp.internal.model; import javax.xml.bind.annotation.XmlAttribute; public class TueppBaseform { - @XmlAttribute(name="f") + @XmlAttribute(name = "f") public String form; -} \ No newline at end of file +} diff --git a/dkpro-core-io-tuepp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuepp/internal/model/TueppPos.java b/dkpro-core-io-tuepp-asl/src/main/java/org/dkpro/core/io/tuepp/internal/model/TueppPos.java similarity index 89% rename from dkpro-core-io-tuepp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuepp/internal/model/TueppPos.java rename to dkpro-core-io-tuepp-asl/src/main/java/org/dkpro/core/io/tuepp/internal/model/TueppPos.java index 8dc13f6754..60f5df2452 100644 --- a/dkpro-core-io-tuepp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuepp/internal/model/TueppPos.java +++ b/dkpro-core-io-tuepp-asl/src/main/java/org/dkpro/core/io/tuepp/internal/model/TueppPos.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tuepp.internal.model; +package org.dkpro.core.io.tuepp.internal.model; import java.util.ArrayList; import java.util.List; @@ -25,18 +25,18 @@ public class TueppPos { - @XmlAttribute(name="t") + @XmlAttribute(name = "t") public String tag; - @XmlAttribute(name="r") + @XmlAttribute(name = "r") public int rank; - @XmlAttribute(name="c") + @XmlAttribute(name = "c") public float certainty; - + @XmlElement(name = "b") public List baseforms = new ArrayList(); - + /** * A part-of-speech tag can have multiple baseforms. This method returns the first baseform. */ @@ -49,4 +49,4 @@ public TueppBaseform getPrimaryBaseForm() return null; } } -} \ No newline at end of file +} diff --git a/dkpro-core-io-tuepp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuepp/internal/model/TueppToken.java b/dkpro-core-io-tuepp-asl/src/main/java/org/dkpro/core/io/tuepp/internal/model/TueppToken.java similarity index 90% rename from dkpro-core-io-tuepp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuepp/internal/model/TueppToken.java rename to dkpro-core-io-tuepp-asl/src/main/java/org/dkpro/core/io/tuepp/internal/model/TueppToken.java index 1633ee5f27..25f12356c1 100644 --- a/dkpro-core-io-tuepp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/tuepp/internal/model/TueppToken.java +++ b/dkpro-core-io-tuepp-asl/src/main/java/org/dkpro/core/io/tuepp/internal/model/TueppToken.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.tuepp.internal.model; +package org.dkpro.core.io.tuepp.internal.model; import java.util.ArrayList; import java.util.List; @@ -25,22 +25,22 @@ public class TueppToken { - @XmlAttribute(name="f") + @XmlAttribute(name = "f") public String form; - + @XmlElement(name = "P") public List posTags = new ArrayList(); - + public TueppPos getPrimaryTag() { TueppPos tag = null; - + for (TueppPos t : posTags) { if (tag == null || tag.rank > t.rank) { tag = t; } } - + return tag; } -} \ No newline at end of file +} diff --git a/dkpro-core-io-tuepp-asl/src/main/java/org/dkpro/core/io/tuepp/package-info.java b/dkpro-core-io-tuepp-asl/src/main/java/org/dkpro/core/io/tuepp/package-info.java new file mode 100644 index 0000000000..2cd2034fa9 --- /dev/null +++ b/dkpro-core-io-tuepp-asl/src/main/java/org/dkpro/core/io/tuepp/package-info.java @@ -0,0 +1,22 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Support for the Tübingen Partially Parsed Corpus of Written German (TüPP-D/Z) XML format. + */ +package org.dkpro.core.io.tuepp; diff --git a/dkpro-core-io-web1t-asl/pom.xml b/dkpro-core-io-web1t-asl/pom.xml index e0c4ec9867..065edeaec9 100644 --- a/dkpro-core-io-web1t-asl/pom.xml +++ b/dkpro-core-io-web1t-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.web1t-asl + dkpro-core-io-web1t-asl jar DKPro Core ASL - IO - Web1T n-grams + https://dkpro.github.io/dkpro-core/ commons-logging @@ -48,28 +49,32 @@ commons-io - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.frequency-asl + org.dkpro.core + dkpro-core-api-frequency-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.ngrams-asl + org.dkpro.core + dkpro-core-ngrams-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.featurepath-asl + org.dkpro.core + dkpro-core-api-featurepath-asl com.googlecode.jweb1t com.googlecode.jweb1t - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit @@ -77,38 +82,38 @@ test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.frequency-asl + org.dkpro.core + dkpro-core-frequency-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.tei-asl + org.dkpro.core + dkpro-core-io-tei-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl + org.dkpro.core + dkpro-core-opennlp-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.text-asl + org.dkpro.core + dkpro-core-io-text-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.tokit-asl + org.dkpro.core + dkpro-core-tokit-asl test @@ -116,9 +121,14 @@ de.tudarmstadt.ukp.dkpro.core.opennlp-model-tagger-en-maxent test + + org.dkpro.core + dkpro-core-clearnlp-asl + test + de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.clearnlp-asl + de.tudarmstadt.ukp.dkpro.core.clearnlp-model-lemma-en-default test @@ -130,16 +140,16 @@ - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.clearnlp-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-clearnlp-asl + 2.3.0-SNAPSHOT pom import - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.opennlp-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-opennlp-asl + 2.3.0-SNAPSHOT pom import @@ -156,6 +166,7 @@ de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.opennlp-model-tagger-en-maxent de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.clearnlp-model-dictionary-en-default + de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.clearnlp-model-lemma-en-default @@ -183,17 +194,8 @@ check - - - release.properties - CHANGES - NOTICE - README - src/main/resources/**/* - src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/util/ExternalSort.java - src/test/resources/**/* - - src/main/java/**/type/**/* + + src/main/java/org/dkpro/core/io/web1t/util/ExternalSort.java diff --git a/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/Web1TWriter.java b/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/Web1TWriter.java deleted file mode 100644 index 20f93707d0..0000000000 --- a/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/Web1TWriter.java +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.web1t; - -import java.io.IOException; -import java.util.Set; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.io.web1t.util.Web1TConverter; - -/** - * Web1T n-gram index format writer. - */ -@ResourceMetaData(name="Web1T N-Gram Index Writer") -@MimeTypeCapability({MimeTypes.TEXT_X_NGRAM}) -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence"}) -public class Web1TWriter - extends JCasAnnotator_ImplBase -{ - /** - * Types to generate n-grams from. - * - * Example: {@code Token.class.getName() + "/pos/PosValue"} for part-of-speech n-grams - */ - public static final String PARAM_INPUT_TYPES = "inputTypes"; - @ConfigurationParameter(name = PARAM_INPUT_TYPES, mandatory = true) - private Set inputPaths; - - /** - * Location to which the output is written. - */ - public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; - @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) - private String outputPath; - - /** - * Character encoding of the output data. - */ - public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; - @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = false, defaultValue = "UTF-8") - private String outputEncoding; - - /** - * Minimum n-gram length. - * - * Default: {@code 1} - */ - public static final String PARAM_MIN_NGRAM_LENGTH = "minNgramLength"; - @ConfigurationParameter(name = PARAM_MIN_NGRAM_LENGTH, mandatory = false, defaultValue = "1") - private int minNgramLength; - - /** - * Maximum n-gram length. - * - * Default: {@code 3} - */ - public static final String PARAM_MAX_NGRAM_LENGTH = "maxNgramLength"; - @ConfigurationParameter(name = PARAM_MAX_NGRAM_LENGTH, mandatory = false, defaultValue = "3") - private int maxNgramLength; - - /** - * Create a lower case index. - */ - public static final String PARAM_LOWERCASE = "lowercase"; - @ConfigurationParameter(name = PARAM_LOWERCASE, mandatory = false, defaultValue = "false") - private boolean lowercase; - - /** - * Create the indexes that jWeb1T needs to operate. (default: true) - */ - public static final String PARAM_CREATE_INDEXES = "createIndexes"; - @ConfigurationParameter(name = PARAM_CREATE_INDEXES, mandatory = false, defaultValue = "true") - private boolean createIndexes; - - /** - * Specifies the minimum frequency a NGram must have to be written to the - * final index. The specified value is interpreted as inclusive value, the - * default is 1. Thus, all NGrams with a frequency of at least 1 or higher - * will be written. - */ - public static final String PARAM_MIN_FREQUENCY = "minFreq"; - @ConfigurationParameter(name = PARAM_MIN_FREQUENCY, mandatory = false, defaultValue = "1") - private int minFreq; - - /** - * The input file(s) is/are split into smaller files for quick access. An - * own file is created if the first two starting letters (or the starting - * letter if the word has a length of 1 character) account for at least x% - * of all starting letters in the input file(s). The default value for - * splitting a file is 1.0%. Every word that has starting characters which - * does not suffice the threshold is written with other words that also did - * not meet the threshold into an own file for miscellaneous words. A high - * threshold will lead to only a few, but large files and a most likely very - * large misc. file. A low threshold results in many small files. Use a zero or a negative - * value to write everything to one file. - */ - public static final String PARAM_SPLIT_TRESHOLD = "splitFileTreshold"; - @ConfigurationParameter(name = PARAM_SPLIT_TRESHOLD, mandatory = false, defaultValue = "1.0") - private float splitThreshold; - - /** - * The type being used for segments - */ - public static final String PARAM_CONTEXT_TYPE = "contextType"; - @ConfigurationParameter(name = PARAM_CONTEXT_TYPE, mandatory = true, defaultValue="de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") - protected String contextType; - - - private Web1TConverter converter; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - - try { - this.converter = new Web1TConverter(outputPath, minNgramLength, maxNgramLength); - } - catch (IOException e) { - throw new ResourceInitializationException(e); - } - - converter.setWriteIndexes(createIndexes); - converter.setSplitThreshold(splitThreshold); - converter.setMinFrequency(minFreq); - converter.setToLowercase(lowercase); - converter.setOutputEncoding(outputEncoding); - } - - @Override - public void process(JCas jcas) - throws AnalysisEngineProcessException - { - - try { - converter.add(jcas, inputPaths, jcas.getCas().getTypeSystem().getType(contextType)); - } - catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - } - - /** - * The input files for each ngram level is read, splitted according to the - * frequency of the words starting letter in the files and the split files - * are individually sorted and consolidated. - */ - @Override - public void collectionProcessComplete() - throws AnalysisEngineProcessException - { - super.collectionProcessComplete(); - - try { - converter.createIndex(); - } - catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - } -} \ No newline at end of file diff --git a/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/package-info.java b/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/package-info.java deleted file mode 100644 index 9842726bfd..0000000000 --- a/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Support for Google's Web1T n-gram format. - */ -package de.tudarmstadt.ukp.dkpro.core.io.web1t; diff --git a/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/util/ExternalSort.java b/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/util/ExternalSort.java deleted file mode 100644 index 794a7527b9..0000000000 --- a/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/util/ExternalSort.java +++ /dev/null @@ -1,279 +0,0 @@ -package de.tudarmstadt.ukp.dkpro.core.io.web1t.util; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.EOFException; -import java.io.File; -import java.io.FileReader; -import java.io.FileWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.PriorityQueue; - -/** -* Goal: offer a generic external-memory sorting program in Java. -* -* It must be : -* - hackable (easy to adapt) -* - scalable to large files -* - sensibly efficient. -* -* This software is in the public domain. -* -* Usage: -* java com/google/code/externalsorting/ExternalSort somefile.txt out.txt -* -* You can change the default maximal number of temporary files with the -t flag: -* java com/google/code/externalsorting/ExternalSort somefile.txt out.txt -t 3 -* -* For very large files, you might want to use an appropriate flag to allocate -* more memory to the Java VM: -* java -Xms2G com/google/code/externalsorting/ExternalSort somefile.txt out.txt -* -* By (in alphabetical order) -* Philippe Beaudoin, Jon Elsas, Christan Grant, Daniel Haran, Daniel Lemire, -* April 2010 -* originally posted at -* http://www.daniel-lemire.com/blog/archives/2010/04/01/external-memory-sorting-in-java/ -*/ -public class ExternalSort { - - static int DEFAULTMAXTEMPFILES = 1024; - - // we divide the file into small blocks. If the blocks - // are too small, we shall create too many temporary files. - // If they are too big, we shall be using too much memory. - public static long estimateBestSizeOfBlocks(File filetobesorted, int maxtmpfiles) { - long sizeoffile = filetobesorted.length() * 2; - /** - * We multiply by two because later on someone insisted on counting the memory - * usage as 2 bytes per character. By this model, loading a file with 1 character - * will use 2 bytes. - */ - // we don't want to open up much more than maxtmpfiles temporary files, better run - // out of memory first. - long blocksize = sizeoffile / maxtmpfiles + (sizeoffile % maxtmpfiles == 0 ? 0 : 1) ; - - // on the other hand, we don't want to create many temporary files - // for naught. If blocksize is smaller than half the free memory, grow it. - long freemem = Runtime.getRuntime().freeMemory(); - if( blocksize < freemem/2) { - blocksize = freemem/2; - } - return blocksize; - } - - /** - * This will simply load the file by blocks of x rows, then sort them in-memory, and write the - * result to temporary files that have to be merged later. - * - * @param file - * some flat file - * @param cmp - * string comparator - * @return a list of temporary flat files - * @throws IOException - * if an I/O problem occurs. - */ - public static List sortInBatch(File file, Comparator cmp) throws IOException { return sortInBatch(file, cmp,DEFAULTMAXTEMPFILES); } - - - /** - * This will simply load the file by blocks of x rows, then sort them in-memory, and write the - * result to temporary files that have to be merged later. You can specify a bound on the number - * of temporary files that will be created. - * - * @param file - * some flat file - * @param cmp - * string comparator - * @param maxtmpfiles - * maximum number of temporary files - * @return a list of temporary flat files - * @throws IOException - * if an I/O problem occurs. - */ - public static List sortInBatch(File file, Comparator cmp, int maxtmpfiles) throws IOException { - List files = new ArrayList(); - BufferedReader fbr = new BufferedReader(new FileReader(file)); - long blocksize = estimateBestSizeOfBlocks(file,maxtmpfiles);// in bytes - - try{ - List tmplist = new ArrayList(); - String line = ""; - try { - while(line != null) { - long currentblocksize = 0;// in bytes - while((currentblocksize < blocksize) - &&( (line = fbr.readLine()) != null) ){ // as long as you have enough memory - tmplist.add(line); - currentblocksize += line.length() * 2; // java uses 16 bits per character? - } - files.add(sortAndSave(tmplist,cmp)); - tmplist.clear(); - } - } catch(EOFException oef) { - if(tmplist.size()>0) { - files.add(sortAndSave(tmplist,cmp)); - tmplist.clear(); - } - } - } finally { - fbr.close(); - } - return files; - } - - - public static File sortAndSave(List tmplist, Comparator cmp) throws IOException { - Collections.sort(tmplist,cmp); - File newtmpfile = File.createTempFile("sortInBatch", "flatfile"); - newtmpfile.deleteOnExit(); - BufferedWriter fbw = new BufferedWriter(new FileWriter(newtmpfile)); - try { - for(String r : tmplist) { - fbw.write(r); - fbw.newLine(); - } - } finally { - fbw.close(); - } - return newtmpfile; - } - - /** - * This merges a bunch of temporary flat files - * - * @param files - * the files to merge. - * @param outputfile - * the target file. - * @param cmp - * the comprarator. - * @return The number of lines sorted. (P. Beaudoin) - * @throws IOException - * if an I/O problem occurs. - */ - public static int mergeSortedFiles(List files, File outputfile, final Comparator cmp) throws IOException { - PriorityQueue pq = new PriorityQueue(11, - new Comparator() { - @Override - public int compare(BinaryFileBuffer i, BinaryFileBuffer j) { - return cmp.compare(i.peek(), j.peek()); - } - } - ); - for (File f : files) { - BinaryFileBuffer bfb = new BinaryFileBuffer(f); - pq.add(bfb); - } - BufferedWriter fbw = new BufferedWriter(new FileWriter(outputfile)); - int rowcounter = 0; - try { - while(pq.size()>0) { - BinaryFileBuffer bfb = pq.poll(); - String r = bfb.pop(); - fbw.write(r); - fbw.newLine(); - ++rowcounter; - if(bfb.empty()) { - bfb.fbr.close(); - bfb.originalfile.delete();// we don't need you anymore - } else { - pq.add(bfb); // add it back - } - } - } finally { - fbw.close(); - for(BinaryFileBuffer bfb : pq ) bfb.close(); - } - return rowcounter; - } - - public static void main(String[] args) throws IOException { - - boolean verbose = false; - int maxtmpfiles = DEFAULTMAXTEMPFILES; - String inputfile=null, outputfile=null; - for(int param = 0; paramparam+1) { - param++; - maxtmpfiles = Integer.parseInt(args[param]); - } else { - if(inputfile == null) - inputfile = args[param]; - else if (outputfile == null) - outputfile = args[param]; - else System.out.println("Unparsed: "+args[param]); - } - } - if(outputfile == null) { - System.out.println("please provide input and output file names"); - return; - } - Comparator comparator = new Comparator() { - @Override - public int compare(String r1, String r2){ - return r1.compareTo(r2);}}; - List l = sortInBatch(new File(inputfile), comparator, maxtmpfiles) ; - if(verbose) System.out.println("created "+l.size()+" tmp files"); - mergeSortedFiles(l, new File(outputfile), comparator); - } -} - - -class BinaryFileBuffer { - public static int BUFFERSIZE = 2048; - public BufferedReader fbr; - public File originalfile; - private String cache; - private boolean empty; - - public BinaryFileBuffer(File f) throws IOException { - originalfile = f; - fbr = new BufferedReader(new FileReader(f), BUFFERSIZE); - reload(); - } - - public boolean empty() { - return empty; - } - - private void reload() throws IOException { - try { - if((this.cache = fbr.readLine()) == null){ - empty = true; - cache = null; - } - else{ - empty = false; - } - } catch(EOFException oef) { - empty = true; - cache = null; - } - } - - public void close() throws IOException { - fbr.close(); - } - - - public String peek() { - if(empty()) return null; - return cache.toString(); - } - public String pop() throws IOException { - String answer = peek(); - reload(); - return answer; - } - - - -} \ No newline at end of file diff --git a/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/util/Web1TFileConsolidator.java b/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/util/Web1TFileConsolidator.java deleted file mode 100644 index e5b2d72611..0000000000 --- a/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/util/Web1TFileConsolidator.java +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.web1t.util; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.util.Comparator; -import java.util.LinkedList; -import java.util.List; - - -public class Web1TFileConsolidator -{ - - private final List inputFiles; - private final Comparator comparator; - private LinkedList consolidatedFiles = new LinkedList(); - private final String fileEncoding; - private final int minFreq; - - private final String TAB = "\t"; - private final String LF = "\n"; - - public Web1TFileConsolidator(List sortedInputFiles, - Comparator comparator, String fileEncoding, int minFreq) - { - this.inputFiles = sortedInputFiles; - this.comparator = comparator; - this.fileEncoding = fileEncoding; - this.minFreq = minFreq; - } - - public void consolidate() - throws IOException - { - - consolidatedFiles = new LinkedList(); - // new temporary files for storing the sorted and consolidated data - for (File file : inputFiles) { - consolidatedFiles.add(new File(Web1TUtil - .cutOffUnderscoredSuffixFromFileName(file) + "_cons")); - } - - for (int i = 0; i < inputFiles.size(); i++) { - - File file_in = inputFiles.get(i); - File file_out = consolidatedFiles.get(i); - - BufferedReader sortedSplitFileReader = new BufferedReader( - new InputStreamReader(new FileInputStream(file_in), - fileEncoding)); - - BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( - new FileOutputStream(file_out), fileEncoding)); - - String prevEntry = null; - String entry = null; - Integer prevEntryFreq = null; - Integer entryFreq = null; - - while ((entry = sortedSplitFileReader.readLine()) != null) { - - int tabPos = entry.indexOf(TAB); - - if (hasLineInvalidFormat(tabPos)) { - System.err.println("Wrong file format in line: " + entry); - continue; - } - - String entryWithoutFreq = extractTextValue(entry, tabPos); - entryFreq = extractFreqValue(entry, tabPos); - - if (isFirstIteration(prevEntry, prevEntryFreq)) { - prevEntry = entryWithoutFreq; - prevEntryFreq = entryFreq; - } - else { - - // Entries are equal, add up frequency - if (arePrevEntryAndCurrentEntryEqual(prevEntry, - entryWithoutFreq, comparator)) { - prevEntryFreq += entryFreq; - } - else { // Entry changed, write aggregated entry - - writeAggregatedEntryToFile(writer, prevEntry, - prevEntryFreq); - - // Prepare next iteration - prevEntry = entryWithoutFreq; - prevEntryFreq = entryFreq; - } - } - - } - writeAggregatedEntryToFile(writer, prevEntry, prevEntryFreq); - writer.close(); - - sortedSplitFileReader.close(); - } - } - - private void writeAggregatedEntryToFile(BufferedWriter writer, - String entry, Integer entryFrequency) - throws IOException - { - - if (entryFrequency < minFreq) { - return; - } - - writer.write(entry + TAB + entryFrequency + LF); - } - - private boolean arePrevEntryAndCurrentEntryEqual(String prevEntry, - String entryWithoutFreq, Comparator comparator) - { - return comparator.compare(prevEntry, entryWithoutFreq) == 0; - } - - private boolean isFirstIteration(String prevEntry, Integer prevEntryFreq) - { - return prevEntry == null || prevEntryFreq == null; - } - - private boolean hasLineInvalidFormat(int tabPos) - { - return (tabPos < 0); - } - - private Integer extractFreqValue(String entry, int tabPos) - { - String freqOfEntryAsString = entry.substring(tabPos + 1); - Integer freqOfEntryAsInt = Integer.parseInt(freqOfEntryAsString); - return freqOfEntryAsInt; - } - - private String extractTextValue(String entry, int tabPos) - { - - return entry.substring(0, tabPos); - } - - public LinkedList getConsolidatedFiles() - { - return new LinkedList(consolidatedFiles); - } - - public void cleanUp() - { - for (File file : consolidatedFiles) { - file.delete(); - } - consolidatedFiles = new LinkedList(); - } -} diff --git a/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/util/Web1TFileSorter.java b/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/util/Web1TFileSorter.java deleted file mode 100644 index d7678a8096..0000000000 --- a/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/util/Web1TFileSorter.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.web1t.util; - -import java.io.File; -import java.io.IOException; -import java.util.Comparator; -import java.util.LinkedList; -import java.util.List; - - - -public class Web1TFileSorter -{ - - private final List inputFiles; - private List sortedFiles = new LinkedList(); - private final Comparator comparator; - - public Web1TFileSorter(List unsortedFiles, - Comparator comparator) - { - this.inputFiles = unsortedFiles; - this.comparator = comparator; - } - - public void sort() - throws IOException - { - for (File file : inputFiles) { - - List l = ExternalSort.sortInBatch(file, comparator); - - File sortedSplitFile = new File( - Web1TUtil.cutOffUnderscoredSuffixFromFileName(file) - + "_sorted"); - sortedFiles.add(sortedSplitFile); - ExternalSort.mergeSortedFiles(l, sortedSplitFile, comparator); - } - } - - public LinkedList getSortedFiles() - { - return new LinkedList(sortedFiles); - } - - public void cleanUp() - { - for (File file : sortedFiles) { - file.delete(); - } - sortedFiles = new LinkedList(); - } - -} diff --git a/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/util/Web1TFileSplitter.java b/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/util/Web1TFileSplitter.java deleted file mode 100644 index bfa0510c5d..0000000000 --- a/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/util/Web1TFileSplitter.java +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.web1t.util; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.io.UnsupportedEncodingException; -import java.io.Writer; -import java.util.Collections; -import java.util.HashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution; - -public class Web1TFileSplitter -{ - private final Log log = LogFactory.getLog(getClass()); - - private final File inputFile; - private final File outputFolder; - private final String fileEncoding; - private final FrequencyDistribution letterFD; - private final double threshold; - private int fileNumber; - - private List splittedFiles = new LinkedList(); - - public Web1TFileSplitter(File aInputFile, File aOutputFolder, - String aFileEncoding, FrequencyDistribution aLetterFD, - double aThreshold, int aStartingFileNumber) - { - inputFile = aInputFile; - outputFolder = aOutputFolder; - fileEncoding = aFileEncoding; - letterFD = aLetterFD; - threshold = aThreshold; - fileNumber = aStartingFileNumber; - } - - public List getFiles() - { - return new LinkedList(splittedFiles); - } - - public void split() - throws IOException - { - Map letterToFileNameMap = mapStartingLettersToFilenames(); - Map fileMap = mapFileNamesToFileHandels(letterToFileNameMap); - Map fileHandleToBufferdWriterMap = mapFileHandelsToWriterHandels(fileMap); - Map writerMap = mapFileNamesToWriterHandels(fileMap, - fileHandleToBufferdWriterMap); - - splittedFiles = generateListOfUniqueFiles(fileMap); - - BufferedReader reader = null; - try { - reader = new BufferedReader(new InputStreamReader( - new FileInputStream(inputFile), fileEncoding)); - - String TAB = "\t"; - String LF = "\n"; - - String readLine = null; - while ((readLine = reader.readLine()) != null) { - - int indexOfTab = readLine.indexOf(TAB); - - if (indexOfTab == -1) { - log.warn("No tab found in line: " + readLine); - continue; - } - - String key = Web1TUtil.getStartingLetters(readLine, indexOfTab); - - Writer writer = writerMap.get(key); - if (writer == null) { - log.warn("No writer found for key: " + key); - key = key.substring(0, 1); - writer = writerMap.get(key); - if (writer == null) { - log.warn("No writer for key: " + key); - continue; - } - } - - writer.write(readLine); - writer.write(LF); - writer.flush(); - } - } - finally { - // Close reader - IOUtils.closeQuietly(reader); - // Close all writers - for (Writer writer : writerMap.values()) { - IOUtils.closeQuietly(writer); - } - } - } - - private Map mapFileHandelsToWriterHandels( - Map fileMap) - throws UnsupportedEncodingException, FileNotFoundException - { - - Map fileHandleToBufferdWriterMap = new HashMap(); - - for (String key : fileMap.keySet()) { - File file = fileMap.get(key); - if (fileHandleToBufferdWriterMap.get(file) == null) { - fileHandleToBufferdWriterMap.put(file, new BufferedWriter( - new OutputStreamWriter(new FileOutputStream(file), fileEncoding))); - } - - } - return fileHandleToBufferdWriterMap; - } - - private Map mapFileNamesToFileHandels( - Map letterToFileNameMap) - { - Map fileMap = new HashMap(); - - for (String key : letterToFileNameMap.keySet()) { - fileMap.put(key, new File(outputFolder + "/" + letterToFileNameMap.get(key) - + "_unsorted")); - } - return fileMap; - } - - public int getNextUnusedFileNumber() - { - return fileNumber; - } - - private Map mapStartingLettersToFilenames() - { - - Map letterToFileNameMap = new HashMap(); - - List keyList = new LinkedList(letterFD.getKeys()); - Collections.sort(keyList); - for (String key : keyList) { - - Long freq = letterFD.getCount(key); - Long total = letterFD.getN(); - - double percentage = (double) freq / total * 100; - if ((threshold > 0.0) && (percentage >= threshold)) { - String filename = String.format("%08d", fileNumber++); - letterToFileNameMap.put(key, filename); - } - else { - letterToFileNameMap.put(key, "99999999"); - } - } - - return letterToFileNameMap; - } - - private Map mapFileNamesToWriterHandels( - Map fileMap, - Map fileHandleToBufferdWriterMap) - throws UnsupportedEncodingException, FileNotFoundException - { - Map nameToWriterMap = new HashMap(); - for (String key : fileMap.keySet()) { - File file = fileMap.get(key); - BufferedWriter writer = fileHandleToBufferdWriterMap.get(file); - nameToWriterMap.put(key, writer); - } - - return nameToWriterMap; - } - - private List generateListOfUniqueFiles(Map fileMap) - { - // Generate unique Filelist - Map uniqeFiles = new HashMap(); - for (File file : fileMap.values()) { - String absPath = file.getAbsolutePath(); - if (uniqeFiles.get(absPath) == null) { - uniqeFiles.put(absPath, ""); - } - } - - LinkedList listOfUniqueFiles = new LinkedList(); - for (String path : uniqeFiles.keySet()) { - listOfUniqueFiles.add(new File(path)); - } - return listOfUniqueFiles; - } - - public void cleanUp() - { - for (File file : splittedFiles) { - file.delete(); - } - splittedFiles = new LinkedList(); - } -} diff --git a/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/util/Web1TUtil.java b/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/util/Web1TUtil.java deleted file mode 100644 index c54d362427..0000000000 --- a/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/util/Web1TUtil.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.web1t.util; - -import java.io.File; - -public class Web1TUtil -{ - - public static String cutOffUnderscoredSuffixFromFileName(File file) - { - - String path = file.getAbsolutePath(); - - return path.substring(0, path.lastIndexOf("_")); - } - - public static String getStartingLetters(String readLine, int indexOfTab) - { - String line = readLine.substring(0, indexOfTab); - - String key = null; - if (line.length() > 1) { - key = readLine.substring(0, 2); - } - else { - key = readLine.substring(0, 1); - } - key = key.toLowerCase(); - return key; - } -} diff --git a/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/Web1TWriter.java b/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/Web1TWriter.java new file mode 100644 index 0000000000..0557e00e16 --- /dev/null +++ b/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/Web1TWriter.java @@ -0,0 +1,193 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.web1t; + +import java.io.IOException; +import java.util.Set; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.io.web1t.util.Web1TConverter; + +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.Parameters; + +/** + * Web1T n-gram index format writer. + */ +@ResourceMetaData(name = "Web1T N-Gram Index Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@Parameters( + exclude = { + Web1TWriter.PARAM_TARGET_LOCATION }) +@MimeTypeCapability({MimeTypes.TEXT_X_NGRAM}) +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence"}) +public class Web1TWriter + extends JCasAnnotator_ImplBase +{ + /** + * Types to generate n-grams from. + * + * Example: {@code Token.class.getName() + "/pos/PosValue"} for part-of-speech n-grams + */ + public static final String PARAM_INPUT_TYPES = "inputTypes"; + @ConfigurationParameter(name = PARAM_INPUT_TYPES, mandatory = true) + private Set inputPaths; + + /** + * Location to which the output is written. + */ + public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; + @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) + private String outputPath; + + /** + * Character encoding of the output data. + */ + public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; + @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = false, defaultValue = "UTF-8") + private String outputEncoding; + + /** + * Minimum n-gram length. + */ + public static final String PARAM_MIN_NGRAM_LENGTH = "minNgramLength"; + @ConfigurationParameter(name = PARAM_MIN_NGRAM_LENGTH, mandatory = false, defaultValue = "1") + private int minNgramLength; + + /** + * Maximum n-gram length. + */ + public static final String PARAM_MAX_NGRAM_LENGTH = "maxNgramLength"; + @ConfigurationParameter(name = PARAM_MAX_NGRAM_LENGTH, mandatory = false, defaultValue = "3") + private int maxNgramLength; + + /** + * Create a lower case index. + */ + public static final String PARAM_LOWERCASE = "lowercase"; + @ConfigurationParameter(name = PARAM_LOWERCASE, mandatory = false, defaultValue = "false") + private boolean lowercase; + + /** + * Create the indexes that jWeb1T needs to operate. (default: true) + */ + public static final String PARAM_CREATE_INDEXES = "createIndexes"; + @ConfigurationParameter(name = PARAM_CREATE_INDEXES, mandatory = false, defaultValue = "true") + private boolean createIndexes; + + /** + * Specifies the minimum frequency a NGram must have to be written to the + * final index. The specified value is interpreted as inclusive value, the + * default is 1. Thus, all NGrams with a frequency of at least 1 or higher + * will be written. + */ + public static final String PARAM_MIN_FREQUENCY = "minFreq"; + @ConfigurationParameter(name = PARAM_MIN_FREQUENCY, mandatory = false, defaultValue = "1") + private int minFreq; + + /** + * The input file(s) is/are split into smaller files for quick access. An + * own file is created if the first two starting letters (or the starting + * letter if the word has a length of 1 character) account for at least x% + * of all starting letters in the input file(s). The default value for + * splitting a file is 1.0%. Every word that has starting characters which + * does not suffice the threshold is written with other words that also did + * not meet the threshold into an own file for miscellaneous words. A high + * threshold will lead to only a few, but large files and a most likely very + * large misc. file. A low threshold results in many small files. Use a zero or a negative + * value to write everything to one file. + */ + public static final String PARAM_SPLIT_TRESHOLD = "splitFileTreshold"; + @ConfigurationParameter(name = PARAM_SPLIT_TRESHOLD, mandatory = false, defaultValue = "1.0") + private float splitThreshold; + + /** + * The type being used for segments + */ + public static final String PARAM_CONTEXT_TYPE = "contextType"; + @ConfigurationParameter(name = PARAM_CONTEXT_TYPE, mandatory = true, + defaultValue = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + protected String contextType; + + + private Web1TConverter converter; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + try { + this.converter = new Web1TConverter(outputPath, minNgramLength, maxNgramLength); + } + catch (IOException e) { + throw new ResourceInitializationException(e); + } + + converter.setWriteIndexes(createIndexes); + converter.setSplitThreshold(splitThreshold); + converter.setMinFrequency(minFreq); + converter.setToLowercase(lowercase); + converter.setOutputEncoding(outputEncoding); + } + + @Override + public void process(JCas jcas) + throws AnalysisEngineProcessException + { + + try { + converter.add(jcas, inputPaths, jcas.getCas().getTypeSystem().getType(contextType)); + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + + /** + * The input files for each ngram level is read, splitted according to the + * frequency of the words starting letter in the files and the split files + * are individually sorted and consolidated. + */ + @Override + public void collectionProcessComplete() + throws AnalysisEngineProcessException + { + super.collectionProcessComplete(); + + try { + converter.createIndex(); + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } +} diff --git a/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/package-info.java b/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/package-info.java new file mode 100644 index 0000000000..b6ad6fa2d6 --- /dev/null +++ b/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/package-info.java @@ -0,0 +1,22 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Support for Google's Web1T n-gram format. + */ +package org.dkpro.core.io.web1t; diff --git a/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/util/ExternalSort.java b/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/util/ExternalSort.java new file mode 100644 index 0000000000..7018869914 --- /dev/null +++ b/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/util/ExternalSort.java @@ -0,0 +1,310 @@ +package org.dkpro.core.io.web1t.util; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.EOFException; +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.PriorityQueue; + +/** + * Goal: offer a generic external-memory sorting program in Java. + * + * It must be : - hackable (easy to adapt) - scalable to large files - sensibly efficient. + * + * This software is in the public domain. + * + * Usage: java com/google/code/externalsorting/ExternalSort somefile.txt out.txt + * + * You can change the default maximal number of temporary files with the -t flag: java + * com/google/code/externalsorting/ExternalSort somefile.txt out.txt -t 3 + * + * For very large files, you might want to use an appropriate flag to allocate more memory to the + * Java VM: java -Xms2G com/google/code/externalsorting/ExternalSort somefile.txt out.txt + * + * By (in alphabetical order) Philippe Beaudoin, Jon Elsas, Christan Grant, Daniel Haran, Daniel + * Lemire, April 2010 originally posted at + * http://www.daniel-lemire.com/blog/archives/2010/04/01/external-memory-sorting-in-java/ + */ +public class ExternalSort +{ + + static int DEFAULTMAXTEMPFILES = 1024; + + // we divide the file into small blocks. If the blocks + // are too small, we shall create too many temporary files. + // If they are too big, we shall be using too much memory. + public static long estimateBestSizeOfBlocks(File filetobesorted, int maxtmpfiles) + { + long sizeoffile = filetobesorted.length() * 2; + /** + * We multiply by two because later on someone insisted on counting the memory usage as 2 + * bytes per character. By this model, loading a file with 1 character will use 2 bytes. + */ + // we don't want to open up much more than maxtmpfiles temporary files, better run + // out of memory first. + long blocksize = sizeoffile / maxtmpfiles + (sizeoffile % maxtmpfiles == 0 ? 0 : 1); + + // on the other hand, we don't want to create many temporary files + // for naught. If blocksize is smaller than half the free memory, grow it. + long freemem = Runtime.getRuntime().freeMemory(); + if (blocksize < freemem / 2) { + blocksize = freemem / 2; + } + return blocksize; + } + + /** + * This will simply load the file by blocks of x rows, then sort them in-memory, and write the + * result to temporary files that have to be merged later. + * + * @param file + * some flat file + * @param cmp + * string comparator + * @return a list of temporary flat files + * @throws IOException + * if an I/O problem occurs. + */ + public static List sortInBatch(File file, Comparator cmp) throws IOException + { + return sortInBatch(file, cmp, DEFAULTMAXTEMPFILES); + } + + /** + * This will simply load the file by blocks of x rows, then sort them in-memory, and write the + * result to temporary files that have to be merged later. You can specify a bound on the number + * of temporary files that will be created. + * + * @param file + * some flat file + * @param cmp + * string comparator + * @param maxtmpfiles + * maximum number of temporary files + * @return a list of temporary flat files + * @throws IOException + * if an I/O problem occurs. + */ + public static List sortInBatch(File file, Comparator cmp, int maxtmpfiles) + throws IOException + { + List files = new ArrayList(); + BufferedReader fbr = new BufferedReader(new FileReader(file)); + long blocksize = estimateBestSizeOfBlocks(file, maxtmpfiles);// in bytes + + try { + List tmplist = new ArrayList(); + String line = ""; + try { + while (line != null) { + // in bytes + long currentblocksize = 0; + // as long as you have enough memory + while ((currentblocksize < blocksize) && ((line = fbr.readLine()) != null)) { + tmplist.add(line); + // java uses 16 bits per character? + currentblocksize += line.length() * 2; + } + files.add(sortAndSave(tmplist, cmp)); + tmplist.clear(); + } + } + catch (EOFException oef) { + if (tmplist.size() > 0) { + files.add(sortAndSave(tmplist, cmp)); + tmplist.clear(); + } + } + } + finally { + fbr.close(); + } + return files; + } + + public static File sortAndSave(List tmplist, Comparator cmp) throws IOException + { + Collections.sort(tmplist, cmp); + File newtmpfile = File.createTempFile("sortInBatch", "flatfile"); + newtmpfile.deleteOnExit(); + BufferedWriter fbw = new BufferedWriter(new FileWriter(newtmpfile)); + try { + for (String r : tmplist) { + fbw.write(r); + fbw.newLine(); + } + } + finally { + fbw.close(); + } + return newtmpfile; + } + + /** + * This merges a bunch of temporary flat files + * + * @param files + * the files to merge. + * @param outputfile + * the target file. + * @param cmp + * the comprarator. + * @return The number of lines sorted. (P. Beaudoin) + * @throws IOException + * if an I/O problem occurs. + */ + public static int mergeSortedFiles(List files, File outputfile, + final Comparator cmp) + throws IOException + { + PriorityQueue pq = new PriorityQueue(11, + new Comparator() + { + @Override + public int compare(BinaryFileBuffer i, BinaryFileBuffer j) + { + return cmp.compare(i.peek(), j.peek()); + } + }); + for (File f : files) { + BinaryFileBuffer bfb = new BinaryFileBuffer(f); + pq.add(bfb); + } + BufferedWriter fbw = new BufferedWriter(new FileWriter(outputfile)); + int rowcounter = 0; + try { + while (pq.size() > 0) { + BinaryFileBuffer bfb = pq.poll(); + String r = bfb.pop(); + fbw.write(r); + fbw.newLine(); + ++rowcounter; + if (bfb.empty()) { + bfb.fbr.close(); + bfb.originalfile.delete();// we don't need you anymore + } + else { + pq.add(bfb); // add it back + } + } + } + finally { + fbw.close(); + for (BinaryFileBuffer bfb : pq) { + bfb.close(); + } + } + return rowcounter; + } + + public static void main(String[] args) throws IOException + { + + boolean verbose = false; + int maxtmpfiles = DEFAULTMAXTEMPFILES; + String inputfile = null, outputfile = null; + for (int param = 0; param < args.length; ++param) { + if (args[param].equals("-v") || args[param].equals("--verbose")) { + verbose = true; + } + else if ((args[param].equals("-t") || args[param].equals("--maxtmpfiles")) + && args.length > param + 1) { + param++; + maxtmpfiles = Integer.parseInt(args[param]); + } + else { + if (inputfile == null) { + inputfile = args[param]; + } + else if (outputfile == null) { + outputfile = args[param]; + } + else { + System.out.println("Unparsed: " + args[param]); + } + } + } + if (outputfile == null) { + System.out.println("please provide input and output file names"); + return; + } + Comparator comparator = new Comparator() + { + @Override + public int compare(String r1, String r2) + { + return r1.compareTo(r2); + } + }; + List l = sortInBatch(new File(inputfile), comparator, maxtmpfiles); + if (verbose) { + System.out.println("created " + l.size() + " tmp files"); + } + mergeSortedFiles(l, new File(outputfile), comparator); + } + + private static class BinaryFileBuffer + { + public static int BUFFERSIZE = 2048; + public BufferedReader fbr; + public File originalfile; + private String cache; + private boolean empty; + + public BinaryFileBuffer(File f) throws IOException + { + originalfile = f; + fbr = new BufferedReader(new FileReader(f), BUFFERSIZE); + reload(); + } + + public boolean empty() + { + return empty; + } + + private void reload() throws IOException + { + try { + if ((this.cache = fbr.readLine()) == null) { + empty = true; + cache = null; + } + else { + empty = false; + } + } + catch (EOFException oef) { + empty = true; + cache = null; + } + } + + public void close() throws IOException + { + fbr.close(); + } + + public String peek() + { + if (empty()) { + return null; + } + return cache.toString(); + } + + public String pop() throws IOException + { + String answer = peek(); + reload(); + return answer; + } + } +} diff --git a/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/util/Web1TConverter.java b/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/util/Web1TConverter.java similarity index 92% rename from dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/util/Web1TConverter.java rename to dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/util/Web1TConverter.java index 4d11cf07ec..2bf3ffc98d 100644 --- a/dkpro-core-io-web1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/util/Web1TConverter.java +++ b/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/util/Web1TConverter.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.web1t.util; +package org.dkpro.core.io.web1t.util; import java.io.BufferedReader; import java.io.BufferedWriter; @@ -42,15 +42,14 @@ import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.fit.util.CasUtil; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.featurepath.FeaturePathException; +import org.dkpro.core.api.featurepath.FeaturePathInfo; +import org.dkpro.core.api.frequency.util.ConditionalFrequencyDistribution; +import org.dkpro.core.api.frequency.util.FrequencyDistribution; +import org.dkpro.core.ngrams.util.NGramStringIterable; import com.googlecode.jweb1t.JWeb1TIndexer; -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathInfo; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.ConditionalFrequencyDistribution; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution; -import de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringIterable; - public class Web1TConverter { @@ -80,15 +79,16 @@ public Web1TConverter(String outputPath) } public Web1TConverter(String outputPath, int aMinNGramLength, int aMaxNGramLength) - throws IOException - { - super(); - this.minNgramLength = aMinNGramLength; - this.maxNgramLength = aMaxNGramLength; - init(outputPath); - } + throws IOException + { + super(); + this.minNgramLength = aMinNGramLength; + this.maxNgramLength = aMaxNGramLength; + init(outputPath); + } - private void init(String aOutputPath) throws IOException{ + private void init(String aOutputPath) throws IOException + { this.outputPath = aOutputPath; ngramWriters = initializeWriters(minNgramLength, maxNgramLength); @@ -102,7 +102,8 @@ private void init(String aOutputPath) throws IOException{ public void add(JCas jcas, Set inputPaths, Type sentenceType) throws IOException { - ConditionalFrequencyDistribution cfd = new ConditionalFrequencyDistribution(); + ConditionalFrequencyDistribution cfd = + new ConditionalFrequencyDistribution(); CAS cas = jcas.getCas(); @@ -125,9 +126,9 @@ public void add(JCas jcas, Set inputPaths, Type sentenceType) throw new IOException(e); } - for (int ngramLength = minNgramLength; ngramLength <= maxNgramLength; ngramLength++) { - cfd.incAll(ngramLength, new NGramStringIterable(tokenStrings, ngramLength, - ngramLength)); + for (int ngramLen = minNgramLength; ngramLen <= maxNgramLength; ngramLen++) { + cfd.incAll(ngramLen, + new NGramStringIterable(tokenStrings, ngramLen, ngramLen)); } } } @@ -410,8 +411,7 @@ private String getStartingLetters(String readLine, int indexOfTab) private Map> initializeLetterFDs(int min, int max) { - - Map> fdistMap = new HashMap>(); + Map> fdistMap = new HashMap<>(); for (int i = min; i <= max; i++) { FrequencyDistribution fdist = new FrequencyDistribution(); @@ -424,7 +424,7 @@ private Map> initializeLetterFDs(int min, private Map initializeWriters(int min, int max) throws IOException { - Map writers = new HashMap(); + Map writers = new HashMap<>(); for (int level = min; level <= max; level++) { File outputFile = new File(outputPath, level + ".txt"); @@ -520,5 +520,4 @@ public void setToLowercase(boolean toLowercase) { this.toLowercase = toLowercase; } - -} \ No newline at end of file +} diff --git a/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/util/Web1TFileConsolidator.java b/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/util/Web1TFileConsolidator.java new file mode 100644 index 0000000000..2ee06cee5f --- /dev/null +++ b/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/util/Web1TFileConsolidator.java @@ -0,0 +1,175 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.web1t.util; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.util.Comparator; +import java.util.LinkedList; +import java.util.List; + +public class Web1TFileConsolidator +{ + private final List inputFiles; + private final Comparator comparator; + private LinkedList consolidatedFiles = new LinkedList(); + private final String fileEncoding; + private final int minFreq; + + private final String TAB = "\t"; + private final String LF = "\n"; + + public Web1TFileConsolidator(List sortedInputFiles, + Comparator comparator, String fileEncoding, int minFreq) + { + this.inputFiles = sortedInputFiles; + this.comparator = comparator; + this.fileEncoding = fileEncoding; + this.minFreq = minFreq; + } + + public void consolidate() + throws IOException + { + + consolidatedFiles = new LinkedList(); + // new temporary files for storing the sorted and consolidated data + for (File file : inputFiles) { + consolidatedFiles.add(new File(Web1TUtil + .cutOffUnderscoredSuffixFromFileName(file) + "_cons")); + } + + for (int i = 0; i < inputFiles.size(); i++) { + + File file_in = inputFiles.get(i); + File file_out = consolidatedFiles.get(i); + + BufferedReader sortedSplitFileReader = new BufferedReader( + new InputStreamReader(new FileInputStream(file_in), + fileEncoding)); + + BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( + new FileOutputStream(file_out), fileEncoding)); + + String prevEntry = null; + String entry = null; + Integer prevEntryFreq = null; + Integer entryFreq = null; + + while ((entry = sortedSplitFileReader.readLine()) != null) { + + int tabPos = entry.indexOf(TAB); + + if (hasLineInvalidFormat(tabPos)) { + System.err.println("Wrong file format in line: " + entry); + continue; + } + + String entryWithoutFreq = extractTextValue(entry, tabPos); + entryFreq = extractFreqValue(entry, tabPos); + + if (isFirstIteration(prevEntry, prevEntryFreq)) { + prevEntry = entryWithoutFreq; + prevEntryFreq = entryFreq; + } + else { + + // Entries are equal, add up frequency + if (arePrevEntryAndCurrentEntryEqual(prevEntry, + entryWithoutFreq, comparator)) { + prevEntryFreq += entryFreq; + } + else { // Entry changed, write aggregated entry + + writeAggregatedEntryToFile(writer, prevEntry, + prevEntryFreq); + + // Prepare next iteration + prevEntry = entryWithoutFreq; + prevEntryFreq = entryFreq; + } + } + + } + writeAggregatedEntryToFile(writer, prevEntry, prevEntryFreq); + writer.close(); + + sortedSplitFileReader.close(); + } + } + + private void writeAggregatedEntryToFile(BufferedWriter writer, + String entry, Integer entryFrequency) + throws IOException + { + + if (entryFrequency < minFreq) { + return; + } + + writer.write(entry + TAB + entryFrequency + LF); + } + + private boolean arePrevEntryAndCurrentEntryEqual(String prevEntry, + String entryWithoutFreq, Comparator comparator) + { + return comparator.compare(prevEntry, entryWithoutFreq) == 0; + } + + private boolean isFirstIteration(String prevEntry, Integer prevEntryFreq) + { + return prevEntry == null || prevEntryFreq == null; + } + + private boolean hasLineInvalidFormat(int tabPos) + { + return (tabPos < 0); + } + + private Integer extractFreqValue(String entry, int tabPos) + { + String freqOfEntryAsString = entry.substring(tabPos + 1); + Integer freqOfEntryAsInt = Integer.parseInt(freqOfEntryAsString); + return freqOfEntryAsInt; + } + + private String extractTextValue(String entry, int tabPos) + { + + return entry.substring(0, tabPos); + } + + public LinkedList getConsolidatedFiles() + { + return new LinkedList(consolidatedFiles); + } + + public void cleanUp() + { + for (File file : consolidatedFiles) { + file.delete(); + } + consolidatedFiles = new LinkedList(); + } +} diff --git a/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/util/Web1TFileSorter.java b/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/util/Web1TFileSorter.java new file mode 100644 index 0000000000..4e0c0c7d74 --- /dev/null +++ b/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/util/Web1TFileSorter.java @@ -0,0 +1,66 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.web1t.util; + +import java.io.File; +import java.io.IOException; +import java.util.Comparator; +import java.util.LinkedList; +import java.util.List; + +public class Web1TFileSorter +{ + private final List inputFiles; + private List sortedFiles = new LinkedList(); + private final Comparator comparator; + + public Web1TFileSorter(List unsortedFiles, + Comparator comparator) + { + this.inputFiles = unsortedFiles; + this.comparator = comparator; + } + + public void sort() + throws IOException + { + for (File file : inputFiles) { + + List l = ExternalSort.sortInBatch(file, comparator); + + File sortedSplitFile = new File( + Web1TUtil.cutOffUnderscoredSuffixFromFileName(file) + + "_sorted"); + sortedFiles.add(sortedSplitFile); + ExternalSort.mergeSortedFiles(l, sortedSplitFile, comparator); + } + } + + public LinkedList getSortedFiles() + { + return new LinkedList(sortedFiles); + } + + public void cleanUp() + { + for (File file : sortedFiles) { + file.delete(); + } + sortedFiles = new LinkedList(); + } +} diff --git a/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/util/Web1TFileSplitter.java b/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/util/Web1TFileSplitter.java new file mode 100644 index 0000000000..2ef0dd8d7e --- /dev/null +++ b/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/util/Web1TFileSplitter.java @@ -0,0 +1,229 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.web1t.util; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; +import java.io.Writer; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dkpro.core.api.frequency.util.FrequencyDistribution; + +public class Web1TFileSplitter +{ + private final Log log = LogFactory.getLog(getClass()); + + private final File inputFile; + private final File outputFolder; + private final String fileEncoding; + private final FrequencyDistribution letterFD; + private final double threshold; + private int fileNumber; + + private List splittedFiles = new LinkedList(); + + public Web1TFileSplitter(File aInputFile, File aOutputFolder, + String aFileEncoding, FrequencyDistribution aLetterFD, + double aThreshold, int aStartingFileNumber) + { + inputFile = aInputFile; + outputFolder = aOutputFolder; + fileEncoding = aFileEncoding; + letterFD = aLetterFD; + threshold = aThreshold; + fileNumber = aStartingFileNumber; + } + + public List getFiles() + { + return new LinkedList(splittedFiles); + } + + public void split() + throws IOException + { + Map letterToFileNameMap = mapStartingLettersToFilenames(); + Map fileMap = mapFileNamesToFileHandels(letterToFileNameMap); + Map fileHandleToBufferdWriterMap = + mapFileHandelsToWriterHandels(fileMap); + Map writerMap = mapFileNamesToWriterHandels(fileMap, + fileHandleToBufferdWriterMap); + + splittedFiles = generateListOfUniqueFiles(fileMap); + + BufferedReader reader = null; + try { + reader = new BufferedReader(new InputStreamReader( + new FileInputStream(inputFile), fileEncoding)); + + String TAB = "\t"; + String LF = "\n"; + + String readLine = null; + while ((readLine = reader.readLine()) != null) { + + int indexOfTab = readLine.indexOf(TAB); + + if (indexOfTab == -1) { + log.warn("No tab found in line: " + readLine); + continue; + } + + String key = Web1TUtil.getStartingLetters(readLine, indexOfTab); + + Writer writer = writerMap.get(key); + if (writer == null) { + log.warn("No writer found for key: " + key); + key = key.substring(0, 1); + writer = writerMap.get(key); + if (writer == null) { + log.warn("No writer for key: " + key); + continue; + } + } + + writer.write(readLine); + writer.write(LF); + writer.flush(); + } + } + finally { + // Close reader + IOUtils.closeQuietly(reader); + // Close all writers + for (Writer writer : writerMap.values()) { + IOUtils.closeQuietly(writer); + } + } + } + + private Map mapFileHandelsToWriterHandels( + Map fileMap) + throws UnsupportedEncodingException, FileNotFoundException + { + Map fileHandleToBufferdWriterMap = new HashMap<>(); + + for (String key : fileMap.keySet()) { + File file = fileMap.get(key); + if (fileHandleToBufferdWriterMap.get(file) == null) { + fileHandleToBufferdWriterMap.put(file, new BufferedWriter( + new OutputStreamWriter(new FileOutputStream(file), fileEncoding))); + } + + } + return fileHandleToBufferdWriterMap; + } + + private Map mapFileNamesToFileHandels( + Map letterToFileNameMap) + { + Map fileMap = new HashMap(); + + for (String key : letterToFileNameMap.keySet()) { + fileMap.put(key, new File(outputFolder + "/" + letterToFileNameMap.get(key) + + "_unsorted")); + } + return fileMap; + } + + public int getNextUnusedFileNumber() + { + return fileNumber; + } + + private Map mapStartingLettersToFilenames() + { + + Map letterToFileNameMap = new HashMap(); + + List keyList = new LinkedList(letterFD.getKeys()); + Collections.sort(keyList); + for (String key : keyList) { + + Long freq = letterFD.getCount(key); + Long total = letterFD.getN(); + + double percentage = (double) freq / total * 100; + if ((threshold > 0.0) && (percentage >= threshold)) { + String filename = String.format("%08d", fileNumber++); + letterToFileNameMap.put(key, filename); + } + else { + letterToFileNameMap.put(key, "99999999"); + } + } + + return letterToFileNameMap; + } + + private Map mapFileNamesToWriterHandels( + Map fileMap, + Map fileHandleToBufferdWriterMap) + throws UnsupportedEncodingException, FileNotFoundException + { + Map nameToWriterMap = new HashMap(); + for (String key : fileMap.keySet()) { + File file = fileMap.get(key); + BufferedWriter writer = fileHandleToBufferdWriterMap.get(file); + nameToWriterMap.put(key, writer); + } + + return nameToWriterMap; + } + + private List generateListOfUniqueFiles(Map fileMap) + { + // Generate unique Filelist + Map uniqeFiles = new HashMap(); + for (File file : fileMap.values()) { + String absPath = file.getAbsolutePath(); + if (uniqeFiles.get(absPath) == null) { + uniqeFiles.put(absPath, ""); + } + } + + LinkedList listOfUniqueFiles = new LinkedList(); + for (String path : uniqeFiles.keySet()) { + listOfUniqueFiles.add(new File(path)); + } + return listOfUniqueFiles; + } + + public void cleanUp() + { + for (File file : splittedFiles) { + file.delete(); + } + splittedFiles = new LinkedList(); + } +} diff --git a/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/util/Web1TUtil.java b/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/util/Web1TUtil.java new file mode 100644 index 0000000000..8721d81bbf --- /dev/null +++ b/dkpro-core-io-web1t-asl/src/main/java/org/dkpro/core/io/web1t/util/Web1TUtil.java @@ -0,0 +1,45 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.web1t.util; + +import java.io.File; + +public class Web1TUtil +{ + public static String cutOffUnderscoredSuffixFromFileName(File file) + { + String path = file.getAbsolutePath(); + + return path.substring(0, path.lastIndexOf("_")); + } + + public static String getStartingLetters(String readLine, int indexOfTab) + { + String line = readLine.substring(0, indexOfTab); + + String key = null; + if (line.length() > 1) { + key = readLine.substring(0, 2); + } + else { + key = readLine.substring(0, 1); + } + key = key.toLowerCase(); + return key; + } +} diff --git a/dkpro-core-io-web1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/TestCreatedIndex.java b/dkpro-core-io-web1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/TestCreatedIndex.java deleted file mode 100644 index a59cc809f0..0000000000 --- a/dkpro-core-io-web1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/TestCreatedIndex.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.web1t; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.fail; - -import java.io.File; - -import org.junit.Ignore; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.frequency.Web1TFileAccessProvider; - -public class TestCreatedIndex -{ - - @Ignore - @Test - // Assumes index created with data from amisch wikipedia for 1 to 3 grams - public void testCreatedIndex() - throws Exception - { - Web1TFileAccessProvider web = new Web1TFileAccessProvider("en", new File( - "target/Index/"), 1, 3); - - assertEquals(200162, web.getNrOfNgrams(1)); - assertEquals(200162, web.getNrOfTokens()); - - assertGreater(-1, web.getNrOfNgrams(1)); - assertGreater(-1, web.getNrOfNgrams(2)); - assertGreater(-1, web.getNrOfNgrams(3)); - assertEquals(-1, web.getNrOfNgrams(4)); - - double l = web.getProbability("Amisch"); - assertEquals(0.002582907, l, 0.00000001); - } - - private void assertGreater(long i, long nrOfNgrams) - { - - if (nrOfNgrams <= i) - fail("Value is not greater"); - } - -} diff --git a/dkpro-core-io-web1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/Web1TConsolidatorTest.java b/dkpro-core-io-web1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/Web1TConsolidatorTest.java deleted file mode 100644 index 1f29d092c7..0000000000 --- a/dkpro-core-io-web1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/Web1TConsolidatorTest.java +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.web1t; - -import static org.junit.Assert.assertEquals; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.Comparator; -import java.util.LinkedList; - -import org.junit.Before; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.io.web1t.util.Web1TFileConsolidator; - -public class Web1TConsolidatorTest -{ - LinkedList filesToSort; - Comparator comparator; - - @Test - public void testConsolidation() - throws IOException - { - Web1TFileConsolidator consolidator = new Web1TFileConsolidator( - filesToSort, comparator, "UTF-8", 1); - - consolidator.consolidate(); - - LinkedList consolidatedFiles = consolidator - .getConsolidatedFiles(); - - String TAB = "\t"; - - assertEquals(2, consolidatedFiles.size()); - - // File #1 - File file = consolidatedFiles.pop(); - String[] lines = getLines(file); - - assertEquals(3, lines.length); - assertEquals("a" + TAB + "5", lines[0]); - assertEquals("although" + TAB + "16", lines[1]); - assertEquals("annoying" + TAB + "5", lines[2]); - - // File #2 - file = consolidatedFiles.pop(); - lines = getLines(file); - - assertEquals(4, lines.length); - assertEquals("often" + TAB + "5", lines[0]); - assertEquals("oil" + TAB + "32", lines[1]); - assertEquals("out" + TAB + "2", lines[2]); - assertEquals("out-of-order" + TAB + "9", lines[3]); - // Clean up calls - consolidator.cleanUp(); - consolidatedFiles = consolidator.getConsolidatedFiles(); - assertEquals(0, consolidatedFiles.size()); - - } - - @Test - public void testConsolidationWithFreq10() - throws IOException - { - Web1TFileConsolidator consolidator = new Web1TFileConsolidator( - filesToSort, comparator, "UTF-8", 10); - - consolidator.consolidate(); - - LinkedList consolidatedFiles = consolidator - .getConsolidatedFiles(); - - String TAB = "\t"; - - assertEquals(2, consolidatedFiles.size()); - - // File #1 - File file = consolidatedFiles.pop(); - String[] lines = getLines(file); - - assertEquals(1, lines.length); - assertEquals("although" + TAB + "16", lines[0]); - - // File #2 - file = consolidatedFiles.pop(); - lines = getLines(file); - - assertEquals(1, lines.length); - assertEquals("oil" + TAB + "32", lines[0]); - // Clean up calls - consolidator.cleanUp(); - consolidatedFiles = consolidator.getConsolidatedFiles(); - assertEquals(0, consolidatedFiles.size()); - - } - - @Before - public void setUp() - { - setUpFileList(); - setUpComparator(); - - } - - private void setUpComparator() - { - comparator = new Comparator() - { - @Override - public int compare(String r1, String r2) - { - return r1.compareTo(r2); - } - }; - } - - private void setUpFileList() - { - filesToSort = new LinkedList(); - - File file_1 = new File( - "src/test/resources/Web1TConsolidator/00000000_sorted"); - File file_2 = new File( - "src/test/resources/Web1TConsolidator/00000001_sorted"); - - filesToSort.add(file_1); - filesToSort.add(file_2); - - } - - private String[] getLines(File file) - throws IOException - { - BufferedReader reader = new BufferedReader(new InputStreamReader( - new FileInputStream(file), "UTF-8")); - - LinkedList lines = new LinkedList(); - - String line = ""; - while ((line = reader.readLine()) != null) { - lines.add(line); - } - reader.close(); - - return lines.toArray(new String[0]); - } -} diff --git a/dkpro-core-io-web1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/Web1TSorterTest.java b/dkpro-core-io-web1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/Web1TSorterTest.java deleted file mode 100644 index 01d79bcf4f..0000000000 --- a/dkpro-core-io-web1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/Web1TSorterTest.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.web1t; - -import static org.junit.Assert.assertEquals; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.Comparator; -import java.util.LinkedList; - -import org.junit.Before; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.io.web1t.util.Web1TFileSorter; - -public class Web1TSorterTest -{ - - LinkedList filesToSort; - Comparator comparator; - Web1TFileSorter sorter; - - @Test - public void testSorter() - throws IOException - { - Web1TFileSorter sorter = new Web1TFileSorter(filesToSort, comparator); - sorter.sort(); - LinkedList sortedFiles = sorter.getSortedFiles(); - - assertEquals(2, sortedFiles.size()); - - String TAB = "\t"; - - // File #1 - File file = sortedFiles.pop(); - String[] lines = getLines(file); - - assertEquals(3, lines.length); - assertEquals("a" + TAB + "3", lines[0]); - assertEquals("although" + TAB + "4", lines[1]); - assertEquals("annoying" + TAB + "5", lines[2]); - - //File #2 - file = sortedFiles.pop(); - lines = getLines(file); - - assertEquals(4, lines.length); - assertEquals("often" + TAB + "3", lines[0]); - assertEquals("oil" + TAB + "30", lines[1]); - assertEquals("out" + TAB + "2", lines[2]); - assertEquals("out-of-order" + TAB + "5", lines[3]); - - //Clean up calls - sorter.cleanUp(); - sortedFiles = sorter.getSortedFiles(); - assertEquals(0, sortedFiles.size()); - - } - - private String[] getLines(File file) - throws IOException - { - BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), - "UTF-8")); - - LinkedList lines = new LinkedList(); - - String line = ""; - while ((line = reader.readLine()) != null) { - lines.add(line); - } - reader.close(); - - return lines.toArray(new String[0]); - } - - @Before - public void setUp() - { - setUpFileList(); - setUpComparator(); - - } - - private void setUpComparator() - { - comparator = new Comparator() - { - @Override - public int compare(String r1, String r2) - { - return r1.compareTo(r2); - } - }; - } - - private void setUpFileList() - { - filesToSort = new LinkedList(); - - File file_1 = new File("src/test/resources/Web1TSorter/00000000_unsorted"); - File file_2 = new File("src/test/resources/Web1TSorter/00000001_unsorted"); - - filesToSort.add(file_1); - filesToSort.add(file_2); - - } -} diff --git a/dkpro-core-io-web1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/Web1TSplitterTest.java b/dkpro-core-io-web1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/Web1TSplitterTest.java deleted file mode 100644 index 4f245ae426..0000000000 --- a/dkpro-core-io-web1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/Web1TSplitterTest.java +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.web1t; - -import static org.junit.Assert.assertEquals; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.util.List; - -import org.junit.After; -import org.junit.Before; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution; -import de.tudarmstadt.ukp.dkpro.core.io.web1t.util.Web1TFileSplitter; - -public class Web1TSplitterTest -{ - - FrequencyDistribution fdist; - File input; - File output; - - @Test - public void testSplitter() - throws IOException - { - Web1TFileSplitter splitter = new Web1TFileSplitter(input, output, "UTF-8", fdist, 0.1, 0); - splitter.split(); - List splits = splitter.getFiles(); - - assertEquals(4, splitter.getNextUnusedFileNumber()); - - // - assertEquals(4, splits.size()); - assertEquals(12, countWordsInSplitFiles(splits)); - // - splitter.cleanUp(); - splits = splitter.getFiles(); - assertEquals(0, splits.size()); - } - - private int countWordsInSplitFiles(List splits) - throws IOException - { - - int words = 0; - for (File file : splits) { - - BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream( - file), "UTF-8")); - - while (reader.readLine() != null) { - words++; - } - - reader.close(); - } - return words; - } - - @Before - public void setUp() - throws IOException - { - fdist = createTestInputFile(); - output = new File("src/test/resources/tmp." + this.getClass().getName()); - output.mkdir(); - } - - private FrequencyDistribution createTestInputFile() - throws IOException - { - input = new File("input.txt"); - - BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( - new FileOutputStream(input), "UTF-8")); - - String[] words = new String[] { "Can", "you", "can", "a", "can", "as", - "a", "canner", "can", "can", "a", "can" }; - - FrequencyDistribution fdist = new FrequencyDistribution(); - for (String word : words) { - writer.write(word + "\t" + "1" + "\n"); - - if (word.length() > 1) { - String subsKey = word.substring(0, 2); - String subsKeyLowered = subsKey.toLowerCase(); - fdist.addSample(subsKeyLowered, 1); - } - else { - String subsKey = word.substring(0, 1); - String subsKeyLowered = subsKey.toLowerCase(); - fdist.addSample(subsKeyLowered, 1); - } - - } - - writer.close(); - - return fdist; - } - - @After - public void tearDown() - { - input.delete(); - - File[] files = output.listFiles(); - - for (File file : files) { - file.delete(); - } - - output.delete(); - } - -} diff --git a/dkpro-core-io-web1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/PipelineExample.java b/dkpro-core-io-web1t-asl/src/test/java/org/dkpro/core/io/web1t/PipelineExample.java similarity index 91% rename from dkpro-core-io-web1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/PipelineExample.java rename to dkpro-core-io-web1t-asl/src/test/java/org/dkpro/core/io/web1t/PipelineExample.java index 76d698306a..dd1b41a0ec 100644 --- a/dkpro-core-io-web1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/PipelineExample.java +++ b/dkpro-core-io-web1t-asl/src/test/java/org/dkpro/core/io/web1t/PipelineExample.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.web1t; +package org.dkpro.core.io.web1t; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; @@ -23,15 +23,15 @@ import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReader; import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.api.resources.DkproContext; +import org.dkpro.core.io.tei.TeiReader; +import org.dkpro.core.tokit.BreakIteratorSegmenter; import org.junit.Ignore; import org.junit.Test; import com.googlecode.jweb1t.JWeb1TIndexer; -import de.tudarmstadt.ukp.dkpro.core.api.resources.DkproContext; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.io.tei.TeiReader; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; public class PipelineExample { @@ -69,4 +69,4 @@ public void pipelineTest() throws Exception JWeb1TIndexer indexCreator = new JWeb1TIndexer("target/web1t/", 3); indexCreator.create(); } -} \ No newline at end of file +} diff --git a/dkpro-core-io-web1t-asl/src/test/java/org/dkpro/core/io/web1t/TestCreatedIndex.java b/dkpro-core-io-web1t-asl/src/test/java/org/dkpro/core/io/web1t/TestCreatedIndex.java new file mode 100644 index 0000000000..99e8c1fc1f --- /dev/null +++ b/dkpro-core-io-web1t-asl/src/test/java/org/dkpro/core/io/web1t/TestCreatedIndex.java @@ -0,0 +1,60 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.web1t; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +import java.io.File; + +import org.dkpro.core.frequency.Web1TFileAccessProvider; +import org.junit.Ignore; +import org.junit.Test; + +public class TestCreatedIndex +{ + + @Ignore + @Test + // Assumes index created with data from amisch wikipedia for 1 to 3 grams + public void testCreatedIndex() throws Exception + { + Web1TFileAccessProvider web = new Web1TFileAccessProvider("en", new File("target/Index/"), + 1, 3); + + assertEquals(200162, web.getNrOfNgrams(1)); + assertEquals(200162, web.getNrOfTokens()); + + assertGreater(-1, web.getNrOfNgrams(1)); + assertGreater(-1, web.getNrOfNgrams(2)); + assertGreater(-1, web.getNrOfNgrams(3)); + assertEquals(-1, web.getNrOfNgrams(4)); + + double l = web.getProbability("Amisch"); + assertEquals(0.002582907, l, 0.00000001); + } + + private void assertGreater(long i, long nrOfNgrams) + { + + if (nrOfNgrams <= i) { + fail("Value is not greater"); + } + } + +} diff --git a/dkpro-core-io-web1t-asl/src/test/java/org/dkpro/core/io/web1t/Web1TConsolidatorTest.java b/dkpro-core-io-web1t-asl/src/test/java/org/dkpro/core/io/web1t/Web1TConsolidatorTest.java new file mode 100644 index 0000000000..96e9a652ec --- /dev/null +++ b/dkpro-core-io-web1t-asl/src/test/java/org/dkpro/core/io/web1t/Web1TConsolidatorTest.java @@ -0,0 +1,166 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.web1t; + +import static org.junit.Assert.assertEquals; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.Comparator; +import java.util.LinkedList; + +import org.dkpro.core.io.web1t.util.Web1TFileConsolidator; +import org.junit.Before; +import org.junit.Test; + +public class Web1TConsolidatorTest +{ + LinkedList filesToSort; + Comparator comparator; + + @Test + public void testConsolidation() + throws IOException + { + Web1TFileConsolidator consolidator = new Web1TFileConsolidator( + filesToSort, comparator, "UTF-8", 1); + + consolidator.consolidate(); + + LinkedList consolidatedFiles = consolidator + .getConsolidatedFiles(); + + String TAB = "\t"; + + assertEquals(2, consolidatedFiles.size()); + + // File #1 + File file = consolidatedFiles.pop(); + String[] lines = getLines(file); + + assertEquals(3, lines.length); + assertEquals("a" + TAB + "5", lines[0]); + assertEquals("although" + TAB + "16", lines[1]); + assertEquals("annoying" + TAB + "5", lines[2]); + + // File #2 + file = consolidatedFiles.pop(); + lines = getLines(file); + + assertEquals(4, lines.length); + assertEquals("often" + TAB + "5", lines[0]); + assertEquals("oil" + TAB + "32", lines[1]); + assertEquals("out" + TAB + "2", lines[2]); + assertEquals("out-of-order" + TAB + "9", lines[3]); + // Clean up calls + consolidator.cleanUp(); + consolidatedFiles = consolidator.getConsolidatedFiles(); + assertEquals(0, consolidatedFiles.size()); + + } + + @Test + public void testConsolidationWithFreq10() + throws IOException + { + Web1TFileConsolidator consolidator = new Web1TFileConsolidator( + filesToSort, comparator, "UTF-8", 10); + + consolidator.consolidate(); + + LinkedList consolidatedFiles = consolidator + .getConsolidatedFiles(); + + String TAB = "\t"; + + assertEquals(2, consolidatedFiles.size()); + + // File #1 + File file = consolidatedFiles.pop(); + String[] lines = getLines(file); + + assertEquals(1, lines.length); + assertEquals("although" + TAB + "16", lines[0]); + + // File #2 + file = consolidatedFiles.pop(); + lines = getLines(file); + + assertEquals(1, lines.length); + assertEquals("oil" + TAB + "32", lines[0]); + // Clean up calls + consolidator.cleanUp(); + consolidatedFiles = consolidator.getConsolidatedFiles(); + assertEquals(0, consolidatedFiles.size()); + + } + + @Before + public void setUp() + { + setUpFileList(); + setUpComparator(); + + } + + private void setUpComparator() + { + comparator = new Comparator() + { + @Override + public int compare(String r1, String r2) + { + return r1.compareTo(r2); + } + }; + } + + private void setUpFileList() + { + filesToSort = new LinkedList(); + + File file_1 = new File( + "src/test/resources/Web1TConsolidator/00000000_sorted"); + File file_2 = new File( + "src/test/resources/Web1TConsolidator/00000001_sorted"); + + filesToSort.add(file_1); + filesToSort.add(file_2); + + } + + private String[] getLines(File file) + throws IOException + { + BufferedReader reader = new BufferedReader(new InputStreamReader( + new FileInputStream(file), "UTF-8")); + + LinkedList lines = new LinkedList(); + + String line = ""; + while ((line = reader.readLine()) != null) { + lines.add(line); + } + reader.close(); + + return lines.toArray(new String[0]); + } +} diff --git a/dkpro-core-io-web1t-asl/src/test/java/org/dkpro/core/io/web1t/Web1TSorterTest.java b/dkpro-core-io-web1t-asl/src/test/java/org/dkpro/core/io/web1t/Web1TSorterTest.java new file mode 100644 index 0000000000..8fe7661808 --- /dev/null +++ b/dkpro-core-io-web1t-asl/src/test/java/org/dkpro/core/io/web1t/Web1TSorterTest.java @@ -0,0 +1,127 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.web1t; + +import static org.junit.Assert.assertEquals; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.Comparator; +import java.util.LinkedList; + +import org.dkpro.core.io.web1t.util.Web1TFileSorter; +import org.junit.Before; +import org.junit.Test; + +public class Web1TSorterTest +{ + + LinkedList filesToSort; + Comparator comparator; + Web1TFileSorter sorter; + + @Test + public void testSorter() + throws IOException + { + Web1TFileSorter sorter = new Web1TFileSorter(filesToSort, comparator); + sorter.sort(); + LinkedList sortedFiles = sorter.getSortedFiles(); + + assertEquals(2, sortedFiles.size()); + + String TAB = "\t"; + + // File #1 + File file = sortedFiles.pop(); + String[] lines = getLines(file); + + assertEquals(3, lines.length); + assertEquals("a" + TAB + "3", lines[0]); + assertEquals("although" + TAB + "4", lines[1]); + assertEquals("annoying" + TAB + "5", lines[2]); + + //File #2 + file = sortedFiles.pop(); + lines = getLines(file); + + assertEquals(4, lines.length); + assertEquals("often" + TAB + "3", lines[0]); + assertEquals("oil" + TAB + "30", lines[1]); + assertEquals("out" + TAB + "2", lines[2]); + assertEquals("out-of-order" + TAB + "5", lines[3]); + + //Clean up calls + sorter.cleanUp(); + sortedFiles = sorter.getSortedFiles(); + assertEquals(0, sortedFiles.size()); + + } + + private String[] getLines(File file) + throws IOException + { + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), + "UTF-8")); + + LinkedList lines = new LinkedList(); + + String line = ""; + while ((line = reader.readLine()) != null) { + lines.add(line); + } + reader.close(); + + return lines.toArray(new String[0]); + } + + @Before + public void setUp() + { + setUpFileList(); + setUpComparator(); + + } + + private void setUpComparator() + { + comparator = new Comparator() + { + @Override + public int compare(String r1, String r2) + { + return r1.compareTo(r2); + } + }; + } + + private void setUpFileList() + { + filesToSort = new LinkedList(); + + File file_1 = new File("src/test/resources/Web1TSorter/00000000_unsorted"); + File file_2 = new File("src/test/resources/Web1TSorter/00000001_unsorted"); + + filesToSort.add(file_1); + filesToSort.add(file_2); + + } +} diff --git a/dkpro-core-io-web1t-asl/src/test/java/org/dkpro/core/io/web1t/Web1TSplitterTest.java b/dkpro-core-io-web1t-asl/src/test/java/org/dkpro/core/io/web1t/Web1TSplitterTest.java new file mode 100644 index 0000000000..6f8619680a --- /dev/null +++ b/dkpro-core-io-web1t-asl/src/test/java/org/dkpro/core/io/web1t/Web1TSplitterTest.java @@ -0,0 +1,139 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.web1t; + +import static org.junit.Assert.assertEquals; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.util.List; + +import org.dkpro.core.api.frequency.util.FrequencyDistribution; +import org.dkpro.core.io.web1t.util.Web1TFileSplitter; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class Web1TSplitterTest +{ + + FrequencyDistribution fdist; + File input; + File output; + + @Test + public void testSplitter() + throws IOException + { + Web1TFileSplitter splitter = new Web1TFileSplitter(input, output, "UTF-8", fdist, 0.1, 0); + splitter.split(); + List splits = splitter.getFiles(); + + assertEquals(4, splitter.getNextUnusedFileNumber()); + + // + assertEquals(4, splits.size()); + assertEquals(12, countWordsInSplitFiles(splits)); + // + splitter.cleanUp(); + splits = splitter.getFiles(); + assertEquals(0, splits.size()); + } + + private int countWordsInSplitFiles(List splits) + throws IOException + { + + int words = 0; + for (File file : splits) { + + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream( + file), "UTF-8")); + + while (reader.readLine() != null) { + words++; + } + + reader.close(); + } + return words; + } + + @Before + public void setUp() + throws IOException + { + fdist = createTestInputFile(); + output = new File("src/test/resources/tmp." + this.getClass().getName()); + output.mkdir(); + } + + private FrequencyDistribution createTestInputFile() + throws IOException + { + input = new File("input.txt"); + + BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( + new FileOutputStream(input), "UTF-8")); + + String[] words = new String[] { "Can", "you", "can", "a", "can", "as", + "a", "canner", "can", "can", "a", "can" }; + + FrequencyDistribution fdist = new FrequencyDistribution(); + for (String word : words) { + writer.write(word + "\t" + "1" + "\n"); + + if (word.length() > 1) { + String subsKey = word.substring(0, 2); + String subsKeyLowered = subsKey.toLowerCase(); + fdist.addSample(subsKeyLowered, 1); + } + else { + String subsKey = word.substring(0, 1); + String subsKeyLowered = subsKey.toLowerCase(); + fdist.addSample(subsKeyLowered, 1); + } + + } + + writer.close(); + + return fdist; + } + + @After + public void tearDown() + { + input.delete(); + + File[] files = output.listFiles(); + + for (File file : files) { + file.delete(); + } + + output.delete(); + } + +} diff --git a/dkpro-core-io-web1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/Web1TWriterTest.java b/dkpro-core-io-web1t-asl/src/test/java/org/dkpro/core/io/web1t/Web1TWriterTest.java similarity index 91% rename from dkpro-core-io-web1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/Web1TWriterTest.java rename to dkpro-core-io-web1t-asl/src/test/java/org/dkpro/core/io/web1t/Web1TWriterTest.java index 05c6dfd785..3d27a783d1 100644 --- a/dkpro-core-io-web1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/web1t/Web1TWriterTest.java +++ b/dkpro-core-io-web1t-asl/src/test/java/org/dkpro/core/io/web1t/Web1TWriterTest.java @@ -1,173 +1,173 @@ -/* - * Copyright 2011 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.web1t; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; -import static org.junit.Assert.assertEquals; - -import java.io.File; -import java.io.IOException; - -import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReader; -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.apache.uima.resource.ResourceInitializationException; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.clearnlp.ClearNlpLemmatizer; -import de.tudarmstadt.ukp.dkpro.core.frequency.Web1TFileAccessProvider; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; - -public class Web1TWriterTest -{ - private final int MIN_NGRAM = 1; - private final int MAX_NGRAM = 3; - - @Test - public void web1TFormatTestWithTwoMultiSlashedTypesAsFeaturePath() - throws Exception - { - File folder = testContext.getTestOutputFolder(); - - Web1TFileAccessProvider web1tProvider = prepareWeb1TFormatTest(folder, new String[] { - Token.class.getName() + "/pos/PosValue", Token.class.getName() + "/lemma/value" }); - - assertEquals(1, web1tProvider.getFrequency("TO")); // "to" - assertEquals(1, web1tProvider.getFrequency("NNS")); // "sentences" - assertEquals(1, web1tProvider.getFrequency("EX")); // "there" - - assertEquals(1, web1tProvider.getFrequency("write")); - assertEquals(0, web1tProvider.getFrequency("written")); - - } - - @Test - public void web1TFormatTestWithMultiSlashedTypesAsFeaturePath() - throws Exception - { - File folder = testContext.getTestOutputFolder(); - - Web1TFileAccessProvider web1tProvider = prepareWeb1TFormatTest(folder, - new String[] { Token.class.getName() + "/lemma/value" }); - - assertEquals(1, web1tProvider.getFrequency("write")); - assertEquals(0, web1tProvider.getFrequency("written")); - assertEquals(4, web1tProvider.getFrequency("sentence")); - - } - - @Test - public void web1TFormatTest_randomFrequencies() - throws Exception - { - File folder = testContext.getTestOutputFolder(); - - Web1TFileAccessProvider web1tProvider = prepareWeb1TFormatTest(folder, - new String[] { Token.class.getName() }); - - assertEquals(4, web1tProvider.getFrequency(".")); - assertEquals(1, web1tProvider.getFrequency(",")); - assertEquals(3, web1tProvider.getFrequency("sentence")); - assertEquals(1, web1tProvider.getFrequency("written")); - - } - - @Test(expected = ResourceInitializationException.class) - public void web1TFormatTest_exceptionForInvalidMinFrequency1() - throws Exception - { - writeWeb1TFormat(new String[] { Token.class.getName() }, -1); - - } - - @Test(expected = ResourceInitializationException.class) - public void web1TFormatTest_exceptionForInvalidMinFrequency2() - throws Exception - { - writeWeb1TFormat(new String[] { Token.class.getName() }, 0); - - } - - private void writeWeb1TFormat(String[] strings, int minFreq) - throws UIMAException, IOException - { - CollectionReader reader = createReader(TextReader.class, - TextReader.PARAM_LANGUAGE, "en", - TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/", - TextReader.PARAM_PATTERNS, new String[] { "[+]**/*.txt" }); - - AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - - AnalysisEngineDescription tagger = createEngineDescription(OpenNlpPosTagger.class); - - AnalysisEngineDescription lemmatizer = createEngineDescription(ClearNlpLemmatizer.class); - - AnalysisEngineDescription ngramWriter = createEngineDescription(Web1TWriter.class, - Web1TWriter.PARAM_TARGET_LOCATION, testContext.getTestOutputFolder(), - Web1TWriter.PARAM_INPUT_TYPES, strings, - Web1TWriter.PARAM_MIN_NGRAM_LENGTH, MIN_NGRAM, - Web1TWriter.PARAM_MAX_NGRAM_LENGTH, MAX_NGRAM, - Web1TWriter.PARAM_MIN_FREQUENCY, minFreq); - - SimplePipeline.runPipeline(reader, segmenter, tagger, lemmatizer, ngramWriter); - } - - private Web1TFileAccessProvider prepareWeb1TFormatTest(File target, String[] inputTypes) - throws Exception - { - writeWeb1TFormat(target, inputTypes); - - Web1TFileAccessProvider web1tProvider = new Web1TFileAccessProvider("en", target, - MIN_NGRAM, MAX_NGRAM); - - return web1tProvider; - } - - private void writeWeb1TFormat(File target, String[] inputPath) - throws Exception - { - CollectionReader reader = createReader(TextReader.class, - TextReader.PARAM_LANGUAGE, "en", - TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/", - TextReader.PARAM_PATTERNS, new String[] { "[+]**/*.txt" }); - - AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - - AnalysisEngineDescription tagger = createEngineDescription(OpenNlpPosTagger.class); - - AnalysisEngineDescription lemmatizer = createEngineDescription(ClearNlpLemmatizer.class); - - AnalysisEngineDescription ngramWriter = createEngineDescription(Web1TWriter.class, - Web1TWriter.PARAM_TARGET_LOCATION, target, - Web1TWriter.PARAM_INPUT_TYPES, inputPath, - Web1TWriter.PARAM_MIN_NGRAM_LENGTH, MIN_NGRAM, - Web1TWriter.PARAM_MAX_NGRAM_LENGTH, MAX_NGRAM); - - SimplePipeline.runPipeline(reader, segmenter, tagger, lemmatizer, ngramWriter); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} +/* + * Copyright 2011 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.web1t; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.IOException; + +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.clearnlp.ClearNlpLemmatizer; +import org.dkpro.core.frequency.Web1TFileAccessProvider; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.opennlp.OpenNlpPosTagger; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.tokit.BreakIteratorSegmenter; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class Web1TWriterTest +{ + private final int MIN_NGRAM = 1; + private final int MAX_NGRAM = 3; + + @Test + public void web1TFormatTestWithTwoMultiSlashedTypesAsFeaturePath() + throws Exception + { + File folder = testContext.getTestOutputFolder(); + + Web1TFileAccessProvider web1tProvider = prepareWeb1TFormatTest(folder, new String[] { + Token.class.getName() + "/pos/PosValue", Token.class.getName() + "/lemma/value" }); + + assertEquals(1, web1tProvider.getFrequency("TO")); // "to" + assertEquals(1, web1tProvider.getFrequency("NNS")); // "sentences" + assertEquals(1, web1tProvider.getFrequency("EX")); // "there" + + assertEquals(1, web1tProvider.getFrequency("write")); + assertEquals(0, web1tProvider.getFrequency("written")); + + } + + @Test + public void web1TFormatTestWithMultiSlashedTypesAsFeaturePath() + throws Exception + { + File folder = testContext.getTestOutputFolder(); + + Web1TFileAccessProvider web1tProvider = prepareWeb1TFormatTest(folder, + new String[] { Token.class.getName() + "/lemma/value" }); + + assertEquals(1, web1tProvider.getFrequency("write")); + assertEquals(0, web1tProvider.getFrequency("written")); + assertEquals(4, web1tProvider.getFrequency("sentence")); + + } + + @Test + public void web1TFormatTest_randomFrequencies() + throws Exception + { + File folder = testContext.getTestOutputFolder(); + + Web1TFileAccessProvider web1tProvider = prepareWeb1TFormatTest(folder, + new String[] { Token.class.getName() }); + + assertEquals(4, web1tProvider.getFrequency(".")); + assertEquals(1, web1tProvider.getFrequency(",")); + assertEquals(3, web1tProvider.getFrequency("sentence")); + assertEquals(1, web1tProvider.getFrequency("written")); + + } + + @Test(expected = ResourceInitializationException.class) + public void web1TFormatTest_exceptionForInvalidMinFrequency1() + throws Exception + { + writeWeb1TFormat(new String[] { Token.class.getName() }, -1); + + } + + @Test(expected = ResourceInitializationException.class) + public void web1TFormatTest_exceptionForInvalidMinFrequency2() + throws Exception + { + writeWeb1TFormat(new String[] { Token.class.getName() }, 0); + + } + + private void writeWeb1TFormat(String[] strings, int minFreq) + throws UIMAException, IOException + { + CollectionReader reader = createReader(TextReader.class, + TextReader.PARAM_LANGUAGE, "en", + TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/", + TextReader.PARAM_PATTERNS, new String[] { "[+]**/*.txt" }); + + AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); + + AnalysisEngineDescription tagger = createEngineDescription(OpenNlpPosTagger.class); + + AnalysisEngineDescription lemmatizer = createEngineDescription(ClearNlpLemmatizer.class); + + AnalysisEngineDescription ngramWriter = createEngineDescription(Web1TWriter.class, + Web1TWriter.PARAM_TARGET_LOCATION, testContext.getTestOutputFolder(), + Web1TWriter.PARAM_INPUT_TYPES, strings, + Web1TWriter.PARAM_MIN_NGRAM_LENGTH, MIN_NGRAM, + Web1TWriter.PARAM_MAX_NGRAM_LENGTH, MAX_NGRAM, + Web1TWriter.PARAM_MIN_FREQUENCY, minFreq); + + SimplePipeline.runPipeline(reader, segmenter, tagger, lemmatizer, ngramWriter); + } + + private Web1TFileAccessProvider prepareWeb1TFormatTest(File target, String[] inputTypes) + throws Exception + { + writeWeb1TFormat(target, inputTypes); + + Web1TFileAccessProvider web1tProvider = new Web1TFileAccessProvider("en", target, + MIN_NGRAM, MAX_NGRAM); + + return web1tProvider; + } + + private void writeWeb1TFormat(File target, String[] inputPath) + throws Exception + { + CollectionReader reader = createReader(TextReader.class, + TextReader.PARAM_LANGUAGE, "en", + TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/", + TextReader.PARAM_PATTERNS, new String[] { "[+]**/*.txt" }); + + AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); + + AnalysisEngineDescription tagger = createEngineDescription(OpenNlpPosTagger.class); + + AnalysisEngineDescription lemmatizer = createEngineDescription(ClearNlpLemmatizer.class); + + AnalysisEngineDescription ngramWriter = createEngineDescription(Web1TWriter.class, + Web1TWriter.PARAM_TARGET_LOCATION, target, + Web1TWriter.PARAM_INPUT_TYPES, inputPath, + Web1TWriter.PARAM_MIN_NGRAM_LENGTH, MIN_NGRAM, + Web1TWriter.PARAM_MAX_NGRAM_LENGTH, MAX_NGRAM); + + SimplePipeline.runPipeline(reader, segmenter, tagger, lemmatizer, ngramWriter); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-web1t-asl/src/test/resources/log4j.properties b/dkpro-core-io-web1t-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-io-web1t-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-web1t-asl/src/test/resources/log4j2.xml b/dkpro-core-io-web1t-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-web1t-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/LICENSE.txt b/dkpro-core-io-webanno-asl/LICENSE.txt new file mode 100644 index 0000000000..d645695673 --- /dev/null +++ b/dkpro-core-io-webanno-asl/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/dkpro-core-io-webanno-asl/NOTICE.txt b/dkpro-core-io-webanno-asl/NOTICE.txt new file mode 100644 index 0000000000..4bff3e0a39 --- /dev/null +++ b/dkpro-core-io-webanno-asl/NOTICE.txt @@ -0,0 +1,5 @@ +The code in this module has been ported from WebAnno in order to provide the ability to work +with TSV3 files outside of WebAnno. + +Copyright by Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology +Technische Universität Darmstadt diff --git a/dkpro-core-io-webanno-asl/pom.xml b/dkpro-core-io-webanno-asl/pom.xml new file mode 100644 index 0000000000..0c81e8441a --- /dev/null +++ b/dkpro-core-io-webanno-asl/pom.xml @@ -0,0 +1,140 @@ + + + 4.0.0 + + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT + ../dkpro-core-asl + + dkpro-core-io-webanno-asl + jar + DKPro Core ASL - IO - WebAnno TSV + + + org.apache.uima + uimaj-core + + + org.apache.uima + uimafit-core + + + org.apache.commons + commons-lang3 + + + commons-io + commons-io + + + org.slf4j + slf4j-api + + + org.dkpro.core + dkpro-core-api-io-asl + + + org.dkpro.core + dkpro-core-api-metadata-asl + + + org.dkpro.core + dkpro-core-api-parameter-asl + + + org.dkpro.core + dkpro-core-api-segmentation-asl + + + org.dkpro.core + dkpro-core-api-lexmorph-asl + + + org.dkpro.core + dkpro-core-api-syntax-asl + + + junit + junit + test + + + org.dkpro.core + dkpro-core-testing-asl + test + + + org.dkpro.core + dkpro-core-io-xmi-asl + test + + + org.dkpro.core + dkpro-core-api-ner-asl + test + + + + + + + org.apache.uima + jcasgen-maven-plugin + ${uima.version} + + true + ${project.build.directory}/generated-test-sources/jcasgen + + src/test/resources/desc/type/**/*.xml + + + + + generate-test-sources + + generate + + + + + + org.codehaus.mojo + build-helper-maven-plugin + + + addToSourceFolder + + + add-test-source + + process-test-sources + + + + ${project.build.directory}/generated-test-sources/jcasgen + + + + + + + + \ No newline at end of file diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3XReader.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3XReader.java new file mode 100644 index 0000000000..7fefd964fd --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3XReader.java @@ -0,0 +1,60 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.webanno.tsv; + +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.LineNumberReader; + +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.Tsv3XDeserializer; + +/** + * Reads the WebAnno TSV v3.x format. + */ +@ResourceMetaData(name = "WebAnno TSV v3.x Reader") +@MimeTypeCapability({MimeTypes.TEXT_X_WEBANNO_TSV3}) +public class WebannoTsv3XReader + extends JCasResourceCollectionReader_ImplBase +{ + /** + * Character encoding of the input data. + */ + public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; + @ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8") + private String encoding; + + @Override + public void getNext(JCas aJCas) throws IOException, CollectionException + { + Resource res = nextFile(); + initCas(aJCas, res); + + try (LineNumberReader br = new LineNumberReader( + new InputStreamReader(res.getInputStream(), encoding))) { + new Tsv3XDeserializer().read(br, aJCas); + } + } +} diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3XWriter.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3XWriter.java new file mode 100644 index 0000000000..ef886cc744 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3XWriter.java @@ -0,0 +1,82 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.webanno.tsv; + +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.Tsv3XCasDocumentBuilder; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.Tsv3XCasSchemaAnalyzer; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.Tsv3XSerializer; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvDocument; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema; + +/** + * Writes the WebAnno TSV v3.x format. + */ +@ResourceMetaData(name = "WebAnno TSV v3.x Writer") +@MimeTypeCapability({MimeTypes.TEXT_X_WEBANNO_TSV3}) +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence"}) +public class WebannoTsv3XWriter + extends JCasFileWriter_ImplBase +{ + /** + * The character encoding used by the input files. + */ + public static final String PARAM_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; + @ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8") + private String encoding; + + /** + * Use this filename extension. + */ + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; + @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".tsv") + private String filenameSuffix; + + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException + { + TsvSchema schema = Tsv3XCasSchemaAnalyzer.analyze(aJCas.getTypeSystem()); + + TsvDocument doc = Tsv3XCasDocumentBuilder.of(schema, aJCas); + + try (PrintWriter docOS = new PrintWriter(new OutputStreamWriter( + getOutputStream(aJCas, filenameSuffix), encoding))) { + new Tsv3XSerializer().write(docOS, doc); + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } +} diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Escaping.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Escaping.java new file mode 100644 index 0000000000..3ee0d7018f --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Escaping.java @@ -0,0 +1,120 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */package org.dkpro.core.io.webanno.tsv.internal.tsv3x; + +import static org.apache.commons.lang3.StringEscapeUtils.unescapeJava; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.lang3.StringUtils; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvFormatHeader; + +public class Escaping +{ + public static String escapeValue(String aValue) + { + return StringUtils.replaceEach(aValue, + new String[] { "\\", "[", "]", "|", "_", "->", ";", "\t", "\n", "*" }, + new String[] { "\\\\", "\\[", "\\]", "\\|", "\\_", "\\->", "\\;", "\\t", "\\n", + "\\*" }); + } + + public static String unescapeValue(String aValue) + { + return StringUtils.replaceEach(aValue, + new String[] { "\\\\", "\\[", "\\]", "\\|", "\\_", "\\->", "\\;", "\\t", "\\n", + "\\*" }, + new String[] { "\\", "[", "]", "|", "_", "->", ";", "\t", "\n", "*" }); + } + + public static String escapeText(String aText) + { + List pat = new ArrayList<>(); + List esc = new ArrayList<>(); + for (int i = 0; i < 32; i++) { + if (i > 7 && i < 14) { + continue; + } + pat.add(Character.toString((char) i)); + esc.add("\\" + Character.toString((char) i)); + } + // with a readable Java escape sequence + // TAB + pat.add("\t"); + esc.add("\\t"); + // linefeed + pat.add("\n"); + esc.add("\\n"); + // formfeed + pat.add("\f"); + esc.add("\\f"); + // carriage return + pat.add("\r"); + esc.add("\\r"); + // backspace + pat.add("\b"); + esc.add("\\b"); + // backslash + pat.add("\\"); + esc.add("\\\\"); + + return StringUtils.replaceEach(aText, pat.toArray(new String[pat.size()]), + esc.toArray(new String[esc.size()])); + } + + public static String unescapeText(TsvFormatHeader aHeader, String aText) { + if ("3.1".equals(aHeader.getVersion())) { + return unescapeJava(aText); + } + else if ("3.2".equals(aHeader.getVersion())) { + List pat = new ArrayList<>(); + List esc = new ArrayList<>(); + for (int i = 0; i < 32; i++) { + if (i > 7 && i < 14) { + continue; + } + pat.add(Character.toString((char) i)); + esc.add("\\" + Character.toString((char) i)); + } + // with a readable Java escape sequence + // TAB + pat.add("\t"); + esc.add("\\t"); + // linefeed + pat.add("\n"); + esc.add("\\n"); + // formfeed + pat.add("\f"); + esc.add("\\f"); + // carriage return + pat.add("\r"); + esc.add("\\r"); + // backspace + pat.add("\b"); + esc.add("\\b"); + // backslash + pat.add("\\"); + esc.add("\\\\"); + return StringUtils.replaceEach(aText, + esc.toArray(new String[esc.size()]), pat.toArray(new String[pat.size()])); + } + else { + throw new IllegalStateException("Unknown version: [" + aHeader.getVersion() + "]"); + } + } +} diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XCasDocumentBuilder.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XCasDocumentBuilder.java new file mode 100644 index 0000000000..594d624e9b --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XCasDocumentBuilder.java @@ -0,0 +1,395 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.webanno.tsv.internal.tsv3x; + +import static java.lang.Math.max; +import static java.lang.Math.min; +import static org.apache.uima.fit.util.FSUtil.getFeature; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.PLACEHOLDER; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.RELATION_REF; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.SLOT_TARGET; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.LayerType.CHAIN; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.LayerType.RELATION; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.LayerType.SPAN; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.CHAIN_FIRST_FEAT; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.CHAIN_NEXT_FEAT; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.FEAT_REL_SOURCE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.FEAT_REL_TARGET; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.ListIterator; +import java.util.Map.Entry; +import java.util.NavigableMap; +import java.util.Set; +import java.util.TreeMap; + +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.FeatureStructure; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.fit.util.CasUtil; +import org.apache.uima.fit.util.FSUtil; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.LayerType; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvColumn; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvDocument; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvFormatHeader; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSentence; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSubToken; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvToken; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvUnit; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; + +public class Tsv3XCasDocumentBuilder +{ + public static TsvDocument of(TsvSchema aSchema, JCas aJCas) + { + TsvFormatHeader format = new TsvFormatHeader("WebAnno TSV", "3.2"); + TsvDocument doc = new TsvDocument(format, aSchema, aJCas); + + // Fill document with all the sentences and tokens + for (Sentence uimaSentence : select(aJCas, Sentence.class)) { + TsvSentence sentence = doc.createSentence(uimaSentence); + for (Token uimaToken : selectCovered(Token.class, uimaSentence)) { + sentence.createToken(uimaToken); + } + } + + // Scan for chains + for (Type headType : aSchema.getChainHeadTypes()) { + // In UIMAv3, the iteration order of feature structures that are not in a sorted index + // is random. For some reason UIMAv2 returned them in creation order. By sorting by the + // FS ID, we restore the UIMAv2 situation. + List heads = new ArrayList<>( + CasUtil.selectFS(aJCas.getCas(), headType)); + heads.sort(Comparator.comparing(fs -> aJCas.getLowLevelCas().ll_getFSRef(fs))); + for (FeatureStructure chainHead : heads) { + List elements = new ArrayList<>(); + AnnotationFS link = getFeature(chainHead, CHAIN_FIRST_FEAT, AnnotationFS.class); + while (link != null) { + elements.add(link); + link = getFeature(link, CHAIN_NEXT_FEAT, AnnotationFS.class); + } + if (!elements.isEmpty()) { + Type elementType = headType.getFeatureByBaseName(CHAIN_FIRST_FEAT).getRange(); + doc.createChain(headType, elementType, elements); + } + } + } + + + // Build indexes over the token start and end positions such that we can quickly locate + // tokens based on their offsets. + NavigableMap tokenBeginIndex = new TreeMap<>(); + NavigableMap tokenEndIndex = new TreeMap<>(); + List tokens = new ArrayList<>(); + for (TsvSentence sentence : doc.getSentences()) { + for (TsvToken token : sentence.getTokens()) { + tokenBeginIndex.put(token.getBegin(), token); + tokenEndIndex.put(token.getEnd(), token); + tokens.add(token); + } + } + + // Scan all annotations of the types defined in the schema and use them to set up sub-token + // units. + for (Type type : aSchema.getUimaTypes()) { + if (aSchema.getIgnoredTypes().contains(type)) { + continue; + } + + LayerType layerType = aSchema.getLayerType(type); + + boolean addDisambiguationIdIfStacked = SPAN.equals(layerType); + + for (AnnotationFS annotation : CasUtil.select(aJCas.getCas(), type)) { + // Mind that we might actually get an annotation here which is a subtype of `type`! + doc.activateType(type); + + // Get the relevant begin and end offsets for the current annotation + int begin = annotation.getBegin(); + int end = annotation.getEnd(); + + // According to DKPro Core conventions, the offsets of relations must match + // those of the target (i.e. the offsets of a Dependency relation must match + // those of the dependent). Thus, we obtain the offsets from the target, just + // to be sure. + if (RELATION.equals(layerType)) { + AnnotationFS targetFS = getFeature(annotation, FEAT_REL_TARGET, + AnnotationFS.class); + begin = targetFS.getBegin(); + end = targetFS.getEnd(); + } + + Entry beginTokenEntry = tokenBeginIndex.floorEntry(begin); + // If the current annotation has leading whitespace, we have wrongly fetched the + // token before the start token using floorEntry(end) - so let's try to correct this + if ( + // found begin token but found the wrong one + ( + beginTokenEntry != null && + beginTokenEntry.getValue().getEnd() < begin && + tokenEndIndex.higherEntry(begin) != null + ) || + // didn't find begin token because annotation starts before the first token + beginTokenEntry == null + ) { + beginTokenEntry = tokenEndIndex.higherEntry(begin); + } + if (beginTokenEntry == null) { + throw new IllegalStateException( + "Unable to find begin token starting at or before " + begin + + " (first token starts at " + + tokenBeginIndex.pollFirstEntry().getKey() + + ") for annotation: " + annotation); + } + + Entry endTokenEntry = tokenEndIndex.ceilingEntry(end); + // If the current annotation has trailing whitespace, we have wrongly fetched the + // token after the end token using ceilingEntry(end) - so let's try to correct this + if ( + // found end token but found the wrong one + ( + endTokenEntry != null && + endTokenEntry.getValue().getBegin() > end && + tokenEndIndex.lowerEntry(end) != null + ) || + // didn't find end token because annotation ends beyond the last token + endTokenEntry == null + ) { + endTokenEntry = tokenEndIndex.lowerEntry(end); + } + if (endTokenEntry == null) { + throw new IllegalStateException("Unable to find end token ending at or after " + + end + " (last token ends at " + tokenEndIndex.pollLastEntry().getKey() + + ") for annotation: " + annotation); + } + + TsvToken beginToken = beginTokenEntry.getValue(); + TsvToken endToken = endTokenEntry.getValue(); + + // For zero-width annotations, the begin token must match the end token. + // Zero-width annotations between two directly adjacent tokens are always + // considered to be at the end of the first token rather than at the beginning + // of the second token, so we trust the tokenEndIndex here and override the + // value obtained from the tokenBeginIndex. + if (begin == end) { + beginToken = endToken; + } + + boolean singleToken = beginToken == endToken; + boolean zeroWitdh = begin == end; + boolean multiTokenCapable = SPAN.equals(layerType) || CHAIN.equals(layerType); + + // Annotation exactly matches token boundaries - it doesn't really matter if the + // begin and end tokens are the same; we don't have to create sub-token units + // in either case. + if (beginToken.getBegin() == begin && endToken.getEnd() == end) { + doc.mapFS2Unit(annotation, beginToken); + beginToken.addUimaAnnotation(annotation, addDisambiguationIdIfStacked); + + if (multiTokenCapable) { + endToken.addUimaAnnotation(annotation, addDisambiguationIdIfStacked); + } + } + else if (zeroWitdh) { + // If the zero-width annotation happens in the space between tokens or after + // the last token, we move it to the end of the closest preceding token in order + // not to have to drop it entirely. + int position = min(beginToken.getEnd(), end); + // ... or if the annotation is before the first token, then we move it to the + // begin of the first token + if (position < beginToken.getBegin()) { + position = beginToken.getBegin(); + } + TsvSubToken t = beginToken.createSubToken(position, position); + doc.mapFS2Unit(annotation, t); + t.addUimaAnnotation(annotation, addDisambiguationIdIfStacked); + } + else { + // Annotation covers only suffix of the begin token - we need to create a + // suffix sub-token unit on the begin token. The new sub-token defines the ID of + // the annotation. + if (beginToken.getBegin() < begin) { + TsvSubToken t = beginToken.createSubToken(begin, + min(beginToken.getEnd(), end)); + doc.mapFS2Unit(annotation, t); + t.addUimaAnnotation(annotation, addDisambiguationIdIfStacked); + } + // If not the sub-token is ID-defining, then the begin token is ID-defining + else { + beginToken.addUimaAnnotation(annotation, addDisambiguationIdIfStacked); + doc.mapFS2Unit(annotation, beginToken); + } + + // Annotation covers only a prefix of the end token - we need to create a + // prefix sub-token unit on the end token. If the current annotation is limited + // only to the sub-token unit, then it defines the ID. This is determined by + // checking if if singleToke is true. + if (endToken.getEnd() > end) { + TsvSubToken t = endToken.createSubToken(max(endToken.getBegin(), begin), + end); + t.addUimaAnnotation(annotation, addDisambiguationIdIfStacked); + + if (!singleToken) { + doc.mapFS2Unit(annotation, t); + } + } + else if (!singleToken && multiTokenCapable) { + endToken.addUimaAnnotation(annotation, addDisambiguationIdIfStacked); + } + } + + // The annotation must also be added to all tokens between the begin token and + // the end token + if (multiTokenCapable && !singleToken) { + ListIterator i = tokens.listIterator(tokens.indexOf(beginToken)); + TsvToken t; + while ((t = i.next()) != endToken) { + if (t != beginToken) { + t.addUimaAnnotation(annotation, addDisambiguationIdIfStacked); + } + } + } + + // Multi-token span annotations must get a disambiguation ID + if (SPAN.equals(layerType) && !singleToken) { + doc.addDisambiguationId(annotation); + } + } + } + + // Scan all created units to see which columns actually contains values + for (TsvSentence sentence : doc.getSentences()) { + for (TsvToken token : sentence.getTokens()) { + scanUnitForActiveColumns(token); + scanUnitForAmbiguousSlotReferences(token); + for (TsvSubToken subToken : token.getSubTokens()) { + scanUnitForActiveColumns(subToken); + scanUnitForAmbiguousSlotReferences(subToken); + } + } + } + + // Activate the placeholder columns for any active types for which no other columns are + // active. + Set activeTypesNeedingPlaceholders = new HashSet<>(doc.getActiveTypes()); + for (TsvColumn col : doc.getActiveColumns()) { + activeTypesNeedingPlaceholders.remove(col.uimaType); + } + for (TsvColumn col : doc.getSchema().getColumns()) { + if (PLACEHOLDER.equals(col.featureType) + && activeTypesNeedingPlaceholders.contains(col.uimaType)) { + doc.activateColumn(col); + } + } + + return doc; + } + + private static void scanUnitForActiveColumns(TsvUnit aUnit) + { + for (TsvColumn col : aUnit.getDocument().getSchema().getColumns()) { + List annotationsForColumn = aUnit.getAnnotationsForColumn(col); + if (!annotationsForColumn.isEmpty()) { +// if (SPAN.equals(col.layerType) && SLOT_TARGET.equals(col.featureType)) { +// for (AnnotationFS aFS : annotationsForColumn) { +// FeatureStructure[] links = getFeature(aFS, col.uimaFeature, +// FeatureStructure[].class); +// if (links != null && links.length > 0) { +// } +// } +// } + + + if (!PLACEHOLDER.equals(col.featureType)) { + aUnit.getDocument().activateColumn(col); + } + + // COMPATIBILITY NOTE: + // WebAnnoTsv3Writer obtains the type of a relation target column not from the + // type system definition but rather by looking at target used by the first + // actual annotation. + if (RELATION.equals(col.layerType) && RELATION_REF.equals(col.featureType)) { + AnnotationFS annotation = annotationsForColumn.get(0); + FeatureStructure target = FSUtil.getFeature(annotation, FEAT_REL_SOURCE, + FeatureStructure.class); + + if (target == null) { + throw new IllegalStateException( + "Relation does not have its source feature (" + FEAT_REL_SOURCE + + ") set: " + annotation); + } + + if (col.uimaType.getName().equals(Dependency.class.getName())) { + // COMPATIBILITY NOTE: + // WebAnnoTsv3Writer hard-changes the target type for DKPro Core + // Dependency annotations from Token to POS - the reason is not really + // clear. Probably because the Dependency relations in the WebAnno UI + // attach to POS (Token's are not visible as annotations in the UI). + col.setTargetTypeHint(aUnit.getDocument().getJCas().getTypeSystem() + .getType(POS.class.getName())); + } + else { + TsvSchema schema = aUnit.getDocument().getSchema(); + col.setTargetTypeHint(schema.getEffectiveType(target)); + } + } + } + } + } + + /** + * If a slot feature has the target type Annotation, then any kind of annotation can be + * used as slot filler. In this case, the targets are ambiguous and require an disambiguaton + * ID. + */ + private static void scanUnitForAmbiguousSlotReferences(TsvUnit aUnit) + { + for (TsvColumn col : aUnit.getDocument().getSchema().getColumns()) { + if (SPAN.equals(col.layerType) && SLOT_TARGET.equals(col.featureType) + && CAS.TYPE_NAME_ANNOTATION.equals(col.getTargetTypeHint().getName())) { + List annotationsForColumn = aUnit.getAnnotationsForColumn(col); + for (AnnotationFS aFS : annotationsForColumn) { + FeatureStructure[] links = getFeature(aFS, col.uimaFeature, + FeatureStructure[].class); + if (links != null) { + for (FeatureStructure link : links) { + AnnotationFS targetFS = getFeature(link, TsvSchema.FEAT_SLOT_TARGET, + AnnotationFS.class); + if (targetFS == null) { + throw new IllegalStateException("Slot link has no target: " + link); + } + aUnit.getDocument().addDisambiguationId(targetFS); + } + } + } + } + } + } +} diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XCasSchemaAnalyzer.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XCasSchemaAnalyzer.java new file mode 100644 index 0000000000..31728812a9 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XCasSchemaAnalyzer.java @@ -0,0 +1,255 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.webanno.tsv.internal.tsv3x; + +import static java.util.Arrays.asList; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.CHAIN_ELEMENT_TYPE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.CHAIN_LINK_TYPE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.RELATION_REF; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.LayerType.CHAIN; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.LayerType.RELATION; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.LayerType.SPAN; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.CHAIN_FIRST_FEAT; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.CHAIN_NEXT_FEAT; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.COREFERENCE_RELATION_FEATURE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.COREFERENCE_TYPE_FEATURE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.FEAT_REL_SOURCE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.FEAT_REL_TARGET; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.FEAT_SLOT_ROLE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.FEAT_SLOT_TARGET; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.TypeSystem; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.LayerType; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvColumn; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class Tsv3XCasSchemaAnalyzer +{ + private static final Logger LOG = LoggerFactory.getLogger(Tsv3XCasSchemaAnalyzer.class); + + public static TsvSchema analyze(TypeSystem aTypeSystem) + { + TsvSchema schema = new TsvSchema(); + + Set chainLinkTypes = new HashSet<>(); + + // Consider only direct subtypes of the UIMA Annotation type. Currently, WebAnno only + // supports such layers. + Type annotationType = aTypeSystem.getType(CAS.TYPE_NAME_ANNOTATION); + Type documentAnnotationType = aTypeSystem.getType(CAS.TYPE_NAME_DOCUMENT_ANNOTATION); + for (Type type : aTypeSystem.getDirectSubtypes(annotationType)) { + if (aTypeSystem.subsumes(documentAnnotationType, type)) { + continue; + } + + if (type.getName().equals(Token.class.getName()) + || type.getName().equals(Sentence.class.getName())) { + continue; + } + + switch (schema.getLayerType(type)) { + case RELATION: + schema.addColumn(new TsvColumn(type, RELATION, + type.getFeatureByBaseName(FEAT_REL_SOURCE), RELATION_REF)); + generateColumns(aTypeSystem, schema, RELATION, type); + break; + case CHAIN: + schema.addColumn(new TsvColumn(type, CHAIN, + type.getFeatureByBaseName(COREFERENCE_TYPE_FEATURE), + CHAIN_ELEMENT_TYPE)); + schema.addColumn(new TsvColumn(type, CHAIN, + type.getFeatureByBaseName(COREFERENCE_RELATION_FEATURE), + CHAIN_LINK_TYPE)); + chainLinkTypes.add(type); + break; + case SPAN: + schema.addColumn(new TsvColumn(type, SPAN)); + generateColumns(aTypeSystem, schema, SPAN, type); + break; + case INCOMPATIBLE: + // Do not generate a column definition for incompatible types. + break; + } + } + + // Scan again for the chain head types + Type topType = aTypeSystem.getType(CAS.TYPE_NAME_ANNOTATION_BASE); + for (Type type : aTypeSystem.getDirectSubtypes(topType)) { + Feature firstFeat = type.getFeatureByBaseName(CHAIN_FIRST_FEAT); + if (firstFeat != null && chainLinkTypes.contains(firstFeat.getRange())) { + schema.addChainHeadType(type); + } + } + + return schema; + } + + private static void generateColumns(TypeSystem aTypeSystem, TsvSchema aSchema, + LayerType aLayerType, Type aType) + { + List specialFeatures = asList( + CAS.FEATURE_FULL_NAME_BEGIN, + CAS.FEATURE_FULL_NAME_END, + CAS.FEATURE_FULL_NAME_SOFA); + + for (Feature feat : aType.getFeatures()) { + if (specialFeatures.contains(feat.getName())) { + continue; + } + + if (isPrimitiveFeature(feat)) { + aSchema.addColumn(new TsvColumn(aType, aLayerType, feat, FeatureType.PRIMITIVE)); + } + else if (SPAN.equals(aLayerType) && isSlotFeature(aTypeSystem, feat)) { + aSchema.addColumn(new TsvColumn(aType, aLayerType, feat, FeatureType.SLOT_ROLE)); + Type slotTargetType = feat.getRange().getComponentType() + .getFeatureByBaseName(FEAT_SLOT_TARGET).getRange(); + TsvColumn targetColumn = new TsvColumn(aType, aLayerType, feat, + FeatureType.SLOT_TARGET); + targetColumn.setTargetTypeHint(slotTargetType); + aSchema.addColumn(targetColumn); + } + } + } + + private static boolean isSlotFeature(TypeSystem aTypeSystem, Feature feat) + { + // This could be written more efficiently using a single conjunction. The reason this + // has not been done is to facilitate debugging. + + boolean multiValued = feat.getRange().isArray() || aTypeSystem + .subsumes(aTypeSystem.getType(CAS.TYPE_NAME_LIST_BASE), feat.getRange()); + + if (!multiValued) { + return false; + } + + boolean linkInheritsFromTop = CAS.TYPE_NAME_TOP + .equals(aTypeSystem.getParent(feat.getRange().getComponentType()).getName()); + boolean hasTargetFeature = feat.getRange().getComponentType() + .getFeatureByBaseName(FEAT_SLOT_TARGET) != null; + boolean hasRoleFeature = feat.getRange().getComponentType() + .getFeatureByBaseName(FEAT_SLOT_ROLE) != null; + + return linkInheritsFromTop && hasTargetFeature && hasRoleFeature; + } + + public static boolean isRelationLayer(Type aType) + { + Feature relSourceFeat = aType.getFeatureByBaseName(FEAT_REL_SOURCE); + boolean hasSourceFeature = relSourceFeat != null && !isPrimitiveFeature(relSourceFeat); + Feature relTargetFeat = aType.getFeatureByBaseName(FEAT_REL_TARGET); + boolean hasTargetFeature = relTargetFeat != null && !isPrimitiveFeature(relTargetFeat); + + boolean compatible = true; + for (Feature feat : aType.getFeatures()) { + if ( + CAS.FEATURE_BASE_NAME_SOFA.equals(feat.getShortName()) || + FEAT_REL_SOURCE.equals(feat.getShortName()) || + FEAT_REL_TARGET.equals(feat.getShortName()) + ) { + continue; + } + + if (!isPrimitiveFeature(feat)) { + compatible = false; + //LOG.debug("Incompatible feature in type [" + aType + "]: " + feat); + break; + } + } + + return hasSourceFeature && hasTargetFeature && compatible; + } + + public static boolean isChainLayer(Type aType) + { + boolean hasTypeFeature = aType.getFeatureByBaseName(COREFERENCE_TYPE_FEATURE) != null; + boolean hasRelationFeature = aType + .getFeatureByBaseName(COREFERENCE_RELATION_FEATURE) != null; + boolean nameEndsInLink = aType.getName().endsWith("Link"); + + boolean compatible = true; + for (Feature feat : aType.getFeatures()) { + if ( + CAS.FEATURE_BASE_NAME_SOFA.equals(feat.getShortName()) || + CHAIN_NEXT_FEAT.equals(feat.getShortName()) || + COREFERENCE_TYPE_FEATURE.equals(feat.getShortName()) || + COREFERENCE_RELATION_FEATURE.equals(feat.getShortName()) + ) { + continue; + } + + if (!isPrimitiveFeature(feat)) { + compatible = false; + LOG.debug("Incompatible feature in type [" + aType + "]: " + feat); + break; + } + } + + return hasTypeFeature && hasRelationFeature && nameEndsInLink && compatible; + } + + public static boolean isSpanLayer(Type aType) + { + boolean compatible = true; + for (Feature feat : aType.getFeatures()) { + if (CAS.FEATURE_BASE_NAME_SOFA.equals(feat.getShortName())) { + continue; + } + + if (!(isPrimitiveFeature(feat) || isSlotFeature(feat))) { + compatible = false; + //LOG.debug("Incompatible feature in type [" + aType + "]: " + feat); + break; + } + + } + + return compatible; + } + + public static boolean isSlotFeature(Feature aFeature) + { + if (aFeature.getRange().isArray()) { + Type elementType = aFeature.getRange().getComponentType(); + + return elementType.getFeatureByBaseName(FEAT_SLOT_TARGET) != null + && elementType.getFeatureByBaseName(FEAT_SLOT_ROLE) != null; + } + + return false; + } + + public static boolean isPrimitiveFeature(Feature aFeature) + { + return aFeature.getRange().isPrimitive(); + } +} diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XDeserializer.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XDeserializer.java new file mode 100644 index 0000000000..481df45666 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XDeserializer.java @@ -0,0 +1,898 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */package org.dkpro.core.io.webanno.tsv.internal.tsv3x; + +import static java.util.Collections.emptyList; +import static java.util.regex.Pattern.quote; +import static org.apache.commons.lang3.StringUtils.isBlank; +import static org.apache.commons.lang3.StringUtils.isEmpty; +import static org.apache.commons.lang3.StringUtils.removeEnd; +import static org.apache.commons.lang3.StringUtils.repeat; +import static org.apache.commons.lang3.StringUtils.split; +import static org.apache.commons.lang3.StringUtils.splitPreserveAllTokens; +import static org.apache.commons.lang3.StringUtils.startsWith; +import static org.apache.commons.lang3.StringUtils.substringAfter; +import static org.apache.commons.lang3.StringUtils.substringAfterLast; +import static org.apache.commons.lang3.StringUtils.substringBefore; +import static org.apache.commons.lang3.StringUtils.substringBeforeLast; +import static org.apache.uima.fit.util.FSUtil.getFeature; +import static org.apache.uima.fit.util.FSUtil.setFeature; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.Escaping.unescapeText; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.CHAIN_ELEMENT_TYPE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.CHAIN_LINK_TYPE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.PRIMITIVE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.RELATION_REF; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.SLOT_ROLE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.SLOT_TARGET; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.FIELD_SEPARATOR; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.HEADER_FIELD_SEPARATOR; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.HEADER_LAYER_PREFIX_SEPARATOR; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.HEADER_PREFIX_BASE_TYPE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.HEADER_PREFIX_CHAIN_LAYER; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.HEADER_PREFIX_FORMAT; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.HEADER_PREFIX_RELATION_LAYER; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.HEADER_PREFIX_ROLE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.HEADER_PREFIX_SPAN_LAYER; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.LINE_BREAK; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.NULL_COLUMN; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.NULL_VALUE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.PREFIX_TEXT; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.SLOT_SEP; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.STACK_SEP; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.LayerType.CHAIN; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.LayerType.RELATION; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.LayerType.SPAN; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.CHAIN_FIRST_FEAT; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.CHAIN_NEXT_FEAT; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.COREFERENCE_RELATION_FEATURE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.COREFERENCE_TYPE_FEATURE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.FEAT_REL_SOURCE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.FEAT_REL_TARGET; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.FEAT_SLOT_ROLE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.FEAT_SLOT_TARGET; + +import java.io.IOException; +import java.io.LineNumberReader; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.FeatureStructure; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.TypeSystem; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.LayerType; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvChain; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvColumn; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvDocument; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvFormatHeader; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSentence; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSubToken; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvToken; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvUnit; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; + +public class Tsv3XDeserializer +{ + private static final Pattern FORMAT_PATTERN = Pattern.compile( + "^" + quote(HEADER_PREFIX_FORMAT) + "(?.*) " + "(?\\d+\\.\\d+)$"); + + private static final Pattern STACK_SEP_PATTERN = Pattern + .compile("(?" + "(?\\d+-\\d+)$"); + + private ThreadLocal> deferredActions = new ThreadLocal<>(); + + public void read(LineNumberReader aIn, JCas aJCas) throws IOException + { + deferredActions.set(new ArrayList<>()); + + TsvFormatHeader format = readFormat(aIn); + TsvSchema schema = readSchema(aIn, aJCas); + + // Read the extra blank line after the schema declaration + String emptyLine = aIn.readLine(); + assert isEmpty(emptyLine); + + TsvDocument doc = new TsvDocument(format, schema, aJCas); + + for (TsvColumn column : schema.getColumns()) { + doc.activateColumn(column); + doc.activateType(column.uimaType); + } + + readContent(aIn, doc); + + // Complete the addition of the chains + CAS cas = aJCas.getCas(); + for (TsvChain chain : doc.getChains()) { + if (chain.getElements().isEmpty()) { + continue; + } + + Iterator linkIterator = chain.getElements().iterator(); + AnnotationFS link = linkIterator.next(); + + // Create the chain head + FeatureStructure head = cas.createFS(chain.getHeadType()); + setFeature(head, CHAIN_FIRST_FEAT, link); + cas.addFsToIndexes(head); + + // Connect the links to each other + AnnotationFS prevLink = link; + while (linkIterator.hasNext()) { + link = linkIterator.next(); + setFeature(prevLink, CHAIN_NEXT_FEAT, link); + prevLink = link; + } + } + + // Run deferred actions + for (Runnable action : deferredActions.get()) { + action.run(); + } + } + + private TsvFormatHeader readFormat(LineNumberReader aIn) throws IOException + { + String line = aIn.readLine(); + + expectStartsWith(line, HEADER_PREFIX_FORMAT); + + Matcher m = FORMAT_PATTERN.matcher(line); + if (!m.matches()) { + throw new IOException("Illlegal format header: [" + line + "]"); + } + + TsvFormatHeader format = new TsvFormatHeader(m.group("NAME"), m.group("VERSION")); + return format; + } + + private TsvSchema readSchema(LineNumberReader aIn, JCas aJCas) throws IOException + { + TsvSchema schema = new TsvSchema(); + int columnIndex = 0; + + // Read first line + for (String line = aIn.readLine(); !isBlank(line); line = aIn.readLine()) { + LayerType layerType; + + // Determine layer type + if (startsWith(line, HEADER_PREFIX_SPAN_LAYER)) { + layerType = SPAN; + } + else if (startsWith(line, HEADER_PREFIX_RELATION_LAYER)) { + layerType = RELATION; + } + else if (startsWith(line, HEADER_PREFIX_CHAIN_LAYER)) { + layerType = CHAIN; + } + else { + // End of header + break; + } + + // Split up layer declaration + String rest = substringAfter(line, HEADER_LAYER_PREFIX_SEPARATOR); + String[] fields = split(rest, HEADER_FIELD_SEPARATOR); + + // Get the type name and the corresponding UIMA type from the type system of the + // target CAS + String typeName = fields[0]; + Type uimaType = aJCas.getTypeSystem().getType(typeName); + if (uimaType == null) { + throw new IOException( + "CAS type system does not contain a type named [" + typeName + "]"); + } + + // Parse the column declarations starting at the second field (the first is the + // type name) + TsvColumn prevColumn = null; + for (int i = 1; i < fields.length; i++) { + String colDecl = fields[i]; + TsvColumn col = parseColumnDeclaration(aJCas, layerType, uimaType, columnIndex, + colDecl, prevColumn); + schema.addColumn(col); + columnIndex++; + prevColumn = col; + } + + // If there is no second field, then add a placeholder column + if (fields.length == 1) { + schema.addColumn(new TsvColumn(columnIndex, uimaType, layerType)); + columnIndex++; + } + } + + return schema; + } + + private TsvColumn parseColumnDeclaration(JCas aJCas, LayerType aLayerType, Type aUimaType, + int aIndex, String aColDecl, TsvColumn aPrevCol) + throws IOException + { + TypeSystem ts = aJCas.getTypeSystem(); + TsvColumn column; + // Determine the feature type: + // SLOT_ROLE - starts with "ROLE_" + if (SPAN.equals(aLayerType) && startsWith(aColDecl, HEADER_PREFIX_ROLE)) { + String[] subFields = splitPreserveAllTokens(aColDecl, '_'); + String featureName = substringAfter(subFields[1], ":"); + + Feature feat = aUimaType.getFeatureByBaseName(featureName); + if (feat == null) { + throw new IOException("CAS type [" + aUimaType.getName() + + "] does not have a feature called [" + featureName + "]"); + } + + column = new TsvColumn(aIndex, aUimaType, aLayerType, featureName, SLOT_ROLE); + + String typeName = subFields[2]; + Type type = ts.getType(typeName); + if (type == null) { + throw new IOException("CAS does not contain a type called [" + typeName + "]"); + } + + column.setTargetTypeHint(type); + } + // RELATION_REF - starts with "BT_ + else if (RELATION.equals(aLayerType) && startsWith(aColDecl, HEADER_PREFIX_BASE_TYPE)) { + column = new TsvColumn(aIndex, aUimaType, aLayerType, FEAT_REL_SOURCE, + RELATION_REF); + + String typeName = substringAfter(aColDecl, HEADER_PREFIX_BASE_TYPE); + Type type = ts.getType(typeName); + if (type == null) { + throw new IOException("CAS does not contain a type called [" + typeName + "]"); + } + + column.setTargetTypeHint(type); + } + // CHAIN_ELEMENT_TYPE - "referenceType" + else if (CHAIN.equals(aLayerType) && COREFERENCE_TYPE_FEATURE.equals(aColDecl)) { + column = new TsvColumn(aIndex, aUimaType, aLayerType, COREFERENCE_TYPE_FEATURE, + CHAIN_ELEMENT_TYPE); + } + // CHAIN_LINK_TYPE - "referenceRelation" + else if (CHAIN.equals(aLayerType) && COREFERENCE_RELATION_FEATURE.equals(aColDecl)) { + column = new TsvColumn(aIndex, aUimaType, aLayerType, COREFERENCE_RELATION_FEATURE, + CHAIN_LINK_TYPE); + } + // SLOT_TARGET - name of the link target type + else if (SPAN.equals(aLayerType) && aColDecl.contains(".") + || ts.getType(aColDecl) != null) { + // In case we got here because the column declaration contains a dot, let's check if + // the type name really exists in the target CAS. + if (ts.getType(aColDecl) == null) { + throw new IOException( + "CAS type system does not contain a type named [" + aColDecl + "]"); + } + + // The previous column must be a SLOT_ROLE because we need to obtain the feature + // name from it. + if (aPrevCol == null || !SLOT_ROLE.equals(aPrevCol.featureType)) { + throw new IOException( + "Slot target column declaration must follow slot role column declaration"); + } + + column = new TsvColumn(aIndex, aUimaType, aLayerType, + aPrevCol.uimaFeature.getShortName(), SLOT_TARGET); + + Type type = ts.getType(aColDecl); + if (type == null) { + throw new IOException("CAS does not contain a type called [" + aColDecl + "]"); + } + + column.setTargetTypeHint(type); + } + // PRIMITIVE - feature name + else if (aUimaType.getFeatureByBaseName(aColDecl) != null) { + column = new TsvColumn(aIndex, aUimaType, aLayerType, aColDecl, PRIMITIVE); + } + else { + throw new IOException("Type [" + aUimaType.getName() + + "] does not contain a feature called [" + aColDecl + "]"); + } + // PLACEHOLDER - empty column declaration, i.e. only a separator after type name + // This is not handled here, but rather in the calling method. + + return column; + } + + private void readContent(LineNumberReader aIn, TsvDocument aDoc) throws IOException + { + StringBuilder text = new StringBuilder(); + + State prevState = State.INTER_SENTENCE_SPACE; + State state = State.INTER_SENTENCE_SPACE; + + StringBuilder sentenceText = new StringBuilder(); + TsvSentence prevSentence = null; + TsvSentence sentence = null; + TsvToken token = null; + + List headerColumns = aDoc.getSchema() + .getHeaderColumns(aDoc.getSchema().getColumns()); + + String line = aIn.readLine(); + while (!State.END.equals(state)) { + // These variables are only used in TOKEN and SUBTOKEN states. + String[] fields = null; + String id = null; + String[] offsets = null; + int begin = -1; + int end = -1; + + // Determine the status of the current line + if (startsWith(line, PREFIX_TEXT)) { + state = State.SENTENCE; + } + else if (line == null) { + state = State.END; + } + else if (isEmpty(line)) { + state = State.INTER_SENTENCE_SPACE; + } + else { + fields = splitPreserveAllTokens(line, FIELD_SEPARATOR); + + // Get token metadata + id = fields[0]; + offsets = split(fields[1], "-"); + begin = Integer.valueOf(offsets[0]); + end = Integer.valueOf(offsets[1]); + + // TOKEN or SUBTOKEN? + if (id.contains(".")) { + state = State.SUBTOKEN; + } + else { + state = State.TOKEN; + } + } + + // Assert that the order of information in the file is correct + switch (prevState) { + case INTER_SENTENCE_SPACE: + if (!State.SENTENCE.equals(state)) { + throw new IOException("Line " + aIn.getLineNumber() + + ": Expected sentence header but got [" + state + "]"); + } + break; + case SENTENCE: + if (!(State.SENTENCE.equals(state) || State.TOKEN.equals(state))) { + throw new IOException("Line " + aIn.getLineNumber() + + ": Expected sentence header or token but got [" + state + "]"); + } + break; + case TOKEN: + case SUBTOKEN: + if (!(State.INTER_SENTENCE_SPACE.equals(state) || State.END.equals(state) + || State.TOKEN.equals(state) || State.SUBTOKEN.equals(state))) { + throw new IOException("Line " + aIn.getLineNumber() + + ": Expected token, sub-token or sentence break but got [" + state + + "]"); + } + break; + } + + // Do the actual parsing + switch (state) { + case END: + case INTER_SENTENCE_SPACE: + // End of sentence action + // The -1 here is to account for the tailing line break + sentence.getUimaSentence().setEnd(text.length() - 1); + sentence.getUimaSentence().addToIndexes(); + prevSentence = sentence; + sentence = null; + break; + case TOKEN: + // Note that the token value is not used here. When we get here, we have already + // added the complete sentence text to the text buffer. + + // End of sentence header action + if (State.SENTENCE.equals(prevState)) { + // If there is no space between the previous sentence and the current + // sentence, then we have to strip off the trailing line break from the + // last sentence! + if (text.length() > begin) { + assert text.length() == begin + 1; + assert text.charAt(text.length() - 1) == LINE_BREAK; + text.setLength(text.length() - 1); + } + + // If there is a gap between the current end of the text buffer and the + // offset of the first token in this sentence, then add whitespace to fill + // the gap. + if (text.length() < begin) { + text.append(repeat(' ', begin - text.length())); + } + + assert text.length() == begin; + assert sentence == null; + + Sentence uimaSentence = new Sentence(aDoc.getJCas()); + uimaSentence.setBegin(text.length()); + sentence = aDoc.createSentence(uimaSentence); + text.append(sentenceText); + sentenceText.setLength(0); + } + + // Token parsing action + Token uimaToken = new Token(aDoc.getJCas(), begin, end); + uimaToken.addToIndexes(); + token = sentence.createToken(uimaToken); + + // Read annotations from the columns + parseAnnotations(aDoc, sentence, token, fields, headerColumns); + break; + case SUBTOKEN: + // Read annotations from the columns + TsvSubToken subToken = token.createSubToken(begin, end); + parseAnnotations(aDoc, sentence, subToken, fields, headerColumns); + break; + case SENTENCE: + // Header parsing action + String textFragment = substringAfter(line, "="); + textFragment = unescapeText(aDoc.getFormatHeader(), textFragment); + sentenceText.append(textFragment); + sentenceText.append(LINE_BREAK); + break; + } + + prevState = state; + line = aIn.readLine(); + } + + aDoc.getJCas().setDocumentText(text.toString()); + + // After all data has been read, we also add the annotations with disambiguation ID to + // the CAS indexes. This ensures we only add them after their final begin/end offsets + // have been determined since most of these annotations are actually multi-token + // annotations. + CAS cas = aDoc.getJCas().getCas(); + Set fses = new LinkedHashSet<>(); + for (TsvSentence s : aDoc.getSentences()) { + for (TsvToken t : s.getTokens()) { + for (Type type : t.getUimaTypes()) { + fses.addAll(t.getUimaAnnotations(type)); + } + for (TsvSubToken st : t.getSubTokens()) { + for (Type type : st.getUimaTypes()) { + fses.addAll(st.getUimaAnnotations(type)); + } + } + } + } + fses.forEach(cas::addFsToIndexes); + } + + private void parseAnnotations(TsvDocument aDoc, TsvSentence aSentence, TsvUnit aUnit, + String[] aFields, List aHeaderColumns) + { + for (TsvColumn col : aHeaderColumns) { + String rawValue = aFields[col.index + 3]; + + if (NULL_COLUMN.equals(rawValue)) { + continue; + } + + String[] stackedValues = STACK_SEP_PATTERN.split(rawValue); + + int index = 0; + for (String val : stackedValues) { + parseAnnotation(aDoc, aSentence, aUnit, col, index, val); + index++; + } + } + } + + /** + * @param aDoc + * the TSV document. + * @param aSentence + * the current sentence. + * @param aUnit + * the current unit (token or subtoken). + * @param aCol + * the column definition. + * @param aStackingIndex + * the stack index within the column in case there are multiple stacked annotations + * (0-based). + * @param aValue + * the value. + */ + private void parseAnnotation(TsvDocument aDoc, TsvSentence aSentence, TsvUnit aUnit, + TsvColumn aCol, int aStackingIndex, String aValue) + { + // Make a copy of the value argument since we may be modifying it below. + String value = aValue; + + // Extract disambiguation/chain suffix if it exists. + // If it is a slot column, skip this step because disambiguation info is provided per + // slot value. + String disambiguationInfo = null; + if (!(SLOT_TARGET.equals(aCol.featureType))) { + if (aValue.endsWith("]") && !aValue.endsWith("\\]")) { + String buf = substringAfterLast(value, "["); + disambiguationInfo = substringBefore(buf, "]"); + value = substringBeforeLast(value, "["); + } + else { + Matcher m = CHAIN_SUFFIX_PATTERN.matcher(value); + if (m.matches()) { + disambiguationInfo = m.group("CHAIN"); + value = value.substring(0, m.start("CHAIN") - 2); + } + } + } + + assert disambiguationInfo == null || disambiguationInfo.length() > 0; + + // Create the annotation of fetch an existing one + AnnotationFS annotation; + switch (aCol.layerType) { + case SPAN: + annotation = getOrCreateSpanAnnotation(aCol, aUnit, aStackingIndex, disambiguationInfo); + break; + case RELATION: + annotation = getOrCreateRelationAnnotation(aCol, aUnit, aStackingIndex, + disambiguationInfo); + break; + case CHAIN: + annotation = getOrCreateChainAnnotation(aCol, aUnit, aStackingIndex, + disambiguationInfo); + break; + default: + throw new IllegalStateException("Unknown layer type [" + aCol.layerType + "]"); + } + + // Set feature values including references such as relation source/target or slot targets. + setFeatures(aCol, aUnit, annotation, disambiguationInfo, aStackingIndex, value); + } + + private AnnotationFS getOrCreateSpanAnnotation(TsvColumn aCol, TsvUnit aUnit, + int aStackingIndex, String aDisambiguationInfo) + { + int disambiguationId = aDisambiguationInfo != null ? Integer.valueOf(aDisambiguationInfo) + : -1; + + // Check if we have seen the same annotation already in the current unit but in another + // column. + AnnotationFS annotation = aUnit.getUimaAnnotation(aCol.uimaType, aStackingIndex); + // If not, check if we have seen the same annotation already in a previous unit + if (annotation == null && disambiguationId != -1) { + annotation = aUnit.getDocument().getDisambiguatedAnnotation(disambiguationId); + if (annotation != null) { + aUnit.addUimaAnnotation(annotation); + + // Extend the span of the existing annotation + // Unfortunately, the AnnotationFS interface does not define a setEnd() method. + setFeature(annotation, CAS.FEATURE_BASE_NAME_END, aUnit.getEnd()); + } + } + + // Still no annotation? Then we have to create one + if (annotation == null) { + annotation = aUnit.getDocument().getJCas().getCas().createAnnotation(aCol.uimaType, + aUnit.getBegin(), aUnit.getEnd()); + aUnit.addUimaAnnotation(annotation); + + // Check if there are slot features that need to be initialized + List otherColumnsForType = aUnit.getDocument().getSchema() + .getColumns(aCol.uimaType); + for (TsvColumn col : otherColumnsForType) { + if (SLOT_TARGET.equals(col.featureType)) { + setFeature(annotation, col.uimaFeature.getShortName(), emptyList()); + } + } + + // Special handling of DKPro Core Token-attached annotations + if (Lemma.class.getName().equals(aCol.uimaType.getName())) { + TsvToken token = (TsvToken) aUnit; + token.getUimaToken().setLemma((Lemma) annotation); + } + if (Stem.class.getName().equals(aCol.uimaType.getName())) { + TsvToken token = (TsvToken) aUnit; + token.getUimaToken().setStem((Stem) annotation); + } + if (MorphologicalFeatures.class.getName().equals(aCol.uimaType.getName())) { + TsvToken token = (TsvToken) aUnit; + token.getUimaToken().setMorph((MorphologicalFeatures) annotation); + } + if (POS.class.getName().equals(aCol.uimaType.getName())) { + TsvToken token = (TsvToken) aUnit; + token.getUimaToken().setPos((POS) annotation); + } + } + + // If the current annotation carries an disambiguation ID, then register it in the + // document so we can look up the annotation via its ID later. This is necessary + // to extend the range of multi-token IDs. + if (disambiguationId != -1) { + aUnit.getDocument().addDisambiguationId(annotation, disambiguationId); + } + + return annotation; + } + + private AnnotationFS getOrCreateRelationAnnotation(TsvColumn aCol, TsvUnit aUnit, + int aStackingIndex, String aDisambiguationInfo) + { + // Check if we have seen the same annotation already in the current unit but in another + // column. + AnnotationFS annotation = aUnit.getUimaAnnotation(aCol.uimaType, aStackingIndex); + + // If not, then we have to create one + if (annotation == null) { + annotation = aUnit.getDocument().getJCas().getCas().createAnnotation(aCol.uimaType, + aUnit.getBegin(), aUnit.getEnd()); + aUnit.addUimaAnnotation(annotation); + } + + return annotation; + } + + private AnnotationFS getOrCreateChainAnnotation(TsvColumn aCol, TsvUnit aUnit, + int aStackingIndex, String aDisambiguationInfo) + { + AnnotationFS annotation; + + // Check if we have seen the same annotation already in the current unit but in + // another column. + annotation = aUnit.getUimaAnnotation(aCol.uimaType, aStackingIndex); + + if (annotation == null && CHAIN_LINK_TYPE.equals(aCol.featureType)) { + // Check if there is already an element with the same index/chain ID + // No disambiguation info, only chain info: *->- + String[] ids = split(aDisambiguationInfo, "-"); + int chainId = Integer.valueOf(ids[0]); + int elementIndex = Integer.valueOf(ids[1]); + annotation = aUnit.getDocument().getChainElement(chainId, elementIndex); + + if (annotation != null) { + aUnit.addUimaAnnotation(annotation); + + // Extend the span of the existing annotation + // Unfortunately, the AnnotationFS interface does not define a setEnd() method. + setFeature(annotation, CAS.FEATURE_BASE_NAME_END, aUnit.getEnd()); + } + + // If not, then we have to create one - we do this only for link-type columns because + // these columns include the chain id and the element index which we both need to + // determine if there is already an existing annotation for this chain/element from + // an earlier unit (i.e. for multi-unit chain elements). + if (annotation == null) { + annotation = aUnit.getDocument().getJCas().getCas().createAnnotation(aCol.uimaType, + aUnit.getBegin(), aUnit.getEnd()); + aUnit.addUimaAnnotation(annotation); + } + } + + return annotation; + } + + private void setFeatures(TsvColumn aCol, TsvUnit aUnit, AnnotationFS aAnnotation, + String aDisambiguationInfo, int aStackingIndex, String aValue) + { + // Set the feature value on the annotation + switch (aCol.featureType) { + case PLACEHOLDER: + // Nothing to do! + break; + case CHAIN_LINK_TYPE: { + // No disambiguation info, only chain info: *->- + String[] ids = split(aDisambiguationInfo, "-"); + int chainId = Integer.valueOf(ids[0]); + int elementIndex = Integer.valueOf(ids[1]); + TsvChain chain = aUnit.getDocument().getChain(chainId); + if (chain == null) { + // Guess the head type using naming conventions. + String headTypeName = removeEnd(aCol.uimaType.getName(), "Link"); + headTypeName += "Chain"; + + Type headType = aUnit.getDocument().getJCas().getTypeSystem() + .getType(headTypeName); + if (headType == null) { + throw new IllegalStateException( + "CAS type system does not contain a type named [" + headTypeName + + "]"); + } + + chain = aUnit.getDocument().createChain(chainId, headType, aCol.uimaType); + } + + chain.putElement(elementIndex, aAnnotation); + // fall-through (to set the relation type) + } + case CHAIN_ELEMENT_TYPE: { + deferredActions.get().add(() -> { + // We need to do this later because first we need to wait until all the elements + // have been created from the link-type columns. Then we have to look the + // annotations up via their unit/stacking index. + AnnotationFS annotation = aUnit.getUimaAnnotation(aCol.uimaType, aStackingIndex); + setPrimitiveValue(aCol, annotation, aValue); + }); + break; + } + case PRIMITIVE: { + setPrimitiveValue(aCol, aAnnotation, aValue); + break; + } + case RELATION_REF: { + // Two disambiguation IDs in brackets after annotation value, e.g.: 1-1[0_2] + final int sourceDisambiguationId; + final int targetDisambiguationId; + if (aDisambiguationInfo != null) { + String[] ids = split(aDisambiguationInfo, "_"); + sourceDisambiguationId = Integer.valueOf(ids[0]); + targetDisambiguationId = Integer.valueOf(ids[1]); + } + else { + sourceDisambiguationId = -1; + targetDisambiguationId = -1; + } + + // We cannot set the source and target features set because we may not yet have + // created the relevant annotations. So we defer setting these values until all + // annotations have been created. + deferredActions.get().add(() -> { + Type attachType = aCol.getTargetTypeHint(); + + // COMPATIBILITY NOTE: + // WebAnnoTsv3Writer hard-changes the target type for DKPro Core + // Dependency annotations from Token to POS - the reason is not really + // clear. Probably because the Dependency relations in the WebAnno UI + // attach to POS (Token's are not visible as annotations in the UI). + if (aCol.uimaType.getName().equals(Dependency.class.getName())) { + attachType = aUnit.getDocument().getJCas().getTypeSystem() + .getType(Token.class.getName()); + } + + AnnotationFS sourceAnnotation = aUnit.getDocument().resolveReference(attachType, + aValue, sourceDisambiguationId); + + AnnotationFS targetAnnotation = aUnit.getDocument().resolveReference(attachType, + aUnit.getId(), targetDisambiguationId); + + assert sourceAnnotation != null; + assert targetAnnotation != null; + + setFeature(aAnnotation, FEAT_REL_SOURCE, sourceAnnotation); + setFeature(aAnnotation, FEAT_REL_TARGET, targetAnnotation); + }); + break; + } + case SLOT_ROLE: { + CAS cas = aUnit.getDocument().getJCas().getCas(); + List links = new ArrayList<>(); + if (!NULL_COLUMN.equals(aValue)) { + String[] values = SLOT_SEP_PATTERN.split(aValue); + for (String value : values) { + FeatureStructure linkFS = cas.createFS(aCol.getTargetTypeHint()); + if (!NULL_VALUE.equals(value)) { + String role = Escaping.unescapeValue(value); + setFeature(linkFS, FEAT_SLOT_ROLE, role); + } + // We index the link features here already so we do not have to track them + // down later. They do not have offsets and no other index-relevant features + // anyway. + cas.addFsToIndexes(linkFS); + links.add(linkFS); + } + } + setFeature(aAnnotation, aCol.uimaFeature.getShortName(), links); + break; + } + case SLOT_TARGET: { + // Setting the target feature has to be deferred until we have created all the + // annotations. + deferredActions.get().add(() -> { + String[] values; + if (NULL_COLUMN.equals(aValue)) { + values = new String[0]; + } + else { + values = SLOT_SEP_PATTERN.split(aValue); + } + + FeatureStructure[] links = getFeature(aAnnotation, + aCol.uimaFeature.getShortName(), FeatureStructure[].class); + + assert (links.length == 0 && values.length == 1 && NULL_VALUE.equals(values[0])) + || (values.length == links.length); + + for (int i = 0; i < values.length; i++) { + String value = values[i]; + + if (NULL_VALUE.equals(value) || NULL_COLUMN.equals(value)) { + continue; + } + + // Extract slot-local disambiguation info + int disambiguationId = -1; + if (value.endsWith("]") && !value.endsWith("\\]")) { + String disambiguationInfo = substringAfterLast(value, "["); + disambiguationId = Integer + .valueOf(substringBefore(disambiguationInfo, "]")); + value = substringBeforeLast(value, "["); + } + + AnnotationFS targetAnnotation = aUnit.getDocument() + .resolveReference(aCol.getTargetTypeHint(), value, disambiguationId); + + setFeature(links[i], FEAT_SLOT_TARGET, targetAnnotation); + } + }); + break; + } + } + } + + private void setPrimitiveValue(TsvColumn aCol, AnnotationFS aAnnotation, String aValue) + { + // Unescape value - this needs to be done after extracting the disambiguation ID and + // after determining whether the values is a null value. + if (!NULL_VALUE.equals(aValue)) { + String value = Escaping.unescapeValue(aValue); + Feature feat = aAnnotation.getType() + .getFeatureByBaseName(aCol.uimaFeature.getShortName()); + + if (feat == null) { + throw new IllegalArgumentException( + "CAS type [" + aAnnotation.getType() + "] does not have a feature called [" + + aCol.uimaFeature.getShortName() + "]"); + } + + aAnnotation.setFeatureValueFromString(feat, value); + } + } + + private void expectStartsWith(String aLine, String aPrefix) throws IOException + { + if (!startsWith(aLine, aPrefix)) { + throw new IOException( + "Line does not start with expected prefix [" + aPrefix + "]: [" + aLine + "]"); + } + } + + private enum State + { + END, SENTENCE, TOKEN, SUBTOKEN, INTER_SENTENCE_SPACE; + } +} diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XSchemaAnalyzer.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XSchemaAnalyzer.java new file mode 100644 index 0000000000..2e4079aca0 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XSchemaAnalyzer.java @@ -0,0 +1,189 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.webanno.tsv.internal.tsv3x; + +import static java.util.Arrays.asList; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.CHAIN_ELEMENT_TYPE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.CHAIN_LINK_TYPE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.RELATION_REF; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.LayerType.CHAIN; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.LayerType.RELATION; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.LayerType.SPAN; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.CHAIN_FIRST_FEAT; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.COREFERENCE_RELATION_FEATURE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.COREFERENCE_TYPE_FEATURE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.FEAT_REL_SOURCE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.FEAT_REL_TARGET; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.FEAT_SLOT_ROLE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.FEAT_SLOT_TARGET; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.TypeSystem; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.LayerType; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvColumn; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class Tsv3XSchemaAnalyzer +{ + public static TsvSchema analyze(TypeSystem aTypeSystem) + { + TsvSchema schema = new TsvSchema(); + + Set chainLinkTypes = new HashSet<>(); + + // Consider only direct subtypes of the UIMA Annotation type. Currently, WebAnno only + // supports such layers. + Type annotationType = aTypeSystem.getType(CAS.TYPE_NAME_ANNOTATION); + Type documentAnnotationType = aTypeSystem.getType(CAS.TYPE_NAME_DOCUMENT_ANNOTATION); + for (Type type : aTypeSystem.getDirectSubtypes(annotationType)) { + if (aTypeSystem.subsumes(documentAnnotationType, type)) { + continue; + } + + if (type.getName().equals(Token.class.getName()) + || type.getName().equals(Sentence.class.getName())) { + continue; + } + + switch (schema.getLayerType(type)) { + case RELATION: + schema.addColumn(new TsvColumn(type, RELATION, + type.getFeatureByBaseName(FEAT_REL_SOURCE), RELATION_REF)); + generateColumns(aTypeSystem, schema, RELATION, type); + break; + case CHAIN: + schema.addColumn(new TsvColumn(type, CHAIN, + type.getFeatureByBaseName(COREFERENCE_TYPE_FEATURE), + CHAIN_ELEMENT_TYPE)); + schema.addColumn(new TsvColumn(type, CHAIN, + type.getFeatureByBaseName(COREFERENCE_RELATION_FEATURE), + CHAIN_LINK_TYPE)); + chainLinkTypes.add(type); + break; + case SPAN: + schema.addColumn(new TsvColumn(type, SPAN)); + generateColumns(aTypeSystem, schema, SPAN, type); + break; + } + } + + // Scan again for the chain head types + Type topType = aTypeSystem.getType(CAS.TYPE_NAME_ANNOTATION_BASE); + for (Type type : aTypeSystem.getDirectSubtypes(topType)) { + Feature firstFeat = type.getFeatureByBaseName(CHAIN_FIRST_FEAT); + if (firstFeat != null && chainLinkTypes.contains(firstFeat.getRange())) { + schema.addChainHeadType(type); + } + } + + return schema; + } + + private static void generateColumns(TypeSystem aTypeSystem, TsvSchema aSchema, + LayerType aLayerType, Type aType) + { + List specialFeatures = asList( + CAS.FEATURE_FULL_NAME_BEGIN, + CAS.FEATURE_FULL_NAME_END, + CAS.FEATURE_FULL_NAME_SOFA); + + for (Feature feat : aType.getFeatures()) { + if (specialFeatures.contains(feat.getName())) { + continue; + } + + if (isPrimitiveFeature(feat)) { + aSchema.addColumn(new TsvColumn(aType, aLayerType, feat, FeatureType.PRIMITIVE)); + } + else if (SPAN.equals(aLayerType) && isSlotFeature(aTypeSystem, feat)) { + aSchema.addColumn(new TsvColumn(aType, aLayerType, feat, FeatureType.SLOT_ROLE)); + Type slotTargetType = feat.getRange().getComponentType() + .getFeatureByBaseName(FEAT_SLOT_TARGET).getRange(); + TsvColumn targetColumn = new TsvColumn(aType, aLayerType, feat, + FeatureType.SLOT_TARGET); + targetColumn.setTargetTypeHint(slotTargetType); + aSchema.addColumn(targetColumn); + } + } + } + + private static boolean isSlotFeature(TypeSystem aTypeSystem, Feature feat) + { + // This could be written more efficiently using a single conjunction. The reason this + // has not been done is to facilitate debugging. + + boolean multiValued = feat.getRange().isArray() || aTypeSystem + .subsumes(aTypeSystem.getType(CAS.TYPE_NAME_LIST_BASE), feat.getRange()); + + if (!multiValued) { + return false; + } + + boolean linkInheritsFromTop = CAS.TYPE_NAME_TOP + .equals(aTypeSystem.getParent(feat.getRange().getComponentType()).getName()); + boolean hasTargetFeature = feat.getRange().getComponentType() + .getFeatureByBaseName(FEAT_SLOT_TARGET) != null; + boolean hasRoleFeature = feat.getRange().getComponentType() + .getFeatureByBaseName(FEAT_SLOT_ROLE) != null; + + return linkInheritsFromTop && hasTargetFeature && hasRoleFeature; + } + + public static boolean isRelationLayer(Type aType) + { + Feature relSourceFeat = aType.getFeatureByBaseName(FEAT_REL_SOURCE); + boolean hasSourceFeature = relSourceFeat != null && !isPrimitiveFeature(relSourceFeat); + Feature relTargetFeat = aType.getFeatureByBaseName(FEAT_REL_TARGET); + boolean hasTargetFeature = relTargetFeat != null && !isPrimitiveFeature(relTargetFeat); + + return hasSourceFeature && hasTargetFeature; + } + + public static boolean isChainLayer(Type aType) + { + boolean hasTypeFeature = aType.getFeatureByBaseName(COREFERENCE_TYPE_FEATURE) != null; + boolean hasRelationFeature = aType + .getFeatureByBaseName(COREFERENCE_RELATION_FEATURE) != null; + boolean nameEndsInLink = aType.getName().endsWith("Link"); + + return hasTypeFeature && hasRelationFeature && nameEndsInLink; + } + + public static boolean isPrimitiveFeature(Feature aFeature) + { + switch (aFeature.getRange().getName()) { + case CAS.TYPE_NAME_STRING: // fallthrough + case CAS.TYPE_NAME_BOOLEAN: // fallthrough + case CAS.TYPE_NAME_FLOAT: // fallthrough + case CAS.TYPE_NAME_INTEGER: + return true; + default: + return false; + } + } +} diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XSerializer.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XSerializer.java new file mode 100644 index 0000000000..2d434fa123 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XSerializer.java @@ -0,0 +1,395 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */package org.dkpro.core.io.webanno.tsv.internal.tsv3x; + +import static org.apache.commons.lang3.StringUtils.splitPreserveAllTokens; +import static org.apache.uima.fit.util.FSUtil.getFeature; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.Escaping.escapeText; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.Escaping.escapeValue; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.RELATION_REF; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.SLOT_ROLE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.SLOT_TARGET; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.FIELD_SEPARATOR; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.HEADER_FIELD_SEPARATOR; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.HEADER_PREFIX_BASE_TYPE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.HEADER_PREFIX_CHAIN_LAYER; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.HEADER_PREFIX_FORMAT; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.HEADER_PREFIX_RELATION_LAYER; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.HEADER_PREFIX_ROLE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.HEADER_PREFIX_SPAN_LAYER; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.LINE_BREAK; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.NULL_COLUMN; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.NULL_VALUE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.PREFIX_TEXT; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.SLOT_SEP; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FormatConstants.STACK_SEP; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.COREFERENCE_RELATION_FEATURE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.COREFERENCE_TYPE_FEATURE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.FEAT_REL_SOURCE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.FEAT_REL_TARGET; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema.FEAT_SLOT_TARGET; + +import java.io.PrintWriter; +import java.util.List; + +import org.apache.uima.cas.FeatureStructure; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvChain; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvColumn; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvDocument; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvFormatHeader; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSchema; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSentence; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvSubToken; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvToken; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.TsvUnit; + +public class Tsv3XSerializer +{ + public void write(PrintWriter aOut, TsvDocument aDocument) + { + write(aOut, aDocument.getFormatHeader()); + + List headerColumns = aDocument.getSchema() + .getHeaderColumns(aDocument.getActiveColumns()); + + write(aOut, headerColumns); + + for (TsvSentence sentence : aDocument.getSentences()) { + aOut.print(LINE_BREAK); + write(aOut, sentence, headerColumns); + } + } + + public void write(PrintWriter aOut, TsvFormatHeader aHeader) + { + aOut.print(HEADER_PREFIX_FORMAT); + aOut.printf("%s %s\n", aHeader.getName(), aHeader.getVersion()); + } + + /** + * Write the schema header. + */ + public void write(PrintWriter aOut, List aHeaderColumns) + { + Type currentType = null; + for (TsvColumn col : aHeaderColumns) { + if (currentType == null || !currentType.equals(col.uimaType)) { + if (currentType != null) { + aOut.print(LINE_BREAK); + } + currentType = col.uimaType; + + switch (col.layerType) { + case SPAN: + aOut.print(HEADER_PREFIX_SPAN_LAYER); + break; + case RELATION: + aOut.print(HEADER_PREFIX_RELATION_LAYER); + break; + case CHAIN: + aOut.print(HEADER_PREFIX_CHAIN_LAYER); + break; + } + aOut.print(col.uimaType.getName()); + } + + if (RELATION_REF.equals(col.featureType)) { + aOut.print(HEADER_FIELD_SEPARATOR); + aOut.print(HEADER_PREFIX_BASE_TYPE); + if (col.getTargetTypeHint() != null) { + // COMPATIBILITY NOTE: + // WebAnnoTsv3Writer obtains the type of a relation target column not from + // the type system definition but rather by looking at target used by the + // first actual annotation. This assumes that relations are always only on + // a single type. + aOut.printf(col.getTargetTypeHint().getName()); + } + else { + aOut.printf(col.uimaFeature.getRange().getName()); + } + } + else if (SLOT_TARGET.equals(col.featureType)) { + // NOTE: This is the same as for the RELATION_REF except that the type + // name is not prefixed with "BT_" here. + + if (col.getTargetTypeHint() != null) { + // COMPATIBILITY NOTE: + // WebAnnoTsv3Writer obtains the type of a slot target column not from + // the type system definition but rather by looking at target used by the + // first actual annotation. + aOut.print(HEADER_FIELD_SEPARATOR); + aOut.print(col.getTargetTypeHint()); + } + else { + aOut.print(HEADER_FIELD_SEPARATOR); + aOut.print(col.uimaFeature.getRange().getName()); + } + } + else if (SLOT_ROLE.equals(col.featureType)) { + aOut.print(HEADER_FIELD_SEPARATOR); + aOut.print(HEADER_PREFIX_ROLE); + aOut.printf("%s_%s", col.uimaFeature.getName(), + col.uimaFeature.getRange().getComponentType().getName()); + } + else if (SLOT_TARGET.equals(col.featureType)) { + aOut.print(HEADER_FIELD_SEPARATOR); + aOut.print(col.uimaFeature.getRange().getComponentType() + .getFeatureByBaseName(FEAT_SLOT_TARGET).getRange().getName()); + } + else { + // COMPATIBILITY NOTE: + // Yes, this pipe symbol needs to be written + aOut.print("|"); + if (col.uimaFeature != null) { + aOut.print(col.uimaFeature.getShortName()); + } + } + } + + // Add line-break to terminate the final column definition + if (!aHeaderColumns.isEmpty()) { + aOut.print(LINE_BREAK); + } + + // COMPATIBILITY NOTE: + // This is really just to make the output match exactly TSV3 + aOut.print(LINE_BREAK); + } + + public void write(PrintWriter aOut, TsvSentence aSentence, List aHeaderColumns) + { + String[] lines = splitPreserveAllTokens(aSentence.getUimaSentence().getCoveredText(), + LINE_BREAK); + for (String line : lines) { + aOut.print(PREFIX_TEXT); + aOut.print(escapeText(line)); + aOut.print(LINE_BREAK); + } + + for (TsvToken token : aSentence.getTokens()) { + write(aOut, token, aHeaderColumns); + aOut.write(LINE_BREAK); + for (TsvSubToken subToken : token.getSubTokens()) { + write(aOut, subToken, aHeaderColumns); + aOut.write(LINE_BREAK); + } + } + } + + public void write(PrintWriter aOut, TsvUnit aUnit, List aHeaderColumns) + { + TsvDocument doc = aUnit.getDocument(); + + // Write unit ID + aOut.print(aUnit.getId()); + aOut.print(FIELD_SEPARATOR); + + // Write unit offset + aOut.printf("%d-%d", aUnit.getBegin(), aUnit.getEnd()); + aOut.print(FIELD_SEPARATOR); + + // Write unit text + aOut.print(doc.getJCas().getDocumentText().substring(aUnit.getBegin(), + aUnit.getEnd())); + aOut.printf(FIELD_SEPARATOR); + + // Write the remaining columns according to the schema definition + for (TsvColumn col : aHeaderColumns) { + // Write all the values in this column - there could be multiple due to stacking + writeValues(aOut, aUnit, col); + aOut.printf(FIELD_SEPARATOR); + } + } + + private void writeValues(PrintWriter aOut, TsvUnit aUnit, TsvColumn aCol) + { + List columnAnnos = aUnit.getAnnotationsForColumn(aCol); + + // Encode the annotation values for the current column + if (columnAnnos.isEmpty()) { + aOut.print(NULL_COLUMN); + } + else { + for (int i = 0; i < columnAnnos.size(); i++) { + if (i > 0) { + aOut.print(STACK_SEP); + } + + AnnotationFS fs = columnAnnos.get(i); + writeValue(aOut, aUnit.getDocument(), aCol, fs); + } + } + } + + private void writeValue(PrintWriter aOut, TsvDocument aDoc, TsvColumn aCol, AnnotationFS aFS) + { + // What kind of column is it? Depending on the type of column, the annotation value + // has to be encoded differently. + switch (aCol.featureType) { + case PLACEHOLDER: { + writePlaceholderValue(aOut, aDoc, aCol, aFS); + writeDisambiguationId(aOut, aDoc, aFS); + break; + } + case PRIMITIVE: { + writePrimitiveValue(aOut, aDoc, aCol, aFS); + writeDisambiguationId(aOut, aDoc, aFS); + break; + } + case RELATION_REF: { + writeRelationReference(aOut, aDoc, aCol, aFS); + break; + } + case SLOT_ROLE: { + writeSlotRole(aOut, aDoc, aCol, aFS); + break; + } + case SLOT_TARGET: { + writeSlotTarget(aOut, aDoc, aCol, aFS); + break; + } + case CHAIN_ELEMENT_TYPE: + writeChainElement(aOut, aDoc, aCol, aFS); + break; + case CHAIN_LINK_TYPE: + writeChainLink(aOut, aDoc, aCol, aFS); + break; + default: + throw new IllegalStateException("Unknown feature type: [" + aCol.featureType + "]"); + } + } + + private static void writeDisambiguationId(PrintWriter aOut, TsvDocument aDoc, AnnotationFS aFS) + { + Integer disambiguationId = aDoc.getDisambiguationId(aFS); + if (disambiguationId != null) { + aOut.printf("[%d]", disambiguationId); + } + } + + private static void writePlaceholderValue(PrintWriter aOut, TsvDocument aDoc, TsvColumn aCol, + AnnotationFS aFS) + { + aOut.print(NULL_VALUE); + } + + private static void writePrimitiveValue(PrintWriter aOut, TsvDocument aDoc, TsvColumn aCol, + AnnotationFS aFS) + { + Object value = getFeature(aFS, aCol.uimaFeature, Object.class); + value = value == null ? NULL_VALUE : escapeValue(String.valueOf(value)); + aOut.print(value); + } + + private static void writeRelationReference(PrintWriter aOut, TsvDocument aDoc, TsvColumn aCol, + AnnotationFS aFS) + { + AnnotationFS targetFS = getFeature(aFS, FEAT_REL_TARGET, AnnotationFS.class); + AnnotationFS sourceFS = getFeature(aFS, FEAT_REL_SOURCE, AnnotationFS.class); + + // The column contains the ID of the unit from which the relation is pointing to the + // current unit, i.e. the sourceUnit of the relation. + TsvUnit sourceUnit = aDoc.findIdDefiningUnit(sourceFS); + aOut.print(sourceUnit.getId()); + + // If the source/target is ambiguous, add the disambiguation IDs + Integer sourceId = aDoc.getDisambiguationId(sourceFS); + Integer targetId = aDoc.getDisambiguationId(targetFS); + if (sourceId != null || targetId != null) { + sourceId = sourceId != null ? sourceId : 0; + targetId = targetId != null ? targetId : 0; + aOut.printf("[%d_%d]", sourceId, targetId); + } + } + + private static void writeSlotRole(PrintWriter aOut, TsvDocument aDoc, TsvColumn aCol, + AnnotationFS aFS) + { + FeatureStructure[] links = getFeature(aFS, aCol.uimaFeature, FeatureStructure[].class); + if (links != null && links.length > 0) { + for (int i = 0; i < links.length; i++) { + if (i > 0) { + aOut.print(SLOT_SEP); + } + String value = getFeature(links[i], TsvSchema.FEAT_SLOT_ROLE, String.class); + value = value == null ? NULL_VALUE : escapeValue(value); + aOut.print(value); + } + } + else { + aOut.print(NULL_COLUMN); + } + writeDisambiguationId(aOut, aDoc, aFS); + } + + private static void writeSlotTarget(PrintWriter aOut, TsvDocument aDoc, TsvColumn aCol, + AnnotationFS aFS) + { + FeatureStructure[] links = getFeature(aFS, aCol.uimaFeature, FeatureStructure[].class); + if (links != null && links.length > 0) { + for (int i = 0; i < links.length; i++) { + if (i > 0) { + aOut.print(SLOT_SEP); + } + AnnotationFS targetFS = getFeature(links[i], TsvSchema.FEAT_SLOT_TARGET, + AnnotationFS.class); + if (targetFS == null) { + throw new IllegalStateException( + "Slot link has no target: " + links[i]); + } + + TsvUnit target = aDoc.findIdDefiningUnit(targetFS); + if (target == null) { + throw new IllegalStateException( + "Unable to find ID-defining unit for annotation: " + targetFS); + } + + aOut.print(target.getId()); + writeDisambiguationId(aOut, aDoc, targetFS); + } + } + else { + // If the slot hosts has no slots, we use this column as a placeholder so we know + // the span of the slot host + aOut.print(NULL_VALUE); + } + } + + private static void writeChainElement(PrintWriter aOut, TsvDocument aDoc, TsvColumn aCol, + AnnotationFS aFS) + { + String value = getFeature(aFS, COREFERENCE_TYPE_FEATURE, String.class); + value = value == null ? NULL_VALUE : escapeValue(value); + + TsvChain chain = aDoc.getChain(aFS); + + aOut.printf("%s[%d]", value, chain.getId()); + } + + private static void writeChainLink(PrintWriter aOut, TsvDocument aDoc, TsvColumn aCol, + AnnotationFS aFS) + { + String value = getFeature(aFS, COREFERENCE_RELATION_FEATURE, String.class); + value = value == null ? NULL_VALUE : escapeValue(value); + + TsvChain chain = aDoc.getChain(aFS); + + aOut.printf("%s->%d-%d", value, chain.getId(), chain.indexOf(aFS) + 1); + } +} diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/FeatureType.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/FeatureType.java new file mode 100644 index 0000000000..9f88528ac9 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/FeatureType.java @@ -0,0 +1,31 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */package org.dkpro.core.io.webanno.tsv.internal.tsv3x.model; + +public enum FeatureType { + /** + * If an annotation does not have any features or feature values set, then still a column + * is required to indicate the location of annotations. + */ + PLACEHOLDER, + PRIMITIVE, + SLOT_ROLE, + SLOT_TARGET, + RELATION_REF, + CHAIN_ELEMENT_TYPE, + CHAIN_LINK_TYPE; +} diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/FormatConstants.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/FormatConstants.java new file mode 100644 index 0000000000..a2818ad24d --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/FormatConstants.java @@ -0,0 +1,45 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */package org.dkpro.core.io.webanno.tsv.internal.tsv3x.model; + +public class FormatConstants +{ + public static final String HEADER_LAYER_PREFIX_SEPARATOR = "="; + public static final String HEADER_PREFIX_FORMAT = "#FORMAT" + HEADER_LAYER_PREFIX_SEPARATOR; + public static final String HEADER_PREFIX_ROLE = "ROLE_"; + public static final String HEADER_FIELD_SEPARATOR = "|"; + public static final String HEADER_PREFIX_BASE_TYPE = "BT_"; + public static final String HEADER_PREFIX_CHAIN_LAYER = "#T_CH" + HEADER_LAYER_PREFIX_SEPARATOR; + public static final String HEADER_PREFIX_RELATION_LAYER = "#T_RL" + + HEADER_LAYER_PREFIX_SEPARATOR; + public static final String HEADER_PREFIX_SPAN_LAYER = "#T_SP" + HEADER_LAYER_PREFIX_SEPARATOR; + + public static final String PREFIX_TEXT = "#Text="; + + public static final String FIELD_SEPARATOR = "\t"; + public static final char LINE_BREAK = '\n'; + public static final String NULL_VALUE = "*"; + public static final String NULL_COLUMN = "_"; + public static final String STACK_SEP = "|"; + public static final String SLOT_SEP = ";"; + + public FormatConstants() + { + // TODO Auto-generated constructor stub + } + +} diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/LayerType.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/LayerType.java new file mode 100644 index 0000000000..4e141b4594 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/LayerType.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */package org.dkpro.core.io.webanno.tsv.internal.tsv3x.model; + +public enum LayerType { + SPAN, + RELATION, + CHAIN, + INCOMPATIBLE; +} diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvChain.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvChain.java new file mode 100644 index 0000000000..0e6ede19b9 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvChain.java @@ -0,0 +1,107 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */package org.dkpro.core.io.webanno.tsv.internal.tsv3x.model; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.TreeMap; + +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; + +public class TsvChain +{ + private final int id; + private final Type headType; + private final Type elementType; + private final NavigableMap elements = new TreeMap<>(); + private final Map fs2ChainIndex; + private List cachedElementsList = null; + + public TsvChain(int aId, Type aHeadType, Type aElementType, + Map aSharedIndex) + { + id = aId; + headType = aHeadType; + elementType = aElementType; + fs2ChainIndex = aSharedIndex; + } + + public TsvChain(int aId, Type aHeadType, Type aElementType, List aElements, + Map aSharedIndex) + { + id = aId; + headType = aHeadType; + elementType = aElementType; + fs2ChainIndex = aSharedIndex; + + for (int i = 0; i < aElements.size(); i++) { + putElement(i, aElements.get(i)); + } + } + + public int getId() + { + return id; + } + + public Type getHeadType() + { + return headType; + } + + public Type getElementType() + { + return elementType; + } + + public void addElement(AnnotationFS aElement) + { + elements.put(elements.size(), aElement); + cachedElementsList = null; + fs2ChainIndex.put(aElement, this); + } + + public void putElement(int aIndex, AnnotationFS aElement) + { + elements.put(aIndex, aElement); + cachedElementsList = null; + fs2ChainIndex.put(aElement, this); + } + + public AnnotationFS getElement(int aIndex) + { + return elements.get(aIndex); + } + + public Collection getElements() + { + return elements.values(); + } + + public int indexOf(AnnotationFS aElement) + { + // This may be called often so we internally cache the list of elements. + if (cachedElementsList == null) { + cachedElementsList = new ArrayList<>(elements.values()); + } + return cachedElementsList.indexOf(aElement); + } +} diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvColumn.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvColumn.java new file mode 100644 index 0000000000..c70f06a831 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvColumn.java @@ -0,0 +1,123 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */package org.dkpro.core.io.webanno.tsv.internal.tsv3x.model; + +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.Type; + +public class TsvColumn +{ + // The index is currently only set when parsing a schema to deserialize a TSV file. During + // serialization, it is not really needed and always set to -1. + public final int index; + public final Type uimaType; + public final Feature uimaFeature; + public final LayerType layerType; + public final FeatureType featureType; + + private Type targetTypeHint; + + public TsvColumn(Type aUimaType, LayerType aLayerType) + { + this(aUimaType, aLayerType, (Feature) null, FeatureType.PLACEHOLDER); + } + + public TsvColumn(Type aUimaType, LayerType aLayerType, String aUimaFeatureName, + FeatureType aFeatureType) + { + this(aUimaType, aLayerType, aUimaType.getFeatureByBaseName(aUimaFeatureName), aFeatureType); + } + + public TsvColumn(Type aUimaType, LayerType aLayerType, Feature aUimaFeature, + FeatureType aFeatureType) + { + this(-1, aUimaType, aLayerType, aUimaFeature, aFeatureType); + } + + public TsvColumn(int aIndex, Type aUimaType, LayerType aLayerType) + { + this(aIndex, aUimaType, aLayerType, (Feature) null, FeatureType.PLACEHOLDER); + } + + public TsvColumn(int aIndex, Type aUimaType, LayerType aLayerType, String aUimaFeatureName, + FeatureType aFeatureType) + { + this(aIndex, aUimaType, aLayerType, aUimaType.getFeatureByBaseName(aUimaFeatureName), + aFeatureType); + } + + public TsvColumn(int aIndex, Type aUimaType, LayerType aLayerType, Feature aUimaFeature, + FeatureType aFeatureType) + { + index = aIndex; + uimaType = aUimaType; + layerType = aLayerType; + uimaFeature = aUimaFeature; + featureType = aFeatureType; + } + + public void setTargetTypeHint(Type aTargetTypeHint) + { + targetTypeHint = aTargetTypeHint; + } + + public Type getTargetTypeHint() + { + return targetTypeHint; + } + + public int getIndex() + { + return index; + } + + public Type getUimaType() + { + return uimaType; + } + + public Feature getUimaFeature() + { + return uimaFeature; + } + + public LayerType getLayerType() + { + return layerType; + } + + public FeatureType getFeatureType() + { + return featureType; + } + + @Override + public String toString() + { + StringBuilder builder = new StringBuilder(); + builder.append("TsvColumn [uimaType="); + builder.append(uimaType); + builder.append(", layerType="); + builder.append(layerType); + builder.append(", uimaFeature="); + builder.append(uimaFeature); + builder.append(", featureType="); + builder.append(featureType); + builder.append("]"); + return builder.toString(); + } +} diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvDocument.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvDocument.java new file mode 100644 index 0000000000..43cc9c74d1 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvDocument.java @@ -0,0 +1,256 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */package org.dkpro.core.io.webanno.tsv.internal.tsv3x.model; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.jcas.JCas; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class TsvDocument +{ + private final Pattern PATTERN_UNIT_ID = Pattern.compile( + "^(?\\d+)-(?\\d+)(\\.(?\\d))?$"); + + private final TsvFormatHeader format; + private final TsvSchema schema; + private final JCas jcas; + private final Map fs2unitIndex = new HashMap<>(); + private final List sentences = new ArrayList<>(); + private final List chains = new ArrayList<>(); + private final Map fs2ChainIndex = new HashMap<>(); + private final Set activeColumns = new HashSet<>(); + private final Set activeTypes = new HashSet<>(); + private final Map fs2IdIndex = new HashMap<>(); + private final Map id2fsIndex = new HashMap<>(); + + public TsvDocument(TsvFormatHeader aFormat, TsvSchema aSchema, JCas aJCas) + { + format = aFormat; + schema = aSchema; + jcas = aJCas; + } + + public void activateColumn(TsvColumn aColumn) + { + activeColumns.add(aColumn); + } + + public Set getActiveColumns() + { + return activeColumns; + } + + public void activateType(Type aType) + { + activeTypes.add(aType); + } + + public Set getActiveTypes() + { + return activeTypes; + } + + /** + * Get the unit which defines the TSV ID for the given feature structure. This can be either a + * token or it could be a sub-token if the feature structure is properly nested in a token of if + * it triggered the creation of a prefix or suffix sub-token. + * + * @param aFS + * an annotation. + * + * @return the unit defining the TSV ID for the given feature structure + */ + public TsvUnit findIdDefiningUnit(AnnotationFS aFS) + { + return fs2unitIndex.get(aFS); + } + + public void mapFS2Unit(AnnotationFS aFS, TsvUnit aUnit) + { + fs2unitIndex.put(aFS, aUnit); + } + + public TsvChain createChain(Type aHeadType, Type aElementType, List aElements) + { + TsvChain chain = new TsvChain(chains.size() + 1, aHeadType, aElementType, aElements, + fs2ChainIndex); + chains.add(chain); + return chain; + } + + public TsvChain createChain(int aChainId, Type aHeadType, Type aElementType) + { + TsvChain chain = new TsvChain(aChainId, aHeadType, aElementType, fs2ChainIndex); + chains.add(chain); + return chain; + } + + public AnnotationFS getChainElement(int aChainId, int aElementIndex) + { + TsvChain chain = getChain(aChainId); + if (chain != null) { + return chain.getElement(aElementIndex); + } + else { + return null; + } + } + + public TsvSentence createSentence(Sentence aUimaSentence) + { + TsvSentence sentence = new TsvSentence(this, aUimaSentence, sentences.size() + 1); + sentences.add(sentence); + return sentence; + } + + public TsvToken createToken(TsvSentence aSentence, Token aUimaToken, int aPosition) + { + TsvToken token = new TsvToken(this, aSentence, aUimaToken, aPosition); + mapFS2Unit(token.getUimaToken(), token); + return token; + } + + public TsvUnit getUnit(String aUnitId) + { + Matcher m = PATTERN_UNIT_ID.matcher(aUnitId); + if (!m.matches()) { + throw new IllegalArgumentException("Invalid unit ID: [" + aUnitId + "]"); + } + + TsvToken token = getToken(Integer.valueOf(m.group("SENT")) - 1, + Integer.valueOf(m.group("TOKEN")) - 1); + + String stid = m.group("SUBTOKEN"); + if (stid != null) { + return token.getSubTokens().get(Integer.valueOf(stid) - 1); + } + else { + return token; + } + } + + public TsvToken getToken(int aSentencePosition, int aTokenPosition) + { + return sentences.get(aSentencePosition).getTokens().get(aTokenPosition); + } + + public List getChains() + { + return chains; + } + + public TsvChain getChain(int aChainId) + { + // This could be optimized if the chains were stored in a map instead of a list. + return chains.stream().filter(c -> c.getId() == aChainId).findAny().orElse(null); + } + + public TsvChain getChain(AnnotationFS aTargetFS) + { + return fs2ChainIndex.get(aTargetFS); + } + + public List getSentences() + { + return sentences; + } + + public TsvSchema getSchema() + { + return schema; + } + + public JCas getJCas() + { + return jcas; + } + + public void addDisambiguationId(AnnotationFS aAnnotation) + { + int newId = fs2IdIndex.size() + 1; + boolean keyExisted = fs2IdIndex.putIfAbsent(aAnnotation, newId) != null; + if (!keyExisted) { + id2fsIndex.put(newId, aAnnotation); + } + } + + public void addDisambiguationId(AnnotationFS aAnnotation, int aId) + { + AnnotationFS oldEntry = id2fsIndex.put(aId, aAnnotation); + assert oldEntry == null || aAnnotation.equals(oldEntry); + fs2IdIndex.put(aAnnotation, aId); + } + + public Integer getDisambiguationId(AnnotationFS aAnnotation) + { + return fs2IdIndex.get(aAnnotation); + } + + public AnnotationFS getDisambiguatedAnnotation(int aDisambiguationId) + { + return id2fsIndex.get(aDisambiguationId); + } + + public Set getDisambiguatedAnnotations() + { + return fs2IdIndex.keySet(); + } + + public AnnotationFS resolveReference(Type aType, String aId, + int aDisambiguationId) + { + AnnotationFS annotation; + // If there is a disambiguation ID then we can easily look up the annotation via the ID. + // A disambiguation ID of 0 used when a relation refers to a non-ambiguous target and + // it is handled in the second case. + if (aDisambiguationId > 0) { + annotation = getDisambiguatedAnnotation(aDisambiguationId); + if (annotation == null) { + throw new IllegalStateException("Unable to resolve reference to disambiguation ID [" + + aDisambiguationId + "]"); + } + } + // Otherwise, we'll have to go through the source unit. + else { + annotation = getUnit(aId).getUimaAnnotation(aType, 0); + if (annotation == null) { + throw new IllegalStateException( + "Unable to resolve reference to unambiguous annotation of type [" + + aType.getName() + "] in unit [" + aId + "]"); + } + } + + return annotation; + } + + public TsvFormatHeader getFormatHeader() + { + return format; + } +} diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvFormatHeader.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvFormatHeader.java new file mode 100644 index 0000000000..88944e22a9 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvFormatHeader.java @@ -0,0 +1,39 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */package org.dkpro.core.io.webanno.tsv.internal.tsv3x.model; + +public class TsvFormatHeader +{ + private final String name; + private final String version; + + public TsvFormatHeader(String aName, String aVersion) + { + name = aName; + version = aVersion; + } + + public String getName() + { + return name; + } + + public String getVersion() + { + return version; + } +} diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvSchema.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvSchema.java new file mode 100644 index 0000000000..097a1498c7 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvSchema.java @@ -0,0 +1,225 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.webanno.tsv.internal.tsv3x.model; + +import static org.apache.commons.lang3.StringUtils.compare; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.Tsv3XCasSchemaAnalyzer.isChainLayer; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.Tsv3XCasSchemaAnalyzer.isRelationLayer; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.Tsv3XCasSchemaAnalyzer.isSpanLayer; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.SLOT_ROLE; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.FeatureType.SLOT_TARGET; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.LayerType.CHAIN; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.LayerType.RELATION; +import static org.dkpro.core.io.webanno.tsv.internal.tsv3x.model.LayerType.SPAN; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.uima.cas.FeatureStructure; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.TypeSystem; + +public class TsvSchema +{ + // Mind these here are define as the reverse of the respective contants in WebAnnoConst + // because here we want that "TARGET" identifies the FS which should be pointed to in + // the TSV. + public static final String FEAT_REL_TARGET = "Dependent"; + public static final String FEAT_REL_SOURCE = "Governor"; + + public static final String FEAT_SLOT_ROLE = "role"; + public static final String FEAT_SLOT_TARGET = "target"; + + public static final String CHAIN_FIRST_FEAT = "first"; + public static final String CHAIN_NEXT_FEAT = "next"; + public static final String COREFERENCE_RELATION_FEATURE = "referenceRelation"; + public static final String COREFERENCE_TYPE_FEATURE = "referenceType"; + + private List columns = new ArrayList<>(); + private Set chainHeadTypes = new HashSet<>(); + private Set ignoredTypes = new HashSet<>(); + private Map effectiveTypes = new HashMap<>(); + + public void ignoreType(Type aType) + { + ignoredTypes.add(aType); + } + + public Set getIgnoredTypes() + { + return ignoredTypes; + } + + public void addColumn(TsvColumn aColumn) + { + columns.add(aColumn); + } + + public List getColumns() + { + return columns; + } + + public List getColumns(Type aType) + { + return getColumns().stream() + .filter(c -> c.uimaType.equals(aType)) + // Sort by name so we get a stable column order even if type systems are merged + // in different orders, e.g. in unit tests. + .sorted((a, b) -> compare( + a.getUimaFeature() != null ? a.getUimaFeature().getShortName() : null, + b.getUimaFeature() != null ? b.getUimaFeature().getShortName() : null)) + .collect(Collectors.toList()); + } + + /** + * Returns the columns in the same order as they are in the TSV header. + * + * @param aActiveColumns + * columns for which actual annotations exist. + * @return the list of columns in the order as defined in the schema header. + */ + public List getHeaderColumns(Collection aActiveColumns) + { + List cols = new ArrayList<>(); + + // COMPATIBILITY NOTE + // We try to maintain the same order of columns as the WebAnnoTsv3Writer because the + // WebAnnoTsv3Reader needs that order. Our own reader does not rely on this order. + // - SPAN layers without slot features + // - SPAN layers with slot features + // - CHAIN layers + // - RELATION layers + List headerTypes = new ArrayList<>(); + headerTypes.addAll(getUimaTypes(SPAN, false)); + headerTypes.addAll(getUimaTypes(SPAN, true)); + headerTypes.addAll(getUimaTypes(CHAIN, false)); + headerTypes.addAll(getUimaTypes(RELATION, false)); + + for (Type type : headerTypes) { + List typeColumns = getColumns(type); + typeColumns.retainAll(aActiveColumns); + if (typeColumns.isEmpty()) { + continue; + } + + // Ensure that relation source columns come last. + { + TsvColumn relRefCol = null; + for (TsvColumn col : typeColumns) { + if (col.layerType.equals(RELATION) + && FEAT_REL_SOURCE.equals(col.uimaFeature.getShortName())) { + relRefCol = col; + continue; + } + + cols.add(col); + } + + if (relRefCol != null) { + cols.add(relRefCol); + } + } + } + + return cols; + } + + public Set getUimaTypes() + { + Set types = new LinkedHashSet<>(); + for (TsvColumn col : columns) { + types.add(col.uimaType); + } + return types; + } + + /** + * @param aSlotFeatures + * if {@code true}, returns only types with slot features, otherwise returns + * only types without slot features. + */ + private Set getUimaTypes(LayerType aLayerType, boolean aSlotFeatures) + { + Set types = new LinkedHashSet<>(); + for (TsvColumn col : columns) { + if (aLayerType.equals(col.layerType)) { + boolean hasSlotFeatures = columns.stream().anyMatch(c -> + c.uimaType.equals(col.uimaType) && + (SLOT_ROLE.equals(c.featureType) || SLOT_TARGET.equals(c.featureType))); + + if (hasSlotFeatures == aSlotFeatures) { + types.add(col.uimaType); + } + } + } + return types; + } + + public LayerType getLayerType(Type aType) + { + if (isRelationLayer(aType)) { + return LayerType.RELATION; + } + else if (isChainLayer(aType)) { + return LayerType.CHAIN; + } + else if (isSpanLayer(aType)) { + return LayerType.SPAN; + } + else { + return LayerType.INCOMPATIBLE; + } + } + + public void addChainHeadType(Type aType) + { + chainHeadTypes.add(aType); + } + + public Set getChainHeadTypes() + { + return chainHeadTypes; + } + + /** + * Locate a type which is known to the schema and which is equal to or a super-type of the type + * of the given {@link FeatureStructure}. If no such type is found, the actual type is returned. + * The results are cached for faster lookups. + * + * @param aFS + * a feature structure. + * @return the effective type. + */ + public Type getEffectiveType(FeatureStructure aFS) + { + TypeSystem typeSystem = aFS.getCAS().getTypeSystem(); + + return effectiveTypes.computeIfAbsent(aFS.getType(), type -> getUimaTypes().stream() + .filter(t -> typeSystem.subsumes(t, type)) + .findFirst() + .orElse(aFS.getType())); + } +} diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvSentence.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvSentence.java new file mode 100644 index 0000000000..6b17b6fc89 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvSentence.java @@ -0,0 +1,86 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */package org.dkpro.core.io.webanno.tsv.internal.tsv3x.model; + +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.List; + +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.Tsv3XSerializer; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class TsvSentence +{ + private final TsvDocument doc; + private final Sentence uimaSentence; + private final List tokens = new ArrayList<>(); + private final int position; + + public TsvSentence(TsvDocument aDoc, Sentence aUimaSentence, int aPosition) + { + doc = aDoc; + uimaSentence = aUimaSentence; + position = aPosition; + } + + public int getBegin() + { + return uimaSentence.getBegin(); + } + + public int getEnd() + { + return uimaSentence.getEnd(); + } + + public Sentence getUimaSentence() + { + return uimaSentence; + } + + public TsvToken createToken(Token aUimaToken) + { + TsvToken token = doc.createToken(this, aUimaToken, tokens.size() + 1); + token.addUimaAnnotation(aUimaToken); + tokens.add(token); + return token; + } + + public List getTokens() + { + return tokens; + } + + public int getPosition() + { + return position; + } + + @Override + public String toString() + { + StringWriter buf = new StringWriter(); + try (PrintWriter out = new PrintWriter(buf)) { + new Tsv3XSerializer().write(out, this, + doc.getSchema().getHeaderColumns(doc.getActiveColumns())); + } + return buf.toString(); + } +} diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvSubToken.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvSubToken.java new file mode 100644 index 0000000000..e6a16fd1e1 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvSubToken.java @@ -0,0 +1,83 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */package org.dkpro.core.io.webanno.tsv.internal.tsv3x.model; + +public class TsvSubToken extends TsvToken +{ + private final TsvToken token; + private final int begin; + private final int end; + + public TsvSubToken(TsvToken aToken, int aBegin, int aEnd) + { + super(aToken.getDocument(), aToken.getSentence(), aToken.getUimaToken(), + aToken.getPosition()); + token = aToken; + begin = aBegin; + end = aEnd; + } + + @Override + public int getBegin() + { + return begin; + } + + @Override + public int getEnd() + { + return end; + } + + @Override + public String getId() + { + return String.format("%s.%d", token.getId(), token.getSubTokens().indexOf(this) + 1); + } + + @Override + public int hashCode() + { + final int prime = 31; + int result = 1; + result = prime * result + begin; + result = prime * result + end; + return result; + } + + @Override + public boolean equals(Object obj) + { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + TsvSubToken other = (TsvSubToken) obj; + if (begin != other.begin) { + return false; + } + if (end != other.end) { + return false; + } + return true; + } +} diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvToken.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvToken.java new file mode 100644 index 0000000000..cfc32a966a --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvToken.java @@ -0,0 +1,61 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */package org.dkpro.core.io.webanno.tsv.internal.tsv3x.model; + +import java.util.ArrayList; +import java.util.List; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class TsvToken extends TsvUnit +{ + private List subTokens = new ArrayList<>(); + + public TsvToken(TsvDocument aDoc, TsvSentence aSentence, Token aUimaToken, int aPosition) + { + super(aDoc, aSentence, aUimaToken, aPosition); + } + + /** + * Creates a new sub-token if there is not already a sub-token with the same offsets. Otherwise, + * it returns the existing sub-token. + * + * @param aBegin + * begin offset. + * @param aEnd + * end offset. + * @return the new sub-token. + */ + public TsvSubToken createSubToken(int aBegin, int aEnd) + { + TsvSubToken subToken = new TsvSubToken(this, aBegin, aEnd); + int existingIndex = subTokens.indexOf(subToken); + if (existingIndex > -1) { + subToken = subTokens.get(existingIndex); + } + else { + subTokens.add(subToken); + } + + return subToken; + } + + public List getSubTokens() + { + return subTokens; + } +} diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvUnit.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvUnit.java new file mode 100644 index 0000000000..1d36c461f7 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/model/TsvUnit.java @@ -0,0 +1,178 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */package org.dkpro.core.io.webanno.tsv.internal.tsv3x.model; + +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.dkpro.core.io.webanno.tsv.internal.tsv3x.Tsv3XSerializer; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public abstract class TsvUnit +{ + private TsvDocument doc; + private TsvSentence sentence; + private int position; + private final Token uimaToken; + private Map> uimaAnnotations = new LinkedHashMap<>(); + + public TsvUnit(TsvDocument aDoc, TsvSentence aSentence, Token aUimaToken, int aPosition) + { + doc = aDoc; + sentence = aSentence; + uimaToken = aUimaToken; + position = aPosition; + } + + public int getBegin() + { + return uimaToken.getBegin(); + } + + public int getEnd() + { + return uimaToken.getEnd(); + } + + /** + * Get all annotations for the current column. Mind that there can be multiple columns per + * annotation. Also mind that in order to the get value for the column, it needs to be still + * retrieved from the corresponding feature of the returned annotations. + */ + public List getAnnotationsForColumn(TsvColumn aCol) + { + return uimaAnnotations.getOrDefault(aCol.uimaType, Collections.emptyList()); + } + + public TsvSentence getSentence() + { + return sentence; + } + + public TsvDocument getDocument() + { + return doc; + } + + /** + * Add an UIMA annotation which overlaps with the current token in any way. It could also + * completely contain or exactly match the token boundaries. + * + * @param aFS + * an UIMA annotation. + */ + public void addUimaAnnotation(AnnotationFS aFS) + { + addUimaAnnotation(aFS, false); + } + + /** + * Add an UIMA annotation which overlaps with the current token in any way. It could also + * completely contain or exactly match the token boundaries. + * + * @param aFS + * an UIMA annotation. + */ + public void addUimaAnnotation(AnnotationFS aFS, boolean aAddDisambiguationIfStacked) + { + Type effectiveType = getDocument().getSchema().getEffectiveType(aFS); + uimaAnnotations.putIfAbsent(effectiveType, new ArrayList<>()); + + List annotations = uimaAnnotations.get(effectiveType); + + // If we already have annotations of this type, then we need to add disambiguation IDs. + boolean alreadyHaveAnnotationsOfSameType = !annotations.isEmpty(); + + // Add to the list only if necessary, i.e. only on the first column in which we encounter + // this annotation. If it has multiple features, there may also be subsequent columns + // for the same annotation and we do not want to add it again and again. + // The position of the annotation in the per-type list corresponds to its stacking ID. + boolean hasBeenAdded = false; + if (!annotations.contains(aFS)) { + annotations.add(aFS); + hasBeenAdded = true; + } + + // Add disambiguation IDs if annotations are stacked + if (aAddDisambiguationIfStacked && hasBeenAdded && alreadyHaveAnnotationsOfSameType) { + annotations.forEach(doc::addDisambiguationId); + } + } + + /** + * @param aUimaType + * an UIMA annotation type. + * @param aStackingIndex + * the stacking index if there are multiple annotations of the same type on the + * current unit. + */ + public AnnotationFS getUimaAnnotation(Type aUimaType, int aStackingIndex) + { + List annotations = uimaAnnotations.get(aUimaType); + if (annotations != null && annotations.size() > aStackingIndex) { + return annotations.get(aStackingIndex); + } + else { + return null; + } + } + + public List getUimaAnnotations(Type aUimaType) + { + return uimaAnnotations.get(aUimaType); + } + + public List getUimaTypes() + { + return uimaAnnotations.keySet().stream().collect(Collectors.toList()); + } + + public Token getUimaToken() + { + return uimaToken; + } + + public int getPosition() + { + return position; + } + + public String getId() + { + return String.format("%d-%d", sentence.getPosition(), position); + } + + @Override + public String toString() + { + StringWriter buf = new StringWriter(); + try (PrintWriter out = new PrintWriter(buf)) { + new Tsv3XSerializer().write(out, this, + doc.getSchema().getHeaderColumns(doc.getActiveColumns())); + } + return buf.toString(); + } +} diff --git a/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3WriterTestBase.java b/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3WriterTestBase.java new file mode 100644 index 0000000000..a479e35154 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3WriterTestBase.java @@ -0,0 +1,2006 @@ +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.webanno.tsv; + +import static java.util.Arrays.asList; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.FSUtil.setFeature; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.toText; +import static org.junit.Assert.assertEquals; +import static org.junit.Assume.assumeFalse; +import static org.junit.Assume.assumeTrue; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.IntStream; + +import org.apache.commons.io.FileUtils; +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.FeatureStructure; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.fit.factory.ConfigurationParameterFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.factory.TypeSystemDescriptionFactory; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.apache.uima.fit.testing.factory.TokenBuilder; +import org.apache.uima.fit.util.FSCollectionFactory; +import org.apache.uima.fit.util.FSUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.metadata.TypeSystemDescription; +import org.apache.uima.util.CasCreationUtils; +import org.dkpro.core.io.xmi.XmiWriter; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Ignore; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_NOUN; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import webanno.custom.Span; + +public abstract class WebAnnoTsv3WriterTestBase +{ + protected abstract AnalysisEngineDescription makeWriter() + throws ResourceInitializationException; + + protected abstract String getSuiteName() + throws ResourceInitializationException; + + protected abstract boolean isKnownToFail(String aMethodName); + + @Test + public void testTokenAttachedAnnotationsWithValues() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + + Lemma l1 = new Lemma(jcas, t1.getBegin(), t1.getEnd()); + l1.setValue("lemma1"); + l1.addToIndexes(); + t1.setLemma(l1); + + MorphologicalFeatures m1 = new MorphologicalFeatures(jcas, t1.getBegin(), t1.getEnd()); + m1.setValue("morph"); + m1.setTense("tense1"); + m1.addToIndexes(); + t1.setMorph(m1); + + POS p1 = new POS(jcas, t1.getBegin(), t1.getEnd()); + p1.setPosValue("pos1"); + p1.addToIndexes(); + t1.setPos(p1); + + Stem s1 = new Stem(jcas, t1.getBegin(), t1.getEnd()); + s1.setValue("stem1"); + s1.addToIndexes(); + t1.setStem(s1); + + writeAndAssertEquals(jcas, WebannoTsv3Writer.PARAM_SPAN_LAYERS, + asList(MorphologicalFeatures.class, POS.class, Lemma.class, Stem.class)); + } + + @Test + public void testDependencyWithValues() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + + POS p1 = new POS(jcas, t1.getBegin(), t1.getEnd()); + p1.setPosValue("POS1"); + p1.addToIndexes(); + t1.setPos(p1); + + POS p2 = new POS(jcas, t2.getBegin(), t2.getEnd()); + p2.setPosValue("POS2"); + p2.addToIndexes(); + t2.setPos(p2); + + Dependency dep1 = new Dependency(jcas); + dep1.setGovernor(t1); + dep1.setDependent(t2); + // WebAnno legacy conventions + // dep1.setBegin(min(dep1.getDependent().getBegin(), dep1.getGovernor().getBegin())); + // dep1.setEnd(max(dep1.getDependent().getEnd(), dep1.getGovernor().getEnd())); + // DKPro Core conventions + dep1.setBegin(dep1.getDependent().getBegin()); + dep1.setEnd(dep1.getDependent().getEnd()); + dep1.addToIndexes(); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(POS.class), + WebannoTsv3Writer.PARAM_RELATION_LAYERS, asList(Dependency.class)); + } + + @Test + public void testZeroLengthSpansWithoutFeatureValues() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + // One at the beginning + new Span(jcas, 0, 0).addToIndexes(); + + // One at the end + new Span(jcas, jcas.getDocumentText().length(), jcas.getDocumentText().length()) + .addToIndexes(); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testZeroLengthSpansWithFeatureValues() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + // One at the beginning + Span ne1 = new Span(jcas, 0, 0); + ne1.setValue("PERSON"); + ne1.addToIndexes(); + + // One at the end + Span ne2 = new Span(jcas, jcas.getDocumentText().length(), + jcas.getDocumentText().length()); + ne2.setValue("ORG"); + ne2.addToIndexes(); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testZeroLengthSpansWithoutFeatures() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + CAS cas = jcas.getCas(); + + Type simpleSpanType = cas.getTypeSystem().getType("webanno.custom.SimpleSpan"); + + // One at the beginning + AnnotationFS fs1 = cas.createAnnotation(simpleSpanType, 0, 0); + cas.addFsToIndexes(fs1); + + // One at the end + AnnotationFS fs2 = cas.createAnnotation(simpleSpanType, jcas.getDocumentText().length(), + jcas.getDocumentText().length()); + cas.addFsToIndexes(fs2); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList("webanno.custom.SimpleSpan")); + } + + @Test + public void testZeroLengthSpanBetweenAdjacentTokens() throws Exception + { + JCas jcas = makeJCas(); + jcas.setDocumentText("word."); + new Token(jcas, 0,4).addToIndexes(); + new Token(jcas, 4,5).addToIndexes(); + new Sentence(jcas, 0,5).addToIndexes(); + + CAS cas = jcas.getCas(); + Type simpleSpanType = cas.getTypeSystem().getType("webanno.custom.SimpleSpan"); + + // Insert zero-width annotation between the adjacent tokens (at end of first token). + AnnotationFS fs1a = cas.createAnnotation(simpleSpanType, 4, 4); + cas.addFsToIndexes(fs1a); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList("webanno.custom.SimpleSpan")); + } + + @Test + public void testZeroLengthStackedSpansWithoutFeatures() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + CAS cas = jcas.getCas(); + + Type simpleSpanType = cas.getTypeSystem().getType("webanno.custom.SimpleSpan"); + + // Two at the beginning + AnnotationFS fs1 = cas.createAnnotation(simpleSpanType, 0, 0); + cas.addFsToIndexes(fs1); + AnnotationFS fs2 = cas.createAnnotation(simpleSpanType, 0, 0); + cas.addFsToIndexes(fs2); + + // Two at the end + AnnotationFS fs3 = cas.createAnnotation(simpleSpanType, jcas.getDocumentText().length(), + jcas.getDocumentText().length()); + cas.addFsToIndexes(fs3); + AnnotationFS fs4 = cas.createAnnotation(simpleSpanType, jcas.getDocumentText().length(), + jcas.getDocumentText().length()); + cas.addFsToIndexes(fs4); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList("webanno.custom.SimpleSpan")); + } + + @Test + public void testTokenBoundedSpanWithFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + int n = 0; + for (Token t : select(jcas, Token.class)) { + Span ne = new Span(jcas, t.getBegin(), t.getEnd()); + ne.setValue("NE " + n); + ne.addToIndexes(); + n++; + } + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testTokenBoundedStackedSpanWithFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + for (Token t : select(jcas, Token.class)) { + Span ne1 = new Span(jcas, t.getBegin(), t.getEnd()); + ne1.setValue("NE"); + ne1.addToIndexes(); + + Span ne2 = new Span(jcas, t.getBegin(), t.getEnd()); + ne2.setValue("NE"); + ne2.addToIndexes(); + } + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testTokenBoundedSpanWithoutFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + for (Token t : select(jcas, Token.class)) { + Span ne = new Span(jcas, t.getBegin(), t.getEnd()); + ne.addToIndexes(); + } + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testTokenBoundedSpanWithNastyFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + for (Token t : select(jcas, Token.class)) { + Span ne = new Span(jcas, t.getBegin(), t.getEnd()); + ne.setValue("de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity:value"); + ne.addToIndexes(); + } + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testTokenBoundedSpanWithUnderscoreFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + for (Token t : select(jcas, Token.class)) { + Span ne = new Span(jcas, t.getBegin(), t.getEnd()); + ne.setValue("_"); + ne.addToIndexes(); + } + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testTokenBoundedSpanWithAsteriskFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + for (Token t : select(jcas, Token.class)) { + Span ne = new Span(jcas, t.getBegin(), t.getEnd()); + ne.setValue("*"); + ne.addToIndexes(); + } + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testSingleTokenWithoutFeatureValue() + throws Exception + { + JCas jCas = makeJCasOneSentence(); + Span neToken = new Span(jCas, 0, 4); + neToken.addToIndexes(); + + writeAndAssertEquals(jCas, WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testTokenBoundedBioLookAlike() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + int n = 0; + for (Token t : select(jcas, Token.class)) { + Span ne = new Span(jcas, t.getBegin(), t.getEnd()); + ne.setValue(((n == 0) ? "B-" : "I-") + "NOTBIO!"); + ne.addToIndexes(); + n++; + } + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testTokenBoundedStackedLookAlike() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + int n = 0; + for (Token t : select(jcas, Token.class)) { + Span ne = new Span(jcas, t.getBegin(), t.getEnd()); + ne.setValue("NOTSTACKED[" + n + "]"); + ne.addToIndexes(); + n++; + } + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testTokenBoundedSpanWithSpecialSymbolsValue() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + for (Token t : select(jcas, Token.class)) { + Span ne = new Span(jcas, t.getBegin(), t.getEnd()); + ne.setValue("#*'\"`´\t:;{}|[ ]()\\§$%?=&_\n"); + ne.addToIndexes(); + } + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testMultiTokenSpanWithoutFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + Span ne = new Span(jcas, 0, jcas.getDocumentText().length()); + ne.addToIndexes(); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testSubMultiTokenSpanWithoutFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + Span ne1 = new Span(jcas, 0, 6); + ne1.addToIndexes(); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testSubMultiTokenSpanWithoutFeatureValue2() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + Span ne1 = new Span(jcas, 1, 6); + ne1.addToIndexes(); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testSubMultiTokenSpanWithoutFeatureValue3() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + Span ne1 = new Span(jcas, 1, 6); + ne1.addToIndexes(); + Span ne2 = new Span(jcas, 6, 12); + ne2.addToIndexes(); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testSubMultiTokenSpanWithFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence("aaaaaa bbbbbb cccccc"); + assertEquals(asList("aaaaaa", "bbbbbb", "cccccc"), toText(select(jcas, Token.class))); + + // 1111111111 + // 01234567890123456789 + // -------------------- + // aaaaaa bbbbbb cccccc + // 1 ------ - single token + // 2 ------+------ - multi-token + // 3 -- - inside token + // 4 ---- - token prefix + // 5 ---- - token suffix + // 6 ---+------ - multi-token prefix + // 7 ------+--- - multi-token suffix + // 8 ---+--- - multi-token prefix + suffix + // 9 ---+------+--- - multi-token prefix + full + suffix + // 10 | - zero-span inside token + // 11 | - zero-span beginning of token + // 12 | - zero-span end of token + + List annotations = new ArrayList<>(); + annotations.add(new Span(jcas, 0, 6)); // 1 + annotations.add(new Span(jcas, 0, 13)); // 2 + annotations.add(new Span(jcas, 9, 11)); // 3 + annotations.add(new Span(jcas, 7, 11)); // 4 + annotations.add(new Span(jcas, 9, 13)); // 5 + annotations.add(new Span(jcas, 3, 13)); // 6 + annotations.add(new Span(jcas, 0, 10)); // 7 + annotations.add(new Span(jcas, 3, 10)); // 8 + annotations.add(new Span(jcas, 3, 17)); // 9 + annotations.add(new Span(jcas, 10, 10)); // 10 + annotations.add(new Span(jcas, 7, 7)); // 11 + annotations.add(new Span(jcas, 13, 13)); // 12 + IntStream.range(0, annotations.size()).forEach(idx -> { + Span ne = annotations.get(idx); + ne.setValue(String.valueOf(idx + 1)); + ne.addToIndexes(); + }); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testStackedSubMultiTokenSpanWithFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence("aaaaaa bbbbbb cccccc"); + assertEquals(asList("aaaaaa", "bbbbbb", "cccccc"), toText(select(jcas, Token.class))); + + // 1111111111 + // 01234567890123456789 + // -------------------- + // aaaaaa bbbbbb cccccc + // 1 ------ - single token + // 2 ------+------ - multi-token + // 3 -- - inside token + // 4 ---- - token prefix + // 5 ---- - token suffix + // 6 ---+------ - multi-token prefix + // 7 ------+--- - multi-token suffix + // 8 ---+--- - multi-token prefix + suffix + // 9 ---+------+--- - multi-token prefix + full + suffix + // 10 | - zero-span inside token + // 11 | - zero-span beginning of token + // 12 | - zero-span end of token + + List annotations = new ArrayList<>(); + annotations.add(new Span(jcas, 0, 6)); // 1 + annotations.add(new Span(jcas, 0, 6)); // 1 + annotations.add(new Span(jcas, 0, 13)); // 2 + annotations.add(new Span(jcas, 0, 13)); // 2 + annotations.add(new Span(jcas, 9, 10)); // 3 + annotations.add(new Span(jcas, 9, 10)); // 3 + annotations.add(new Span(jcas, 7, 10)); // 4 + annotations.add(new Span(jcas, 7, 10)); // 4 + annotations.add(new Span(jcas, 9, 13)); // 5 + annotations.add(new Span(jcas, 9, 13)); // 5 + annotations.add(new Span(jcas, 3, 13)); // 6 + annotations.add(new Span(jcas, 3, 13)); // 6 + annotations.add(new Span(jcas, 0, 10)); // 7 + annotations.add(new Span(jcas, 0, 10)); // 7 + annotations.add(new Span(jcas, 3, 10)); // 8 + annotations.add(new Span(jcas, 3, 10)); // 8 + annotations.add(new Span(jcas, 3, 17)); // 9 + annotations.add(new Span(jcas, 3, 17)); // 9 + annotations.add(new Span(jcas, 10, 10)); // 10 + annotations.add(new Span(jcas, 10, 10)); // 10 + annotations.add(new Span(jcas, 7, 7)); // 11 + annotations.add(new Span(jcas, 7, 7)); // 11 + annotations.add(new Span(jcas, 13, 13)); // 12 + annotations.add(new Span(jcas, 13, 13)); // 12 + IntStream.range(0, annotations.size()).forEach(idx -> { + Span ne = annotations.get(idx); + ne.setValue(String.valueOf((idx / 2) + 1) + (idx % 2 == 0 ? "a" : "b")); + ne.addToIndexes(); + }); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testMultiTokenStackedSpanWithoutFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + Span ne1 = new Span(jcas, 0, jcas.getDocumentText().length()); + ne1.addToIndexes(); + + Span ne2 = new Span(jcas, 0, jcas.getDocumentText().length()); + ne2.addToIndexes(); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testMultiTokenSpanWithFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + Span ne = new Span(jcas, 0, jcas.getDocumentText().length()); + ne.setValue("PERSON"); + ne.addToIndexes(); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testMultiTokenStackedSpanWithFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence(); + + Span ne1 = new Span(jcas, 0, jcas.getDocumentText().length()); + ne1.setValue("PERSON"); + ne1.addToIndexes(); + + Span ne2 = new Span(jcas, 0, jcas.getDocumentText().length()); + ne2.setValue("LOCATION"); + ne2.addToIndexes(); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testCrossSentenceSpanWithoutFeatureValue() throws Exception + { + JCas jcas = makeJCasTwoSentences(); + + Span ne = new Span(jcas, 0, jcas.getDocumentText().length()); + ne.addToIndexes(); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testCrossSentenceSpanWithFeatureValue() throws Exception + { + JCas jcas = makeJCasTwoSentences(); + + Span ne = new Span(jcas, 0, jcas.getDocumentText().length()); + ne.setValue("PERSON"); + ne.addToIndexes(); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testSingleTokenRelationWithoutFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token gov = tokens.get(0); + Token dep = tokens.get(tokens.size() - 1); + + Type relationType = cas.getTypeSystem().getType("webanno.custom.Relation"); + + // One at the beginning + // WebAnno legacy conventions + // AnnotationFS fs1 = cas.createAnnotation(relationType, + // min(dep.getBegin(), gov.getBegin()), + // max(dep.getEnd(), gov.getEnd())); + // DKPro Core conventions + AnnotationFS fs1 = cas.createAnnotation(relationType, dep.getBegin(), dep.getEnd()); + FSUtil.setFeature(fs1, "Governor", gov); + FSUtil.setFeature(fs1, "Dependent", dep); + cas.addFsToIndexes(fs1); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_RELATION_LAYERS, asList("webanno.custom.Relation")); + } + + @Test + public void testSingleNonTokenRelationWithoutFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(tokens.size() - 1); + + Span gov = new Span(jcas, t1.getBegin(), t1.getEnd()); + gov.addToIndexes(); + Span dep = new Span(jcas, t2.getBegin(), t2.getEnd()); + dep.addToIndexes(); + + Type relationType = cas.getTypeSystem().getType("webanno.custom.Relation"); + + // One at the beginning + // WebAnno legacy conventions + // AnnotationFS fs1 = cas.createAnnotation(relationType, + // min(dep.getBegin(), gov.getBegin()), + // max(dep.getEnd(), gov.getEnd())); + // DKPro Core conventions + AnnotationFS fs1 = cas.createAnnotation(relationType, dep.getBegin(), dep.getEnd()); + FSUtil.setFeature(fs1, "Governor", gov); + FSUtil.setFeature(fs1, "Dependent", dep); + cas.addFsToIndexes(fs1); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class), + WebannoTsv3Writer.PARAM_RELATION_LAYERS, asList("webanno.custom.Relation")); + } + + @Test + public void testSingleStackedNonTokenRelationWithoutFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(tokens.size() - 1); + + Span gov = new Span(jcas, t1.getBegin(), t1.getEnd()); + gov.addToIndexes(); + new Span(jcas, t1.getBegin(), t1.getEnd()).addToIndexes(); + + Span dep = new Span(jcas, t2.getBegin(), t2.getEnd()); + dep.addToIndexes(); + new Span(jcas, t2.getBegin(), t2.getEnd()).addToIndexes(); + + Type relationType = cas.getTypeSystem().getType("webanno.custom.Relation"); + + // One at the beginning + // WebAnno legacy conventions + // AnnotationFS fs1 = cas.createAnnotation(relationType, + // min(dep.getBegin(), gov.getBegin()), + // max(dep.getEnd(), gov.getEnd())); + // DKPro Core conventions + AnnotationFS fs1 = cas.createAnnotation(relationType, dep.getBegin(), dep.getEnd()); + FSUtil.setFeature(fs1, "Governor", gov); + FSUtil.setFeature(fs1, "Dependent", dep); + cas.addFsToIndexes(fs1); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class), + WebannoTsv3Writer.PARAM_RELATION_LAYERS, asList("webanno.custom.Relation")); + } + + @Test + public void testSingleStackedNonTokenRelationWithoutFeatureValue2() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(tokens.size() - 1); + + Span gov = new Span(jcas, t1.getBegin(), t1.getEnd()); + gov.addToIndexes(); + + Span dep = new Span(jcas, t2.getBegin(), t2.getEnd()); + dep.addToIndexes(); + new Span(jcas, t2.getBegin(), t2.getEnd()).addToIndexes(); + + Type relationType = cas.getTypeSystem().getType("webanno.custom.Relation"); + + // One at the beginning + // WebAnno legacy conventions + // AnnotationFS fs1 = cas.createAnnotation(relationType, + // min(dep.getBegin(), gov.getBegin()), + // max(dep.getEnd(), gov.getEnd())); + // DKPro Core conventions + AnnotationFS fs1 = cas.createAnnotation(relationType, dep.getBegin(), dep.getEnd()); + FSUtil.setFeature(fs1, "Governor", gov); + FSUtil.setFeature(fs1, "Dependent", dep); + cas.addFsToIndexes(fs1); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class), + WebannoTsv3Writer.PARAM_RELATION_LAYERS, asList("webanno.custom.Relation")); + } + + @Test + public void testSingleStackedNonTokenRelationWithoutFeatureValue3() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(tokens.size() - 1); + + Span gov = new Span(jcas, t1.getBegin(), t1.getEnd()); + gov.addToIndexes(); + new Span(jcas, t1.getBegin(), t1.getEnd()).addToIndexes(); + + Span dep = new Span(jcas, t2.getBegin(), t2.getEnd()); + dep.addToIndexes(); + + Type relationType = cas.getTypeSystem().getType("webanno.custom.Relation"); + + // One at the beginning + // WebAnno legacy conventions + // AnnotationFS fs1 = cas.createAnnotation(relationType, + // min(dep.getBegin(), gov.getBegin()), + // max(dep.getEnd(), gov.getEnd())); + // DKPro Core conventions + AnnotationFS fs1 = cas.createAnnotation(relationType, dep.getBegin(), dep.getEnd()); + FSUtil.setFeature(fs1, "Governor", gov); + FSUtil.setFeature(fs1, "Dependent", dep); + cas.addFsToIndexes(fs1); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class), + WebannoTsv3Writer.PARAM_RELATION_LAYERS, asList("webanno.custom.Relation")); + } + @Test + public void testSingleStackedNonTokenOverlappingRelationWithoutFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(tokens.size() - 1); + + Span gov = new Span(jcas, t1.getBegin(), t2.getEnd()); + gov.addToIndexes(); + new Span(jcas, t1.getBegin(), t2.getEnd()).addToIndexes(); + + Span dep = new Span(jcas, t2.getBegin(), t2.getEnd()); + dep.addToIndexes(); + new Span(jcas, t2.getBegin(), t2.getEnd()).addToIndexes(); + + Type relationType = cas.getTypeSystem().getType("webanno.custom.Relation"); + + // One at the beginning + // WebAnno legacy conventions + // AnnotationFS fs1 = cas.createAnnotation(relationType, + // min(dep.getBegin(), gov.getBegin()), + // max(dep.getEnd(), gov.getEnd())); + // DKPro Core conventions + AnnotationFS fs1 = cas.createAnnotation(relationType, dep.getBegin(), dep.getEnd()); + FSUtil.setFeature(fs1, "Governor", gov); + FSUtil.setFeature(fs1, "Dependent", dep); + cas.addFsToIndexes(fs1); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class), + WebannoTsv3Writer.PARAM_RELATION_LAYERS, asList("webanno.custom.Relation")); + } + + @Test + public void testSingleNonTokenRelationWithoutFeature() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(tokens.size() - 1); + + Span gov = new Span(jcas, t1.getBegin(), t1.getEnd()); + gov.addToIndexes(); + Span dep = new Span(jcas, t2.getBegin(), t2.getEnd()); + dep.addToIndexes(); + + Type relationType = cas.getTypeSystem().getType("webanno.custom.SimpleRelation"); + + // One at the beginning + // WebAnno legacy conventions + // AnnotationFS fs1 = cas.createAnnotation(relationType, + // min(dep.getBegin(), gov.getBegin()), + // max(dep.getEnd(), gov.getEnd())); + // DKPro Core conventions + AnnotationFS fs1 = cas.createAnnotation(relationType, dep.getBegin(), dep.getEnd()); + FSUtil.setFeature(fs1, "Governor", gov); + FSUtil.setFeature(fs1, "Dependent", dep); + cas.addFsToIndexes(fs1); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class), + WebannoTsv3Writer.PARAM_RELATION_LAYERS, asList("webanno.custom.SimpleRelation")); + } + + @Test + public void testSingleNonMultiTokenRelationWithoutFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(2); + Token t4 = tokens.get(3); + + Span gov = new Span(jcas, t1.getBegin(), t2.getEnd()); + gov.addToIndexes(); + Span dep = new Span(jcas, t3.getBegin(), t4.getEnd()); + dep.addToIndexes(); + + Type relationType = cas.getTypeSystem().getType("webanno.custom.Relation"); + + // One at the beginning + // WebAnno legacy conventions + // AnnotationFS fs1 = cas.createAnnotation(relationType, + // min(dep.getBegin(), gov.getBegin()), + // max(dep.getEnd(), gov.getEnd())); + // DKPro Core conventions + AnnotationFS fs1 = cas.createAnnotation(relationType, dep.getBegin(), dep.getEnd()); + FSUtil.setFeature(fs1, "Governor", gov); + FSUtil.setFeature(fs1, "Dependent", dep); + cas.addFsToIndexes(fs1); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class), + WebannoTsv3Writer.PARAM_RELATION_LAYERS, asList("webanno.custom.Relation")); + } + + @Test + public void testSingleNonMultiTokenRelationWithMultipleFeatureValues() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(2); + Token t4 = tokens.get(3); + + Span gov = new Span(jcas, t1.getBegin(), t2.getEnd()); + gov.addToIndexes(); + Span dep = new Span(jcas, t3.getBegin(), t4.getEnd()); + dep.addToIndexes(); + + Type relationType = cas.getTypeSystem().getType("webanno.custom.ComplexRelation"); + + // One at the beginning + // WebAnno legacy conventions + // AnnotationFS fs1 = cas.createAnnotation(relationType, + // min(dep.getBegin(), gov.getBegin()), + // max(dep.getEnd(), gov.getEnd())); + // DKPro Core conventions + AnnotationFS fs1 = cas.createAnnotation(relationType, dep.getBegin(), dep.getEnd()); + FSUtil.setFeature(fs1, "Governor", gov); + FSUtil.setFeature(fs1, "Dependent", dep); + FSUtil.setFeature(fs1, "value", "nsubj"); + FSUtil.setFeature(fs1, "boolValue", true); + FSUtil.setFeature(fs1, "integerValue", 42); + cas.addFsToIndexes(fs1); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class), + WebannoTsv3Writer.PARAM_RELATION_LAYERS, asList("webanno.custom.ComplexRelation")); + } + + @Test + public void testStackedNonMultiTokenRelationWithMultipleFeatureValues() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(2); + Token t4 = tokens.get(3); + + Span gov = new Span(jcas, t1.getBegin(), t2.getEnd()); + gov.addToIndexes(); + Span dep = new Span(jcas, t3.getBegin(), t4.getEnd()); + dep.addToIndexes(); + + Type relationType = cas.getTypeSystem().getType("webanno.custom.ComplexRelation"); + + // WebAnno legacy conventions + // AnnotationFS fs1 = cas.createAnnotation(relationType, + // min(dep.getBegin(), gov.getBegin()), + // max(dep.getEnd(), gov.getEnd())); + // DKPro Core conventions + AnnotationFS fs1 = cas.createAnnotation(relationType, dep.getBegin(), dep.getEnd()); + FSUtil.setFeature(fs1, "Governor", gov); + FSUtil.setFeature(fs1, "Dependent", dep); + FSUtil.setFeature(fs1, "value", "nsubj"); + FSUtil.setFeature(fs1, "boolValue", true); + FSUtil.setFeature(fs1, "integerValue", 42); + cas.addFsToIndexes(fs1); + + // WebAnno legacy conventions + // AnnotationFS fs2 = cas.createAnnotation(relationType, + // min(dep.getBegin(), gov.getBegin()), + // max(dep.getEnd(), gov.getEnd())); + // DKPro Core conventions + AnnotationFS fs2 = cas.createAnnotation(relationType, dep.getBegin(), dep.getEnd()); + FSUtil.setFeature(fs2, "Governor", gov); + FSUtil.setFeature(fs2, "Dependent", dep); + FSUtil.setFeature(fs2, "value", "obj"); + FSUtil.setFeature(fs2, "boolValue", false); + FSUtil.setFeature(fs2, "integerValue", 43); + cas.addFsToIndexes(fs2); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class), + WebannoTsv3Writer.PARAM_RELATION_LAYERS, asList("webanno.custom.ComplexRelation")); + } + @Ignore("Relations between different layers not supported in WebAnno TSV 3 atm") + @Test + public void testSingleMixedRelationWithoutFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token gov = tokens.get(0); + + Token t2 = tokens.get(tokens.size() - 1); + Span dep = new Span(jcas, t2.getBegin(), t2.getEnd()); + dep.addToIndexes(); + + Type relationType = cas.getTypeSystem().getType("webanno.custom.Relation"); + + // One at the beginning + // WebAnno legacy conventions + // AnnotationFS fs1 = cas.createAnnotation(relationType, + // min(dep.getBegin(), gov.getBegin()), + // max(dep.getEnd(), gov.getEnd())); + // DKPro Core conventions + AnnotationFS fs1 = cas.createAnnotation(relationType, dep.getBegin(), dep.getEnd()); + FSUtil.setFeature(fs1, "Governor", gov); + FSUtil.setFeature(fs1, "Dependent", dep); + cas.addFsToIndexes(fs1); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class), + WebannoTsv3Writer.PARAM_RELATION_LAYERS, asList("webanno.custom.Relation")); + } + + @Test + public void testSingleTokenRelationWithFeatureValue() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token gov = tokens.get(0); + Token dep = tokens.get(tokens.size() - 1); + + Type relationType = cas.getTypeSystem().getType("webanno.custom.Relation"); + + // One at the beginning + // WebAnno legacy conventions + // AnnotationFS fs1 = cas.createAnnotation(relationType, + // min(dep.getBegin(), gov.getBegin()), + // max(dep.getEnd(), gov.getEnd())); + // DKPro Core conventions + AnnotationFS fs1 = cas.createAnnotation(relationType, dep.getBegin(), dep.getEnd()); + FSUtil.setFeature(fs1, "Governor", gov); + FSUtil.setFeature(fs1, "Dependent", dep); + FSUtil.setFeature(fs1, "value", "nsubj"); + cas.addFsToIndexes(fs1); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_RELATION_LAYERS, asList("webanno.custom.Relation")); + } + + @Test + public void testSingleTokenRelationWithMultipleFeatureValues() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token gov = tokens.get(0); + Token dep = tokens.get(tokens.size() - 1); + + Type relationType = cas.getTypeSystem().getType("webanno.custom.ComplexRelation"); + + // One at the beginning + // WebAnno legacy conventions + // AnnotationFS fs1 = cas.createAnnotation(relationType, + // min(dep.getBegin(), gov.getBegin()), + // max(dep.getEnd(), gov.getEnd())); + // DKPro Core conventions + AnnotationFS fs1 = cas.createAnnotation(relationType, dep.getBegin(), dep.getEnd()); + FSUtil.setFeature(fs1, "Governor", gov); + FSUtil.setFeature(fs1, "Dependent", dep); + FSUtil.setFeature(fs1, "value", "nsubj"); + FSUtil.setFeature(fs1, "boolValue", true); + FSUtil.setFeature(fs1, "integerValue", 42); + cas.addFsToIndexes(fs1); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_RELATION_LAYERS, asList("webanno.custom.ComplexRelation")); + } + + @Test + public void testSimpleSlotFeature() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(2); + + Type type = cas.getTypeSystem().getType("webanno.custom.SimpleSpan"); + AnnotationFS s2 = cas.createAnnotation(type, t2.getBegin(), t2.getEnd()); + cas.addFsToIndexes(s2); + AnnotationFS s3 = cas.createAnnotation(type, t3.getBegin(), t3.getEnd()); + cas.addFsToIndexes(s3); + + FeatureStructure link1 = makeLinkFS(jcas, "p1", s2); + FeatureStructure link2 = makeLinkFS(jcas, "p2", s3); + + makeLinkHostFS(jcas, t1.getBegin(), t1.getEnd(), link1, link2); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SLOT_FEATS, asList("webanno.custom.SimpleLinkHost:links"), + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList("webanno.custom.SimpleSpan", + "webanno.custom.SimpleLinkHost"), + WebannoTsv3Writer.PARAM_LINK_TYPES, asList("webanno.custom.LinkType"), + WebannoTsv3Writer.PARAM_SLOT_TARGETS, asList("webanno.custom.SimpleSpan")); + } + + @Test + public void testUnsetSlotFeature() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(2); + + Type type = cas.getTypeSystem().getType("webanno.custom.SimpleSpan"); + AnnotationFS s2 = cas.createAnnotation(type, t2.getBegin(), t2.getEnd()); + cas.addFsToIndexes(s2); + AnnotationFS s3 = cas.createAnnotation(type, t3.getBegin(), t3.getEnd()); + cas.addFsToIndexes(s3); + + makeLinkHostFS(jcas, "webanno.custom.FlexLinkHost", t1.getBegin(), t1.getEnd(), + (FeatureStructure[]) null); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SLOT_FEATS, asList("webanno.custom.FlexLinkHost:links"), + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList("webanno.custom.SimpleSpan", + "webanno.custom.SimpleLinkHost"), + WebannoTsv3Writer.PARAM_LINK_TYPES, asList("webanno.custom.FlexLinkType"), + WebannoTsv3Writer.PARAM_SLOT_TARGETS, asList("webanno.custom.SimpleSpan")); + } + + @Test + public void testSimpleSlotFeatureWithoutValues() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(2); + + Type type = cas.getTypeSystem().getType("webanno.custom.SimpleSpan"); + AnnotationFS s2 = cas.createAnnotation(type, t2.getBegin(), t2.getEnd()); + cas.addFsToIndexes(s2); + AnnotationFS s3 = cas.createAnnotation(type, t3.getBegin(), t3.getEnd()); + cas.addFsToIndexes(s3); + + FeatureStructure link1 = makeLinkFS(jcas, null, s2); + FeatureStructure link2 = makeLinkFS(jcas, null, s3); + + makeLinkHostFS(jcas, t1.getBegin(), t1.getEnd(), link1, link2); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SLOT_FEATS, asList("webanno.custom.SimpleLinkHost:links"), + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList("webanno.custom.SimpleSpan", + "webanno.custom.SimpleLinkHost"), + WebannoTsv3Writer.PARAM_LINK_TYPES, asList("webanno.custom.LinkType"), + WebannoTsv3Writer.PARAM_SLOT_TARGETS, asList("webanno.custom.SimpleSpan")); + } + + @Test + public void testStackedSimpleSlotFeatureWithoutValues() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(2); + + Type type = cas.getTypeSystem().getType("webanno.custom.SimpleSpan"); + AnnotationFS s2 = cas.createAnnotation(type, t2.getBegin(), t2.getEnd()); + cas.addFsToIndexes(s2); + AnnotationFS s3 = cas.createAnnotation(type, t3.getBegin(), t3.getEnd()); + cas.addFsToIndexes(s3); + + { + FeatureStructure link1 = makeLinkFS(jcas, null, s2); + FeatureStructure link2 = makeLinkFS(jcas, null, s3); + makeLinkHostFS(jcas, t1.getBegin(), t1.getEnd(), link1, link2); + } + + { + FeatureStructure link1 = makeLinkFS(jcas, null, s2); + FeatureStructure link2 = makeLinkFS(jcas, null, s3); + makeLinkHostFS(jcas, t1.getBegin(), t1.getEnd(), link1, link2); + } + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SLOT_FEATS, asList("webanno.custom.SimpleLinkHost:links"), + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList("webanno.custom.SimpleSpan", + "webanno.custom.SimpleLinkHost"), + WebannoTsv3Writer.PARAM_LINK_TYPES, asList("webanno.custom.LinkType"), + WebannoTsv3Writer.PARAM_SLOT_TARGETS, asList("webanno.custom.SimpleSpan")); + } + @Test + public void testSimpleSameRoleSlotFeature() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(2); + + Type type = cas.getTypeSystem().getType("webanno.custom.SimpleSpan"); + AnnotationFS s2 = cas.createAnnotation(type, t2.getBegin(), t2.getEnd()); + cas.addFsToIndexes(s2); + AnnotationFS s3 = cas.createAnnotation(type, t3.getBegin(), t3.getEnd()); + cas.addFsToIndexes(s3); + + FeatureStructure link1 = makeLinkFS(jcas, "p1", s2); + FeatureStructure link2 = makeLinkFS(jcas, "p1", s3); + + makeLinkHostFS(jcas, t1.getBegin(), t1.getEnd(), link1, link2); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SLOT_FEATS, asList("webanno.custom.SimpleLinkHost:links"), + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList("webanno.custom.SimpleSpan", + "webanno.custom.SimpleLinkHost"), + WebannoTsv3Writer.PARAM_LINK_TYPES, asList("webanno.custom.LinkType"), + WebannoTsv3Writer.PARAM_SLOT_TARGETS, asList("webanno.custom.SimpleSpan")); + } + + @Test + public void testComplexSlotFeatureWithoutValues() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(2); + + Type type = cas.getTypeSystem().getType("webanno.custom.SimpleSpan"); + AnnotationFS s2 = cas.createAnnotation(type, t2.getBegin(), t2.getEnd()); + cas.addFsToIndexes(s2); + AnnotationFS s3 = cas.createAnnotation(type, t3.getBegin(), t3.getEnd()); + cas.addFsToIndexes(s3); + + FeatureStructure link1 = makeLinkFS(jcas, "webanno.custom.ComplexLinkType", null, s2); + FeatureStructure link2 = makeLinkFS(jcas, "webanno.custom.ComplexLinkType", null, s3); + + makeLinkHostFS(jcas, "webanno.custom.ComplexLinkHost", t1.getBegin(), t1.getEnd(), link1, + link2); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SLOT_FEATS, asList("webanno.custom.ComplexLinkHost:links"), + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList("webanno.custom.SimpleSpan", + "webanno.custom.ComplexLinkHost"), + WebannoTsv3Writer.PARAM_LINK_TYPES, asList("webanno.custom.ComplexLinkType"), + WebannoTsv3Writer.PARAM_SLOT_TARGETS, asList("webanno.custom.SimpleSpan")); + } + + @Test + public void testStackedComplexSlotFeatureWithoutValues() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(2); + + Type type = cas.getTypeSystem().getType("webanno.custom.SimpleSpan"); + AnnotationFS s2 = cas.createAnnotation(type, t2.getBegin(), t2.getEnd()); + cas.addFsToIndexes(s2); + AnnotationFS s3 = cas.createAnnotation(type, t3.getBegin(), t3.getEnd()); + cas.addFsToIndexes(s3); + + { + FeatureStructure link1 = makeLinkFS(jcas, "webanno.custom.ComplexLinkType", null, s2); + FeatureStructure link2 = makeLinkFS(jcas, "webanno.custom.ComplexLinkType", null, s3); + makeLinkHostFS(jcas, "webanno.custom.ComplexLinkHost", t1.getBegin(), t1.getEnd(), + link1, link2); + } + + { + FeatureStructure link1 = makeLinkFS(jcas, "webanno.custom.ComplexLinkType", null, s2); + FeatureStructure link2 = makeLinkFS(jcas, "webanno.custom.ComplexLinkType", null, s3); + makeLinkHostFS(jcas, "webanno.custom.ComplexLinkHost", t1.getBegin(), t1.getEnd(), + link1, link2); + } + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SLOT_FEATS, asList("webanno.custom.ComplexLinkHost:links"), + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList("webanno.custom.SimpleSpan", + "webanno.custom.ComplexLinkHost"), + WebannoTsv3Writer.PARAM_LINK_TYPES, asList("webanno.custom.ComplexLinkType"), + WebannoTsv3Writer.PARAM_SLOT_TARGETS, asList("webanno.custom.SimpleSpan")); + } + + @Test + public void testStackedComplexSlotFeatureWithoutSlotFillers() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(2); + + Type type = cas.getTypeSystem().getType("webanno.custom.SimpleSpan"); + AnnotationFS s2 = cas.createAnnotation(type, t2.getBegin(), t2.getEnd()); + cas.addFsToIndexes(s2); + AnnotationFS s3 = cas.createAnnotation(type, t3.getBegin(), t3.getEnd()); + cas.addFsToIndexes(s3); + + AnnotationFS host1 = makeLinkHostFS(jcas, "webanno.custom.ComplexLinkHost", t1.getBegin(), + t1.getEnd()); + setFeature(host1, "value", "val1"); + + AnnotationFS host2 = makeLinkHostFS(jcas, "webanno.custom.ComplexLinkHost", t1.getBegin(), + t1.getEnd()); + setFeature(host2, "value", "val2"); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SLOT_FEATS, asList("webanno.custom.ComplexLinkHost:links"), + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList("webanno.custom.SimpleSpan", + "webanno.custom.ComplexLinkHost"), + WebannoTsv3Writer.PARAM_LINK_TYPES, asList("webanno.custom.ComplexLinkType"), + WebannoTsv3Writer.PARAM_SLOT_TARGETS, asList("webanno.custom.SimpleSpan")); + } + + @Test + public void testSimpleCrossSenenceSlotFeature() throws Exception + { + JCas jcas = makeJCasTwoSentences(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(6); + + Type type = cas.getTypeSystem().getType("webanno.custom.SimpleSpan"); + AnnotationFS s2 = cas.createAnnotation(type, t2.getBegin(), t2.getEnd()); + cas.addFsToIndexes(s2); + AnnotationFS s3 = cas.createAnnotation(type, t3.getBegin(), t3.getEnd()); + cas.addFsToIndexes(s3); + + FeatureStructure link1 = makeLinkFS(jcas, "p1", s2); + FeatureStructure link2 = makeLinkFS(jcas, "p2", s3); + + makeLinkHostFS(jcas, t1.getBegin(), t1.getEnd(), link1, link2); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SLOT_FEATS, asList("webanno.custom.SimpleLinkHost:links"), + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList("webanno.custom.SimpleSpan", + "webanno.custom.SimpleLinkHost"), + WebannoTsv3Writer.PARAM_LINK_TYPES, asList("webanno.custom.LinkType"), + WebannoTsv3Writer.PARAM_SLOT_TARGETS, asList("webanno.custom.SimpleSpan")); + } + + @Test + public void testMultiTokenSlotFeature() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(2); + Token t4 = tokens.get(3); + Token t5 = tokens.get(4); + + Type type = cas.getTypeSystem().getType("webanno.custom.SimpleSpan"); + AnnotationFS s2 = cas.createAnnotation(type, t2.getBegin(), t3.getEnd()); + cas.addFsToIndexes(s2); + AnnotationFS s3 = cas.createAnnotation(type, t4.getBegin(), t5.getEnd()); + cas.addFsToIndexes(s3); + + FeatureStructure link1 = makeLinkFS(jcas, "p1", s2); + FeatureStructure link2 = makeLinkFS(jcas, "p2", s3); + + makeLinkHostFS(jcas, t1.getBegin(), t1.getEnd(), link1, link2); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SLOT_FEATS, asList("webanno.custom.SimpleLinkHost:links"), + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList("webanno.custom.SimpleSpan", + "webanno.custom.SimpleLinkHost"), + WebannoTsv3Writer.PARAM_LINK_TYPES, asList("webanno.custom.LinkType"), + WebannoTsv3Writer.PARAM_SLOT_TARGETS, asList("webanno.custom.SimpleSpan")); + } + + @Test + public void testMultiTokenStackedSlotFeature() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(2); + + Type type = cas.getTypeSystem().getType("webanno.custom.SimpleSpan"); + AnnotationFS s2 = cas.createAnnotation(type, t2.getBegin(), t3.getEnd()); + cas.addFsToIndexes(s2); + AnnotationFS s3 = cas.createAnnotation(type, t2.getBegin(), t3.getEnd()); + cas.addFsToIndexes(s3); + + FeatureStructure link1 = makeLinkFS(jcas, "p1", s2); + FeatureStructure link2 = makeLinkFS(jcas, "p2", s3); + + makeLinkHostFS(jcas, t1.getBegin(), t1.getEnd(), link1, link2); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SLOT_FEATS, asList("webanno.custom.SimpleLinkHost:links"), + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList("webanno.custom.SimpleSpan", + "webanno.custom.SimpleLinkHost"), + WebannoTsv3Writer.PARAM_LINK_TYPES, asList("webanno.custom.LinkType"), + WebannoTsv3Writer.PARAM_SLOT_TARGETS, asList("webanno.custom.SimpleSpan")); + } + + @Test + public void testZeroLengthSlotFeature1() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(2); + + Type type = cas.getTypeSystem().getType("webanno.custom.SimpleSpan"); + AnnotationFS s2 = cas.createAnnotation(type, t2.getBegin(), t3.getEnd()); + cas.addFsToIndexes(s2); + AnnotationFS s3 = cas.createAnnotation(type, t2.getBegin(), t3.getEnd()); + cas.addFsToIndexes(s3); + + FeatureStructure link1 = makeLinkFS(jcas, "p1", s2); + FeatureStructure link2 = makeLinkFS(jcas, "p2", s3); + + makeLinkHostFS(jcas, t1.getBegin(), t1.getBegin(), link1, link2); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SLOT_FEATS, asList("webanno.custom.SimpleLinkHost:links"), + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList("webanno.custom.SimpleSpan", + "webanno.custom.SimpleLinkHost"), + WebannoTsv3Writer.PARAM_LINK_TYPES, asList("webanno.custom.LinkType"), + WebannoTsv3Writer.PARAM_SLOT_TARGETS, asList("webanno.custom.SimpleSpan")); + } + + @Test + public void testZeroLengthSlotFeature2() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(2); + + Type type = cas.getTypeSystem().getType("webanno.custom.SimpleSpan"); + AnnotationFS s2 = cas.createAnnotation(type, t2.getBegin(), t3.getEnd()); + cas.addFsToIndexes(s2); + AnnotationFS s3 = cas.createAnnotation(type, t3.getEnd(), t3.getEnd()); + cas.addFsToIndexes(s3); + + FeatureStructure link1 = makeLinkFS(jcas, "p1", s2); + FeatureStructure link2 = makeLinkFS(jcas, "p2", s3); + + makeLinkHostFS(jcas, t1.getBegin(), t1.getEnd(), link1, link2); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_SLOT_FEATS, asList("webanno.custom.SimpleLinkHost:links"), + WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList("webanno.custom.SimpleSpan", + "webanno.custom.SimpleLinkHost"), + WebannoTsv3Writer.PARAM_LINK_TYPES, asList("webanno.custom.LinkType"), + WebannoTsv3Writer.PARAM_SLOT_TARGETS, asList("webanno.custom.SimpleSpan")); + } + + @Test + public void testSimpleChain() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(2); + + Type head = cas.getTypeSystem().getType("webanno.custom.SimpleChain"); + Type link = cas.getTypeSystem().getType("webanno.custom.SimpleLink"); + + makeChainHead(head, + makeChainLink(link, cas, t1.getBegin(), t1.getEnd(), null, null, + makeChainLink(link, cas, t2.getBegin(), t2.getEnd(), null, null, + makeChainLink(link, cas, t3.getBegin(), t3.getEnd(), null, null, null)))); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_CHAIN_LAYERS, asList("webanno.custom.Simple")); + } + + @Test + public void testMultiTokenChain() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(2); + Token t4 = tokens.get(3); + + Type head = cas.getTypeSystem().getType("webanno.custom.SimpleChain"); + Type link = cas.getTypeSystem().getType("webanno.custom.SimpleLink"); + + makeChainHead(head, + makeChainLink(link, cas, t1.getBegin(), t2.getEnd(), null, null, + makeChainLink(link, cas, t3.getBegin(), t4.getEnd(), null, null, null))); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_CHAIN_LAYERS, asList("webanno.custom.Simple")); + } + @Test + public void testStackedChain() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t3 = tokens.get(2); + + Type head = cas.getTypeSystem().getType("webanno.custom.SimpleChain"); + Type link = cas.getTypeSystem().getType("webanno.custom.SimpleLink"); + + makeChainHead(head, + makeChainLink(link, cas, t1.getBegin(), t1.getEnd(), null, null, + makeChainLink(link, cas, t2.getBegin(), t2.getEnd(), null, null, + makeChainLink(link, cas, t3.getBegin(), t3.getEnd(), null, null, null)))); + + makeChainHead(head, + makeChainLink(link, cas, t3.getBegin(), t3.getEnd(), null, null, + makeChainLink(link, cas, t2.getBegin(), t2.getEnd(), null, null, + makeChainLink(link, cas, t1.getBegin(), t1.getEnd(), null, null, null)))); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_CHAIN_LAYERS, asList("webanno.custom.Simple")); + } + + @Test + public void testSubtokenChain() throws Exception + { + JCas jcas = makeJCasOneSentence(); + CAS cas = jcas.getCas(); + + List tokens = new ArrayList<>(select(jcas, Token.class)); + + Token t1 = tokens.get(0); + Token t2 = tokens.get(1); + Token t4 = tokens.get(3); + + Type head = cas.getTypeSystem().getType("webanno.custom.SimpleChain"); + Type link = cas.getTypeSystem().getType("webanno.custom.SimpleLink"); + + makeChainHead(head, + makeChainLink(link, cas, t1.getBegin() + 1, t1.getEnd() - 1, null, null, + makeChainLink(link, cas, t2.getBegin() + 1, t2.getEnd() - 1, null, null, + makeChainLink(link, cas, t4.getBegin() + 1, t4.getEnd() - 1, null, null, null)))); + + writeAndAssertEquals(jcas, + WebannoTsv3Writer.PARAM_CHAIN_LAYERS, asList("webanno.custom.Simple")); + } + + + @Test + public void testSentenceWithLineBreak() throws Exception + { + JCas jcas = makeJCasOneSentence("This is\na test ."); + + Span neToken = new Span(jcas, 0, 4); + neToken.addToIndexes(); + + writeAndAssertEquals(jcas, WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testSentenceWithTab() throws Exception + { + JCas jcas = makeJCasOneSentence("This is\ta test ."); + + Span neToken = new Span(jcas, 0, 4); + neToken.addToIndexes(); + + writeAndAssertEquals(jcas, WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testSentenceWithEmoji() throws Exception + { + JCas jcas = makeJCasOneSentence("I like it 😊 ."); + + Span neToken = new Span(jcas, 10, 12); + neToken.addToIndexes(); + + writeAndAssertEquals(jcas, WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(Span.class)); + } + + @Test + public void testTwoSentencesWithNoSpaceInBetween() throws Exception + { + TypeSystemDescription global = TypeSystemDescriptionFactory.createTypeSystemDescription(); + TypeSystemDescription local = TypeSystemDescriptionFactory + .createTypeSystemDescriptionFromPath( + "src/test/resources/desc/type/webannoTestTypes.xml"); + + TypeSystemDescription merged = CasCreationUtils.mergeTypeSystems(asList(global, local)); + + JCas jcas = JCasFactory.createJCas(merged); + + DocumentMetaData.create(jcas).setDocumentId("doc"); + jcas.setDocumentText("onetwo"); + new Token(jcas, 0, 3).addToIndexes(); + new Sentence(jcas, 0, 3).addToIndexes(); + new Token(jcas, 3, 6).addToIndexes(); + new Sentence(jcas, 3, 6).addToIndexes(); + + writeAndAssertEquals(jcas); + } + + /* + * This is something that cannot be done through the editor UI but can happen when working with + * externally created data. + */ + @Test + public void testAnnotationWithTrailingWhitespace() throws Exception + { + JCas jcas = JCasFactory.createJCas(); + + DocumentMetaData.create(jcas).setDocumentId("doc"); + jcas.setDocumentText("one two"); + new Token(jcas, 0, 3).addToIndexes(); + new Token(jcas, 5, 8).addToIndexes(); + new Sentence(jcas, 0, 8).addToIndexes(); + + // NE has trailing whitespace - on export this should be silently dropped + new NamedEntity(jcas, 0, 4).addToIndexes(); + + writeAndAssertEquals(jcas); + } + + /* + * This is something that cannot be done through the editor UI but can happen when working with + * externally created data. + */ + @Test + public void testAnnotationWithTrailingWhitespaceAtEnd() throws Exception + { + JCas jcas = JCasFactory.createJCas(); + + DocumentMetaData.create(jcas).setDocumentId("doc"); + jcas.setDocumentText("one two "); + new Token(jcas, 0, 3).addToIndexes(); + new Token(jcas, 4, 7).addToIndexes(); + new Sentence(jcas, 0, 7).addToIndexes(); + + // NE has trailing whitespace - on export this should be silently dropped + new NamedEntity(jcas, 4, 8).addToIndexes(); + + writeAndAssertEquals(jcas); + } + + /* + * This is something that cannot be done through the editor UI but can happen when working with + * externally created data. + */ + @Test + public void testAnnotationWithLeadingWhitespaceAtStart() throws Exception + { + JCas jcas = JCasFactory.createJCas(); + + DocumentMetaData.create(jcas).setDocumentId("doc"); + jcas.setDocumentText(" one two"); + new Token(jcas, 1, 4).addToIndexes(); + new Token(jcas, 5, 8).addToIndexes(); + new Sentence(jcas, 1, 8).addToIndexes(); + + // NE has leading whitespace - on export this should be silently dropped + new NamedEntity(jcas, 0, 4).addToIndexes(); + + writeAndAssertEquals(jcas); + } + + /* + * This is something that cannot be done through the editor UI but can happen when working with + * externally created data. + */ + @Test + public void testAnnotationWithLeadingWhitespace() throws Exception + { + JCas jcas = JCasFactory.createJCas(); + + DocumentMetaData.create(jcas).setDocumentId("doc"); + jcas.setDocumentText("one two"); + new Token(jcas, 0, 3).addToIndexes(); + new Token(jcas, 5, 8).addToIndexes(); + new Sentence(jcas, 0, 8).addToIndexes(); + + // NE has leading whitespace - on export this should be silently dropped + new NamedEntity(jcas, 4, 8).addToIndexes(); + + writeAndAssertEquals(jcas); + } + + + /* + * This is something that cannot be done through the editor UI but can happen when working with + * externally created data. + */ + @Test + public void testZeroWidthAnnotationBetweenTokenIsMovedToEndOfPreviousToken() throws Exception + { + JCas jcas = JCasFactory.createJCas(); + + DocumentMetaData.create(jcas).setDocumentId("doc"); + jcas.setDocumentText("one two"); + new Token(jcas, 0, 3).addToIndexes(); + new Token(jcas, 5, 8).addToIndexes(); + new Sentence(jcas, 0, 8).addToIndexes(); + + // NE is after the end of the last token and should be moved to the end of the last token + // otherwise it could not be represented in the TSV3 format. + new NamedEntity(jcas, 4, 4).addToIndexes(); + + writeAndAssertEquals(jcas); + } + + /* + * This is something that cannot be done through the editor UI but can happen when working with + * externally created data. + */ + @Test + public void testZeroWidthAnnotationBeyondLastTokenIsMovedToEndOfLastToken() throws Exception + { + JCas jcas = JCasFactory.createJCas(); + + DocumentMetaData.create(jcas).setDocumentId("doc"); + jcas.setDocumentText("one two "); + new Token(jcas, 0, 3).addToIndexes(); + new Token(jcas, 4, 7).addToIndexes(); + new Sentence(jcas, 0, 7).addToIndexes(); + + // NE is after the end of the last token and should be moved to the end of the last token + // otherwise it could not be represented in the TSV3 format. + new NamedEntity(jcas, 8, 8).addToIndexes(); + + writeAndAssertEquals(jcas); + } + + /* + * This is something that cannot be done through the editor UI but can happen when working with + * externally created data. + */ + @Test + public void testZeroWidthAnnotationBeforeFirstTokenIsMovedToBeginOfFirstToken() throws Exception + { + JCas jcas = JCasFactory.createJCas(); + + DocumentMetaData.create(jcas).setDocumentId("doc"); + jcas.setDocumentText(" one two"); + new Token(jcas, 2, 5).addToIndexes(); + new Token(jcas, 6, 9).addToIndexes(); + new Sentence(jcas, 2, 9).addToIndexes(); + + // NE is after the end of the last token and should be moved to the end of the last token + // otherwise it could not be represented in the TSV3 format. + new NamedEntity(jcas, 1, 1).addToIndexes(); + + writeAndAssertEquals(jcas); + } + + @Test + public void testElevatedType() throws Exception { + JCas jcas = JCasFactory.createJCas(); + + DocumentMetaData.create(jcas).setDocumentId("doc"); + jcas.setDocumentText("John"); + + // Add an elevated type which is not a direct subtype of Annotation. This type not be picked + // up by the schema analyzer but should still be serialized as the POS type which is in fact + // picked up. + POS_NOUN pos = new POS_NOUN(jcas, 0, 4); + pos.setPosValue("NN"); + pos.setCoarseValue("NOUN"); + pos.addToIndexes(); + + Token t = new Token(jcas, 0, 4); + t.setPos(pos); + t.addToIndexes(); + new Sentence(jcas, 0, 4).addToIndexes(); + + writeAndAssertEquals(jcas); + } + + private void writeAndAssertEquals(JCas aJCas, Object... aParams) + throws IOException, ResourceInitializationException, AnalysisEngineProcessException + { + assumeFalse("This test is known to fail.", isKnownToFail(testContext.getMethodName())); + + String targetFolder = "target/test-output/" + testContext.getClassName() + "/" + + getSuiteName() + "/" + testContext.getMethodName(); + String referenceFolder = "src/test/resources/" + getSuiteName() + "/" + + testContext.getMethodName(); + + List params = new ArrayList<>(); + params.addAll(asList(aParams)); + params.add(WebannoTsv3Writer.PARAM_TARGET_LOCATION); + params.add(targetFolder); + params.add(WebannoTsv3Writer.PARAM_OVERWRITE); + params.add(true); + + AnalysisEngineDescription tsv = makeWriter(); + for (int i = 0; i < params.size(); i += 2) { + String name = (String) params.get(i); + Object value = params.get(i + 1); + if (ConfigurationParameterFactory.canParameterBeSet(tsv, name)) { + ConfigurationParameterFactory.setParameter(tsv, name, value); + } + } + + AnalysisEngineDescription xmi = createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, targetFolder, + XmiWriter.PARAM_OVERWRITE, true); + + SimplePipeline.runPipeline(aJCas, tsv, xmi); + + File referenceFile = new File(referenceFolder, "reference.tsv"); + assumeTrue("No reference data available for this test.", referenceFile.exists()); + + File actualFile = new File(targetFolder, "doc.tsv"); + + String reference = FileUtils.readFileToString(referenceFile, "UTF-8"); + String actual = FileUtils.readFileToString(actualFile, "UTF-8"); + + assertEquals(reference, actual); + } + + private static JCas makeJCas() throws UIMAException + { + TypeSystemDescription global = TypeSystemDescriptionFactory.createTypeSystemDescription(); + TypeSystemDescription local = TypeSystemDescriptionFactory + .createTypeSystemDescriptionFromPath( + "src/test/resources/desc/type/webannoTestTypes.xml"); + + TypeSystemDescription merged = CasCreationUtils.mergeTypeSystems(asList(global, local)); + + JCas jcas = JCasFactory.createJCas(merged); + + DocumentMetaData.create(jcas).setDocumentId("doc"); + + return jcas; + } + + private static JCas makeJCasOneSentence() throws UIMAException + { + JCas jcas = makeJCas(); + + TokenBuilder tb = new TokenBuilder<>(Token.class, + Sentence.class); + tb.buildTokens(jcas, "This is a test ."); + + return jcas; + } + + private static JCas makeJCasTwoSentences() throws UIMAException + { + JCas jcas = makeJCas(); + + TokenBuilder tb = new TokenBuilder<>(Token.class, + Sentence.class); + tb.buildTokens(jcas, "He loves her .\nShe loves him not ."); + + assertEquals(2, select(jcas, Sentence.class).size()); + + return jcas; + } + + private static JCas makeJCasOneSentence(String aText) throws UIMAException + { + JCas jcas = makeJCas(); + + TokenBuilder tb = new TokenBuilder<>(Token.class, + Sentence.class); + tb.buildTokens(jcas, aText); + + // Remove the sentences generated by the token builder which treats the line break as a + // sentence break + for (Sentence s : select(jcas, Sentence.class)) { + s.removeFromIndexes(); + } + + // Add a new sentence covering the whole text + new Sentence(jcas, 0, jcas.getDocumentText().length()).addToIndexes(); + + return jcas; + } + + private static AnnotationFS makeLinkHostFS(JCas aJCas, int aBegin, int aEnd, + FeatureStructure... aLinks) + { + return makeLinkHostFS(aJCas, "webanno.custom.SimpleLinkHost", aBegin, aEnd, aLinks); + } + + private static AnnotationFS makeLinkHostFS(JCas aJCas, String aType, int aBegin, int aEnd, + FeatureStructure... aLinks) + { + Type hostType = aJCas.getTypeSystem().getType(aType); + AnnotationFS hostA1 = aJCas.getCas().createAnnotation(hostType, aBegin, aEnd); + if (aLinks != null) { + hostA1.setFeatureValue(hostType.getFeatureByBaseName("links"), + FSCollectionFactory.createFSArray(aJCas, asList(aLinks))); + } + aJCas.getCas().addFsToIndexes(hostA1); + return hostA1; + } + + private static FeatureStructure makeLinkFS(JCas aJCas, String aSlotLabel, AnnotationFS aTarget) + { + return makeLinkFS(aJCas, "webanno.custom.LinkType", aSlotLabel, aTarget); + } + + private static FeatureStructure makeLinkFS(JCas aJCas, String aType, String aSlotLabel, + AnnotationFS aTarget) + { + Type linkType = aJCas.getTypeSystem().getType(aType); + FeatureStructure linkA1 = aJCas.getCas().createFS(linkType); + linkA1.setStringValue(linkType.getFeatureByBaseName("role"), aSlotLabel); + linkA1.setFeatureValue(linkType.getFeatureByBaseName("target"), aTarget); + aJCas.getCas().addFsToIndexes(linkA1); + + return linkA1; + } + + private static void makeChainHead(Type aType, AnnotationFS first) + { + CAS cas = first.getCAS(); + FeatureStructure h = cas.createFS(aType); + FSUtil.setFeature(h, "first", first); + cas.addFsToIndexes(h); + } + + private static AnnotationFS makeChainLink(Type aType, CAS aCas, + int aBegin, int aEnd, String aLabel, String aLinkLabel, AnnotationFS aNext) + { + AnnotationFS link = aCas.createAnnotation(aType, aBegin, aEnd); + FSUtil.setFeature(link, "next", aNext); + FSUtil.setFeature(link, "referenceType", aLabel); + FSUtil.setFeature(link, "referenceRelation", aLinkLabel); + aCas.addFsToIndexes(link); + return link; + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3XReaderWriterRoundTripTest.java b/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3XReaderWriterRoundTripTest.java new file mode 100644 index 0000000000..2fbf069a2d --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3XReaderWriterRoundTripTest.java @@ -0,0 +1,172 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.webanno.tsv; + +import static java.util.Arrays.asList; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectSingleAt; +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.FilenameFilter; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.filefilter.PrefixFileFilter; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.factory.TypeSystemDescriptionFactory; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.metadata.TypeSystemDescription; +import org.apache.uima.util.CasCreationUtils; +import org.dkpro.core.io.xmi.XmiWriter; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; + +@RunWith(value = Parameterized.class) +public class WebAnnoTsv3XReaderWriterRoundTripTest +{ + @Parameters(name = "{index}: running on file {0}") + public static Iterable tsvFiles() + { + return asList(new File("src/test/resources/tsv3-suite/").listFiles( + (FilenameFilter) new PrefixFileFilter(asList("test", "issue", "sample")))); + } + + private File referenceFolder; + + public WebAnnoTsv3XReaderWriterRoundTripTest(File aFolder) + { + referenceFolder = aFolder; + } + + @Test + public void runTest() throws Exception + { + TypeSystemDescription global = TypeSystemDescriptionFactory.createTypeSystemDescription(); + TypeSystemDescription local; + if (new File(referenceFolder, "typesystem.xml").exists()) { + local = TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath( + new File(referenceFolder, "typesystem.xml").toString()); + } + else { + local = TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath( + "src/test/resources/desc/type/webannoTestTypes.xml"); + } + + TypeSystemDescription merged = CasCreationUtils.mergeTypeSystems(asList(global, local)); + + String targetFolder = "target/test-output/WebAnnoTsv3XReaderWriterRoundTripTest/" + + referenceFolder.getName(); + + CollectionReaderDescription reader = createReaderDescription(WebannoTsv3XReader.class, + merged, + WebannoTsv3XReader.PARAM_SOURCE_LOCATION, referenceFolder, + WebannoTsv3XReader.PARAM_PATTERNS, "reference.tsv"); + + AnalysisEngineDescription checker = createEngineDescription( + DKProCoreConventionsChecker.class); + + AnalysisEngineDescription tsvWriter = createEngineDescription(WebannoTsv3XWriter.class, + merged, + WebannoTsv3XWriter.PARAM_TARGET_LOCATION, targetFolder, + WebannoTsv3XWriter.PARAM_STRIP_EXTENSION, true, + WebannoTsv3XWriter.PARAM_OVERWRITE, true); + + AnalysisEngineDescription xmiWriter = createEngineDescription(XmiWriter.class, + merged, + XmiWriter.PARAM_TARGET_LOCATION, targetFolder, + XmiWriter.PARAM_STRIP_EXTENSION, true, + XmiWriter.PARAM_OVERWRITE, true); + + SimplePipeline.runPipeline(reader, checker, tsvWriter, xmiWriter); + + String referenceTsv = FileUtils.readFileToString(new File(referenceFolder, "reference.tsv"), + "UTF-8"); + + String actualTsv = FileUtils.readFileToString(new File(targetFolder, "reference.tsv"), + "UTF-8"); + + // + // The XMI files here are not compared semantically but using their serialization which + // is subject to minor variations depending e.g. on the order in which annotation are + // created in the CAS. Thus, this code is commented out and should only be used on a + // case-by-case base to compare XMIs during development. + // + // String referenceXmi = FileUtils.readFileToString(new File(referenceFolder, + // "reference.xmi"), + // "UTF-8"); + // + // String actualXmi = FileUtils.readFileToString(new File(targetFolder, "reference.xmi"), + // "UTF-8"); + + assertEquals(referenceTsv, actualTsv); + // assertEquals(referenceXmi, actualXmi); + } + + public static class DKProCoreConventionsChecker + extends JCasAnnotator_ImplBase + { + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException + { + for (Lemma lemma : select(aJCas, Lemma.class)) { + Token t = selectSingleAt(aJCas, Token.class, lemma.getBegin(), lemma.getEnd()); + assert t.getLemma() == lemma; + } + + for (Stem stem : select(aJCas, Stem.class)) { + Token t = selectSingleAt(aJCas, Token.class, stem.getBegin(), stem.getEnd()); + assert t.getStem() == stem; + } + + for (MorphologicalFeatures morph : select(aJCas, MorphologicalFeatures.class)) { + Token t = selectSingleAt(aJCas, Token.class, morph.getBegin(), morph.getEnd()); + assert t.getMorph() == morph; + } + + for (POS pos : select(aJCas, POS.class)) { + Token t = selectSingleAt(aJCas, Token.class, pos.getBegin(), pos.getEnd()); + assert t.getPos() == pos; + } + + for (Dependency dep : select(aJCas, Dependency.class)) { + assert dep.getBegin() == dep.getDependent().getBegin(); + assert dep.getEnd() == dep.getDependent().getEnd(); + } + } + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3XReaderWriterTest.java b/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3XReaderWriterTest.java new file mode 100644 index 0000000000..91be7e4a0f --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3XReaderWriterTest.java @@ -0,0 +1,94 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.webanno.tsv; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; +import static org.junit.Assert.assertEquals; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.cas.CAS; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.factory.CollectionReaderFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; + +public class WebAnnoTsv3XReaderWriterTest +{ + @Test + public void test() + throws Exception + { + String targetFolder = "target/test-output/" + testContext.getTestOutputFolderName(); + + CollectionReader reader = CollectionReaderFactory.createReader( + WebannoTsv3XReader.class, + WebannoTsv3XReader.PARAM_SOURCE_LOCATION, "src/test/resources/tsv3/", + WebannoTsv3XReader.PARAM_PATTERNS, "coref.tsv"); + + AnalysisEngineDescription writer = createEngineDescription( + WebannoTsv3XWriter.class, + WebannoTsv3XWriter.PARAM_TARGET_LOCATION, targetFolder, + WebannoTsv3XWriter.PARAM_STRIP_EXTENSION, true, + WebannoTsv3XWriter.PARAM_OVERWRITE, true); + + runPipeline(reader, writer); + + CollectionReader reader1 = CollectionReaderFactory.createReader( + WebannoTsv3XReader.class, + WebannoTsv3XReader.PARAM_SOURCE_LOCATION, "src/test/resources/tsv3/", + WebannoTsv3XReader.PARAM_PATTERNS, "coref.tsv"); + + CollectionReader reader2 = CollectionReaderFactory.createReader( + WebannoTsv3XReader.class, + WebannoTsv3XReader.PARAM_SOURCE_LOCATION, targetFolder, + WebannoTsv3XReader.PARAM_PATTERNS, "coref.tsv"); + + CAS cas1 = JCasFactory.createJCas().getCas(); + reader1.getNext(cas1); + + CAS cas2 = JCasFactory.createJCas().getCas(); + reader2.getNext(cas2); + + assertEquals(JCasUtil.select(cas2.getJCas(), Token.class).size(), + JCasUtil.select(cas1.getJCas(), Token.class).size()); + assertEquals(JCasUtil.select(cas2.getJCas(), POS.class).size(), + JCasUtil.select(cas1.getJCas(), POS.class).size()); + assertEquals(JCasUtil.select(cas2.getJCas(), Lemma.class).size(), + JCasUtil.select(cas1.getJCas(), Lemma.class).size()); + assertEquals(JCasUtil.select(cas2.getJCas(), NamedEntity.class).size(), + JCasUtil.select(cas1.getJCas(), NamedEntity.class).size()); + assertEquals(JCasUtil.select(cas2.getJCas(), Sentence.class).size(), + JCasUtil.select(cas1.getJCas(), Sentence.class).size()); + assertEquals(JCasUtil.select(cas2.getJCas(), Dependency.class).size(), + JCasUtil.select(cas1.getJCas(), Dependency.class).size()); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3XWriterTest.java b/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3XWriterTest.java new file mode 100644 index 0000000000..1a04e20958 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3XWriterTest.java @@ -0,0 +1,45 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.webanno.tsv; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.resource.ResourceInitializationException; + +public class WebAnnoTsv3XWriterTest + extends WebAnnoTsv3WriterTestBase +{ + @Override + protected AnalysisEngineDescription makeWriter() throws ResourceInitializationException + { + return createEngineDescription(WebannoTsv3XWriter.class); + } + + @Override + protected String getSuiteName() throws ResourceInitializationException + { + return "tsv3-suite"; + } + + @Override + protected boolean isKnownToFail(String aMethodName) + { + return false; + } +} diff --git a/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3Writer.java b/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3Writer.java new file mode 100644 index 0000000000..1e292a3562 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebannoTsv3Writer.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.webanno.tsv; + +import org.dkpro.core.api.parameter.ComponentParameters; + +/** + * This is just a dummy class with some constants that is used to enable copying the + * TsvWebAnno3WriterTestBase as-is from the WebAnno codebase here. + */ +public class WebannoTsv3Writer +{ + public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; + public static final String PARAM_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; + public static final String PARAM_FILENAME_SUFFIX = "filenameSuffix"; + public static final String PARAM_SPAN_LAYERS = "spanLayers"; + public static final String PARAM_SLOT_FEATS = "slotFeatures"; + public static final String PARAM_LINK_TYPES = "linkTypes"; + public static final String PARAM_SLOT_TARGETS = "slotTargets"; + public static final String PARAM_CHAIN_LAYERS = "chainLayers"; + public static final String PARAM_RELATION_LAYERS = "relationLayers"; + public static final String PARAM_OVERWRITE = "overwrite"; +} diff --git a/dkpro-core-io-webanno-asl/src/test/resources/desc/type/webannoTestTypes.xml b/dkpro-core-io-webanno-asl/src/test/resources/desc/type/webannoTestTypes.xml new file mode 100644 index 0000000000..ea78fb510c --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/desc/type/webannoTestTypes.xml @@ -0,0 +1,228 @@ + + + webannoTestTypes + + 1.0 + + + + webanno.custom.SimpleSpan + + uima.tcas.Annotation + + + webanno.custom.Span + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + webanno.custom.SimpleRelation + + uima.tcas.Annotation + + + Governor + + uima.tcas.Annotation + + + Dependent + + uima.tcas.Annotation + + + + + webanno.custom.Relation + + uima.tcas.Annotation + + + Governor + + uima.tcas.Annotation + + + Dependent + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + webanno.custom.ComplexRelation + + uima.tcas.Annotation + + + Governor + + uima.tcas.Annotation + + + Dependent + + uima.tcas.Annotation + + + value + + uima.cas.String + + + boolValue + + uima.cas.Boolean + + + integerValue + + uima.cas.Integer + + + + + webanno.custom.SimpleLinkHost + + uima.tcas.Annotation + + + links + + uima.cas.FSArray + webanno.custom.LinkType + false + + + + + webanno.custom.LinkType + + uima.cas.TOP + + + role + + uima.cas.String + + + target + + webanno.custom.SimpleSpan + + + + + webanno.custom.FlexLinkHost + + uima.tcas.Annotation + + + links + + uima.cas.FSArray + webanno.custom.FlexLinkType + false + + + + + webanno.custom.FlexLinkType + + uima.cas.TOP + + + role + + uima.cas.String + + + target + + uima.tcas.Annotation + + + + + webanno.custom.SimpleChain + + uima.cas.AnnotationBase + + + first + + webanno.custom.SimpleLink + + + + + webanno.custom.SimpleLink + + uima.tcas.Annotation + + + next + + webanno.custom.SimpleLink + + + referenceType + + uima.cas.String + + + referenceRelation + + uima.cas.String + + + + + webanno.custom.ComplexLinkHost + + uima.tcas.Annotation + + + links + + uima.cas.FSArray + webanno.custom.ComplexLinkType + + + value + + uima.cas.String + + + + + webanno.custom.ComplexLinkType + This link is not really complex, but it belongs to the ComplexLinkHost! + uima.cas.TOP + + + role + + uima.cas.String + + + target + + webanno.custom.SimpleSpan + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/log4j2.xml b/dkpro-core-io-webanno-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..19bf03b585 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/log4j2.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/sampleSlotAnnotation1/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/sampleSlotAnnotation1/reference.tsv new file mode 100644 index 0000000000..ac34cab2d1 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/sampleSlotAnnotation1/reference.tsv @@ -0,0 +1,12 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm|value +#T_SP=webanno.custom.SimpleSpan| +#T_SP=webanno.custom.SimpleSlotHost|ROLE_webanno.custom.SimpleSlotHost:links_webanno.custom.SimpleSlotHostLinksLink|uima.tcas.Annotation + + +#Text=This is a test . +1-1 0-4 This _ _ pr2;pr1 1-3[1];1-2[2] +1-2 5-7 is _ *[2] _ _ +1-3 8-9 a _ *[1] _ _ +1-4 10-14 test * _ _ _ +1-5 15-16 . _ _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/sampleSlotAnnotation1/typesystem.xml b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/sampleSlotAnnotation1/typesystem.xml new file mode 100644 index 0000000000..7687c9a806 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/sampleSlotAnnotation1/typesystem.xml @@ -0,0 +1,1282 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain + + uima.cas.AnnotationBase + + + first + + de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink + + + + + de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink + + uima.tcas.Annotation + + + next + + de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink + + + referenceType + + uima.cas.String + + + referenceRelation + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.Morpheme + + uima.tcas.Annotation + + + morphTag + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures + + uima.tcas.Annotation + + + gender + + uima.cas.String + + + number + + uima.cas.String + + + case + + uima.cas.String + + + degree + + uima.cas.String + + + transitivity + + uima.cas.String + + + tense + + uima.cas.String + + + mood + + uima.cas.String + + + voice + + uima.cas.String + + + definiteness + + uima.cas.String + + + value + + uima.cas.String + + + person + + uima.cas.String + + + aspect + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + uima.tcas.Annotation + + + PosValue + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.PP + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.PR + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.PRT + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.PUNC + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.V + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData + + uima.tcas.DocumentAnnotation + + + documentTitle + + uima.cas.String + + + documentId + + uima.cas.String + + + documentUri + + uima.cas.String + + + collectionId + + uima.cas.String + + + documentBaseUri + + uima.cas.String + + + isLastSegment + + uima.cas.Boolean + + + + + de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagDescription + + uima.cas.TOP + + + name + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription + + uima.tcas.Annotation + + + layer + + uima.cas.String + + + name + + uima.cas.String + + + tags + + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagDescription + + + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Nationality + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Norp + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Ordinal + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.OrgDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.PerDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Percent + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Person + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Plant + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Product + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.ProductDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Quantity + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Substance + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Time + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.WorkOfArt + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound + + uima.tcas.Annotation + + + splits + + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Document + + uima.tcas.Annotation + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading + + uima.tcas.Annotation + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.NGram + + uima.tcas.Annotation + + + text + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph + + uima.tcas.Annotation + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence + + uima.tcas.Annotation + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + + uima.tcas.Annotation + + + splits + + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.StopWord + + uima.tcas.Annotation + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token + + uima.tcas.Annotation + + + parent + + uima.tcas.Annotation + + + lemma + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma + + + stem + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem + + + pos + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + morph + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticArgument + + uima.tcas.Annotation + + + role + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticField + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticPredicate + + uima.tcas.Annotation + + + category + + uima.cas.String + + + arguments + + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticArgument + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.WordSense + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree + + uima.tcas.Annotation + + + PennTree + + uima.cas.String + + + TransformationNames + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.Tag + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + uima.tcas.Annotation + + + chunkValue + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.INTJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.LST + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.NC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.O + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.PC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.PRT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.VC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + uima.tcas.Annotation + + + constituentType + + uima.cas.String + + + parent + + uima.tcas.Annotation + + + children + + uima.cas.FSArray + uima.tcas.Annotation + + + syntacticFunction + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.FRAG + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.INTJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.LST + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.NAC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.NP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.NX + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PRN + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PRP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PRT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.QP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.RRC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.S + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SBAR + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SBARQ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SINV + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SQ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.UCP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.VP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHADJP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHADVP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHNP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHPP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.X + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + uima.tcas.Annotation + + + Governor + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token + + + Dependent + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token + + + DependencyType + + uima.cas.String + + + flavor + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.EXPL + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.INFMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.IOBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.MARK + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.MEASURE + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.MWE + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NEG + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NN + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NPADVMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NSUBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NSUBJPASS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NUM + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NUMBER + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PARATAXIS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PARTMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PCOMP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.POBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.POSS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.POSSESSIVE + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PRECONJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PRED + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PREDET + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PREP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PREPC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PRT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PUNCT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PURPCL + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.QUANTMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.RCMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.REF + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.REL + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.TMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.XCOMP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.XSUBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + webanno.custom.SimpleSlotHost + + uima.tcas.Annotation + + + links + + uima.cas.FSArray + webanno.custom.SimpleSlotHostLinksLink + + + + + webanno.custom.SimpleSlotHostLinksLink + + uima.cas.TOP + + + role + + uima.cas.String + + + target + + uima.tcas.Annotation + + + + + webanno.custom.SimpleSpan + + uima.tcas.Annotation + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.ADJ + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.ADV + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.ART + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.CARD + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.CONJ + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.N + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.NN + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.N + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.NP + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.N + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.O + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.AT + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.O + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.DM + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.O + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.EMO + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.O + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.HASH + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.O + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.INT + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.O + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.NNV + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.N + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.NPV + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.N + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.URL + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.O + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Animal + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Cardinal + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.ContactInfo + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Date + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Disease + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Event + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Fac + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.FacDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Game + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Gpe + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.GpeDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Language + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Law + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Money + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.CompoundPart + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LinkingMorpheme + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.ADJC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.ADVC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.CONCJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ADJP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ADVP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.CONJP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ABBREV + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ACOMP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ADVCL + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ADVMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AGENT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.APPOS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ATTR + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AUX0 + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AUXPASS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CCOMP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.COMPLM + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CONJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CONJP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CONJ_YET + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.COP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CSUBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CSUBJPASS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DEP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DET + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DOBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/sampleSlotAnnotation2/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/sampleSlotAnnotation2/reference.tsv new file mode 100644 index 0000000000..904eeac4a9 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/sampleSlotAnnotation2/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.SimpleSlotHost|ROLE_webanno.custom.SimpleSlotHost:links_webanno.custom.SimpleSlotHostLinksLink|uima.tcas.Annotation + + +#Text=This is a test . +1-1 0-4 This _[1]|pr1[2] *|1-1[1] +1-2 5-7 is _ _ +1-3 8-9 a _ _ +1-4 10-14 test _ _ +1-5 15-16 . _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/sampleSlotAnnotation2/typesystem.xml b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/sampleSlotAnnotation2/typesystem.xml new file mode 100644 index 0000000000..7687c9a806 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/sampleSlotAnnotation2/typesystem.xml @@ -0,0 +1,1282 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain + + uima.cas.AnnotationBase + + + first + + de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink + + + + + de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink + + uima.tcas.Annotation + + + next + + de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink + + + referenceType + + uima.cas.String + + + referenceRelation + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.Morpheme + + uima.tcas.Annotation + + + morphTag + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures + + uima.tcas.Annotation + + + gender + + uima.cas.String + + + number + + uima.cas.String + + + case + + uima.cas.String + + + degree + + uima.cas.String + + + transitivity + + uima.cas.String + + + tense + + uima.cas.String + + + mood + + uima.cas.String + + + voice + + uima.cas.String + + + definiteness + + uima.cas.String + + + value + + uima.cas.String + + + person + + uima.cas.String + + + aspect + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + uima.tcas.Annotation + + + PosValue + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.PP + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.PR + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.PRT + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.PUNC + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.V + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData + + uima.tcas.DocumentAnnotation + + + documentTitle + + uima.cas.String + + + documentId + + uima.cas.String + + + documentUri + + uima.cas.String + + + collectionId + + uima.cas.String + + + documentBaseUri + + uima.cas.String + + + isLastSegment + + uima.cas.Boolean + + + + + de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagDescription + + uima.cas.TOP + + + name + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription + + uima.tcas.Annotation + + + layer + + uima.cas.String + + + name + + uima.cas.String + + + tags + + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagDescription + + + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Nationality + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Norp + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Ordinal + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.OrgDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.PerDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Percent + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Person + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Plant + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Product + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.ProductDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Quantity + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Substance + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Time + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.WorkOfArt + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound + + uima.tcas.Annotation + + + splits + + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Document + + uima.tcas.Annotation + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading + + uima.tcas.Annotation + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.NGram + + uima.tcas.Annotation + + + text + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph + + uima.tcas.Annotation + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence + + uima.tcas.Annotation + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + + uima.tcas.Annotation + + + splits + + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.StopWord + + uima.tcas.Annotation + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token + + uima.tcas.Annotation + + + parent + + uima.tcas.Annotation + + + lemma + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma + + + stem + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem + + + pos + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + morph + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticArgument + + uima.tcas.Annotation + + + role + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticField + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticPredicate + + uima.tcas.Annotation + + + category + + uima.cas.String + + + arguments + + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticArgument + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.WordSense + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree + + uima.tcas.Annotation + + + PennTree + + uima.cas.String + + + TransformationNames + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.Tag + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + uima.tcas.Annotation + + + chunkValue + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.INTJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.LST + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.NC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.O + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.PC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.PRT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.VC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + uima.tcas.Annotation + + + constituentType + + uima.cas.String + + + parent + + uima.tcas.Annotation + + + children + + uima.cas.FSArray + uima.tcas.Annotation + + + syntacticFunction + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.FRAG + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.INTJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.LST + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.NAC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.NP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.NX + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PRN + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PRP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PRT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.QP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.RRC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.S + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SBAR + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SBARQ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SINV + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SQ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.UCP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.VP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHADJP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHADVP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHNP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHPP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.X + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + uima.tcas.Annotation + + + Governor + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token + + + Dependent + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token + + + DependencyType + + uima.cas.String + + + flavor + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.EXPL + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.INFMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.IOBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.MARK + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.MEASURE + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.MWE + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NEG + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NN + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NPADVMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NSUBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NSUBJPASS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NUM + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NUMBER + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PARATAXIS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PARTMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PCOMP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.POBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.POSS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.POSSESSIVE + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PRECONJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PRED + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PREDET + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PREP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PREPC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PRT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PUNCT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PURPCL + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.QUANTMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.RCMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.REF + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.REL + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.TMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.XCOMP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.XSUBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + webanno.custom.SimpleSlotHost + + uima.tcas.Annotation + + + links + + uima.cas.FSArray + webanno.custom.SimpleSlotHostLinksLink + + + + + webanno.custom.SimpleSlotHostLinksLink + + uima.cas.TOP + + + role + + uima.cas.String + + + target + + uima.tcas.Annotation + + + + + webanno.custom.SimpleSpan + + uima.tcas.Annotation + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.ADJ + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.ADV + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.ART + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.CARD + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.CONJ + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.N + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.NN + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.N + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.NP + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.N + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.O + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.AT + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.O + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.DM + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.O + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.EMO + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.O + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.HASH + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.O + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.INT + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.O + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.NNV + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.N + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.NPV + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.N + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.URL + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.O + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Animal + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Cardinal + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.ContactInfo + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Date + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Disease + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Event + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Fac + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.FacDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Game + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Gpe + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.GpeDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Language + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Law + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Money + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.CompoundPart + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LinkingMorpheme + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.ADJC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.ADVC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.CONCJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ADJP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ADVP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.CONJP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ABBREV + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ACOMP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ADVCL + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ADVMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AGENT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.APPOS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ATTR + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AUX0 + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AUXPASS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CCOMP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.COMPLM + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CONJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CONJP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CONJ_YET + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.COP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CSUBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CSUBJPASS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DEP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DET + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DOBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespace/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespace/reference.tsv new file mode 100644 index 0000000000..16073bc943 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespace/reference.tsv @@ -0,0 +1,7 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|identifier|value + + +#Text=one two +1-1 0-3 one _ _ +1-2 5-8 two * * diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespace/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespace/reference.xmi new file mode 100644 index 0000000000..2760952d04 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespace/reference.xmi @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespaceAtStart/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespaceAtStart/reference.tsv new file mode 100644 index 0000000000..73422f62f7 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespaceAtStart/reference.tsv @@ -0,0 +1,7 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|identifier|value + + +#Text=one two +1-1 1-4 one * * +1-2 5-8 two _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespaceAtStart/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespaceAtStart/reference.xmi new file mode 100644 index 0000000000..1d50fbbb13 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespaceAtStart/reference.xmi @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespace/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespace/reference.tsv new file mode 100644 index 0000000000..4656ebf8d1 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespace/reference.tsv @@ -0,0 +1,7 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|identifier|value + + +#Text=one two +1-1 0-3 one * * +1-2 5-8 two _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespace/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespace/reference.xmi new file mode 100644 index 0000000000..e05cab5977 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespace/reference.xmi @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespaceAtEnd/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespaceAtEnd/reference.tsv new file mode 100644 index 0000000000..d0399a28cc --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespaceAtEnd/reference.tsv @@ -0,0 +1,7 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|identifier|value + + +#Text=one two +1-1 0-3 one _ _ +1-2 4-7 two * * diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespaceAtEnd/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespaceAtEnd/reference.xmi new file mode 100644 index 0000000000..cfb1ce5aca --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespaceAtEnd/reference.xmi @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testComplexSlotFeatureWithoutValues/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testComplexSlotFeatureWithoutValues/reference.tsv new file mode 100644 index 0000000000..9a398fd67f --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testComplexSlotFeatureWithoutValues/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.SimpleSpan| +#T_SP=webanno.custom.ComplexLinkHost|ROLE_webanno.custom.ComplexLinkHost:links_webanno.custom.ComplexLinkType|webanno.custom.SimpleSpan|value + + +#Text=This is a test . +1-1 0-4 This _ *;* 1-2;1-3 * +1-2 5-7 is * _ _ _ +1-3 8-9 a * _ _ _ +1-4 10-14 test _ _ _ _ +1-5 15-16 . _ _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testComplexSlotFeatureWithoutValues/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testComplexSlotFeatureWithoutValues/reference.xmi new file mode 100644 index 0000000000..2d11f9b61c --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testComplexSlotFeatureWithoutValues/reference.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testCrossSentenceSpanWithFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testCrossSentenceSpanWithFeatureValue/reference.tsv new file mode 100644 index 0000000000..65de493f1d --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testCrossSentenceSpanWithFeatureValue/reference.tsv @@ -0,0 +1,16 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=He loves her . +1-1 0-2 He PERSON[1] +1-2 3-8 loves PERSON[1] +1-3 9-12 her PERSON[1] +1-4 13-14 . PERSON[1] + +#Text=She loves him not . +2-1 15-18 She PERSON[1] +2-2 19-24 loves PERSON[1] +2-3 25-28 him PERSON[1] +2-4 29-32 not PERSON[1] +2-5 33-34 . PERSON[1] diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testCrossSentenceSpanWithFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testCrossSentenceSpanWithFeatureValue/reference.xmi new file mode 100644 index 0000000000..e3dd13602e --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testCrossSentenceSpanWithFeatureValue/reference.xmi @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testCrossSentenceSpanWithoutFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testCrossSentenceSpanWithoutFeatureValue/reference.tsv new file mode 100644 index 0000000000..112837684c --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testCrossSentenceSpanWithoutFeatureValue/reference.tsv @@ -0,0 +1,16 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=He loves her . +1-1 0-2 He *[1] +1-2 3-8 loves *[1] +1-3 9-12 her *[1] +1-4 13-14 . *[1] + +#Text=She loves him not . +2-1 15-18 She *[1] +2-2 19-24 loves *[1] +2-3 25-28 him *[1] +2-4 29-32 not *[1] +2-5 33-34 . *[1] diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testCrossSentenceSpanWithoutFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testCrossSentenceSpanWithoutFeatureValue/reference.xmi new file mode 100644 index 0000000000..23dfcd88e1 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testCrossSentenceSpanWithoutFeatureValue/reference.xmi @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testDependencyWithValues/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testDependencyWithValues/reference.tsv new file mode 100644 index 0000000000..cf504555a9 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testDependencyWithValues/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS|PosValue|coarseValue +#T_RL=de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency|DependencyType|flavor|BT_de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + +#Text=This is a test . +1-1 0-4 This POS1 * _ _ _ +1-2 5-7 is POS2 * * * 1-1 +1-3 8-9 a _ _ _ _ _ +1-4 10-14 test _ _ _ _ _ +1-5 15-16 . _ _ _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testDependencyWithValues/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testDependencyWithValues/reference.xmi new file mode 100644 index 0000000000..a1b86edb6d --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testDependencyWithValues/reference.xmi @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testElevatedType/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testElevatedType/reference.tsv new file mode 100644 index 0000000000..bf7111dae5 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testElevatedType/reference.tsv @@ -0,0 +1,6 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS|PosValue|coarseValue + + +#Text=John +1-1 0-4 John NN NOUN diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testElevatedType/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testElevatedType/reference.xmi new file mode 100644 index 0000000000..c330fcc72e --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testElevatedType/reference.xmi @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenChain/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenChain/reference.tsv new file mode 100644 index 0000000000..0c3bf8bc21 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenChain/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_CH=webanno.custom.SimpleLink|referenceRelation|referenceType + + +#Text=This is a test . +1-1 0-4 This *->1-1 *[1] +1-2 5-7 is *->1-1 *[1] +1-3 8-9 a *->1-2 *[1] +1-4 10-14 test *->1-2 *[1] +1-5 15-16 . _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenChain/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenChain/reference.xmi new file mode 100644 index 0000000000..47364587e2 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenChain/reference.xmi @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenSlotFeature/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenSlotFeature/reference.tsv new file mode 100644 index 0000000000..b44f06878f --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenSlotFeature/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.SimpleSpan| +#T_SP=webanno.custom.SimpleLinkHost|ROLE_webanno.custom.SimpleLinkHost:links_webanno.custom.LinkType|webanno.custom.SimpleSpan + + +#Text=This is a test . +1-1 0-4 This _ p1;p2 1-2[1];1-4[2] +1-2 5-7 is *[1] _ _ +1-3 8-9 a *[1] _ _ +1-4 10-14 test *[2] _ _ +1-5 15-16 . *[2] _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenSlotFeature/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenSlotFeature/reference.xmi new file mode 100644 index 0000000000..06b66ad6cd --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenSlotFeature/reference.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenSpanWithFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenSpanWithFeatureValue/reference.tsv new file mode 100644 index 0000000000..c24c697bc2 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenSpanWithFeatureValue/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is a test . +1-1 0-4 This PERSON[1] +1-2 5-7 is PERSON[1] +1-3 8-9 a PERSON[1] +1-4 10-14 test PERSON[1] +1-5 15-16 . PERSON[1] diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenSpanWithFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenSpanWithFeatureValue/reference.xmi new file mode 100644 index 0000000000..6ac35ca751 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenSpanWithFeatureValue/reference.xmi @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenSpanWithoutFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenSpanWithoutFeatureValue/reference.tsv new file mode 100644 index 0000000000..c3d1d7406b --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenSpanWithoutFeatureValue/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is a test . +1-1 0-4 This *[1] +1-2 5-7 is *[1] +1-3 8-9 a *[1] +1-4 10-14 test *[1] +1-5 15-16 . *[1] diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenSpanWithoutFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenSpanWithoutFeatureValue/reference.xmi new file mode 100644 index 0000000000..0f12aebde5 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenSpanWithoutFeatureValue/reference.xmi @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenStackedSlotFeature/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenStackedSlotFeature/reference.tsv new file mode 100644 index 0000000000..654bc078d1 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenStackedSlotFeature/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.SimpleSpan| +#T_SP=webanno.custom.SimpleLinkHost|ROLE_webanno.custom.SimpleLinkHost:links_webanno.custom.LinkType|webanno.custom.SimpleSpan + + +#Text=This is a test . +1-1 0-4 This _ p1;p2 1-2[1];1-2[2] +1-2 5-7 is *[1]|*[2] _ _ +1-3 8-9 a *[1]|*[2] _ _ +1-4 10-14 test _ _ _ +1-5 15-16 . _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenStackedSlotFeature/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenStackedSlotFeature/reference.xmi new file mode 100644 index 0000000000..19c8b4053b --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenStackedSlotFeature/reference.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenStackedSpanWithFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenStackedSpanWithFeatureValue/reference.tsv new file mode 100644 index 0000000000..1388be41cc --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenStackedSpanWithFeatureValue/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is a test . +1-1 0-4 This PERSON[1]|LOCATION[2] +1-2 5-7 is PERSON[1]|LOCATION[2] +1-3 8-9 a PERSON[1]|LOCATION[2] +1-4 10-14 test PERSON[1]|LOCATION[2] +1-5 15-16 . PERSON[1]|LOCATION[2] diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenStackedSpanWithFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenStackedSpanWithFeatureValue/reference.xmi new file mode 100644 index 0000000000..16389d00eb --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenStackedSpanWithFeatureValue/reference.xmi @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenStackedSpanWithoutFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenStackedSpanWithoutFeatureValue/reference.tsv new file mode 100644 index 0000000000..f69946c261 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenStackedSpanWithoutFeatureValue/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is a test . +1-1 0-4 This *[1]|*[2] +1-2 5-7 is *[1]|*[2] +1-3 8-9 a *[1]|*[2] +1-4 10-14 test *[1]|*[2] +1-5 15-16 . *[1]|*[2] diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenStackedSpanWithoutFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenStackedSpanWithoutFeatureValue/reference.xmi new file mode 100644 index 0000000000..38b4d0e3c9 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testMultiTokenStackedSpanWithoutFeatureValue/reference.xmi @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSentenceWithEmoji/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSentenceWithEmoji/reference.tsv new file mode 100644 index 0000000000..35f80cdd2e --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSentenceWithEmoji/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=I like it 😊 . +1-1 0-1 I _ +1-2 2-6 like _ +1-3 7-9 it _ +1-4 10-12 😊 * +1-5 13-14 . _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSentenceWithEmoji/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSentenceWithEmoji/reference.xmi new file mode 100644 index 0000000000..f40e715146 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSentenceWithEmoji/reference.xmi @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSentenceWithLineBreak/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSentenceWithLineBreak/reference.tsv new file mode 100644 index 0000000000..00d8a7d6cb --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSentenceWithLineBreak/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is +#Text=a test . +1-1 0-4 This * +1-2 5-7 is _ +1-3 8-9 a _ +1-4 10-14 test _ +1-5 15-16 . _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSentenceWithLineBreak/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSentenceWithLineBreak/reference.xmi new file mode 100644 index 0000000000..53c73fd256 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSentenceWithLineBreak/reference.xmi @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSentenceWithTab/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSentenceWithTab/reference.tsv new file mode 100644 index 0000000000..3b0811c1d8 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSentenceWithTab/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is\ta test . +1-1 0-4 This * +1-2 5-7 is _ +1-3 8-9 a _ +1-4 10-14 test _ +1-5 15-16 . _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSentenceWithTab/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSentenceWithTab/reference.xmi new file mode 100644 index 0000000000..c38f643527 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSentenceWithTab/reference.xmi @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleChain/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleChain/reference.tsv new file mode 100644 index 0000000000..c912246996 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleChain/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_CH=webanno.custom.SimpleLink|referenceRelation|referenceType + + +#Text=This is a test . +1-1 0-4 This *->1-1 *[1] +1-2 5-7 is *->1-2 *[1] +1-3 8-9 a *->1-3 *[1] +1-4 10-14 test _ _ +1-5 15-16 . _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleChain/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleChain/reference.xmi new file mode 100644 index 0000000000..12d08b6099 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleChain/reference.xmi @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleCrossSenenceSlotFeature/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleCrossSenenceSlotFeature/reference.tsv new file mode 100644 index 0000000000..7d1076f7c4 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleCrossSenenceSlotFeature/reference.tsv @@ -0,0 +1,17 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.SimpleSpan| +#T_SP=webanno.custom.SimpleLinkHost|ROLE_webanno.custom.SimpleLinkHost:links_webanno.custom.LinkType|webanno.custom.SimpleSpan + + +#Text=He loves her . +1-1 0-2 He _ p1;p2 1-2;2-3 +1-2 3-8 loves * _ _ +1-3 9-12 her _ _ _ +1-4 13-14 . _ _ _ + +#Text=She loves him not . +2-1 15-18 She _ _ _ +2-2 19-24 loves _ _ _ +2-3 25-28 him * _ _ +2-4 29-32 not _ _ _ +2-5 33-34 . _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleCrossSenenceSlotFeature/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleCrossSenenceSlotFeature/reference.xmi new file mode 100644 index 0000000000..7f05db7dbc --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleCrossSenenceSlotFeature/reference.xmi @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleSameRoleSlotFeature/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleSameRoleSlotFeature/reference.tsv new file mode 100644 index 0000000000..49cd2e0286 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleSameRoleSlotFeature/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.SimpleSpan| +#T_SP=webanno.custom.SimpleLinkHost|ROLE_webanno.custom.SimpleLinkHost:links_webanno.custom.LinkType|webanno.custom.SimpleSpan + + +#Text=This is a test . +1-1 0-4 This _ p1;p1 1-2;1-3 +1-2 5-7 is * _ _ +1-3 8-9 a * _ _ +1-4 10-14 test _ _ _ +1-5 15-16 . _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleSameRoleSlotFeature/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleSameRoleSlotFeature/reference.xmi new file mode 100644 index 0000000000..38f8c3acb5 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleSameRoleSlotFeature/reference.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleSlotFeature/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleSlotFeature/reference.tsv new file mode 100644 index 0000000000..ac737b6d44 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleSlotFeature/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.SimpleSpan| +#T_SP=webanno.custom.SimpleLinkHost|ROLE_webanno.custom.SimpleLinkHost:links_webanno.custom.LinkType|webanno.custom.SimpleSpan + + +#Text=This is a test . +1-1 0-4 This _ p1;p2 1-2;1-3 +1-2 5-7 is * _ _ +1-3 8-9 a * _ _ +1-4 10-14 test _ _ _ +1-5 15-16 . _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleSlotFeature/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleSlotFeature/reference.xmi new file mode 100644 index 0000000000..2af75732e4 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleSlotFeature/reference.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleSlotFeatureWithoutValues/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleSlotFeatureWithoutValues/reference.tsv new file mode 100644 index 0000000000..f515577fef --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleSlotFeatureWithoutValues/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.SimpleSpan| +#T_SP=webanno.custom.SimpleLinkHost|ROLE_webanno.custom.SimpleLinkHost:links_webanno.custom.LinkType|webanno.custom.SimpleSpan + + +#Text=This is a test . +1-1 0-4 This _ *;* 1-2;1-3 +1-2 5-7 is * _ _ +1-3 8-9 a * _ _ +1-4 10-14 test _ _ _ +1-5 15-16 . _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleSlotFeatureWithoutValues/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleSlotFeatureWithoutValues/reference.xmi new file mode 100644 index 0000000000..31b02897ce --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSimpleSlotFeatureWithoutValues/reference.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonMultiTokenRelationWithMultipleFeatureValues/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonMultiTokenRelationWithMultipleFeatureValues/reference.tsv new file mode 100644 index 0000000000..f33d78a767 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonMultiTokenRelationWithMultipleFeatureValues/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value +#T_RL=webanno.custom.ComplexRelation|boolValue|integerValue|value|BT_webanno.custom.Span + + +#Text=This is a test . +1-1 0-4 This *[1] _ _ _ _ +1-2 5-7 is *[1] _ _ _ _ +1-3 8-9 a *[2] true 42 nsubj 1-1[1_2] +1-4 10-14 test *[2] _ _ _ _ +1-5 15-16 . _ _ _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonMultiTokenRelationWithMultipleFeatureValues/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonMultiTokenRelationWithMultipleFeatureValues/reference.xmi new file mode 100644 index 0000000000..a12032d755 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonMultiTokenRelationWithMultipleFeatureValues/reference.xmi @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonMultiTokenRelationWithoutFeatureValue/reference-explained.txt b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonMultiTokenRelationWithoutFeatureValue/reference-explained.txt new file mode 100644 index 0000000000..d29c703fe5 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonMultiTokenRelationWithoutFeatureValue/reference-explained.txt @@ -0,0 +1,20 @@ +#FORMAT=WebAnno TSV 3 +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|value +#T_RL=webanno.custom.Relation|value|BT_de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + +#Text=This is a test . +1-1 0-4 This *[1] _ _ +1-2 5-7 is *[1] _ _ +1-3 8-9 a *[2] _ 1-1 +1-4 10-14 test *[2] _ _ +1-5 15-16 . _ _ _ + +################################################################################################### + +Line 1-3 does not look like + +1-3 8-9 a *[2] _ 1-1[2_1] + +because there are no stacked annotations in line 1-3 and neither in line 1-1. So using the source/ +target annotation ID in brackets is not necessary. diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonMultiTokenRelationWithoutFeatureValue/reference-explained2.txt b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonMultiTokenRelationWithoutFeatureValue/reference-explained2.txt new file mode 100755 index 0000000000..ff0b10b07d --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonMultiTokenRelationWithoutFeatureValue/reference-explained2.txt @@ -0,0 +1,19 @@ +#FORMAT=WebAnno TSV 3 +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|value +#T_RL=webanno.custom.Relation|value|BT_de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + +#Text=This is a test . +1-1 0-4 This *[1] _ _ +1-2 5-7 is *[1] _ _ +1-3 8-9 a *[2] _ 1-1[1_2] +1-4 10-14 test *[2] _ _ +1-5 15-16 . _ _ _ + +################################################################################################### + +Line 1-3 does not look like + +1-3 8-9 a *[2] _ 1-1 + +Both stacked and multiple span annotation are referenced hence we will be consistent adding the reference even if it is not stacked here. diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonMultiTokenRelationWithoutFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonMultiTokenRelationWithoutFeatureValue/reference.tsv new file mode 100644 index 0000000000..d726daf1c1 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonMultiTokenRelationWithoutFeatureValue/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value +#T_RL=webanno.custom.Relation|value|BT_webanno.custom.Span + + +#Text=This is a test . +1-1 0-4 This *[1] _ _ +1-2 5-7 is *[1] _ _ +1-3 8-9 a *[2] * 1-1[1_2] +1-4 10-14 test *[2] _ _ +1-5 15-16 . _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonMultiTokenRelationWithoutFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonMultiTokenRelationWithoutFeatureValue/reference.xmi new file mode 100644 index 0000000000..f1195fc5d7 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonMultiTokenRelationWithoutFeatureValue/reference.xmi @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonTokenRelationWithoutFeature/reference-explained.txt b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonTokenRelationWithoutFeature/reference-explained.txt new file mode 100644 index 0000000000..26dce6a5d7 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonTokenRelationWithoutFeature/reference-explained.txt @@ -0,0 +1,20 @@ +#FORMAT=WebAnno TSV 3 +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|value +#T_RL=webanno.custom.SimpleRelation|BT_de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + +#Text=This is a test . +1-1 0-4 This * _ +1-2 5-7 is _ _ +1-3 8-9 a _ _ +1-4 10-14 test _ _ +1-5 15-16 . * 1-1 + +################################################################################################### + +The named entities in line 1-1 and 1-5 do not have a feature value (*). +They also do not have an annotation ID because they are neither stacked nor multi-token. + +So in line 1-5, we do not need source/target annotation IDs in brackets + +1-5 15-16 . * 1-1[0_0] diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonTokenRelationWithoutFeature/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonTokenRelationWithoutFeature/reference.tsv new file mode 100644 index 0000000000..7e9849d6b1 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonTokenRelationWithoutFeature/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value +#T_RL=webanno.custom.SimpleRelation|BT_webanno.custom.Span + + +#Text=This is a test . +1-1 0-4 This * _ +1-2 5-7 is _ _ +1-3 8-9 a _ _ +1-4 10-14 test _ _ +1-5 15-16 . * 1-1 diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonTokenRelationWithoutFeature/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonTokenRelationWithoutFeature/reference.xmi new file mode 100644 index 0000000000..6a7afb30c4 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonTokenRelationWithoutFeature/reference.xmi @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonTokenRelationWithoutFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonTokenRelationWithoutFeatureValue/reference.tsv new file mode 100644 index 0000000000..7d87cbe051 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonTokenRelationWithoutFeatureValue/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value +#T_RL=webanno.custom.Relation|value|BT_webanno.custom.Span + + +#Text=This is a test . +1-1 0-4 This * _ _ +1-2 5-7 is _ _ _ +1-3 8-9 a _ _ _ +1-4 10-14 test _ _ _ +1-5 15-16 . * * 1-1 diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonTokenRelationWithoutFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonTokenRelationWithoutFeatureValue/reference.xmi new file mode 100644 index 0000000000..039e42a667 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleNonTokenRelationWithoutFeatureValue/reference.xmi @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenOverlappingRelationWithoutFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenOverlappingRelationWithoutFeatureValue/reference.tsv new file mode 100644 index 0000000000..a0b17340b5 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenOverlappingRelationWithoutFeatureValue/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value +#T_RL=webanno.custom.Relation|value|BT_webanno.custom.Span + + +#Text=This is a test . +1-1 0-4 This *[1]|*[2] _ _ +1-2 5-7 is *[1]|*[2] _ _ +1-3 8-9 a *[1]|*[2] _ _ +1-4 10-14 test *[1]|*[2] _ _ +1-5 15-16 . *[1]|*[2]|*[3]|*[4] * 1-1[1_3] diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenOverlappingRelationWithoutFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenOverlappingRelationWithoutFeatureValue/reference.xmi new file mode 100644 index 0000000000..ab67b701e7 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenOverlappingRelationWithoutFeatureValue/reference.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenRelationWithoutFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenRelationWithoutFeatureValue/reference.tsv new file mode 100644 index 0000000000..7634ec69ea --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenRelationWithoutFeatureValue/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value +#T_RL=webanno.custom.Relation|value|BT_webanno.custom.Span + + +#Text=This is a test . +1-1 0-4 This *[1]|*[2] _ _ +1-2 5-7 is _ _ _ +1-3 8-9 a _ _ _ +1-4 10-14 test _ _ _ +1-5 15-16 . *[3]|*[4] * 1-1[1_3] diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenRelationWithoutFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenRelationWithoutFeatureValue/reference.xmi new file mode 100644 index 0000000000..2fcbbd73ac --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenRelationWithoutFeatureValue/reference.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenRelationWithoutFeatureValue2/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenRelationWithoutFeatureValue2/reference.tsv new file mode 100644 index 0000000000..673eced49c --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenRelationWithoutFeatureValue2/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value +#T_RL=webanno.custom.Relation|value|BT_webanno.custom.Span + + +#Text=This is a test . +1-1 0-4 This * _ _ +1-2 5-7 is _ _ _ +1-3 8-9 a _ _ _ +1-4 10-14 test _ _ _ +1-5 15-16 . *[1]|*[2] * 1-1[0_1] diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenRelationWithoutFeatureValue2/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenRelationWithoutFeatureValue2/reference.xmi new file mode 100644 index 0000000000..5f32ea7028 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenRelationWithoutFeatureValue2/reference.xmi @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenRelationWithoutFeatureValue3/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenRelationWithoutFeatureValue3/reference.tsv new file mode 100644 index 0000000000..2e3dfc60fd --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenRelationWithoutFeatureValue3/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value +#T_RL=webanno.custom.Relation|value|BT_webanno.custom.Span + + +#Text=This is a test . +1-1 0-4 This *[1]|*[2] _ _ +1-2 5-7 is _ _ _ +1-3 8-9 a _ _ _ +1-4 10-14 test _ _ _ +1-5 15-16 . * * 1-1[1_0] diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenRelationWithoutFeatureValue3/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenRelationWithoutFeatureValue3/reference.xmi new file mode 100644 index 0000000000..728adbc0fe --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleStackedNonTokenRelationWithoutFeatureValue3/reference.xmi @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenRelationWithFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenRelationWithFeatureValue/reference.tsv new file mode 100644 index 0000000000..70e6450aa6 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenRelationWithFeatureValue/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_RL=webanno.custom.Relation|value|BT_de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token + + +#Text=This is a test . +1-1 0-4 This _ _ +1-2 5-7 is _ _ +1-3 8-9 a _ _ +1-4 10-14 test _ _ +1-5 15-16 . nsubj 1-1 diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenRelationWithFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenRelationWithFeatureValue/reference.xmi new file mode 100644 index 0000000000..91165ded61 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenRelationWithFeatureValue/reference.xmi @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenRelationWithMultipleFeatureValues/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenRelationWithMultipleFeatureValues/reference.tsv new file mode 100644 index 0000000000..af3ccfca80 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenRelationWithMultipleFeatureValues/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_RL=webanno.custom.ComplexRelation|boolValue|integerValue|value|BT_de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token + + +#Text=This is a test . +1-1 0-4 This _ _ _ _ +1-2 5-7 is _ _ _ _ +1-3 8-9 a _ _ _ _ +1-4 10-14 test _ _ _ _ +1-5 15-16 . true 42 nsubj 1-1 diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenRelationWithMultipleFeatureValues/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenRelationWithMultipleFeatureValues/reference.xmi new file mode 100644 index 0000000000..9e54faa3dc --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenRelationWithMultipleFeatureValues/reference.xmi @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenRelationWithoutFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenRelationWithoutFeatureValue/reference.tsv new file mode 100644 index 0000000000..7008d1d22e --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenRelationWithoutFeatureValue/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_RL=webanno.custom.Relation|value|BT_de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token + + +#Text=This is a test . +1-1 0-4 This _ _ +1-2 5-7 is _ _ +1-3 8-9 a _ _ +1-4 10-14 test _ _ +1-5 15-16 . * 1-1 diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenRelationWithoutFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenRelationWithoutFeatureValue/reference.xmi new file mode 100644 index 0000000000..59e6ff0269 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenRelationWithoutFeatureValue/reference.xmi @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenWithoutFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenWithoutFeatureValue/reference.tsv new file mode 100644 index 0000000000..32d467bc24 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenWithoutFeatureValue/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is a test . +1-1 0-4 This * +1-2 5-7 is _ +1-3 8-9 a _ +1-4 10-14 test _ +1-5 15-16 . _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenWithoutFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenWithoutFeatureValue/reference.xmi new file mode 100644 index 0000000000..5f42d2e2e8 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSingleTokenWithoutFeatureValue/reference.xmi @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedChain/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedChain/reference.tsv new file mode 100644 index 0000000000..7aef527680 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedChain/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_CH=webanno.custom.SimpleLink|referenceRelation|referenceType + + +#Text=This is a test . +1-1 0-4 This *->1-1|*->2-3 *[1]|*[2] +1-2 5-7 is *->1-2|*->2-2 *[1]|*[2] +1-3 8-9 a *->1-3|*->2-1 *[1]|*[2] +1-4 10-14 test _ _ +1-5 15-16 . _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedChain/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedChain/reference.xmi new file mode 100644 index 0000000000..303ea42493 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedChain/reference.xmi @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedComplexSlotFeatureWithoutSlotFillers/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedComplexSlotFeatureWithoutSlotFillers/reference.tsv new file mode 100644 index 0000000000..475132155d --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedComplexSlotFeatureWithoutSlotFillers/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.SimpleSpan| +#T_SP=webanno.custom.ComplexLinkHost|ROLE_webanno.custom.ComplexLinkHost:links_webanno.custom.ComplexLinkType|webanno.custom.SimpleSpan|value + + +#Text=This is a test . +1-1 0-4 This _ _[1]|_[2] *|* val1[1]|val2[2] +1-2 5-7 is * _ _ _ +1-3 8-9 a * _ _ _ +1-4 10-14 test _ _ _ _ +1-5 15-16 . _ _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedComplexSlotFeatureWithoutSlotFillers/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedComplexSlotFeatureWithoutSlotFillers/reference.xmi new file mode 100644 index 0000000000..978ecfb29f --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedComplexSlotFeatureWithoutSlotFillers/reference.xmi @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedComplexSlotFeatureWithoutValues/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedComplexSlotFeatureWithoutValues/reference.tsv new file mode 100644 index 0000000000..fc3dda4ccf --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedComplexSlotFeatureWithoutValues/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.SimpleSpan| +#T_SP=webanno.custom.ComplexLinkHost|ROLE_webanno.custom.ComplexLinkHost:links_webanno.custom.ComplexLinkType|webanno.custom.SimpleSpan|value + + +#Text=This is a test . +1-1 0-4 This _ *;*[1]|*;*[2] 1-2;1-3|1-2;1-3 *[1]|*[2] +1-2 5-7 is * _ _ _ +1-3 8-9 a * _ _ _ +1-4 10-14 test _ _ _ _ +1-5 15-16 . _ _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedComplexSlotFeatureWithoutValues/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedComplexSlotFeatureWithoutValues/reference.xmi new file mode 100644 index 0000000000..1c16158f2b --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedComplexSlotFeatureWithoutValues/reference.xmi @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedNonMultiTokenRelationWithMultipleFeatureValues/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedNonMultiTokenRelationWithMultipleFeatureValues/reference.tsv new file mode 100644 index 0000000000..3989eb83a6 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedNonMultiTokenRelationWithMultipleFeatureValues/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value +#T_RL=webanno.custom.ComplexRelation|boolValue|integerValue|value|BT_webanno.custom.Span + + +#Text=This is a test . +1-1 0-4 This *[1] _ _ _ _ +1-2 5-7 is *[1] _ _ _ _ +1-3 8-9 a *[2] true|false 42|43 nsubj|obj 1-1[1_2]|1-1[1_2] +1-4 10-14 test *[2] _ _ _ _ +1-5 15-16 . _ _ _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedNonMultiTokenRelationWithMultipleFeatureValues/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedNonMultiTokenRelationWithMultipleFeatureValues/reference.xmi new file mode 100644 index 0000000000..c1cc50e520 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedNonMultiTokenRelationWithMultipleFeatureValues/reference.xmi @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedSimpleSlotFeatureWithoutValues/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedSimpleSlotFeatureWithoutValues/reference.tsv new file mode 100644 index 0000000000..50715c59d9 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedSimpleSlotFeatureWithoutValues/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.SimpleSpan| +#T_SP=webanno.custom.SimpleLinkHost|ROLE_webanno.custom.SimpleLinkHost:links_webanno.custom.LinkType|webanno.custom.SimpleSpan + + +#Text=This is a test . +1-1 0-4 This _ *;*[1]|*;*[2] 1-2;1-3|1-2;1-3 +1-2 5-7 is * _ _ +1-3 8-9 a * _ _ +1-4 10-14 test _ _ _ +1-5 15-16 . _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedSimpleSlotFeatureWithoutValues/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedSimpleSlotFeatureWithoutValues/reference.xmi new file mode 100644 index 0000000000..bdd51683b2 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedSimpleSlotFeatureWithoutValues/reference.xmi @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedSubMultiTokenSpanWithFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedSubMultiTokenSpanWithFeatureValue/reference.tsv new file mode 100644 index 0000000000..867e0f97df --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedSubMultiTokenSpanWithFeatureValue/reference.tsv @@ -0,0 +1,16 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=aaaaaa bbbbbb cccccc +1-1 0-6 aaaaaa 2a[1]|2b[2]|7a[3]|7b[4]|1a[5]|1b[6] +1-1.1 3-6 aaa 9a[7]|9b[8]|6a[9]|6b[10]|8a[11]|8b[12] +1-2 7-13 bbbbbb 2a[1]|2b[2]|9a[7]|9b[8]|6a[9]|6b[10]|4a[13]|4b[14] +1-2.1 7-10 bbb 7a[3]|7b[4]|8a[11]|8b[12]|4a[13]|4b[14] +1-2.2 7-7 11a[15]|11b[16] +1-2.3 9-13 bbbb 5a[17]|5b[18] +1-2.4 9-10 b 3a[19]|3b[20] +1-2.5 10-10 10a[21]|10b[22] +1-2.6 13-13 12a[23]|12b[24] +1-3 14-20 cccccc _ +1-3.1 14-17 ccc 9a[7]|9b[8] diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedSubMultiTokenSpanWithFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedSubMultiTokenSpanWithFeatureValue/reference.xmi new file mode 100644 index 0000000000..df112d6479 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testStackedSubMultiTokenSpanWithFeatureValue/reference.xmi @@ -0,0 +1,34 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithFeatureValue/reference.tsv new file mode 100644 index 0000000000..7f57aa15f7 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithFeatureValue/reference.tsv @@ -0,0 +1,17 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=aaaaaa bbbbbb cccccc +1-1 0-6 aaaaaa 2[1]|7[2]|1[3] +1-1.1 3-6 aaa 9[4]|6[5]|8[6] +1-2 7-13 bbbbbb 2[1]|9[4]|6[5]|4[7] +1-2.1 7-10 bbb 7[2]|8[6] +1-2.2 7-11 bbbb 4[7] +1-2.3 7-7 11 +1-2.4 9-13 bbbb 5 +1-2.5 9-11 bb 3 +1-2.6 10-10 10 +1-2.7 13-13 12 +1-3 14-20 cccccc _ +1-3.1 14-17 ccc 9[4] diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithFeatureValue/reference.xmi new file mode 100644 index 0000000000..83d7f05ec6 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithFeatureValue/reference.xmi @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithoutFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithoutFeatureValue/reference.tsv new file mode 100644 index 0000000000..c4e2e21639 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithoutFeatureValue/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is a test . +1-1 0-4 This *[1] +1-2 5-7 is _ +1-2.1 5-6 i *[1] +1-3 8-9 a _ +1-4 10-14 test _ +1-5 15-16 . _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithoutFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithoutFeatureValue/reference.xmi new file mode 100644 index 0000000000..4f62492db1 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithoutFeatureValue/reference.xmi @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithoutFeatureValue2/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithoutFeatureValue2/reference.tsv new file mode 100644 index 0000000000..4d73460cc0 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithoutFeatureValue2/reference.tsv @@ -0,0 +1,12 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is a test . +1-1 0-4 This _ +1-1.1 1-4 his *[1] +1-2 5-7 is _ +1-2.1 5-6 i *[1] +1-3 8-9 a _ +1-4 10-14 test _ +1-5 15-16 . _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithoutFeatureValue2/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithoutFeatureValue2/reference.xmi new file mode 100644 index 0000000000..5240c409f3 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithoutFeatureValue2/reference.xmi @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithoutFeatureValue3/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithoutFeatureValue3/reference.tsv new file mode 100644 index 0000000000..a23e2fe4ef --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithoutFeatureValue3/reference.tsv @@ -0,0 +1,14 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is a test . +1-1 0-4 This _ +1-1.1 1-4 his *[1] +1-2 5-7 is _ +1-2.1 5-6 i *[1] +1-2.2 6-7 s *[2] +1-3 8-9 a *[2] +1-4 10-14 test _ +1-4.1 10-12 te *[2] +1-5 15-16 . _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithoutFeatureValue3/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithoutFeatureValue3/reference.xmi new file mode 100644 index 0000000000..7c297d2fd0 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubMultiTokenSpanWithoutFeatureValue3/reference.xmi @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubtokenChain/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubtokenChain/reference.tsv new file mode 100644 index 0000000000..5bbd59965f --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubtokenChain/reference.tsv @@ -0,0 +1,13 @@ +#FORMAT=WebAnno TSV 3.2 +#T_CH=webanno.custom.SimpleLink|referenceRelation|referenceType + + +#Text=This is a test . +1-1 0-4 This _ _ +1-1.1 1-3 hi *->1-1 *[1] +1-2 5-7 is _ _ +1-2.1 6-6 *->1-2 *[1] +1-3 8-9 a _ _ +1-4 10-14 test _ _ +1-4.1 11-13 es *->1-3 *[1] +1-5 15-16 . _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubtokenChain/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubtokenChain/reference.xmi new file mode 100644 index 0000000000..aa6022b579 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testSubtokenChain/reference.xmi @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenAttachedAnnotationsWithValues/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenAttachedAnnotationsWithValues/reference.tsv new file mode 100644 index 0000000000..444940a5d0 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenAttachedAnnotationsWithValues/reference.tsv @@ -0,0 +1,13 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures|animacy|aspect|case|definiteness|degree|gender|mood|negative|numType|number|person|possessive|pronType|reflex|tense|transitivity|value|verbForm|voice +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS|PosValue|coarseValue +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma|value +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem|value + + +#Text=This is a test . +1-1 0-4 This * * * * * * * * * * * * * * tense1 * morph * * pos1 * lemma1 stem1 +1-2 5-7 is _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ +1-3 8-9 a _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ +1-4 10-14 test _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ +1-5 15-16 . _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenAttachedAnnotationsWithValues/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenAttachedAnnotationsWithValues/reference.xmi new file mode 100644 index 0000000000..4211d99e6c --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenAttachedAnnotationsWithValues/reference.xmi @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedBioLookAlike/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedBioLookAlike/reference.tsv new file mode 100644 index 0000000000..4d0bbfac30 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedBioLookAlike/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is a test . +1-1 0-4 This B-NOTBIO! +1-2 5-7 is I-NOTBIO! +1-3 8-9 a I-NOTBIO! +1-4 10-14 test I-NOTBIO! +1-5 15-16 . I-NOTBIO! diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedBioLookAlike/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedBioLookAlike/reference.xmi new file mode 100644 index 0000000000..b1c438695a --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedBioLookAlike/reference.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithAsteriskFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithAsteriskFeatureValue/reference.tsv new file mode 100644 index 0000000000..da185dfe1c --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithAsteriskFeatureValue/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is a test . +1-1 0-4 This \* +1-2 5-7 is \* +1-3 8-9 a \* +1-4 10-14 test \* +1-5 15-16 . \* diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithAsteriskFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithAsteriskFeatureValue/reference.xmi new file mode 100644 index 0000000000..f6698c72d4 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithAsteriskFeatureValue/reference.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithFeatureValue/reference.tsv new file mode 100644 index 0000000000..7834f61d40 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithFeatureValue/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is a test . +1-1 0-4 This NE 0 +1-2 5-7 is NE 1 +1-3 8-9 a NE 2 +1-4 10-14 test NE 3 +1-5 15-16 . NE 4 diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithFeatureValue/reference.xmi new file mode 100644 index 0000000000..b5272a2a10 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithFeatureValue/reference.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithNastyFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithNastyFeatureValue/reference.tsv new file mode 100644 index 0000000000..fcb164eaeb --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithNastyFeatureValue/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is a test . +1-1 0-4 This de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity:value +1-2 5-7 is de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity:value +1-3 8-9 a de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity:value +1-4 10-14 test de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity:value +1-5 15-16 . de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity:value diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithNastyFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithNastyFeatureValue/reference.xmi new file mode 100644 index 0000000000..931d15bbdb --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithNastyFeatureValue/reference.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithSpecialSymbolsValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithSpecialSymbolsValue/reference.tsv new file mode 100644 index 0000000000..6b88261bcd --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithSpecialSymbolsValue/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is a test . +1-1 0-4 This #\*'"`´\t:\;{}\|\[ \]()\\§$%?=&\_\n +1-2 5-7 is #\*'"`´\t:\;{}\|\[ \]()\\§$%?=&\_\n +1-3 8-9 a #\*'"`´\t:\;{}\|\[ \]()\\§$%?=&\_\n +1-4 10-14 test #\*'"`´\t:\;{}\|\[ \]()\\§$%?=&\_\n +1-5 15-16 . #\*'"`´\t:\;{}\|\[ \]()\\§$%?=&\_\n diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithSpecialSymbolsValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithSpecialSymbolsValue/reference.xmi new file mode 100644 index 0000000000..cb8364e3a5 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithSpecialSymbolsValue/reference.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithUnderscoreFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithUnderscoreFeatureValue/reference.tsv new file mode 100644 index 0000000000..2f253889ed --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithUnderscoreFeatureValue/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is a test . +1-1 0-4 This \_ +1-2 5-7 is \_ +1-3 8-9 a \_ +1-4 10-14 test \_ +1-5 15-16 . \_ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithUnderscoreFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithUnderscoreFeatureValue/reference.xmi new file mode 100644 index 0000000000..056f9b4f4f --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithUnderscoreFeatureValue/reference.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithoutFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithoutFeatureValue/reference.tsv new file mode 100644 index 0000000000..227b96009d --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithoutFeatureValue/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is a test . +1-1 0-4 This * +1-2 5-7 is * +1-3 8-9 a * +1-4 10-14 test * +1-5 15-16 . * diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithoutFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithoutFeatureValue/reference.xmi new file mode 100644 index 0000000000..7c4062245b --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedSpanWithoutFeatureValue/reference.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedStackedLookAlike/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedStackedLookAlike/reference.tsv new file mode 100644 index 0000000000..89ac0e3ead --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedStackedLookAlike/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is a test . +1-1 0-4 This NOTSTACKED\[0\] +1-2 5-7 is NOTSTACKED\[1\] +1-3 8-9 a NOTSTACKED\[2\] +1-4 10-14 test NOTSTACKED\[3\] +1-5 15-16 . NOTSTACKED\[4\] diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedStackedLookAlike/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedStackedLookAlike/reference.xmi new file mode 100644 index 0000000000..5670043dd2 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedStackedLookAlike/reference.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedStackedSpanWithFeatureValue/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedStackedSpanWithFeatureValue/reference.tsv new file mode 100644 index 0000000000..b4ae177992 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedStackedSpanWithFeatureValue/reference.tsv @@ -0,0 +1,10 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is a test . +1-1 0-4 This NE[1]|NE[2] +1-2 5-7 is NE[3]|NE[4] +1-3 8-9 a NE[5]|NE[6] +1-4 10-14 test NE[7]|NE[8] +1-5 15-16 . NE[9]|NE[10] diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedStackedSpanWithFeatureValue/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedStackedSpanWithFeatureValue/reference.xmi new file mode 100644 index 0000000000..b434b3bd4f --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTokenBoundedStackedSpanWithFeatureValue/reference.xmi @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTwoSentencesWithNoSpaceInBetween/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTwoSentencesWithNoSpaceInBetween/reference.tsv new file mode 100644 index 0000000000..5c6c823506 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTwoSentencesWithNoSpaceInBetween/reference.tsv @@ -0,0 +1,8 @@ +#FORMAT=WebAnno TSV 3.2 + + +#Text=one +1-1 0-3 one + +#Text=two +2-1 3-6 two diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTwoSentencesWithNoSpaceInBetween/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTwoSentencesWithNoSpaceInBetween/reference.xmi new file mode 100644 index 0000000000..1cba323f72 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testTwoSentencesWithNoSpaceInBetween/reference.xmi @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testUnsetSlotFeature/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testUnsetSlotFeature/reference.tsv new file mode 100644 index 0000000000..31ca959d59 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testUnsetSlotFeature/reference.tsv @@ -0,0 +1,11 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.SimpleSpan| +#T_SP=webanno.custom.FlexLinkHost|ROLE_webanno.custom.FlexLinkHost:links_webanno.custom.FlexLinkType|uima.tcas.Annotation + + +#Text=This is a test . +1-1 0-4 This _ _ * +1-2 5-7 is * _ _ +1-3 8-9 a * _ _ +1-4 10-14 test _ _ _ +1-5 15-16 . _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testUnsetSlotFeature/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testUnsetSlotFeature/reference.xmi new file mode 100644 index 0000000000..2bccf68653 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testUnsetSlotFeature/reference.xmi @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSlotFeature1/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSlotFeature1/reference.tsv new file mode 100644 index 0000000000..cd2989bf4e --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSlotFeature1/reference.tsv @@ -0,0 +1,12 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.SimpleSpan| +#T_SP=webanno.custom.SimpleLinkHost|ROLE_webanno.custom.SimpleLinkHost:links_webanno.custom.LinkType|webanno.custom.SimpleSpan + + +#Text=This is a test . +1-1 0-4 This _ _ _ +1-1.1 0-0 _ p1;p2 1-2[1];1-2[2] +1-2 5-7 is *[1]|*[2] _ _ +1-3 8-9 a *[1]|*[2] _ _ +1-4 10-14 test _ _ _ +1-5 15-16 . _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSlotFeature1/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSlotFeature1/reference.xmi new file mode 100644 index 0000000000..ec186c80de --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSlotFeature1/reference.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSlotFeature2/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSlotFeature2/reference.tsv new file mode 100644 index 0000000000..28d5f8a136 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSlotFeature2/reference.tsv @@ -0,0 +1,12 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.SimpleSpan| +#T_SP=webanno.custom.SimpleLinkHost|ROLE_webanno.custom.SimpleLinkHost:links_webanno.custom.LinkType|webanno.custom.SimpleSpan + + +#Text=This is a test . +1-1 0-4 This _ p1;p2 1-2[1];1-3.1 +1-2 5-7 is *[1] _ _ +1-3 8-9 a *[1] _ _ +1-3.1 9-9 * _ _ +1-4 10-14 test _ _ _ +1-5 15-16 . _ _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSlotFeature2/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSlotFeature2/reference.xmi new file mode 100644 index 0000000000..f194cbdb3e --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSlotFeature2/reference.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpanBetweenAdjacentTokens/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpanBetweenAdjacentTokens/reference.tsv new file mode 100644 index 0000000000..f48409f67e --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpanBetweenAdjacentTokens/reference.tsv @@ -0,0 +1,8 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.SimpleSpan| + + +#Text=word. +1-1 0-4 word _ +1-1.1 4-4 * +1-2 4-5 . _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpanBetweenAdjacentTokens/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpanBetweenAdjacentTokens/reference.xmi new file mode 100644 index 0000000000..c31fc75028 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpanBetweenAdjacentTokens/reference.xmi @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpansWithFeatureValues/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpansWithFeatureValues/reference.tsv new file mode 100644 index 0000000000..aa040ab629 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpansWithFeatureValues/reference.tsv @@ -0,0 +1,12 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is a test . +1-1 0-4 This _ +1-1.1 0-0 PERSON +1-2 5-7 is _ +1-3 8-9 a _ +1-4 10-14 test _ +1-5 15-16 . _ +1-5.1 16-16 ORG diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpansWithFeatureValues/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpansWithFeatureValues/reference.xmi new file mode 100644 index 0000000000..dea862f1b7 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpansWithFeatureValues/reference.xmi @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpansWithoutFeatureValues/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpansWithoutFeatureValues/reference.tsv new file mode 100644 index 0000000000..575a10e0eb --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpansWithoutFeatureValues/reference.tsv @@ -0,0 +1,12 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Span|value + + +#Text=This is a test . +1-1 0-4 This _ +1-1.1 0-0 * +1-2 5-7 is _ +1-3 8-9 a _ +1-4 10-14 test _ +1-5 15-16 . _ +1-5.1 16-16 * diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpansWithoutFeatureValues/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpansWithoutFeatureValues/reference.xmi new file mode 100644 index 0000000000..679557b3eb --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpansWithoutFeatureValues/reference.xmi @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpansWithoutFeatures/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpansWithoutFeatures/reference.tsv new file mode 100644 index 0000000000..e4aff478d8 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpansWithoutFeatures/reference.tsv @@ -0,0 +1,12 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.SimpleSpan| + + +#Text=This is a test . +1-1 0-4 This _ +1-1.1 0-0 * +1-2 5-7 is _ +1-3 8-9 a _ +1-4 10-14 test _ +1-5 15-16 . _ +1-5.1 16-16 * diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpansWithoutFeatures/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpansWithoutFeatures/reference.xmi new file mode 100644 index 0000000000..2bab5fc08a --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthSpansWithoutFeatures/reference.xmi @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthStackedSpansWithoutFeatures/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthStackedSpansWithoutFeatures/reference.tsv new file mode 100644 index 0000000000..d7db1238da --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthStackedSpansWithoutFeatures/reference.tsv @@ -0,0 +1,12 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.SimpleSpan| + + +#Text=This is a test . +1-1 0-4 This _ +1-1.1 0-0 *[1]|*[2] +1-2 5-7 is _ +1-3 8-9 a _ +1-4 10-14 test _ +1-5 15-16 . _ +1-5.1 16-16 *[3]|*[4] diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthStackedSpansWithoutFeatures/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthStackedSpansWithoutFeatures/reference.xmi new file mode 100644 index 0000000000..a9341d6a03 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testZeroLengthStackedSpansWithoutFeatures/reference.xmi @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3/coref.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3/coref.tsv new file mode 100644 index 0000000000..7657e5d959 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3/coref.tsv @@ -0,0 +1,5599 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|value +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma|value +#T_CH=de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink|referenceType|referenceRelation + + +#Text=_ Verhaftung – +1-1 0-1 _ _ _ _ _ +1-2 2-12 Verhaftung _ Verhaftung _ _ +1-3 13-14 – _ – _ _ + +#Text=_ Gespräch mit Frau Grubach – +2-1 15-16 _ _ _ _ _ +2-2 17-25 Gespräch _ Gespräch _ _ +2-3 26-29 mit _ mit _ _ +2-4 30-34 Frau _ Frau CM[1] COREFERENTIAL->1-1 +2-5 35-42 Grubach PER Grubach CM[1] COREFERENTIAL->1-1 +2-6 43-44 – _ – _ _ + +#Text=_ Dann _ Fräulein Bürstner +3-1 45-46 _ _ _ _ _ +3-2 47-51 Dann _ Dann _ _ +3-3 52-53 _ _ kommt _ _ +3-4 54-62 Fräulein _ Fräulein CM[62] COREFERENTIAL->62-1 +3-5 63-71 Bürstner PER Bürstner CM[62] COREFERENTIAL->62-1 + +#Text=_ Jemand mußte Josef K. verleumdet haben , denn ohne daß er etwas Böses getan hätte , wurde er eines Morgens verhaftet . +4-1 72-73 _ _ _ _ _ +4-2 74-80 Jemand _ Jemand _ _ +4-3 81-86 mußte _ musste _ _ +4-4 87-92 Josef PER[3] Josef CM[89] ANAPHORIC->89-1 +4-5 93-95 K. PER[3] K. CM[89] ANAPHORIC->89-1 +4-6 96-106 verleumdet _ verleumdet _ _ +4-7 107-112 haben _ haben _ _ +4-8 113-114 , _ , _ _ +4-9 115-119 denn _ denn _ _ +4-10 120-124 ohne _ ohne _ _ +4-11 125-128 daß _ dass _ _ +4-12 129-131 er _ er CM[89] ANAPHORIC->89-2 +4-13 132-137 etwas _ etwas _ _ +4-14 138-143 Böses _ Böses _ _ +4-15 144-149 getan _ getan _ _ +4-16 150-155 hätte _ hätte _ _ +4-17 156-157 , _ , _ _ +4-18 158-163 wurde _ wurde _ _ +4-19 164-166 er _ er CM[89] ANAPHORIC->89-3 +4-20 167-172 eines _ eines _ _ +4-21 173-180 Morgens _ Morgens _ _ +4-22 181-190 verhaftet _ verhaftet _ _ +4-23 191-192 . _ . _ _ + +#Text=_ Die Köchin der Frau Grubach , seine Zimmervermieterin , die ihm jeden Tag gegen acht Uhr früh das Frühstück brachte , kam diesmal nicht . +5-1 193-194 _ _ _ _ _ +5-2 195-198 Die _ Die CM[88] COREFERENTIAL->88-1 +5-3 199-205 Köchin _ Köchin CM[88] COREFERENTIAL->88-1 +5-4 206-209 der _ der CM[88] COREFERENTIAL->88-1 +5-5 210-214 Frau _ Frau CM[1]|CM[88] COREFERENTIAL->1-2|COREFERENTIAL->88-1 +5-6 215-222 Grubach PER Grubach CM[1]|CM[88] COREFERENTIAL->1-2|COREFERENTIAL->88-1 +5-7 223-224 , _ , _ _ +5-8 225-230 seine _ seine CM[88]|CM[89] ANAPHORIC->88-2|ANAPHORIC->89-4 +5-9 231-248 Zimmervermieterin _ Zimmervermieterin CM[88] ANAPHORIC->88-2 +5-10 249-250 , _ , _ _ +5-11 251-254 die _ die CM[88] *->88-3 +5-12 255-258 ihm _ ihm CM[89] COREFERENTIAL->89-5 +5-13 259-264 jeden _ jeden _ _ +5-14 265-268 Tag _ Tag _ _ +5-15 269-274 gegen _ gegen _ _ +5-16 275-279 acht _ acht _ _ +5-17 280-283 Uhr _ Uhr _ _ +5-18 284-288 früh _ früh _ _ +5-19 289-292 das _ das _ _ +5-20 293-302 Frühstück _ Frühstück _ _ +5-21 303-310 brachte _ brachte _ _ +5-22 311-312 , _ , _ _ +5-23 313-316 kam _ kam _ _ +5-24 317-324 diesmal _ diesmal _ _ +5-25 325-330 nicht _ nicht _ _ +5-26 331-332 . _ . _ _ + +#Text=_ Das war noch niemals geschehen . +6-1 333-334 _ _ _ _ _ +6-2 335-338 Das _ Das _ _ +6-3 339-342 war _ war _ _ +6-4 343-347 noch _ noch _ _ +6-5 348-355 niemals _ niemals _ _ +6-6 356-365 geschehen _ geschehen _ _ +6-7 366-367 . _ . _ _ + +#Text=_ K. wartete noch ein Weilchen , _ sah von seinem Kopfkissen aus die alte Frau , die ihm gegenüber wohnte und die ihn mit einer an ihr ganz ungewöhnlichen Neugierde beobachtete , dann aber , gleichzeitig befremdet und hungrig , läutete er . +7-1 368-369 _ _ _ _ _ +7-2 370-372 K. PER K. CM[89] ANAPHORIC->89-6 +7-3 373-380 wartete _ wartete _ _ +7-4 381-385 noch _ noch _ _ +7-5 386-389 ein _ ein _ _ +7-6 390-398 Weilchen _ Weilchen _ _ +7-7 399-400 , _ , _ _ +7-8 401-402 _ _ er _ _ +7-9 403-406 sah _ sah _ _ +7-10 407-410 von _ von _ _ +7-11 411-417 seinem _ seinem CM[89] ANAPHORIC->89-7 +7-12 418-428 Kopfkissen _ Kopfkissen _ _ +7-13 429-432 aus _ aus _ _ +7-14 433-436 die _ die CM[2] ANAPHORIC->2-1 +7-15 437-441 alte _ alte CM[2] ANAPHORIC->2-1 +7-16 442-446 Frau _ Frau CM[2] ANAPHORIC->2-1 +7-17 447-448 , _ , _ _ +7-18 449-452 die _ die CM[2] ANAPHORIC->2-2 +7-19 453-456 ihm _ ihm CM[89] ANAPHORIC->89-8 +7-20 457-466 gegenüber _ gegenüber _ _ +7-21 467-473 wohnte _ wohnte _ _ +7-22 474-477 und _ und _ _ +7-23 478-481 die _ die CM[2] ANAPHORIC->2-3 +7-24 482-485 ihn _ ihn CM[89] ANAPHORIC->89-9 +7-25 486-489 mit _ mit _ _ +7-26 490-495 einer _ einer _ _ +7-27 496-498 an _ an _ _ +7-28 499-502 ihr _ ihr CM[2] *->2-4 +7-29 503-507 ganz _ ganz _ _ +7-30 508-522 ungewöhnlichen _ ungewöhnlichen _ _ +7-31 523-532 Neugierde _ Neugierde _ _ +7-32 533-544 beobachtete _ beobachtete _ _ +7-33 545-546 , _ , _ _ +7-34 547-551 dann _ dann _ _ +7-35 552-556 aber _ aber _ _ +7-36 557-558 , _ , _ _ +7-37 559-571 gleichzeitig _ gleichzeitig _ _ +7-38 572-581 befremdet _ befremdet _ _ +7-39 582-585 und _ und _ _ +7-40 586-593 hungrig _ hungrig _ _ +7-41 594-595 , _ , _ _ +7-42 596-603 läutete _ läutete _ _ +7-43 604-606 er _ er CM[89] ANAPHORIC->89-10 +7-44 607-608 . _ . _ _ + +#Text=_ Sofort klopfte es und ein Mann , den er in dieser Wohnung noch niemals gesehen hatte , trat ein . +8-1 609-610 _ _ _ _ _ +8-2 611-617 Sofort _ Sofort _ _ +8-3 618-625 klopfte _ klopfte _ _ +8-4 626-628 es _ es _ _ +8-5 629-632 und _ und _ _ +8-6 633-636 ein _ ein CM[4] ANAPHORIC->4-1 +8-7 637-641 Mann _ Mann CM[4] ANAPHORIC->4-1 +8-8 642-643 , _ , _ _ +8-9 644-647 den _ den CM[4] ANAPHORIC->4-2 +8-10 648-650 er _ er CM[89] COREFERENTIAL->89-11 +8-11 651-653 in _ in _ _ +8-12 654-660 dieser _ dieser _ _ +8-13 661-668 Wohnung _ Wohnung _ _ +8-14 669-673 noch _ noch _ _ +8-15 674-681 niemals _ niemals _ _ +8-16 682-689 gesehen _ gesehen _ _ +8-17 690-695 hatte _ hatte _ _ +8-18 696-697 , _ , _ _ +8-19 698-702 trat _ trat _ _ +8-20 703-706 ein _ ein _ _ +8-21 707-708 . _ . _ _ + +#Text=_ Er war schlank und doch _ _ fest gebaut , er trug ein anliegendes schwarzes Kleid , das , ähnlich den Reiseanzügen , mit verschiedenen Falten , Taschen , Schnallen , Knöpfen und einem Gürtel versehen war und _ infolgedessen , ohne daß man sich darüber klar wurde , wozu es dienen sollte , besonders praktisch erschien . +9-1 709-710 _ _ _ _ _ +9-2 711-713 Er _ Er CM[4] ANAPHORIC->4-3 +9-3 714-717 war _ war _ _ +9-4 718-725 schlank _ schlank _ _ +9-5 726-729 und _ und _ _ +9-6 730-734 doch _ doch _ _ +9-7 735-736 _ _ war _ _ +9-8 737-738 _ _ er _ _ +9-9 739-743 fest _ fest _ _ +9-10 744-750 gebaut _ gebaut _ _ +9-11 751-752 , _ , _ _ +9-12 753-755 er _ er CM[4] ANAPHORIC->4-4 +9-13 756-760 trug _ trug _ _ +9-14 761-764 ein _ ein CM[3] ANAPHORIC->3-1 +9-15 765-776 anliegendes _ anliegendes CM[3] ANAPHORIC->3-1 +9-16 777-786 schwarzes _ schwarzes CM[3] ANAPHORIC->3-1 +9-17 787-792 Kleid _ Kleid CM[3] ANAPHORIC->3-1 +9-18 793-794 , _ , _ _ +9-19 795-798 das _ das CM[3] ANAPHORIC->3-2 +9-20 799-800 , _ , _ _ +9-21 801-808 ähnlich _ ähnlich _ _ +9-22 809-812 den _ den _ _ +9-23 813-825 Reiseanzügen _ Reiseanzügen _ _ +9-24 826-827 , _ , _ _ +9-25 828-831 mit _ mit _ _ +9-26 832-845 verschiedenen _ verschiedenen _ _ +9-27 846-852 Falten _ Falten _ _ +9-28 853-854 , _ , _ _ +9-29 855-862 Taschen _ Taschen _ _ +9-30 863-864 , _ , _ _ +9-31 865-874 Schnallen _ Schnallen _ _ +9-32 875-876 , _ , _ _ +9-33 877-884 Knöpfen _ Knöpfen _ _ +9-34 885-888 und _ und _ _ +9-35 889-894 einem _ einem _ _ +9-36 895-901 Gürtel _ Gürtel _ _ +9-37 902-910 versehen _ versehen _ _ +9-38 911-914 war _ war _ _ +9-39 915-918 und _ und _ _ +9-40 919-920 _ _ das _ _ +9-41 921-934 infolgedessen _ infolgedessen _ _ +9-42 935-936 , _ , _ _ +9-43 937-941 ohne _ ohne _ _ +9-44 942-945 daß _ dass _ _ +9-45 946-949 man _ man _ _ +9-46 950-954 sich _ sich _ _ +9-47 955-962 darüber _ darüber _ _ +9-48 963-967 klar _ klar _ _ +9-49 968-973 wurde _ wurde _ _ +9-50 974-975 , _ , _ _ +9-51 976-980 wozu _ wozu _ _ +9-52 981-983 es _ es CM[3] *->3-3 +9-53 984-990 dienen _ dienen _ _ +9-54 991-997 sollte _ sollte _ _ +9-55 998-999 , _ , _ _ +9-56 1000-1009 besonders _ besonders _ _ +9-57 1010-1019 praktisch _ praktisch _ _ +9-58 1020-1028 erschien _ erschien _ _ +9-59 1029-1030 . _ . _ _ + +#Text=_ " Wer sind Sie ? " fragte K. und saß gleich halb aufrecht im Bett . +10-1 1031-1032 _ _ _ _ _ +10-2 1033-1034 " _ " _ _ +10-3 1035-1038 Wer _ Wer _ _ +10-4 1039-1043 sind _ sind _ _ +10-5 1044-1047 Sie _ Sie CM[4] ANAPHORIC->4-5 +10-6 1048-1049 ? _ ? _ _ +10-7 1050-1051 " _ " _ _ +10-8 1052-1058 fragte _ fragte _ _ +10-9 1059-1061 K. PER K. CM[89] ANAPHORIC->89-12 +10-10 1062-1065 und _ und _ _ +10-11 1066-1069 saß _ saß _ _ +10-12 1070-1076 gleich _ gleich _ _ +10-13 1077-1081 halb _ halb _ _ +10-14 1082-1090 aufrecht _ aufrecht _ _ +10-15 1091-1093 im _ im _ _ +10-16 1094-1098 Bett _ Bett _ _ +10-17 1099-1100 . _ . _ _ + +#Text=_ Der Mann aber ging über die Frage hinweg , als müsse man seine Erscheinung hinnehmen , und _ sagte bloß seinerseits : " Sie haben geläutet ? " +11-1 1101-1102 _ _ _ _ _ +11-2 1103-1106 Der _ Der CM[4] ANAPHORIC->4-6 +11-3 1107-1111 Mann _ Mann CM[4] ANAPHORIC->4-6 +11-4 1112-1116 aber _ aber _ _ +11-5 1117-1121 ging _ ging _ _ +11-6 1122-1126 über _ über _ _ +11-7 1127-1130 die _ die _ _ +11-8 1131-1136 Frage _ Frage _ _ +11-9 1137-1143 hinweg _ hinweg _ _ +11-10 1144-1145 , _ , _ _ +11-11 1146-1149 als _ als _ _ +11-12 1150-1155 müsse _ müsse _ _ +11-13 1156-1159 man _ man _ _ +11-14 1160-1165 seine _ seine CM[4] ANAPHORIC->4-7 +11-15 1166-1177 Erscheinung _ Erscheinung _ _ +11-16 1178-1187 hinnehmen _ hinnehmen _ _ +11-17 1188-1189 , _ , _ _ +11-18 1190-1193 und _ und _ _ +11-19 1194-1195 _ _ er _ _ +11-20 1196-1201 sagte _ sagte _ _ +11-21 1202-1206 bloß _ bloß _ _ +11-22 1207-1218 seinerseits _ seinerseits _ _ +11-23 1219-1220 : _ : _ _ +11-24 1221-1222 " _ " _ _ +11-25 1223-1226 Sie _ Sie CM[89] ANAPHORIC->89-13 +11-26 1227-1232 haben _ haben _ _ +11-27 1233-1241 geläutet _ geläutet _ _ +11-28 1242-1243 ? _ ? _ _ +11-29 1244-1245 " _ " _ _ + +#Text=_ " Anna soll mir das Frühstück bringen " , sagte K. und _ versuchte , zunächst stillschweigend , durch Aufmerksamkeit und Überlegung festzustellen , wer der Mann eigentlich war . +12-1 1246-1247 _ _ _ _ _ +12-2 1248-1249 " _ " _ _ +12-3 1250-1254 Anna _ Anna CM[7] COREFERENTIAL->7-1 +12-4 1255-1259 soll _ soll _ _ +12-5 1260-1263 mir _ mir CM[89] COREFERENTIAL->89-14 +12-6 1264-1267 das _ das CM[8] ANAPHORIC->8-1 +12-7 1268-1277 Frühstück _ Frühstück CM[8] ANAPHORIC->8-1 +12-8 1278-1285 bringen _ bringen _ _ +12-9 1286-1287 " _ " _ _ +12-10 1288-1289 , _ , _ _ +12-11 1290-1295 sagte _ sagte _ _ +12-12 1296-1298 K. PER K. CM[89] ANAPHORIC->89-15 +12-13 1299-1302 und _ und _ _ +12-14 1303-1304 _ _ er _ _ +12-15 1305-1314 versuchte _ versuchte _ _ +12-16 1315-1316 , _ , _ _ +12-17 1317-1325 zunächst _ zunächst _ _ +12-18 1326-1341 stillschweigend _ stillschweigend _ _ +12-19 1342-1343 , _ , _ _ +12-20 1344-1349 durch _ durch _ _ +12-21 1350-1364 Aufmerksamkeit _ Aufmerksamkeit _ _ +12-22 1365-1368 und _ und _ _ +12-23 1369-1379 Überlegung _ Überlegung _ _ +12-24 1380-1393 festzustellen _ festzustellen _ _ +12-25 1394-1395 , _ , _ _ +12-26 1396-1399 wer _ wer _ _ +12-27 1400-1403 der _ der CM[4] ANAPHORIC->4-8 +12-28 1404-1408 Mann _ Mann CM[4] ANAPHORIC->4-8 +12-29 1409-1419 eigentlich _ eigentlich _ _ +12-30 1420-1423 war _ war _ _ +12-31 1424-1425 . _ . _ _ + +#Text=_ Aber dieser setzte sich nicht allzu| lange seinen Blicken aus , sondern er wandte sich zur Tür , die er ein wenig öffnete , um jemandem , der offenbar knapp hinter der Tür stand , zu sagen : " Er will , daß Anna ihm das Frühstück bringt . " +13-1 1426-1427 _ _ _ _ _ +13-2 1428-1432 Aber _ Aber _ _ +13-3 1433-1439 dieser _ dieser CM[4] ANAPHORIC->4-9 +13-4 1440-1446 setzte _ setzte _ _ +13-5 1447-1451 sich _ sich CM[4] ANAPHORIC->4-10 +13-6 1452-1457 nicht _ nicht _ _ +13-7 1458-1464 allzu| _ allzu _ _ +13-8 1465-1470 lange _ lange _ _ +13-9 1471-1477 seinen _ seinen CM[89] ANAPHORIC->89-16 +13-10 1478-1485 Blicken _ Blicken _ _ +13-11 1486-1489 aus _ aus _ _ +13-12 1490-1491 , _ , _ _ +13-13 1492-1499 sondern _ sondern _ _ +13-14 1500-1502 er _ _ CM[4] ANAPHORIC->4-11 +13-15 1503-1509 wandte _ wandte _ _ +13-16 1510-1514 sich _ sich CM[4] ANAPHORIC->4-12 +13-17 1515-1518 zur _ zur CM[5] ANAPHORIC->5-1 +13-18 1519-1522 Tür _ Tür CM[5] ANAPHORIC->5-1 +13-19 1523-1524 , _ , _ _ +13-20 1525-1528 die _ die CM[5] ANAPHORIC->5-2 +13-21 1529-1531 er _ er CM[4] ANAPHORIC->4-13 +13-22 1532-1535 ein _ ein _ _ +13-23 1536-1541 wenig _ wenig _ _ +13-24 1542-1549 öffnete _ öffnete _ _ +13-25 1550-1551 , _ , _ _ +13-26 1552-1554 um _ um _ _ +13-27 1555-1563 jemandem _ jemandem CM[6] ANAPHORIC->6-1 +13-28 1564-1565 , _ , _ _ +13-29 1566-1569 der _ der CM[6] *->6-2 +13-30 1570-1578 offenbar _ offenbar _ _ +13-31 1579-1584 knapp _ knapp _ _ +13-32 1585-1591 hinter _ hinter _ _ +13-33 1592-1595 der _ der CM[5] *->5-3 +13-34 1596-1599 Tür _ Tür CM[5] *->5-3 +13-35 1600-1605 stand _ stand _ _ +13-36 1606-1607 , _ , _ _ +13-37 1608-1610 zu _ zu _ _ +13-38 1611-1616 sagen _ sagen _ _ +13-39 1617-1618 : _ : _ _ +13-40 1619-1620 " _ " _ _ +13-41 1621-1623 Er _ Er CM[89] ANAPHORIC->89-17 +13-42 1624-1628 will _ will _ _ +13-43 1629-1630 , _ , _ _ +13-44 1631-1634 daß _ dass _ _ +13-45 1635-1639 Anna _ Anna CM[7] *->7-2 +13-46 1640-1643 ihm _ ihm CM[89] COREFERENTIAL->89-18 +13-47 1644-1647 das _ das CM[8] *->8-2 +13-48 1648-1657 Frühstück _ Frühstück CM[8] *->8-2 +13-49 1658-1664 bringt _ bringt _ _ +13-50 1665-1666 . _ . _ _ +13-51 1667-1668 " _ " _ _ + +#Text=_ Ein kleines Gelächter im Nebenzimmer folgte , es war nach dem Klang nicht sicher , ob nicht mehrere Personen daran beteiligt waren . +14-1 1669-1670 _ _ _ _ _ +14-2 1671-1674 Ein _ Ein _ _ +14-3 1675-1682 kleines _ kleines _ _ +14-4 1683-1692 Gelächter _ Gelächter _ _ +14-5 1693-1695 im _ im CM[32] COREFERENTIAL->32-1 +14-6 1696-1707 Nebenzimmer _ Nebenzimmer CM[32] COREFERENTIAL->32-1 +14-7 1708-1714 folgte _ folgte _ _ +14-8 1715-1716 , _ , _ _ +14-9 1717-1719 es _ es _ _ +14-10 1720-1723 war _ war _ _ +14-11 1724-1728 nach _ nach _ _ +14-12 1729-1732 dem _ dem _ _ +14-13 1733-1738 Klang _ Klang _ _ +14-14 1739-1744 nicht _ nicht _ _ +14-15 1745-1751 sicher _ sicher _ _ +14-16 1752-1753 , _ , _ _ +14-17 1754-1756 ob _ ob _ _ +14-18 1757-1762 nicht _ nicht _ _ +14-19 1763-1770 mehrere _ mehrere _ _ +14-20 1771-1779 Personen _ Personen _ _ +14-21 1780-1785 daran _ daran _ _ +14-22 1786-1795 beteiligt _ beteiligt _ _ +14-23 1796-1801 waren _ waren _ _ +14-24 1802-1803 . _ . _ _ + +#Text=_ Obwohl der fremde Mann dadurch nichts erfahren haben konnte , was er nicht schon früher gewußt hätte , sagte er nun doch zu K. im Tone einer Meldung : " Es ist unmöglich . " +15-1 1804-1805 _ _ _ _ _ +15-2 1806-1812 Obwohl _ Obwohl _ _ +15-3 1813-1816 der _ der CM[4] ANAPHORIC->4-14 +15-4 1817-1823 fremde _ fremde CM[4] ANAPHORIC->4-14 +15-5 1824-1828 Mann _ Mann CM[4] ANAPHORIC->4-14 +15-6 1829-1836 dadurch _ dadurch _ _ +15-7 1837-1843 nichts _ nichts CM[9] BOUND->9-1 +15-8 1844-1852 erfahren _ erfahren _ _ +15-9 1853-1858 haben _ haben _ _ +15-10 1859-1865 konnte _ konnte _ _ +15-11 1866-1867 , _ , _ _ +15-12 1868-1871 was _ was CM[9] *->9-2 +15-13 1872-1874 er _ er CM[4] ANAPHORIC->4-15 +15-14 1875-1880 nicht _ nicht _ _ +15-15 1881-1886 schon _ schon _ _ +15-16 1887-1893 früher _ früher _ _ +15-17 1894-1900 gewußt _ gewusst _ _ +15-18 1901-1906 hätte _ hätte _ _ +15-19 1907-1908 , _ , _ _ +15-20 1909-1914 sagte _ sagte _ _ +15-21 1915-1917 er _ er CM[4] ANAPHORIC->4-16 +15-22 1918-1921 nun _ nun _ _ +15-23 1922-1926 doch _ doch _ _ +15-24 1927-1929 zu _ zu _ _ +15-25 1930-1932 K. PER K. CM[89] COREFERENTIAL->89-19 +15-26 1933-1935 im _ im _ _ +15-27 1936-1940 Tone _ Tone _ _ +15-28 1941-1946 einer _ einer _ _ +15-29 1947-1954 Meldung _ Meldung _ _ +15-30 1955-1956 : _ : _ _ +15-31 1957-1958 " _ " _ _ +15-32 1959-1961 Es _ Es _ _ +15-33 1962-1965 ist _ ist _ _ +15-34 1966-1975 unmöglich _ unmöglich _ _ +15-35 1976-1977 . _ . _ _ +15-36 1978-1979 " _ " _ _ + +#Text=_ " Das wäre neu " , sagte K. , sprang aus dem Bett und zog rasch seine Hosen an . +16-1 1980-1981 _ _ _ _ _ +16-2 1982-1983 " _ " _ _ +16-3 1984-1987 Das _ Das _ _ +16-4 1988-1992 wäre _ wäre _ _ +16-5 1993-1996 neu _ neu _ _ +16-6 1997-1998 " _ " _ _ +16-7 1999-2000 , _ , _ _ +16-8 2001-2006 sagte _ sagte _ _ +16-9 2007-2009 K. PER K. CM[89] ANAPHORIC->89-20 +16-10 2010-2011 , _ , _ _ +16-11 2012-2018 sprang _ sprang _ _ +16-12 2019-2022 aus _ aus _ _ +16-13 2023-2026 dem _ dem _ _ +16-14 2027-2031 Bett _ Bett _ _ +16-15 2032-2035 und _ und _ _ +16-16 2036-2039 zog _ zog _ _ +16-17 2040-2045 rasch _ rasch _ _ +16-18 2046-2051 seine _ seine CM[89] ANAPHORIC->89-21 +16-19 2052-2057 Hosen _ Hosen _ _ +16-20 2058-2060 an _ an _ _ +16-21 2061-2062 . _ . _ _ + +#Text=_ " Ich will doch sehen , was für Leute im Nebenzimmer sind und wie Frau Grubach diese Störung mir gegenüber verantworten wird . " +17-1 2063-2064 _ _ _ _ _ +17-2 2065-2066 " _ " _ _ +17-3 2067-2070 Ich _ Ich _ _ +17-4 2071-2075 will _ will _ _ +17-5 2076-2080 doch _ doch _ _ +17-6 2081-2086 sehen _ sehen _ _ +17-7 2087-2088 , _ , _ _ +17-8 2089-2092 was _ was _ _ +17-9 2093-2096 für _ für _ _ +17-10 2097-2102 Leute _ Leute _ _ +17-11 2103-2105 im _ im _ _ +17-12 2106-2117 Nebenzimmer _ Nebenzimmer _ _ +17-13 2118-2122 sind _ sind _ _ +17-14 2123-2126 und _ und _ _ +17-15 2127-2130 wie _ wie _ _ +17-16 2131-2135 Frau _ Frau CM[1] COREFERENTIAL->1-3 +17-17 2136-2143 Grubach PER Grubach CM[1] COREFERENTIAL->1-3 +17-18 2144-2149 diese _ diese _ _ +17-19 2150-2157 Störung _ Störung _ _ +17-20 2158-2161 mir _ mir _ _ +17-21 2162-2171 gegenüber _ gegenüber _ _ +17-22 2172-2184 verantworten _ verantworten _ _ +17-23 2185-2189 wird _ wird _ _ +17-24 2190-2191 . _ . _ _ +17-25 2192-2193 " _ " _ _ + +#Text=_ Es fiel ihm zwar gleich ein , daß er das nicht hätte laut sagen müssen und daß er dadurch gewissermaßen ein Beaufsichtigungsrecht des Fremden anerkannte , aber es schien ihm jetzt nicht wichtig . +18-1 2194-2195 _ _ _ _ _ +18-2 2196-2198 Es _ Es _ _ +18-3 2199-2203 fiel _ fiel _ _ +18-4 2204-2207 ihm _ ihm CM[89] ANAPHORIC->89-22 +18-5 2208-2212 zwar _ zwar _ _ +18-6 2213-2219 gleich _ gleich _ _ +18-7 2220-2223 ein _ ein _ _ +18-8 2224-2225 , _ , _ _ +18-9 2226-2229 daß _ dass _ _ +18-10 2230-2232 er _ er CM[89] ANAPHORIC->89-23 +18-11 2233-2236 das _ das _ _ +18-12 2237-2242 nicht _ nicht _ _ +18-13 2243-2248 hätte _ hätte _ _ +18-14 2249-2253 laut _ laut _ _ +18-15 2254-2259 sagen _ sagen _ _ +18-16 2260-2266 müssen _ müssen _ _ +18-17 2267-2270 und _ und _ _ +18-18 2271-2274 daß _ dass _ _ +18-19 2275-2277 er _ er CM[89] ANAPHORIC->89-24 +18-20 2278-2285 dadurch _ dadurch _ _ +18-21 2286-2299 gewissermaßen _ gewissermaßen _ _ +18-22 2300-2303 ein _ ein _ _ +18-23 2304-2325 Beaufsichtigungsrecht _ Beaufsichtigungsrecht _ _ +18-24 2326-2329 des _ des CM[4] ANAPHORIC->4-17 +18-25 2330-2337 Fremden _ Fremden CM[4] ANAPHORIC->4-17 +18-26 2338-2348 anerkannte _ anerkannte _ _ +18-27 2349-2350 , _ , _ _ +18-28 2351-2355 aber _ aber _ _ +18-29 2356-2358 es _ es _ _ +18-30 2359-2365 schien _ schien _ _ +18-31 2366-2369 ihm _ ihm CM[89] COREFERENTIAL->89-25 +18-32 2370-2375 jetzt _ jetzt _ _ +18-33 2376-2381 nicht _ nicht _ _ +18-34 2382-2389 wichtig _ wichtig _ _ +18-35 2390-2391 . _ . _ _ + +#Text=_ Immerhin faßte es der Fremde so auf , denn er sagte : " Wollen Sie nicht lieber hierbleiben ? " +19-1 2392-2393 _ _ _ _ _ +19-2 2394-2402 Immerhin _ Immerhin _ _ +19-3 2403-2408 faßte _ fasste _ _ +19-4 2409-2411 es _ es _ _ +19-5 2412-2415 der _ der CM[4] ANAPHORIC->4-18 +19-6 2416-2422 Fremde _ Fremde CM[4] ANAPHORIC->4-18 +19-7 2423-2425 so _ so _ _ +19-8 2426-2429 auf _ auf _ _ +19-9 2430-2431 , _ , _ _ +19-10 2432-2436 denn _ denn _ _ +19-11 2437-2439 er _ er CM[4] ANAPHORIC->4-19 +19-12 2440-2445 sagte _ sagte _ _ +19-13 2446-2447 : _ : _ _ +19-14 2448-2449 " _ " _ _ +19-15 2450-2456 Wollen _ Wollen _ _ +19-16 2457-2460 Sie _ Sie _ _ +19-17 2461-2466 nicht _ nicht _ _ +19-18 2467-2473 lieber _ lieber _ _ +19-19 2474-2485 hierbleiben _ hierbleiben _ _ +19-20 2486-2487 ? _ ? _ _ +19-21 2488-2489 " _ " _ _ + +#Text=_ " Ich will weder hierbleiben , noch _ _ von Ihnen angesprochen werden , solange Sie sich mir nicht vorstellen . " +20-1 2490-2491 _ _ _ _ _ +20-2 2492-2493 " _ " _ _ +20-3 2494-2497 Ich _ Ich _ _ +20-4 2498-2502 will _ will _ _ +20-5 2503-2508 weder _ weder _ _ +20-6 2509-2520 hierbleiben _ hierbleiben _ _ +20-7 2521-2522 , _ , _ _ +20-8 2523-2527 noch _ noch _ _ +20-9 2528-2529 _ _ will _ _ +20-10 2530-2531 _ _ ich _ _ +20-11 2532-2535 von _ von _ _ +20-12 2536-2541 Ihnen _ Ihnen _ _ +20-13 2542-2554 angesprochen _ angesprochen _ _ +20-14 2555-2561 werden _ werden _ _ +20-15 2562-2563 , _ , _ _ +20-16 2564-2571 solange _ solange _ _ +20-17 2572-2575 Sie _ Sie _ _ +20-18 2576-2580 sich _ sich _ _ +20-19 2581-2584 mir _ mir _ _ +20-20 2585-2590 nicht _ nicht _ _ +20-21 2591-2601 vorstellen _ vorstellen _ _ +20-22 2602-2603 . _ . _ _ +20-23 2604-2605 " _ " _ _ + +#Text=_ " Es war gut gemeint " , sagte der Fremde und _ öffnete nun freiwillig die Tür . +21-1 2606-2607 _ _ _ _ _ +21-2 2608-2609 " _ " _ _ +21-3 2610-2612 Es _ Es _ _ +21-4 2613-2616 war _ war _ _ +21-5 2617-2620 gut _ gut _ _ +21-6 2621-2628 gemeint _ gemeint _ _ +21-7 2629-2630 " _ " _ _ +21-8 2631-2632 , _ , _ _ +21-9 2633-2638 sagte _ sagte _ _ +21-10 2639-2642 der _ der CM[4] *->4-20 +21-11 2643-2649 Fremde _ Fremde CM[4] *->4-20 +21-12 2650-2653 und _ und _ _ +21-13 2654-2655 _ _ er _ _ +21-14 2656-2663 öffnete _ öffnete _ _ +21-15 2664-2667 nun _ nun _ _ +21-16 2668-2678 freiwillig _ freiwillig _ _ +21-17 2679-2682 die _ die _ _ +21-18 2683-2686 Tür _ Tür _ _ +21-19 2687-2688 . _ . _ _ + +#Text=_ Im Nebenzimmer , in das K. langsamer eintrat , als er wollte , sah es auf den ersten Blick fast genau so aus wie am Abend vorher . +22-1 2689-2690 _ _ _ _ _ +22-2 2691-2693 Im _ Im CM[10] ANAPHORIC->10-1 +22-3 2694-2705 Nebenzimmer _ Nebenzimmer CM[10] ANAPHORIC->10-1 +22-4 2706-2707 , _ , _ _ +22-5 2708-2710 in _ in _ _ +22-6 2711-2714 das _ das CM[10] ANAPHORIC->10-2 +22-7 2715-2717 K. PER K. CM[89] ANAPHORIC->89-26 +22-8 2718-2727 langsamer _ langsamer _ _ +22-9 2728-2735 eintrat _ eintrat _ _ +22-10 2736-2737 , _ , _ _ +22-11 2738-2741 als _ als _ _ +22-12 2742-2744 er _ er CM[89] COREFERENTIAL->89-27 +22-13 2745-2751 wollte _ wollte _ _ +22-14 2752-2753 , _ , _ _ +22-15 2754-2757 sah _ sah _ _ +22-16 2758-2760 es _ es _ _ +22-17 2761-2764 auf _ auf _ _ +22-18 2765-2768 den _ den _ _ +22-19 2769-2775 ersten _ ersten _ _ +22-20 2776-2781 Blick _ Blick _ _ +22-21 2782-2786 fast _ fast _ _ +22-22 2787-2792 genau _ genau _ _ +22-23 2793-2795 so _ so _ _ +22-24 2796-2799 aus _ aus _ _ +22-25 2800-2803 wie _ wie _ _ +22-26 2804-2806 am _ am _ _ +22-27 2807-2812 Abend _ Abend _ _ +22-28 2813-2819 vorher _ vorher _ _ +22-29 2820-2821 . _ . _ _ + +#Text=_ Es war das Wohnzimmer der Frau Grubach , vielleicht war in diesem mit Möbeln , Decken , Porzellan und Photographien überfüllten Zimmer heute ein wenig mehr Raum als sonst , man erkannte das nicht gleich , um so weniger , als die Hauptveränderung in der Anwesenheit eines Mannes bestand , der beim offenen Fenster mit einem Buch saß , von dem er jetzt aufblickte . +23-1 2822-2823 _ _ _ _ _ +23-2 2824-2826 Es _ Es CM[10] *->10-3 +23-3 2827-2830 war _ war _ _ +23-4 2831-2834 das _ das CM[11] COREFERENTIAL->11-1 +23-5 2835-2845 Wohnzimmer _ Wohnzimmer CM[11] COREFERENTIAL->11-1 +23-6 2846-2849 der _ der CM[11] COREFERENTIAL->11-1 +23-7 2850-2854 Frau _ Frau CM[1]|CM[11] COREFERENTIAL->1-4|COREFERENTIAL->11-1 +23-8 2855-2862 Grubach PER Grubach CM[1]|CM[11] COREFERENTIAL->1-4|COREFERENTIAL->11-1 +23-9 2863-2864 , _ , _ _ +23-10 2865-2875 vielleicht _ vielleicht _ _ +23-11 2876-2879 war _ war _ _ +23-12 2880-2882 in _ in _ _ +23-13 2883-2889 diesem _ diesem CM[11] *->11-2 +23-14 2890-2893 mit _ mit CM[11] *->11-2 +23-15 2894-2900 Möbeln _ Möbeln CM[11] *->11-2 +23-16 2901-2902 , _ , CM[11] *->11-2 +23-17 2903-2909 Decken _ Decken CM[11] *->11-2 +23-18 2910-2911 , _ , CM[11] *->11-2 +23-19 2912-2921 Porzellan _ Porzellan CM[11] *->11-2 +23-20 2922-2925 und _ und CM[11] *->11-2 +23-21 2926-2939 Photographien _ Photographien CM[11] *->11-2 +23-22 2940-2951 überfüllten _ überfüllten CM[11] *->11-2 +23-23 2952-2958 Zimmer _ Zimmer CM[11] *->11-2 +23-24 2959-2964 heute _ heute _ _ +23-25 2965-2968 ein _ ein _ _ +23-26 2969-2974 wenig _ wenig _ _ +23-27 2975-2979 mehr _ mehr _ _ +23-28 2980-2984 Raum _ Raum _ _ +23-29 2985-2988 als _ als _ _ +23-30 2989-2994 sonst _ sonst _ _ +23-31 2995-2996 , _ , _ _ +23-32 2997-3000 man _ man _ _ +23-33 3001-3009 erkannte _ erkannte _ _ +23-34 3010-3013 das _ das _ _ +23-35 3014-3019 nicht _ nicht _ _ +23-36 3020-3026 gleich _ gleich _ _ +23-37 3027-3028 , _ , _ _ +23-38 3029-3031 um _ um _ _ +23-39 3032-3034 so _ so _ _ +23-40 3035-3042 weniger _ weniger _ _ +23-41 3043-3044 , _ , _ _ +23-42 3045-3048 als _ als _ _ +23-43 3049-3052 die _ die _ _ +23-44 3053-3069 Hauptveränderung _ Hauptveränderung _ _ +23-45 3070-3072 in _ in _ _ +23-46 3073-3076 der _ der _ _ +23-47 3077-3088 Anwesenheit _ Anwesenheit _ _ +23-48 3089-3094 eines _ eines CM[12] ANAPHORIC->12-1 +23-49 3095-3101 Mannes _ Mannes CM[12] ANAPHORIC->12-1 +23-50 3102-3109 bestand _ bestand _ _ +23-51 3110-3111 , _ , _ _ +23-52 3112-3115 der _ der CM[12] ANAPHORIC->12-2 +23-53 3116-3120 beim _ beim CM[15] COREFERENTIAL->15-1 +23-54 3121-3128 offenen _ offenen CM[15] COREFERENTIAL->15-1 +23-55 3129-3136 Fenster _ Fenster CM[15] COREFERENTIAL->15-1 +23-56 3137-3140 mit _ mit _ _ +23-57 3141-3146 einem _ einem CM[13] ANAPHORIC->13-1 +23-58 3147-3151 Buch _ Buch CM[13] ANAPHORIC->13-1 +23-59 3152-3155 saß _ saß _ _ +23-60 3156-3157 , _ , _ _ +23-61 3158-3161 von _ von _ _ +23-62 3162-3165 dem _ dem CM[13] *->13-2 +23-63 3166-3168 er _ er CM[12] *->12-3 +23-64 3169-3174 jetzt _ jetzt _ _ +23-65 3175-3185 aufblickte _ aufblickte _ _ +23-66 3186-3187 . _ . _ _ + +#Text=_ " Sie hätten in Ihrem Zimmer bleiben sollen ! +24-1 3188-3189 _ _ _ _ _ +24-2 3190-3191 " _ " _ _ +24-3 3192-3195 Sie _ Sie _ _ +24-4 3196-3202 hätten _ hätten _ _ +24-5 3203-3205 in _ in _ _ +24-6 3206-3211 Ihrem _ Ihrem _ _ +24-7 3212-3218 Zimmer _ Zimmer _ _ +24-8 3219-3226 bleiben _ bleiben _ _ +24-9 3227-3233 sollen _ sollen _ _ +24-10 3234-3235 ! _ ! _ _ + +#Text=_ Hat es Ihnen denn Franz nicht gesagt ? " +25-1 3236-3237 _ _ _ _ _ +25-2 3238-3241 Hat _ Hat _ _ +25-3 3242-3244 es _ es _ _ +25-4 3245-3250 Ihnen _ Ihnen _ _ +25-5 3251-3255 denn _ denn _ _ +25-6 3256-3261 Franz PER Franz CM[14] COREFERENTIAL->14-1 +25-7 3262-3267 nicht _ nicht _ _ +25-8 3268-3274 gesagt _ gesagt _ _ +25-9 3275-3276 ? _ ? _ _ +25-10 3277-3278 " _ " _ _ + +#Text=_ " Ja , was wollen Sie denn ? " sagte K. und _ sah von der neuen Bekanntschaft zu dem mit Franz Benannten , der in der Tür stehengeblieben war , und dann _ _ wieder zurück . +26-1 3279-3280 _ _ _ _ _ +26-2 3281-3282 " _ " _ _ +26-3 3283-3285 Ja _ Ja _ _ +26-4 3286-3287 , _ , _ _ +26-5 3288-3291 was _ was _ _ +26-6 3292-3298 wollen _ wollen _ _ +26-7 3299-3302 Sie _ Sie _ _ +26-8 3303-3307 denn _ denn _ _ +26-9 3308-3309 ? _ ? _ _ +26-10 3310-3311 " _ " _ _ +26-11 3312-3317 sagte _ sagte _ _ +26-12 3318-3320 K. PER K. CM[89] COREFERENTIAL->89-28 +26-13 3321-3324 und _ und _ _ +26-14 3325-3326 _ _ er _ _ +26-15 3327-3330 sah _ sah _ _ +26-16 3331-3334 von _ von _ _ +26-17 3335-3338 der _ der CM[17] COREFERENTIAL->17-1 +26-18 3339-3344 neuen _ neuen CM[17] COREFERENTIAL->17-1 +26-19 3345-3358 Bekanntschaft _ Bekanntschaft CM[17] COREFERENTIAL->17-1 +26-20 3359-3361 zu _ zu _ _ +26-21 3362-3365 dem _ dem CM[14] ANAPHORIC->14-2 +26-22 3366-3369 mit _ mit CM[14] ANAPHORIC->14-2 +26-23 3370-3375 Franz PER Franz CM[14] ANAPHORIC->14-2 +26-24 3376-3385 Benannten _ Benannten CM[14] ANAPHORIC->14-2 +26-25 3386-3387 , _ , _ _ +26-26 3388-3391 der _ der CM[14] ANAPHORIC->14-3 +26-27 3392-3394 in _ in _ _ +26-28 3395-3398 der _ der _ _ +26-29 3399-3402 Tür _ Tür _ _ +26-30 3403-3418 stehengeblieben _ stehengeblieben _ _ +26-31 3419-3422 war _ war _ _ +26-32 3423-3424 , _ , _ _ +26-33 3425-3428 und _ und _ _ +26-34 3429-3433 dann _ dann _ _ +26-35 3434-3435 _ _ sah _ _ +26-36 3436-3437 _ _ er _ _ +26-37 3438-3444 wieder _ wieder _ _ +26-38 3445-3451 zurück _ zurück _ _ +26-39 3452-3453 . _ . _ _ + +#Text=_ Durch das offene Fenster erblickte man wieder die alte Frau , die mit wahrhaft greisenhafter Neugierde zu dem jetzt gegenüberliegenden Fenster getreten war , um auch weiterhin alles zu sehen . +27-1 3454-3455 _ _ _ _ _ +27-2 3456-3461 Durch _ Durch _ _ +27-3 3462-3465 das _ das CM[15] COREFERENTIAL->15-2 +27-4 3466-3472 offene _ offene CM[15] COREFERENTIAL->15-2 +27-5 3473-3480 Fenster _ Fenster CM[15] COREFERENTIAL->15-2 +27-6 3481-3490 erblickte _ erblickte _ _ +27-7 3491-3494 man _ man _ _ +27-8 3495-3501 wieder _ wieder _ _ +27-9 3502-3505 die _ die CM[48] ANAPHORIC->48-1 +27-10 3506-3510 alte _ alte CM[48] ANAPHORIC->48-1 +27-11 3511-3515 Frau _ Frau CM[48] ANAPHORIC->48-1 +27-12 3516-3517 , _ , _ _ +27-13 3518-3521 die _ die CM[48] COREFERENTIAL->48-2 +27-14 3522-3525 mit _ mit _ _ +27-15 3526-3534 wahrhaft _ wahrhaft _ _ +27-16 3535-3548 greisenhafter _ greisenhafter _ _ +27-17 3549-3558 Neugierde _ Neugierde _ _ +27-18 3559-3561 zu _ zu _ _ +27-19 3562-3565 dem _ dem _ _ +27-20 3566-3571 jetzt _ jetzt _ _ +27-21 3572-3590 gegenüberliegenden _ gegenüberliegenden _ _ +27-22 3591-3598 Fenster _ Fenster _ _ +27-23 3599-3607 getreten _ getreten _ _ +27-24 3608-3611 war _ war _ _ +27-25 3612-3613 , _ , _ _ +27-26 3614-3616 um _ um _ _ +27-27 3617-3621 auch _ auch _ _ +27-28 3622-3631 weiterhin _ weiterhin _ _ +27-29 3632-3637 alles _ alles _ _ +27-30 3638-3640 zu _ zu _ _ +27-31 3641-3646 sehen _ sehen _ _ +27-32 3647-3648 . _ . _ _ + +#Text=_ " Ich will doch Frau Grubach " , sagte K. , _ machte eine Bewegung , als reiße er sich von den zwei Männern los , die aber weit von ihm entfernt standen , und _ wollte weitergehen . +28-1 3649-3650 _ _ _ _ _ +28-2 3651-3652 " _ " _ _ +28-3 3653-3656 Ich _ Ich _ _ +28-4 3657-3661 will _ will _ _ +28-5 3662-3666 doch _ doch _ _ +28-6 3667-3671 Frau _ Frau CM[1] COREFERENTIAL->1-5 +28-7 3672-3679 Grubach PER Grubach CM[1] COREFERENTIAL->1-5 +28-8 3680-3681 " _ " _ _ +28-9 3682-3683 , _ , _ _ +28-10 3684-3689 sagte _ sagte _ _ +28-11 3690-3692 K. PER K. CM[89] ANAPHORIC->89-29 +28-12 3693-3694 , _ , _ _ +28-13 3695-3696 _ _ er _ _ +28-14 3697-3703 machte _ machte _ _ +28-15 3704-3708 eine _ eine _ _ +28-16 3709-3717 Bewegung _ Bewegung _ _ +28-17 3718-3719 , _ , _ _ +28-18 3720-3723 als _ als _ _ +28-19 3724-3729 reiße _ reiße _ _ +28-20 3730-3732 er _ er CM[89] ANAPHORIC->89-30 +28-21 3733-3737 sich _ sich _ _ +28-22 3738-3741 von _ von _ _ +28-23 3742-3745 den _ den CM[16] ANAPHORIC->16-1 +28-24 3746-3750 zwei _ zwei CM[16] ANAPHORIC->16-1 +28-25 3751-3758 Männern _ Männern CM[16] ANAPHORIC->16-1 +28-26 3759-3762 los _ los _ _ +28-27 3763-3764 , _ , _ _ +28-28 3765-3768 die _ die CM[16] *->16-2 +28-29 3769-3773 aber _ aber _ _ +28-30 3774-3778 weit _ weit _ _ +28-31 3779-3782 von _ von _ _ +28-32 3783-3786 ihm _ ihm CM[89] ANAPHORIC->89-31 +28-33 3787-3795 entfernt _ entfernt _ _ +28-34 3796-3803 standen _ standen _ _ +28-35 3804-3805 , _ , _ _ +28-36 3806-3809 und _ und _ _ +28-37 3810-3811 _ _ er _ _ +28-38 3812-3818 wollte _ wollte _ _ +28-39 3819-3830 weitergehen _ weitergehen _ _ +28-40 3831-3832 . _ . _ _ + +#Text=_ " Nein " , sagte der Mann beim Fenster , _ warf das Buch auf ein Tischchen und _ stand auf . +29-1 3833-3834 _ _ _ _ _ +29-2 3835-3836 " _ " _ _ +29-3 3837-3841 Nein _ Nein _ _ +29-4 3842-3843 " _ " _ _ +29-5 3844-3845 , _ , _ _ +29-6 3846-3851 sagte _ sagte _ _ +29-7 3852-3855 der _ der CM[17] ANAPHORIC->17-2 +29-8 3856-3860 Mann _ Mann CM[17] ANAPHORIC->17-2 +29-9 3861-3865 beim _ beim CM[15]|CM[17] COREFERENTIAL->15-3|ANAPHORIC->17-2 +29-10 3866-3873 Fenster _ Fenster CM[15]|CM[17] COREFERENTIAL->15-3|ANAPHORIC->17-2 +29-11 3874-3875 , _ , _ _ +29-12 3876-3877 _ _ er _ _ +29-13 3878-3882 warf _ warf _ _ +29-14 3883-3886 das _ das _ _ +29-15 3887-3891 Buch _ Buch _ _ +29-16 3892-3895 auf _ auf _ _ +29-17 3896-3899 ein _ ein _ _ +29-18 3900-3909 Tischchen _ Tischchen _ _ +29-19 3910-3913 und _ und _ _ +29-20 3914-3915 _ _ er _ _ +29-21 3916-3921 stand _ stand _ _ +29-22 3922-3925 auf _ auf _ _ +29-23 3926-3927 . _ . _ _ + +#Text=_ " Sie dürfen nicht weggehen , Sie sind ja verhaftet . " +30-1 3928-3929 _ _ _ _ _ +30-2 3930-3931 " _ " _ _ +30-3 3932-3935 Sie _ Sie _ _ +30-4 3936-3942 dürfen _ dürfen _ _ +30-5 3943-3948 nicht _ nicht _ _ +30-6 3949-3957 weggehen _ weggehen _ _ +30-7 3958-3959 , _ , _ _ +30-8 3960-3963 Sie _ Sie _ _ +30-9 3964-3968 sind _ sind _ _ +30-10 3969-3971 ja _ ja _ _ +30-11 3972-3981 verhaftet _ verhaftet _ _ +30-12 3982-3983 . _ . _ _ +30-13 3984-3985 " _ " _ _ + +#Text=_ " Es sieht so aus " , sagte K. +31-1 3986-3987 _ _ _ _ _ +31-2 3988-3989 " _ " _ _ +31-3 3990-3992 Es _ Es _ _ +31-4 3993-3998 sieht _ sieht _ _ +31-5 3999-4001 so _ so _ _ +31-6 4002-4005 aus _ aus _ _ +31-7 4006-4007 " _ " _ _ +31-8 4008-4009 , _ , _ _ +31-9 4010-4015 sagte _ sagte _ _ +31-10 4016-4018 K. PER K. CM[89] ANAPHORIC->89-32 + +#Text=_ " Und warum denn ? " _ fragte er dann . +32-1 4019-4020 _ _ _ _ _ +32-2 4021-4022 " _ " _ _ +32-3 4023-4026 Und _ Und _ _ +32-4 4027-4032 warum _ warum _ _ +32-5 4033-4037 denn _ denn _ _ +32-6 4038-4039 ? _ ? _ _ +32-7 4040-4041 " _ " _ _ +32-8 4042-4043 _ _ , _ _ +32-9 4044-4050 fragte _ fragte _ _ +32-10 4051-4053 er _ er CM[89] COREFERENTIAL->89-33 +32-11 4054-4058 dann _ dann _ _ +32-12 4059-4060 . _ . _ _ + +#Text=_ " Wir sind nicht dazu bestellt , Ihnen das zu sagen . +33-1 4061-4062 _ _ _ _ _ +33-2 4063-4064 " _ " _ _ +33-3 4065-4068 Wir _ Wir _ _ +33-4 4069-4073 sind _ sind _ _ +33-5 4074-4079 nicht _ nicht _ _ +33-6 4080-4084 dazu _ dazu _ _ +33-7 4085-4093 bestellt _ bestellt _ _ +33-8 4094-4095 , _ , _ _ +33-9 4096-4101 Ihnen _ Ihnen _ _ +33-10 4102-4105 das _ das _ _ +33-11 4106-4108 zu _ zu _ _ +33-12 4109-4114 sagen _ sagen _ _ +33-13 4115-4116 . _ . _ _ + +#Text=_ Gehen Sie in Ihr Zimmer und warten Sie . +34-1 4117-4118 _ _ _ _ _ +34-2 4119-4124 Gehen _ Gehen _ _ +34-3 4125-4128 Sie _ Sie _ _ +34-4 4129-4131 in _ in _ _ +34-5 4132-4135 Ihr _ Ihr _ _ +34-6 4136-4142 Zimmer _ Zimmer _ _ +34-7 4143-4146 und _ und _ _ +34-8 4147-4153 warten _ warten _ _ +34-9 4154-4157 Sie _ Sie _ _ +34-10 4158-4159 . _ . _ _ + +#Text=_ Das Verfahren ist nun einmal eingeleitet , und Sie werden alles zur richtigen Zeit erfahren . +35-1 4160-4161 _ _ _ _ _ +35-2 4162-4165 Das _ Das _ _ +35-3 4166-4175 Verfahren _ Verfahren _ _ +35-4 4176-4179 ist _ ist _ _ +35-5 4180-4183 nun _ nun _ _ +35-6 4184-4190 einmal _ einmal _ _ +35-7 4191-4202 eingeleitet _ eingeleitet _ _ +35-8 4203-4204 , _ , _ _ +35-9 4205-4208 und _ und _ _ +35-10 4209-4212 Sie _ Sie _ _ +35-11 4213-4219 werden _ werden _ _ +35-12 4220-4225 alles _ alles _ _ +35-13 4226-4229 zur _ zur _ _ +35-14 4230-4239 richtigen _ richtigen _ _ +35-15 4240-4244 Zeit _ Zeit _ _ +35-16 4245-4253 erfahren _ erfahren _ _ +35-17 4254-4255 . _ . _ _ + +#Text=_ Ich gehe über meinen Auftrag hinaus , wenn ich Ihnen so freundschaftlich zurede . +36-1 4256-4257 _ _ _ _ _ +36-2 4258-4261 Ich _ Ich _ _ +36-3 4262-4266 gehe _ gehe _ _ +36-4 4267-4271 über _ über _ _ +36-5 4272-4278 meinen _ meinen _ _ +36-6 4279-4286 Auftrag _ Auftrag _ _ +36-7 4287-4293 hinaus _ hinaus _ _ +36-8 4294-4295 , _ , _ _ +36-9 4296-4300 wenn _ wenn _ _ +36-10 4301-4304 ich _ ich _ _ +36-11 4305-4310 Ihnen _ Ihnen _ _ +36-12 4311-4313 so _ so _ _ +36-13 4314-4330 freundschaftlich _ freundschaftlich _ _ +36-14 4331-4337 zurede _ zurede _ _ +36-15 4338-4339 . _ . _ _ + +#Text=_ Aber ich hoffe , es hört es niemand sonst als Franz , und der ist selbst gegen alle Vorschrift freundlich zu Ihnen . +37-1 4340-4341 _ _ _ _ _ +37-2 4342-4346 Aber _ Aber _ _ +37-3 4347-4350 ich _ ich _ _ +37-4 4351-4356 hoffe _ hoffe _ _ +37-5 4357-4358 , _ , _ _ +37-6 4359-4361 es _ es _ _ +37-7 4362-4366 hört _ hört _ _ +37-8 4367-4369 es _ es _ _ +37-9 4370-4377 niemand _ niemand _ _ +37-10 4378-4383 sonst _ sonst _ _ +37-11 4384-4387 als _ als _ _ +37-12 4388-4393 Franz PER Franz CM[14] ANAPHORIC->14-4 +37-13 4394-4395 , _ , _ _ +37-14 4396-4399 und _ und _ _ +37-15 4400-4403 der _ der CM[14] COREFERENTIAL->14-5 +37-16 4404-4407 ist _ ist _ _ +37-17 4408-4414 selbst _ selbst _ _ +37-18 4415-4420 gegen _ gegen _ _ +37-19 4421-4425 alle _ alle _ _ +37-20 4426-4436 Vorschrift _ Vorschrift _ _ +37-21 4437-4447 freundlich _ freundlich _ _ +37-22 4448-4450 zu _ zu _ _ +37-23 4451-4456 Ihnen _ Ihnen _ _ +37-24 4457-4458 . _ . _ _ + +#Text=_ Wenn Sie auch weiterhin so viel Glück haben wie bei der Bestimmung Ihrer Wächter , dann können Sie zuversichtlich sein . " +38-1 4459-4460 _ _ _ _ _ +38-2 4461-4465 Wenn _ Wenn _ _ +38-3 4466-4469 Sie _ Sie _ _ +38-4 4470-4474 auch _ auch _ _ +38-5 4475-4484 weiterhin _ weiterhin _ _ +38-6 4485-4487 so _ so _ _ +38-7 4488-4492 viel _ viel _ _ +38-8 4493-4498 Glück _ Glück _ _ +38-9 4499-4504 haben _ haben _ _ +38-10 4505-4508 wie _ wie _ _ +38-11 4509-4512 bei _ bei _ _ +38-12 4513-4516 der _ der _ _ +38-13 4517-4527 Bestimmung _ Bestimmung _ _ +38-14 4528-4533 Ihrer _ Ihrer _ _ +38-15 4534-4541 Wächter _ Wächter _ _ +38-16 4542-4543 , _ , _ _ +38-17 4544-4548 dann _ dann _ _ +38-18 4549-4555 können _ können _ _ +38-19 4556-4559 Sie _ Sie _ _ +38-20 4560-4574 zuversichtlich _ zuversichtlich _ _ +38-21 4575-4579 sein _ sein _ _ +38-22 4580-4581 . _ . _ _ +38-23 4582-4583 " _ " _ _ + +#Text=_ K. wollte sich setzen , aber nun sah er , daß im ganzen Zimmer keine Sitzgelegenheit war , außer dem Sessel beim Fenster . +39-1 4584-4585 _ _ _ _ _ +39-2 4586-4588 K. PER K. CM[89] ANAPHORIC->89-34 +39-3 4589-4595 wollte _ wollte _ _ +39-4 4596-4600 sich _ sich _ _ +39-5 4601-4607 setzen _ setzen _ _ +39-6 4608-4609 , _ , _ _ +39-7 4610-4614 aber _ aber _ _ +39-8 4615-4618 nun _ nun _ _ +39-9 4619-4622 sah _ sah _ _ +39-10 4623-4625 er _ er CM[89] ANAPHORIC->89-35 +39-11 4626-4627 , _ , _ _ +39-12 4628-4631 daß _ dass _ _ +39-13 4632-4634 im _ im _ _ +39-14 4635-4641 ganzen _ ganzen _ _ +39-15 4642-4648 Zimmer _ Zimmer _ _ +39-16 4649-4654 keine _ keine _ _ +39-17 4655-4670 Sitzgelegenheit _ Sitzgelegenheit _ _ +39-18 4671-4674 war _ war _ _ +39-19 4675-4676 , _ , _ _ +39-20 4677-4682 außer _ außer _ _ +39-21 4683-4686 dem _ dem _ _ +39-22 4687-4693 Sessel _ Sessel _ _ +39-23 4694-4698 beim _ beim CM[15] *->15-4 +39-24 4699-4706 Fenster _ Fenster CM[15] *->15-4 +39-25 4707-4708 . _ . _ _ + +#Text=_ " Sie werden noch einsehen , wie wahr das alles ist " , sagte Franz und _ ging gleichzeitig mit dem andern Mann auf ihn zu . +40-1 4709-4710 _ _ _ _ _ +40-2 4711-4712 " _ " _ _ +40-3 4713-4716 Sie _ Sie _ _ +40-4 4717-4723 werden _ werden _ _ +40-5 4724-4728 noch _ noch _ _ +40-6 4729-4737 einsehen _ einsehen _ _ +40-7 4738-4739 , _ , _ _ +40-8 4740-4743 wie _ wie _ _ +40-9 4744-4748 wahr _ wahr _ _ +40-10 4749-4752 das _ das _ _ +40-11 4753-4758 alles _ alles _ _ +40-12 4759-4762 ist _ ist _ _ +40-13 4763-4764 " _ " _ _ +40-14 4765-4766 , _ , _ _ +40-15 4767-4772 sagte _ sagte _ _ +40-16 4773-4778 Franz PER Franz CM[14] COREFERENTIAL->14-6 +40-17 4779-4782 und _ und _ _ +40-18 4783-4784 _ _ er _ _ +40-19 4785-4789 ging _ ging _ _ +40-20 4790-4802 gleichzeitig _ gleichzeitig _ _ +40-21 4803-4806 mit _ mit CM[17] ANAPHORIC->17-3 +40-22 4807-4810 dem _ dem CM[17] ANAPHORIC->17-3 +40-23 4811-4817 andern _ andern CM[17] ANAPHORIC->17-3 +40-24 4818-4822 Mann _ Mann CM[17] ANAPHORIC->17-3 +40-25 4823-4826 auf _ auf _ _ +40-26 4827-4830 ihn _ ihn CM[89] ANAPHORIC->89-36 +40-27 4831-4833 zu _ zu _ _ +40-28 4834-4835 . _ . _ _ + +#Text=_ Besonders der letztere überragte K. bedeutend und _ klopfte ihm öfters auf die Schulter . +41-1 4836-4837 _ _ _ _ _ +41-2 4838-4847 Besonders _ Besonders _ _ +41-3 4848-4851 der _ der CM[17] COREFERENTIAL->17-4 +41-4 4852-4860 letztere _ letztere CM[17] COREFERENTIAL->17-4 +41-5 4861-4870 überragte _ überragte _ _ +41-6 4871-4873 K. PER K. CM[89] ANAPHORIC->89-37 +41-7 4874-4883 bedeutend _ bedeutend _ _ +41-8 4884-4887 und _ und _ _ +41-9 4888-4889 _ _ er _ _ +41-10 4890-4897 klopfte _ klopfte _ _ +41-11 4898-4901 ihm _ ihm CM[89] ANAPHORIC->89-38 +41-12 4902-4908 öfters _ öfters _ _ +41-13 4909-4912 auf _ auf _ _ +41-14 4913-4916 die _ die _ _ +41-15 4917-4925 Schulter _ Schulter _ _ +41-16 4926-4927 . _ . _ _ + +#Text=_ Beide prüften K.s Nachthemd und _ sagten , daß er jetzt ein viel schlechteres Hemd werde anziehen müssen , daß sie aber dieses Hemd wie auch seine übrige Wäsche aufbewahren und , wenn seine Sache günstig ausfallen sollte , _ _ ihm wieder zurückgeben würden . +42-1 4928-4929 _ _ _ _ _ +42-2 4930-4935 Beide _ Beide CM[19] ANAPHORIC->19-1 +42-3 4936-4943 prüften _ prüften _ _ +42-4 4944-4947 K.s PER K.s CM[18]|CM[89] ANAPHORIC->18-1|ANAPHORIC->89-39 +42-5 4948-4957 Nachthemd _ Nachthemd CM[18] ANAPHORIC->18-1 +42-6 4958-4961 und _ und _ _ +42-7 4962-4963 _ _ sie _ _ +42-8 4964-4970 sagten _ sagten _ _ +42-9 4971-4972 , _ , _ _ +42-10 4973-4976 daß _ dass _ _ +42-11 4977-4979 er _ er CM[89] ANAPHORIC->89-40 +42-12 4980-4985 jetzt _ jetzt _ _ +42-13 4986-4989 ein _ ein _ _ +42-14 4990-4994 viel _ viel _ _ +42-15 4995-5007 schlechteres _ schlechteres _ _ +42-16 5008-5012 Hemd _ Hemd _ _ +42-17 5013-5018 werde _ werde _ _ +42-18 5019-5027 anziehen _ anziehen _ _ +42-19 5028-5034 müssen _ müssen _ _ +42-20 5035-5036 , _ , _ _ +42-21 5037-5040 daß _ dass _ _ +42-22 5041-5044 sie _ sie CM[19] ANAPHORIC->19-2 +42-23 5045-5049 aber _ aber _ _ +42-24 5050-5056 dieses _ dieses CM[18] *->18-2 +42-25 5057-5061 Hemd _ Hemd CM[18] *->18-2 +42-26 5062-5065 wie _ wie _ _ +42-27 5066-5070 auch _ auch _ _ +42-28 5071-5076 seine _ seine _ _ +42-29 5077-5083 übrige _ übrige _ _ +42-30 5084-5090 Wäsche _ Wäsche _ _ +42-31 5091-5102 aufbewahren _ aufbewahren _ _ +42-32 5103-5106 und _ und _ _ +42-33 5107-5108 , _ , _ _ +42-34 5109-5113 wenn _ wenn _ _ +42-35 5114-5119 seine _ seine CM[89] ANAPHORIC->89-41 +42-36 5120-5125 Sache _ Sache _ _ +42-37 5126-5133 günstig _ günstig _ _ +42-38 5134-5143 ausfallen _ ausfallen _ _ +42-39 5144-5150 sollte _ sollte _ _ +42-40 5151-5152 , _ , _ _ +42-41 5153-5154 _ _ sie _ _ +42-42 5155-5156 _ _ es _ _ +42-43 5157-5160 ihm _ ihm CM[89] ANAPHORIC->89-42 +42-44 5161-5167 wieder _ wieder _ _ +42-45 5168-5179 zurückgeben _ zurückgeben _ _ +42-46 5180-5186 würden _ würden _ _ +42-47 5187-5188 . _ . _ _ + +#Text=_ " Es ist besser , Sie geben die Sachen uns als ins Depot " , sagten sie , " denn im Depot kommen öfters Unterschleife vor und außerdem verkauft man dort alle Sachen nach einer gewissen Zeit , ohne Rücksicht , ob das betreffende Verfahren zu Ende ist oder nicht . +43-1 5189-5190 _ _ _ _ _ +43-2 5191-5192 " _ " _ _ +43-3 5193-5195 Es _ Es _ _ +43-4 5196-5199 ist _ ist _ _ +43-5 5200-5206 besser _ besser _ _ +43-6 5207-5208 , _ , _ _ +43-7 5209-5212 Sie _ Sie _ _ +43-8 5213-5218 geben _ geben _ _ +43-9 5219-5222 die _ die _ _ +43-10 5223-5229 Sachen _ Sachen _ _ +43-11 5230-5233 uns _ uns _ _ +43-12 5234-5237 als _ als _ _ +43-13 5238-5241 ins _ ins CM[20] COREFERENTIAL->20-1 +43-14 5242-5247 Depot _ Depot CM[20] COREFERENTIAL->20-1 +43-15 5248-5249 " _ " _ _ +43-16 5250-5251 , _ , _ _ +43-17 5252-5258 sagten _ sagten _ _ +43-18 5259-5262 sie _ sie CM[19] *->19-3 +43-19 5263-5264 , _ , _ _ +43-20 5265-5266 " _ " _ _ +43-21 5267-5271 denn _ denn _ _ +43-22 5272-5274 im _ im CM[20] COREFERENTIAL->20-2 +43-23 5275-5280 Depot _ Depot CM[20] COREFERENTIAL->20-2 +43-24 5281-5287 kommen _ kommen _ _ +43-25 5288-5294 öfters _ öfters _ _ +43-26 5295-5308 Unterschleife _ Unterschleife _ _ +43-27 5309-5312 vor _ vor _ _ +43-28 5313-5316 und _ und _ _ +43-29 5317-5325 außerdem _ außerdem _ _ +43-30 5326-5334 verkauft _ verkauft _ _ +43-31 5335-5338 man _ man _ _ +43-32 5339-5343 dort _ dort _ _ +43-33 5344-5348 alle _ alle _ _ +43-34 5349-5355 Sachen _ Sachen _ _ +43-35 5356-5360 nach _ nach _ _ +43-36 5361-5366 einer _ einer _ _ +43-37 5367-5375 gewissen _ gewissen _ _ +43-38 5376-5380 Zeit _ Zeit _ _ +43-39 5381-5382 , _ , _ _ +43-40 5383-5387 ohne _ ohne _ _ +43-41 5388-5397 Rücksicht _ Rücksicht _ _ +43-42 5398-5399 , _ , _ _ +43-43 5400-5402 ob _ ob _ _ +43-44 5403-5406 das _ das _ _ +43-45 5407-5418 betreffende _ betreffende _ _ +43-46 5419-5428 Verfahren _ Verfahren _ _ +43-47 5429-5431 zu _ zu _ _ +43-48 5432-5436 Ende _ Ende _ _ +43-49 5437-5440 ist _ ist _ _ +43-50 5441-5445 oder _ oder _ _ +43-51 5446-5451 nicht _ nicht _ _ +43-52 5452-5453 . _ . _ _ + +#Text=_ Und wie lange dauern doch derartige Prozesse , besonders in letzter Zeit ! +44-1 5454-5455 _ _ _ _ _ +44-2 5456-5459 Und _ Und _ _ +44-3 5460-5463 wie _ wie _ _ +44-4 5464-5469 lange _ lange _ _ +44-5 5470-5476 dauern _ dauern _ _ +44-6 5477-5481 doch _ doch _ _ +44-7 5482-5491 derartige _ derartige _ _ +44-8 5492-5500 Prozesse _ Prozesse _ _ +44-9 5501-5502 , _ , _ _ +44-10 5503-5512 besonders _ besonders _ _ +44-11 5513-5515 in _ in _ _ +44-12 5516-5523 letzter _ letzter _ _ +44-13 5524-5528 Zeit _ Zeit _ _ +44-14 5529-5530 ! _ ! _ _ + +#Text=_ Sie bekämen dann schließlich allerdings vom Depot den Erlös , aber dieser Erlös ist erstens an sich schon gering , denn beim Verkauf entscheidet nicht die Höhe des Angebotes , sondern die Höhe der Bestechung , und weiter verringern sich solche Erlöse erfahrungsgemäß , wenn sie von Hand zu Hand und von Jahr zu Jahr weitergegeben werden . " +45-1 5531-5532 _ _ _ _ _ +45-2 5533-5536 Sie _ Sie _ _ +45-3 5537-5544 bekämen _ bekämen _ _ +45-4 5545-5549 dann _ dann _ _ +45-5 5550-5561 schließlich _ schließlich _ _ +45-6 5562-5572 allerdings _ allerdings _ _ +45-7 5573-5576 vom _ vom CM[20] *->20-3 +45-8 5577-5582 Depot _ Depot CM[20] *->20-3 +45-9 5583-5586 den _ den CM[21] ANAPHORIC->21-1 +45-10 5587-5592 Erlös _ Erlös CM[21] ANAPHORIC->21-1 +45-11 5593-5594 , _ , _ _ +45-12 5595-5599 aber _ aber _ _ +45-13 5600-5606 dieser _ dieser CM[21] AMPLIFICATION->21-2 +45-14 5607-5612 Erlös _ Erlös CM[21] AMPLIFICATION->21-2 +45-15 5613-5616 ist _ ist _ _ +45-16 5617-5624 erstens _ erstens _ _ +45-17 5625-5627 an _ an _ _ +45-18 5628-5632 sich _ sich _ _ +45-19 5633-5638 schon _ schon _ _ +45-20 5639-5645 gering _ gering _ _ +45-21 5646-5647 , _ , _ _ +45-22 5648-5652 denn _ denn _ _ +45-23 5653-5657 beim _ beim _ _ +45-24 5658-5665 Verkauf _ Verkauf _ _ +45-25 5666-5677 entscheidet _ entscheidet _ _ +45-26 5678-5683 nicht _ nicht _ _ +45-27 5684-5687 die _ die _ _ +45-28 5688-5692 Höhe _ Höhe _ _ +45-29 5693-5696 des _ des _ _ +45-30 5697-5706 Angebotes _ Angebotes _ _ +45-31 5707-5708 , _ , _ _ +45-32 5709-5716 sondern _ sondern _ _ +45-33 5717-5720 die _ die _ _ +45-34 5721-5725 Höhe _ Höhe _ _ +45-35 5726-5729 der _ der _ _ +45-36 5730-5740 Bestechung _ Bestechung _ _ +45-37 5741-5742 , _ , _ _ +45-38 5743-5746 und _ und _ _ +45-39 5747-5753 weiter _ weiter _ _ +45-40 5754-5764 verringern _ verringern _ _ +45-41 5765-5769 sich _ sich _ _ +45-42 5770-5776 solche _ solche CM[21] ANAPHORIC->21-3 +45-43 5777-5783 Erlöse _ Erlöse CM[21] ANAPHORIC->21-3 +45-44 5784-5799 erfahrungsgemäß _ erfahrungsgemäß _ _ +45-45 5800-5801 , _ , _ _ +45-46 5802-5806 wenn _ wenn _ _ +45-47 5807-5810 sie _ sie CM[21] *->21-4 +45-48 5811-5814 von _ von _ _ +45-49 5815-5819 Hand _ Hand _ _ +45-50 5820-5822 zu _ zu _ _ +45-51 5823-5827 Hand _ Hand _ _ +45-52 5828-5831 und _ und _ _ +45-53 5832-5835 von _ von _ _ +45-54 5836-5840 Jahr _ Jahr _ _ +45-55 5841-5843 zu _ zu _ _ +45-56 5844-5848 Jahr _ Jahr _ _ +45-57 5849-5862 weitergegeben _ weitergegeben _ _ +45-58 5863-5869 werden _ werden _ _ +45-59 5870-5871 . _ . _ _ +45-60 5872-5873 " _ " _ _ + +#Text=_ K. achtete auf diese Reden kaum , das Verfügungsrecht über seine Sachen , das er vielleicht noch besaß , schätzte er nicht hoch ein , viel wichtiger war es ihm , Klarheit über seine Lage zu bekommen ; in Gegenwart dieser Leute konnte er aber nicht einmal nachdenken , immer wieder stieß der Bauch des zweiten Wächters – es konnten ja nur Wächter sein – förmlich freundschaftlich an ihn , sah er aber auf , dann erblickte er ein zu diesem dicken Körper gar nicht passendes trockenes , knochiges Gesicht mit starker , seitlich gedrehter Nase , das sich über ihn hinweg mit dem anderen Wächter verständigte . +46-1 5874-5875 _ _ _ _ _ +46-2 5876-5878 K. PER K. CM[89] ANAPHORIC->89-43 +46-3 5879-5886 achtete _ achtete _ _ +46-4 5887-5890 auf _ auf _ _ +46-5 5891-5896 diese _ diese _ _ +46-6 5897-5902 Reden _ Reden _ _ +46-7 5903-5907 kaum _ kaum _ _ +46-8 5908-5909 , _ , _ _ +46-9 5910-5913 das _ das CM[22] ANAPHORIC->22-1 +46-10 5914-5929 Verfügungsrecht _ Verfügungsrecht CM[22] ANAPHORIC->22-1 +46-11 5930-5934 über _ über CM[22] ANAPHORIC->22-1 +46-12 5935-5940 seine _ seine CM[22]|CM[89] ANAPHORIC->22-1|ANAPHORIC->89-44 +46-13 5941-5947 Sachen _ Sachen CM[22] ANAPHORIC->22-1 +46-14 5948-5949 , _ , _ _ +46-15 5950-5953 das _ das CM[22] *->22-2 +46-16 5954-5956 er _ er CM[89] ANAPHORIC->89-45 +46-17 5957-5967 vielleicht _ vielleicht _ _ +46-18 5968-5972 noch _ noch _ _ +46-19 5973-5978 besaß _ besaß _ _ +46-20 5979-5980 , _ , _ _ +46-21 5981-5989 schätzte _ schätzte _ _ +46-22 5990-5992 er _ er CM[89] ANAPHORIC->89-46 +46-23 5993-5998 nicht _ nicht _ _ +46-24 5999-6003 hoch _ hoch _ _ +46-25 6004-6007 ein _ ein _ _ +46-26 6008-6009 , _ , _ _ +46-27 6010-6014 viel _ viel _ _ +46-28 6015-6024 wichtiger _ wichtiger _ _ +46-29 6025-6028 war _ war _ _ +46-30 6029-6031 es _ es _ _ +46-31 6032-6035 ihm _ ihm CM[89] ANAPHORIC->89-47 +46-32 6036-6037 , _ , _ _ +46-33 6038-6046 Klarheit _ Klarheit _ _ +46-34 6047-6051 über _ über _ _ +46-35 6052-6057 seine _ seine CM[89] ANAPHORIC->89-48 +46-36 6058-6062 Lage _ Lage _ _ +46-37 6063-6065 zu _ zu _ _ +46-38 6066-6074 bekommen _ bekommen _ _ +46-39 6075-6076 ; _ \; _ _ +46-40 6077-6079 in _ in _ _ +46-41 6080-6089 Gegenwart _ Gegenwart _ _ +46-42 6090-6096 dieser _ dieser _ _ +46-43 6097-6102 Leute _ Leute _ _ +46-44 6103-6109 konnte _ konnte _ _ +46-45 6110-6112 er _ er CM[89] ANAPHORIC->89-49 +46-46 6113-6117 aber _ aber _ _ +46-47 6118-6123 nicht _ nicht _ _ +46-48 6124-6130 einmal _ einmal _ _ +46-49 6131-6141 nachdenken _ nachdenken _ _ +46-50 6142-6143 , _ , _ _ +46-51 6144-6149 immer _ immer _ _ +46-52 6150-6156 wieder _ wieder _ _ +46-53 6157-6162 stieß _ stieß _ _ +46-54 6163-6166 der _ der _ _ +46-55 6167-6172 Bauch _ Bauch _ _ +46-56 6173-6176 des _ des CM[17] *->17-5 +46-57 6177-6184 zweiten _ zweiten CM[17] *->17-5 +46-58 6185-6193 Wächters _ Wächters CM[17] *->17-5 +46-59 6194-6195 – _ – _ _ +46-60 6196-6198 es _ es _ _ +46-61 6199-6206 konnten _ konnten _ _ +46-62 6207-6209 ja _ ja _ _ +46-63 6210-6213 nur _ nur _ _ +46-64 6214-6221 Wächter _ Wächter _ _ +46-65 6222-6226 sein _ sein _ _ +46-66 6227-6228 – _ – _ _ +46-67 6229-6237 förmlich _ förmlich _ _ +46-68 6238-6254 freundschaftlich _ freundschaftlich _ _ +46-69 6255-6257 an _ an _ _ +46-70 6258-6261 ihn _ ihn CM[89] AMPLIFICATION->89-50 +46-71 6262-6263 , _ , _ _ +46-72 6264-6267 sah _ sah _ _ +46-73 6268-6270 er _ er CM[89] ANAPHORIC->89-51 +46-74 6271-6275 aber _ aber _ _ +46-75 6276-6279 auf _ auf _ _ +46-76 6280-6281 , _ , _ _ +46-77 6282-6286 dann _ dann _ _ +46-78 6287-6296 erblickte _ erblickte _ _ +46-79 6297-6299 er _ er CM[89] ANAPHORIC->89-52 +46-80 6300-6303 ein _ ein CM[23] ANAPHORIC->23-1 +46-81 6304-6306 zu _ zu CM[23] ANAPHORIC->23-1 +46-82 6307-6313 diesem _ diesem CM[23] ANAPHORIC->23-1 +46-83 6314-6320 dicken _ dicken CM[23] ANAPHORIC->23-1 +46-84 6321-6327 Körper _ Körper CM[23] ANAPHORIC->23-1 +46-85 6328-6331 gar _ gar CM[23] ANAPHORIC->23-1 +46-86 6332-6337 nicht _ nicht CM[23] ANAPHORIC->23-1 +46-87 6338-6347 passendes _ passendes CM[23] ANAPHORIC->23-1 +46-88 6348-6357 trockenes _ trockenes CM[23] ANAPHORIC->23-1 +46-89 6358-6359 , _ , CM[23] ANAPHORIC->23-1 +46-90 6360-6369 knochiges _ knochiges CM[23] ANAPHORIC->23-1 +46-91 6370-6377 Gesicht _ Gesicht CM[23] ANAPHORIC->23-1 +46-92 6378-6381 mit _ mit CM[23] ANAPHORIC->23-1 +46-93 6382-6389 starker _ starker CM[23] ANAPHORIC->23-1 +46-94 6390-6391 , _ , CM[23] ANAPHORIC->23-1 +46-95 6392-6400 seitlich _ seitlich CM[23] ANAPHORIC->23-1 +46-96 6401-6410 gedrehter _ gedrehter CM[23] ANAPHORIC->23-1 +46-97 6411-6415 Nase _ Nase CM[23] ANAPHORIC->23-1 +46-98 6416-6417 , _ , _ _ +46-99 6418-6421 das _ das CM[23] *->23-2 +46-100 6422-6426 sich _ sich _ _ +46-101 6427-6431 über _ über _ _ +46-102 6432-6435 ihn _ ihn CM[89] ANAPHORIC->89-53 +46-103 6436-6442 hinweg _ hinweg _ _ +46-104 6443-6446 mit _ mit _ _ +46-105 6447-6450 dem _ dem CM[14] COREFERENTIAL->14-7 +46-106 6451-6458 anderen _ anderen CM[14] COREFERENTIAL->14-7 +46-107 6459-6466 Wächter _ Wächter CM[14] COREFERENTIAL->14-7 +46-108 6467-6479 verständigte _ verständigte _ _ +46-109 6480-6481 . _ . _ _ + +#Text=_ Was waren denn das für Menschen ? +47-1 6482-6483 _ _ _ _ _ +47-2 6484-6487 Was _ Was _ _ +47-3 6488-6493 waren _ waren _ _ +47-4 6494-6498 denn _ denn _ _ +47-5 6499-6502 das _ das CM[24] ANAPHORIC->24-1 +47-6 6503-6506 für _ für _ _ +47-7 6507-6515 Menschen _ Menschen _ _ +47-8 6516-6517 ? _ ? _ _ + +#Text=_ Wovon sprachen sie ? +48-1 6518-6519 _ _ _ _ _ +48-2 6520-6525 Wovon _ Wovon _ _ +48-3 6526-6534 sprachen _ sprachen _ _ +48-4 6535-6538 sie _ sie CM[24] ANAPHORIC->24-2 +48-5 6539-6540 ? _ ? _ _ + +#Text=_ Welcher Behörde gehörten sie an ? +49-1 6541-6542 _ _ _ _ _ +49-2 6543-6550 Welcher _ Welcher _ _ +49-3 6551-6558 Behörde _ Behörde _ _ +49-4 6559-6567 gehörten _ gehörten _ _ +49-5 6568-6571 sie _ sie CM[24] ANAPHORIC->24-3 +49-6 6572-6574 an _ an _ _ +49-7 6575-6576 ? _ ? _ _ + +#Text=_ K. lebte doch in einem Rechtsstaat , überall herrschte Friede , alle Gesetze bestanden aufrecht , wer wagte , ihn in seiner Wohnung zu überfallen ? +50-1 6577-6578 _ _ _ _ _ +50-2 6579-6581 K. PER K. CM[89] ANAPHORIC->89-54 +50-3 6582-6587 lebte _ lebte _ _ +50-4 6588-6592 doch _ doch _ _ +50-5 6593-6595 in _ in _ _ +50-6 6596-6601 einem _ einem _ _ +50-7 6602-6613 Rechtsstaat _ Rechtsstaat _ _ +50-8 6614-6615 , _ , _ _ +50-9 6616-6623 überall _ überall _ _ +50-10 6624-6633 herrschte _ herrschte _ _ +50-11 6634-6640 Friede _ Friede _ _ +50-12 6641-6642 , _ , _ _ +50-13 6643-6647 alle _ alle _ _ +50-14 6648-6655 Gesetze _ Gesetze _ _ +50-15 6656-6665 bestanden _ bestanden _ _ +50-16 6666-6674 aufrecht _ aufrecht _ _ +50-17 6675-6676 , _ , _ _ +50-18 6677-6680 wer _ wer _ _ +50-19 6681-6686 wagte _ wagte _ _ +50-20 6687-6688 , _ , _ _ +50-21 6689-6692 ihn _ ihn CM[89] ANAPHORIC->89-55 +50-22 6693-6695 in _ in _ _ +50-23 6696-6702 seiner _ seiner CM[89] ANAPHORIC->89-56 +50-24 6703-6710 Wohnung _ Wohnung _ _ +50-25 6711-6713 zu _ zu _ _ +50-26 6714-6724 überfallen _ überfallen _ _ +50-27 6725-6726 ? _ ? _ _ + +#Text=_ Er neigte stets dazu , alles möglichst leicht zu nehmen , das Schlimmste erst beim Eintritt des Schlimmsten zu glauben , keine Vorsorge für die Zukunft zu treffen , selbst wenn alles drohte . +51-1 6727-6728 _ _ _ _ _ +51-2 6729-6731 Er _ Er CM[89] ANAPHORIC->89-57 +51-3 6732-6738 neigte _ neigte _ _ +51-4 6739-6744 stets _ stets _ _ +51-5 6745-6749 dazu _ dazu _ _ +51-6 6750-6751 , _ , _ _ +51-7 6752-6757 alles _ alles _ _ +51-8 6758-6767 möglichst _ möglichst _ _ +51-9 6768-6774 leicht _ leicht _ _ +51-10 6775-6777 zu _ zu _ _ +51-11 6778-6784 nehmen _ nehmen _ _ +51-12 6785-6786 , _ , _ _ +51-13 6787-6790 das _ das CM[25] COREFERENTIAL->25-1 +51-14 6791-6801 Schlimmste _ Schlimmste CM[25] COREFERENTIAL->25-1 +51-15 6802-6806 erst _ erst _ _ +51-16 6807-6811 beim _ beim _ _ +51-17 6812-6820 Eintritt _ Eintritt _ _ +51-18 6821-6824 des _ des CM[25] *->25-2 +51-19 6825-6836 Schlimmsten _ Schlimmsten CM[25] *->25-2 +51-20 6837-6839 zu _ zu _ _ +51-21 6840-6847 glauben _ glauben _ _ +51-22 6848-6849 , _ , _ _ +51-23 6850-6855 keine _ keine _ _ +51-24 6856-6864 Vorsorge _ Vorsorge _ _ +51-25 6865-6868 für _ für _ _ +51-26 6869-6872 die _ die _ _ +51-27 6873-6880 Zukunft _ Zukunft _ _ +51-28 6881-6883 zu _ zu _ _ +51-29 6884-6891 treffen _ treffen _ _ +51-30 6892-6893 , _ , _ _ +51-31 6894-6900 selbst _ selbst _ _ +51-32 6901-6905 wenn _ wenn _ _ +51-33 6906-6911 alles _ alles _ _ +51-34 6912-6918 drohte _ drohte _ _ +51-35 6919-6920 . _ . _ _ + +#Text=_ Hier schien ihm das aber nicht richtig , man konnte zwar das Ganze als Spaß ansehen , als einen groben Spaß , den ihm aus unbekannten Gründen , vielleicht weil heute sein dreißigster Geburtstag war , die Kollegen in der Bank veranstaltet hatten , es war natürlich möglich , vielleicht brauchte er nur auf irgendeine Weise den Wächtern ins Gesicht zu lachen , und sie würden mitlachen , vielleicht waren es Dienstmänner von der Straßenecke , sie sahen ihnen nicht unähnlich – trotzdem war er diesmal , förmlich schon seit dem ersten Anblick des Wächters Franz , entschlossen , nicht den geringsten Vorteil , den er vielleicht gegenüber diesen Leuten besaß , aus der Hand zu geben . +52-1 6921-6922 _ _ _ _ _ +52-2 6923-6927 Hier _ Hier _ _ +52-3 6928-6934 schien _ schien _ _ +52-4 6935-6938 ihm _ ihm CM[89] ANAPHORIC->89-58 +52-5 6939-6942 das _ das _ _ +52-6 6943-6947 aber _ aber _ _ +52-7 6948-6953 nicht _ nicht _ _ +52-8 6954-6961 richtig _ richtig _ _ +52-9 6962-6963 , _ , _ _ +52-10 6964-6967 man _ man _ _ +52-11 6968-6974 konnte _ konnte _ _ +52-12 6975-6979 zwar _ zwar _ _ +52-13 6980-6983 das _ das _ _ +52-14 6984-6989 Ganze _ Ganze _ _ +52-15 6990-6993 als _ als _ _ +52-16 6994-6998 Spaß _ Spaß _ _ +52-17 6999-7006 ansehen _ ansehen _ _ +52-18 7007-7008 , _ , _ _ +52-19 7009-7012 als _ als _ _ +52-20 7013-7018 einen _ einen CM[26] ANAPHORIC->26-1 +52-21 7019-7025 groben _ groben CM[26] ANAPHORIC->26-1 +52-22 7026-7030 Spaß _ Spaß CM[26] ANAPHORIC->26-1 +52-23 7031-7032 , _ , _ _ +52-24 7033-7036 den _ den CM[26] *->26-2 +52-25 7037-7040 ihm _ ihm CM[89] ANAPHORIC->89-59 +52-26 7041-7044 aus _ aus _ _ +52-27 7045-7056 unbekannten _ unbekannten _ _ +52-28 7057-7064 Gründen _ Gründen _ _ +52-29 7065-7066 , _ , _ _ +52-30 7067-7077 vielleicht _ vielleicht _ _ +52-31 7078-7082 weil _ weil _ _ +52-32 7083-7088 heute _ heute _ _ +52-33 7089-7093 sein _ sein CM[89] ANAPHORIC->89-60 +52-34 7094-7105 dreißigster _ dreißigster _ _ +52-35 7106-7116 Geburtstag _ Geburtstag _ _ +52-36 7117-7120 war _ war _ _ +52-37 7121-7122 , _ , _ _ +52-38 7123-7126 die _ die _ _ +52-39 7127-7135 Kollegen _ Kollegen _ _ +52-40 7136-7138 in _ in _ _ +52-41 7139-7142 der _ der _ _ +52-42 7143-7147 Bank _ Bank _ _ +52-43 7148-7160 veranstaltet _ veranstaltet _ _ +52-44 7161-7167 hatten _ hatten _ _ +52-45 7168-7169 , _ , _ _ +52-46 7170-7172 es _ es _ _ +52-47 7173-7176 war _ war _ _ +52-48 7177-7186 natürlich _ natürlich _ _ +52-49 7187-7194 möglich _ möglich _ _ +52-50 7195-7196 , _ , _ _ +52-51 7197-7207 vielleicht _ vielleicht _ _ +52-52 7208-7216 brauchte _ brauchte _ _ +52-53 7217-7219 er _ er CM[89] ANAPHORIC->89-61 +52-54 7220-7223 nur _ nur _ _ +52-55 7224-7227 auf _ auf _ _ +52-56 7228-7238 irgendeine _ irgendeine _ _ +52-57 7239-7244 Weise _ Weise _ _ +52-58 7245-7248 den _ den CM[24] ANAPHORIC->24-4 +52-59 7249-7257 Wächtern _ Wächtern CM[24] ANAPHORIC->24-4 +52-60 7258-7261 ins _ ins _ _ +52-61 7262-7269 Gesicht _ Gesicht _ _ +52-62 7270-7272 zu _ zu _ _ +52-63 7273-7279 lachen _ lachen _ _ +52-64 7280-7281 , _ , _ _ +52-65 7282-7285 und _ und _ _ +52-66 7286-7289 sie _ sie CM[24] ANAPHORIC->24-5 +52-67 7290-7296 würden _ würden _ _ +52-68 7297-7306 mitlachen _ mitlachen _ _ +52-69 7307-7308 , _ , _ _ +52-70 7309-7319 vielleicht _ vielleicht _ _ +52-71 7320-7325 waren _ waren _ _ +52-72 7326-7328 es _ es CM[24] ANAPHORIC->24-6 +52-73 7329-7341 Dienstmänner _ Dienstmänner CM[27] ANAPHORIC->27-1 +52-74 7342-7345 von _ von _ _ +52-75 7346-7349 der _ der _ _ +52-76 7350-7361 Straßenecke _ Straßenecke _ _ +52-77 7362-7363 , _ , _ _ +52-78 7364-7367 sie _ sie CM[24] ANAPHORIC->24-7 +52-79 7368-7373 sahen _ sahen _ _ +52-80 7374-7379 ihnen _ ihnen CM[27] *->27-2 +52-81 7380-7385 nicht _ nicht _ _ +52-82 7386-7395 unähnlich _ unähnlich _ _ +52-83 7396-7397 – _ – _ _ +52-84 7398-7406 trotzdem _ trotzdem _ _ +52-85 7407-7410 war _ war _ _ +52-86 7411-7413 er _ er CM[89] ANAPHORIC->89-62 +52-87 7414-7421 diesmal _ diesmal _ _ +52-88 7422-7423 , _ , _ _ +52-89 7424-7432 förmlich _ förmlich _ _ +52-90 7433-7438 schon _ schon _ _ +52-91 7439-7443 seit _ seit _ _ +52-92 7444-7447 dem _ dem _ _ +52-93 7448-7454 ersten _ ersten _ _ +52-94 7455-7462 Anblick _ Anblick _ _ +52-95 7463-7466 des _ des CM[14] COREFERENTIAL->14-8 +52-96 7467-7475 Wächters _ Wächters CM[14] COREFERENTIAL->14-8 +52-97 7476-7481 Franz PER Franz CM[14] COREFERENTIAL->14-8 +52-98 7482-7483 , _ , _ _ +52-99 7484-7496 entschlossen _ entschlossen _ _ +52-100 7497-7498 , _ , _ _ +52-101 7499-7504 nicht _ nicht _ _ +52-102 7505-7508 den _ den CM[28] ANAPHORIC->28-1 +52-103 7509-7519 geringsten _ geringsten CM[28] ANAPHORIC->28-1 +52-104 7520-7527 Vorteil _ Vorteil CM[28] ANAPHORIC->28-1 +52-105 7528-7529 , _ , _ _ +52-106 7530-7533 den _ den CM[28] *->28-2 +52-107 7534-7536 er _ er CM[89] ANAPHORIC->89-63 +52-108 7537-7547 vielleicht _ vielleicht _ _ +52-109 7548-7557 gegenüber _ gegenüber _ _ +52-110 7558-7564 diesen _ diesen CM[24] COREFERENTIAL->24-8 +52-111 7565-7571 Leuten _ Leuten CM[24] COREFERENTIAL->24-8 +52-112 7572-7577 besaß _ besaß _ _ +52-113 7578-7579 , _ , _ _ +52-114 7580-7583 aus _ aus _ _ +52-115 7584-7587 der _ der _ _ +52-116 7588-7592 Hand _ Hand _ _ +52-117 7593-7595 zu _ zu _ _ +52-118 7596-7601 geben _ geben _ _ +52-119 7602-7603 . _ . _ _ + +#Text=_ Darin , daß man später sagen würde , er habe keinen Spaß verstanden , sah K. eine ganz geringe Gefahr , wohl aber erinnerte er sich – ohne daß es sonst seine Gewohnheit gewesen wäre , aus Erfahrungen zu lernen – an einige , an sich unbedeutende Fälle , in denen er zum Unterschied von seinen Freunden mit Bewußtsein , ohne das geringste Gefühl für die möglichen Folgen , sich unvorsichtig benommen hatte und dafür durch das Ergebnis gestraft worden war . +53-1 7604-7605 _ _ _ _ _ +53-2 7606-7611 Darin _ Darin _ _ +53-3 7612-7613 , _ , _ _ +53-4 7614-7617 daß _ dass _ _ +53-5 7618-7621 man _ man _ _ +53-6 7622-7628 später _ später _ _ +53-7 7629-7634 sagen _ sagen _ _ +53-8 7635-7640 würde _ würde _ _ +53-9 7641-7642 , _ , _ _ +53-10 7643-7645 er _ er CM[89] ANAPHORIC->89-64 +53-11 7646-7650 habe _ habe _ _ +53-12 7651-7657 keinen _ keinen _ _ +53-13 7658-7662 Spaß _ Spaß _ _ +53-14 7663-7673 verstanden _ verstanden _ _ +53-15 7674-7675 , _ , _ _ +53-16 7676-7679 sah _ sah _ _ +53-17 7680-7682 K. PER K. CM[89] ANAPHORIC->89-65 +53-18 7683-7687 eine _ eine _ _ +53-19 7688-7692 ganz _ ganz _ _ +53-20 7693-7700 geringe _ geringe _ _ +53-21 7701-7707 Gefahr _ Gefahr _ _ +53-22 7708-7709 , _ , _ _ +53-23 7710-7714 wohl _ wohl _ _ +53-24 7715-7719 aber _ aber _ _ +53-25 7720-7729 erinnerte _ erinnerte _ _ +53-26 7730-7732 er _ er CM[89] ANAPHORIC->89-66 +53-27 7733-7737 sich _ sich _ _ +53-28 7738-7739 – _ – _ _ +53-29 7740-7744 ohne _ ohne _ _ +53-30 7745-7748 daß _ dass _ _ +53-31 7749-7751 es _ es _ _ +53-32 7752-7757 sonst _ sonst _ _ +53-33 7758-7763 seine _ seine CM[89] ANAPHORIC->89-67 +53-34 7764-7774 Gewohnheit _ Gewohnheit _ _ +53-35 7775-7782 gewesen _ gewesen _ _ +53-36 7783-7787 wäre _ wäre _ _ +53-37 7788-7789 , _ , _ _ +53-38 7790-7793 aus _ aus _ _ +53-39 7794-7805 Erfahrungen _ Erfahrungen _ _ +53-40 7806-7808 zu _ zu _ _ +53-41 7809-7815 lernen _ lernen _ _ +53-42 7816-7817 – _ – _ _ +53-43 7818-7820 an _ an _ _ +53-44 7821-7827 einige _ einige CM[29] ANAPHORIC->29-1 +53-45 7828-7829 , _ , CM[29] ANAPHORIC->29-1 +53-46 7830-7832 an _ an CM[29] ANAPHORIC->29-1 +53-47 7833-7837 sich _ sich CM[29] ANAPHORIC->29-1 +53-48 7838-7850 unbedeutende _ unbedeutende CM[29] ANAPHORIC->29-1 +53-49 7851-7856 Fälle _ Fälle CM[29] ANAPHORIC->29-1 +53-50 7857-7858 , _ , _ _ +53-51 7859-7861 in _ in _ _ +53-52 7862-7867 denen _ denen CM[29] *->29-2 +53-53 7868-7870 er _ er CM[89] ANAPHORIC->89-68 +53-54 7871-7874 zum _ zum _ _ +53-55 7875-7886 Unterschied _ Unterschied _ _ +53-56 7887-7890 von _ von _ _ +53-57 7891-7897 seinen _ seinen CM[89] ANAPHORIC->89-69 +53-58 7898-7906 Freunden _ Freunden _ _ +53-59 7907-7910 mit _ mit _ _ +53-60 7911-7921 Bewußtsein _ Bewusstsein _ _ +53-61 7922-7923 , _ , _ _ +53-62 7924-7928 ohne _ ohne _ _ +53-63 7929-7932 das _ das _ _ +53-64 7933-7942 geringste _ geringste _ _ +53-65 7943-7949 Gefühl _ Gefühl _ _ +53-66 7950-7953 für _ für _ _ +53-67 7954-7957 die _ die _ _ +53-68 7958-7967 möglichen _ möglichen _ _ +53-69 7968-7974 Folgen _ Folgen _ _ +53-70 7975-7976 , _ , _ _ +53-71 7977-7981 sich _ sich _ _ +53-72 7982-7994 unvorsichtig _ unvorsichtig _ _ +53-73 7995-8003 benommen _ benommen _ _ +53-74 8004-8009 hatte _ hatte _ _ +53-75 8010-8013 und _ und _ _ +53-76 8014-8019 dafür _ dafür _ _ +53-77 8020-8025 durch _ durch _ _ +53-78 8026-8029 das _ das _ _ +53-79 8030-8038 Ergebnis _ Ergebnis _ _ +53-80 8039-8047 gestraft _ gestraft _ _ +53-81 8048-8054 worden _ worden _ _ +53-82 8055-8058 war _ war _ _ +53-83 8059-8060 . _ . _ _ + +#Text=_ Es sollte nicht wieder geschehen , zumindest nicht diesmal ; war es eine Komödie , so wollte er mitspielen . +54-1 8061-8062 _ _ _ _ _ +54-2 8063-8065 Es _ Es _ _ +54-3 8066-8072 sollte _ sollte _ _ +54-4 8073-8078 nicht _ nicht _ _ +54-5 8079-8085 wieder _ wieder _ _ +54-6 8086-8095 geschehen _ geschehen _ _ +54-7 8096-8097 , _ , _ _ +54-8 8098-8107 zumindest _ zumindest _ _ +54-9 8108-8113 nicht _ nicht _ _ +54-10 8114-8121 diesmal _ diesmal _ _ +54-11 8122-8123 ; _ \; _ _ +54-12 8124-8127 war _ war _ _ +54-13 8128-8130 es _ es _ _ +54-14 8131-8135 eine _ eine _ _ +54-15 8136-8143 Komödie _ Komödie _ _ +54-16 8144-8145 , _ , _ _ +54-17 8146-8148 so _ so _ _ +54-18 8149-8155 wollte _ wollte _ _ +54-19 8156-8158 er _ er CM[89] ANAPHORIC->89-70 +54-20 8159-8169 mitspielen _ mitspielen _ _ +54-21 8170-8171 . _ . _ _ + +#Text=_ Noch war er frei . +55-1 8172-8173 _ _ _ _ _ +55-2 8174-8178 Noch _ Noch _ _ +55-3 8179-8182 war _ war _ _ +55-4 8183-8185 er _ er CM[89] ANAPHORIC->89-71 +55-5 8186-8190 frei _ frei _ _ +55-6 8191-8192 . _ . _ _ + +#Text=_ " Erlauben Sie " , sagte er und _ ging eilig zwischen den Wächtern durch in sein Zimmer . +56-1 8193-8194 _ _ _ _ _ +56-2 8195-8196 " _ " _ _ +56-3 8197-8205 Erlauben _ Erlauben _ _ +56-4 8206-8209 Sie _ Sie _ _ +56-5 8210-8211 " _ " _ _ +56-6 8212-8213 , _ , _ _ +56-7 8214-8219 sagte _ sagte _ _ +56-8 8220-8222 er _ er CM[89] ANAPHORIC->89-72 +56-9 8223-8226 und _ und _ _ +56-10 8227-8228 _ _ er _ _ +56-11 8229-8233 ging _ ging _ _ +56-12 8234-8239 eilig _ eilig _ _ +56-13 8240-8248 zwischen _ zwischen _ _ +56-14 8249-8252 den _ den _ _ +56-15 8253-8261 Wächtern _ Wächtern _ _ +56-16 8262-8267 durch _ durch _ _ +56-17 8268-8270 in _ in _ _ +56-18 8271-8275 sein _ sein CM[71]|CM[89] COREFERENTIAL->71-1|ANAPHORIC->89-73 +56-19 8276-8282 Zimmer _ Zimmer CM[71] COREFERENTIAL->71-1 +56-20 8283-8284 . _ . _ _ + +#Text=_ " Er scheint vernünftig zu sein " , hörte er hinter sich sagen . +57-1 8285-8286 _ _ _ _ _ +57-2 8287-8288 " _ " _ _ +57-3 8289-8291 Er _ Er CM[89] ANAPHORIC->89-74 +57-4 8292-8299 scheint _ scheint _ _ +57-5 8300-8310 vernünftig _ vernünftig _ _ +57-6 8311-8313 zu _ zu _ _ +57-7 8314-8318 sein _ sein _ _ +57-8 8319-8320 " _ " _ _ +57-9 8321-8322 , _ , _ _ +57-10 8323-8328 hörte _ hörte _ _ +57-11 8329-8331 er _ er CM[89] ANAPHORIC->89-75 +57-12 8332-8338 hinter _ hinter _ _ +57-13 8339-8343 sich _ sich CM[89] ANAPHORIC->89-76 +57-14 8344-8349 sagen _ sagen _ _ +57-15 8350-8351 . _ . _ _ + +#Text=_ In seinem Zimmer riß er gleich die Schubladen des Schreibtischs auf , es lag dort alles in großer Ordnung , aber gerade die Legitimationspapiere , die er suchte , konnte er in der Aufregung nicht gleich finden . +58-1 8352-8353 _ _ _ _ _ +58-2 8354-8356 In _ In _ _ +58-3 8357-8363 seinem _ seinem CM[71]|CM[89] COREFERENTIAL->71-2|ANAPHORIC->89-77 +58-4 8364-8370 Zimmer _ Zimmer CM[71] COREFERENTIAL->71-2 +58-5 8371-8374 riß _ riss _ _ +58-6 8375-8377 er _ er CM[89] ANAPHORIC->89-78 +58-7 8378-8384 gleich _ gleich _ _ +58-8 8385-8388 die _ die _ _ +58-9 8389-8399 Schubladen _ Schubladen _ _ +58-10 8400-8403 des _ des _ _ +58-11 8404-8417 Schreibtischs _ Schreibtischs _ _ +58-12 8418-8421 auf _ auf _ _ +58-13 8422-8423 , _ , _ _ +58-14 8424-8426 es _ es _ _ +58-15 8427-8430 lag _ lag _ _ +58-16 8431-8435 dort _ dort _ _ +58-17 8436-8441 alles _ alles _ _ +58-18 8442-8444 in _ in _ _ +58-19 8445-8451 großer _ großer _ _ +58-20 8452-8459 Ordnung _ Ordnung _ _ +58-21 8460-8461 , _ , _ _ +58-22 8462-8466 aber _ aber _ _ +58-23 8467-8473 gerade _ gerade _ _ +58-24 8474-8477 die _ die CM[30] ANAPHORIC->30-1 +58-25 8478-8498 Legitimationspapiere _ Legitimationspapiere CM[30] ANAPHORIC->30-1 +58-26 8499-8500 , _ , _ _ +58-27 8501-8504 die _ die CM[30] *->30-2 +58-28 8505-8507 er _ er CM[89] ANAPHORIC->89-79 +58-29 8508-8514 suchte _ suchte _ _ +58-30 8515-8516 , _ , _ _ +58-31 8517-8523 konnte _ konnte _ _ +58-32 8524-8526 er _ er CM[89] ANAPHORIC->89-80 +58-33 8527-8529 in _ in _ _ +58-34 8530-8533 der _ der _ _ +58-35 8534-8543 Aufregung _ Aufregung _ _ +58-36 8544-8549 nicht _ nicht _ _ +58-37 8550-8556 gleich _ gleich _ _ +58-38 8557-8563 finden _ finden _ _ +58-39 8564-8565 . _ . _ _ + +#Text=_ Schließlich fand er seine Radfahrlegitimation und _ wollte schon mit ihr zu den Wächtern gehen , dann aber schien ihm das Papier zu geringfügig und er suchte weiter , bis er den Geburtsschein fand . +59-1 8566-8567 _ _ _ _ _ +59-2 8568-8579 Schließlich _ Schließlich _ _ +59-3 8580-8584 fand _ fand _ _ +59-4 8585-8587 er _ er CM[89] ANAPHORIC->89-81 +59-5 8588-8593 seine _ seine CM[31]|CM[89] ANAPHORIC->31-1|ANAPHORIC->89-82 +59-6 8594-8613 Radfahrlegitimation _ Radfahrlegitimation CM[31] ANAPHORIC->31-1 +59-7 8614-8617 und _ und _ _ +59-8 8618-8619 _ _ er _ _ +59-9 8620-8626 wollte _ wollte _ _ +59-10 8627-8632 schon _ schon _ _ +59-11 8633-8636 mit _ mit _ _ +59-12 8637-8640 ihr _ ihr CM[31] COREFERENTIAL->31-2 +59-13 8641-8643 zu _ zu _ _ +59-14 8644-8647 den _ den CM[24] COREFERENTIAL->24-9 +59-15 8648-8656 Wächtern _ Wächtern CM[24] COREFERENTIAL->24-9 +59-16 8657-8662 gehen _ gehen _ _ +59-17 8663-8664 , _ , _ _ +59-18 8665-8669 dann _ dann _ _ +59-19 8670-8674 aber _ aber _ _ +59-20 8675-8681 schien _ schien _ _ +59-21 8682-8685 ihm _ ihm CM[84]|CM[89] ANAPHORIC->84-1|ANAPHORIC->89-83 +59-22 8686-8689 das _ das CM[31] *->31-3 +59-23 8690-8696 Papier _ Papier CM[31] *->31-3 +59-24 8697-8699 zu _ zu _ _ +59-25 8700-8711 geringfügig _ geringfügig _ _ +59-26 8712-8715 und _ und _ _ +59-27 8716-8718 er _ er CM[84]|CM[89] ANAPHORIC->84-2|ANAPHORIC->89-84 +59-28 8719-8725 suchte _ suchte _ _ +59-29 8726-8732 weiter _ weiter _ _ +59-30 8733-8734 , _ , _ _ +59-31 8735-8738 bis _ bis _ _ +59-32 8739-8741 er _ er CM[84]|CM[89] ANAPHORIC->84-3|ANAPHORIC->89-85 +59-33 8742-8745 den _ den CM[85] COREFERENTIAL->85-1 +59-34 8746-8759 Geburtsschein _ Geburtsschein CM[85] COREFERENTIAL->85-1 +59-35 8760-8764 fand _ fand _ _ +59-36 8765-8766 . _ . _ _ + +#Text=_ Als er wieder in das Nebenzimmer zurückkam , öffnete sich gerade die gegenüberliegende Tür und Frau Grubach wollte dort eintreten . +60-1 8767-8768 _ _ _ _ _ +60-2 8769-8772 Als _ Als _ _ +60-3 8773-8775 er _ er CM[84]|CM[89] COREFERENTIAL->84-4|COREFERENTIAL->89-86 +60-4 8776-8782 wieder _ wieder _ _ +60-5 8783-8785 in _ in _ _ +60-6 8786-8789 das _ das CM[32] *->32-2 +60-7 8790-8801 Nebenzimmer _ Nebenzimmer CM[32] *->32-2 +60-8 8802-8811 zurückkam _ zurückkam _ _ +60-9 8812-8813 , _ , _ _ +60-10 8814-8821 öffnete _ öffnete _ _ +60-11 8822-8826 sich _ sich _ _ +60-12 8827-8833 gerade _ gerade _ _ +60-13 8834-8837 die _ die _ _ +60-14 8838-8855 gegenüberliegende _ gegenüberliegende _ _ +60-15 8856-8859 Tür _ Tür _ _ +60-16 8860-8863 und _ und _ _ +60-17 8864-8868 Frau _ Frau CM[1] ANAPHORIC->1-6 +60-18 8869-8876 Grubach PER Grubach CM[1] ANAPHORIC->1-6 +60-19 8877-8883 wollte _ wollte _ _ +60-20 8884-8888 dort _ dort _ _ +60-21 8889-8898 eintreten _ eintreten _ _ +60-22 8899-8900 . _ . _ _ + +#Text=_ Man sah sie nur einen Augenblick , denn kaum hatte sie K. erkannt , als sie offenbar verlegen wurde , _ um Verzeihung bat , _ verschwand und _ äußerst vorsichtig die Tür schloß . +61-1 8901-8902 _ _ _ _ _ +61-2 8903-8906 Man _ Man _ _ +61-3 8907-8910 sah _ sah _ _ +61-4 8911-8914 sie _ sie CM[1] ANAPHORIC->1-7 +61-5 8915-8918 nur _ nur _ _ +61-6 8919-8924 einen _ einen _ _ +61-7 8925-8935 Augenblick _ Augenblick _ _ +61-8 8936-8937 , _ , _ _ +61-9 8938-8942 denn _ denn _ _ +61-10 8943-8947 kaum _ kaum _ _ +61-11 8948-8953 hatte _ hatte _ _ +61-12 8954-8957 sie _ sie CM[1] ANAPHORIC->1-8 +61-13 8958-8960 K. PER K. CM[84]|CM[89] COREFERENTIAL->84-5|COREFERENTIAL->89-87 +61-14 8961-8968 erkannt _ erkannt _ _ +61-15 8969-8970 , _ , _ _ +61-16 8971-8974 als _ als _ _ +61-17 8975-8978 sie _ sie CM[1] ANAPHORIC->1-9 +61-18 8979-8987 offenbar _ offenbar _ _ +61-19 8988-8996 verlegen _ verlegen _ _ +61-20 8997-9002 wurde _ wurde _ _ +61-21 9003-9004 , _ , _ _ +61-22 9005-9006 _ _ sie _ _ +61-23 9007-9009 um _ um _ _ +61-24 9010-9020 Verzeihung _ Verzeihung _ _ +61-25 9021-9024 bat _ bat _ _ +61-26 9025-9026 , _ , _ _ +61-27 9027-9028 _ _ sie _ _ +61-28 9029-9039 verschwand _ verschwand _ _ +61-29 9040-9043 und _ und _ _ +61-30 9044-9045 _ _ sie _ _ +61-31 9046-9053 äußerst _ äußerst _ _ +61-32 9054-9064 vorsichtig _ vorsichtig _ _ +61-33 9065-9068 die _ die _ _ +61-34 9069-9072 Tür _ Tür _ _ +61-35 9073-9079 schloß _ schloss _ _ +61-36 9080-9081 . _ . _ _ + +#Text=_ " Kommen Sie doch herein " , hatte K. gerade noch sagen können . +62-1 9082-9083 _ _ _ _ _ +62-2 9084-9085 " _ " _ _ +62-3 9086-9092 Kommen _ Kommen _ _ +62-4 9093-9096 Sie _ Sie _ _ +62-5 9097-9101 doch _ doch _ _ +62-6 9102-9108 herein _ herein _ _ +62-7 9109-9110 " _ " _ _ +62-8 9111-9112 , _ , _ _ +62-9 9113-9118 hatte _ hatte _ _ +62-10 9119-9121 K. PER K. CM[84]|CM[89] ANAPHORIC->84-6|ANAPHORIC->89-88 +62-11 9122-9128 gerade _ gerade _ _ +62-12 9129-9133 noch _ noch _ _ +62-13 9134-9139 sagen _ sagen _ _ +62-14 9140-9146 können _ können _ _ +62-15 9147-9148 . _ . _ _ + +#Text=_ Nun aber stand er mit seinen Papieren in der Mitte des Zimmers , _ sah noch auf die Tür hin , die sich nicht wieder öffnete , und _ wurde erst durch einen Anruf der Wächter aufgeschreckt , die bei dem Tischchen am offenen Fenster saßen und , wie K. jetzt erkannte , _ sein Frühstück verzehrten . +63-1 9149-9150 _ _ _ _ _ +63-2 9151-9154 Nun _ Nun _ _ +63-3 9155-9159 aber _ aber _ _ +63-4 9160-9165 stand _ stand _ _ +63-5 9166-9168 er _ er CM[84]|CM[89] ANAPHORIC->84-7|ANAPHORIC->89-89 +63-6 9169-9172 mit _ mit _ _ +63-7 9173-9179 seinen _ seinen CM[84]|CM[89] COREFERENTIAL->84-8|COREFERENTIAL->89-90 +63-8 9180-9188 Papieren _ Papieren _ _ +63-9 9189-9191 in _ in _ _ +63-10 9192-9195 der _ der _ _ +63-11 9196-9201 Mitte _ Mitte _ _ +63-12 9202-9205 des _ des _ _ +63-13 9206-9213 Zimmers _ Zimmers _ _ +63-14 9214-9215 , _ , _ _ +63-15 9216-9217 _ _ er _ _ +63-16 9218-9221 sah _ sah _ _ +63-17 9222-9226 noch _ noch _ _ +63-18 9227-9230 auf _ auf _ _ +63-19 9231-9234 die _ die CM[33] ANAPHORIC->33-1 +63-20 9235-9238 Tür _ Tür CM[33] ANAPHORIC->33-1 +63-21 9239-9242 hin _ hin _ _ +63-22 9243-9244 , _ , _ _ +63-23 9245-9248 die _ die CM[33] *->33-2 +63-24 9249-9253 sich _ sich _ _ +63-25 9254-9259 nicht _ nicht _ _ +63-26 9260-9266 wieder _ wieder _ _ +63-27 9267-9274 öffnete _ öffnete _ _ +63-28 9275-9276 , _ , _ _ +63-29 9277-9280 und _ und _ _ +63-30 9281-9282 _ _ er _ _ +63-31 9283-9288 wurde _ wurde _ _ +63-32 9289-9293 erst _ erst _ _ +63-33 9294-9299 durch _ durch _ _ +63-34 9300-9305 einen _ einen _ _ +63-35 9306-9311 Anruf _ Anruf _ _ +63-36 9312-9315 der _ der CM[24] ANAPHORIC->24-10 +63-37 9316-9323 Wächter _ Wächter CM[24] ANAPHORIC->24-10 +63-38 9324-9337 aufgeschreckt _ aufgeschreckt _ _ +63-39 9338-9339 , _ , _ _ +63-40 9340-9343 die _ die CM[24] COREFERENTIAL->24-11 +63-41 9344-9347 bei _ bei _ _ +63-42 9348-9351 dem _ dem _ _ +63-43 9352-9361 Tischchen _ Tischchen _ _ +63-44 9362-9364 am _ am _ _ +63-45 9365-9372 offenen _ offenen _ _ +63-46 9373-9380 Fenster _ Fenster _ _ +63-47 9381-9386 saßen _ saßen _ _ +63-48 9387-9390 und _ und _ _ +63-49 9391-9392 , _ , _ _ +63-50 9393-9396 wie _ wie _ _ +63-51 9397-9399 K. PER K. CM[84]|CM[89] ANAPHORIC->84-9|ANAPHORIC->89-91 +63-52 9400-9405 jetzt _ jetzt _ _ +63-53 9406-9414 erkannte _ erkannte _ _ +63-54 9415-9416 , _ , _ _ +63-55 9417-9418 _ _ die _ _ +63-56 9419-9423 sein _ sein CM[84]|CM[87]|CM[89] ANAPHORIC->84-10|COREFERENTIAL->87-1|ANAPHORIC->89-92 +63-57 9424-9433 Frühstück _ Frühstück CM[87] COREFERENTIAL->87-1 +63-58 9434-9444 verzehrten _ verzehrten _ _ +63-59 9445-9446 . _ . _ _ + +#Text=_ " Warum ist sie nicht eingetreten ? " fragte er . +64-1 9447-9448 _ _ _ _ _ +64-2 9449-9450 " _ " _ _ +64-3 9451-9456 Warum _ Warum _ _ +64-4 9457-9460 ist _ ist _ _ +64-5 9461-9464 sie _ sie CM[1] ANAPHORIC->1-10 +64-6 9465-9470 nicht _ nicht _ _ +64-7 9471-9482 eingetreten _ eingetreten _ _ +64-8 9483-9484 ? _ ? _ _ +64-9 9485-9486 " _ " _ _ +64-10 9487-9493 fragte _ fragte _ _ +64-11 9494-9496 er _ er CM[84]|CM[89] COREFERENTIAL->84-11|COREFERENTIAL->89-93 +64-12 9497-9498 . _ . _ _ + +#Text=_ " Sie darf _ nicht " , sagte der große Wächter . +65-1 9499-9500 _ _ _ _ _ +65-2 9501-9502 " _ " _ _ +65-3 9503-9506 Sie _ Sie CM[1] COREFERENTIAL->1-11 +65-4 9507-9511 darf _ darf _ _ +65-5 9512-9513 _ _ es _ _ +65-6 9514-9519 nicht _ nicht _ _ +65-7 9520-9521 " _ " _ _ +65-8 9522-9523 , _ , _ _ +65-9 9524-9529 sagte _ sagte _ _ +65-10 9530-9533 der _ der CM[34] COREFERENTIAL->34-1 +65-11 9534-9539 große _ große CM[34] COREFERENTIAL->34-1 +65-12 9540-9547 Wächter _ Wächter CM[34] COREFERENTIAL->34-1 +65-13 9548-9549 . _ . _ _ + +#Text=_ " Sie sind doch verhaftet . " +66-1 9550-9551 _ _ _ _ _ +66-2 9552-9553 " _ " _ _ +66-3 9554-9557 Sie _ Sie _ _ +66-4 9558-9562 sind _ sind _ _ +66-5 9563-9567 doch _ doch _ _ +66-6 9568-9577 verhaftet _ verhaftet _ _ +66-7 9578-9579 . _ . _ _ +66-8 9580-9581 " _ " _ _ + +#Text=_ " Wie kann ich denn verhaftet sein ? +67-1 9582-9583 _ _ _ _ _ +67-2 9584-9585 " _ " _ _ +67-3 9586-9589 Wie _ Wie _ _ +67-4 9590-9594 kann _ kann _ _ +67-5 9595-9598 ich _ ich _ _ +67-6 9599-9603 denn _ denn _ _ +67-7 9604-9613 verhaftet _ verhaftet _ _ +67-8 9614-9618 sein _ sein _ _ +67-9 9619-9620 ? _ ? _ _ + +#Text=_ Und gar auf diese Weise _ _ ? " +68-1 9621-9622 _ _ _ _ _ +68-2 9623-9626 Und _ Und _ _ +68-3 9627-9630 gar _ gar _ _ +68-4 9631-9634 auf _ auf _ _ +68-5 9635-9640 diese _ diese _ _ +68-6 9641-9646 Weise _ Weise _ _ +68-7 9647-9648 _ _ verhaftet _ _ +68-8 9649-9650 _ _ sein _ _ +68-9 9651-9652 ? _ ? _ _ +68-10 9653-9654 " _ " _ _ + +#Text=_ " Nun fangen Sie also wieder an " , sagte der Wächter und _ tauchte ein Butterbrot ins Honigfäßchen . +69-1 9655-9656 _ _ _ _ _ +69-2 9657-9658 " _ " _ _ +69-3 9659-9662 Nun _ Nun _ _ +69-4 9663-9669 fangen _ fangen _ _ +69-5 9670-9673 Sie _ Sie _ _ +69-6 9674-9678 also _ also _ _ +69-7 9679-9685 wieder _ wieder _ _ +69-8 9686-9688 an _ an _ _ +69-9 9689-9690 " _ " _ _ +69-10 9691-9692 , _ , _ _ +69-11 9693-9698 sagte _ sagte _ _ +69-12 9699-9702 der _ der CM[34] COREFERENTIAL->34-2 +69-13 9703-9710 Wächter _ Wächter CM[34] COREFERENTIAL->34-2 +69-14 9711-9714 und _ und _ _ +69-15 9715-9716 _ _ er _ _ +69-16 9717-9724 tauchte _ tauchte _ _ +69-17 9725-9728 ein _ ein _ _ +69-18 9729-9739 Butterbrot _ Butterbrot _ _ +69-19 9740-9743 ins _ ins _ _ +69-20 9744-9756 Honigfäßchen _ Honigfässchen _ _ +69-21 9757-9758 . _ . _ _ + +#Text=_ " Solche Fragen beantworten wir nicht . " +70-1 9759-9760 _ _ _ _ _ +70-2 9761-9762 " _ " _ _ +70-3 9763-9769 Solche _ Solche CM[35] ANAPHORIC->35-1 +70-4 9770-9776 Fragen _ Fragen CM[35] ANAPHORIC->35-1 +70-5 9777-9788 beantworten _ beantworten _ _ +70-6 9789-9792 wir _ wir _ _ +70-7 9793-9798 nicht _ nicht _ _ +70-8 9799-9800 . _ . _ _ +70-9 9801-9802 " _ " _ _ + +#Text=_ " Sie werden sie beantworten müssen " , sagte K. +71-1 9803-9804 _ _ _ _ _ +71-2 9805-9806 " _ " _ _ +71-3 9807-9810 Sie _ Sie _ _ +71-4 9811-9817 werden _ werden _ _ +71-5 9818-9821 sie _ sie CM[35] *->35-2 +71-6 9822-9833 beantworten _ beantworten _ _ +71-7 9834-9840 müssen _ müssen _ _ +71-8 9841-9842 " _ " _ _ +71-9 9843-9844 , _ , _ _ +71-10 9845-9850 sagte _ sagte _ _ +71-11 9851-9853 K. PER K. CM[84]|CM[89] COREFERENTIAL->84-12|COREFERENTIAL->89-94 + +#Text=_ " Hier sind meine Legitimationspapiere , zeigen Sie mir jetzt die Ihrigen und _ _ _ vor allem den Verhaftbefehl . " +72-1 9854-9855 _ _ _ _ _ +72-2 9856-9857 " _ " _ _ +72-3 9858-9862 Hier _ Hier _ _ +72-4 9863-9867 sind _ sind _ _ +72-5 9868-9873 meine _ meine CM[85] COREFERENTIAL->85-2 +72-6 9874-9894 Legitimationspapiere _ Legitimationspapiere CM[85] COREFERENTIAL->85-2 +72-7 9895-9896 , _ , _ _ +72-8 9897-9903 zeigen _ zeigen _ _ +72-9 9904-9907 Sie _ Sie _ _ +72-10 9908-9911 mir _ mir _ _ +72-11 9912-9917 jetzt _ jetzt _ _ +72-12 9918-9921 die _ die _ _ +72-13 9922-9929 Ihrigen _ Ihrigen _ _ +72-14 9930-9933 und _ und _ _ +72-15 9934-9935 _ _ zeigen _ _ +72-16 9936-9937 _ _ Sie _ _ +72-17 9938-9939 _ _ mir _ _ +72-18 9940-9943 vor _ vor _ _ +72-19 9944-9949 allem _ allem _ _ +72-20 9950-9953 den _ den _ _ +72-21 9954-9967 Verhaftbefehl _ Verhaftbefehl _ _ +72-22 9968-9969 . _ . _ _ +72-23 9970-9971 " _ " _ _ + +#Text=_ " Du lieber Himmel ! " sagte der Wächter . +73-1 9972-9973 _ _ _ _ _ +73-2 9974-9975 " _ " _ _ +73-3 9976-9978 Du _ Du _ _ +73-4 9979-9985 lieber _ lieber _ _ +73-5 9986-9992 Himmel _ Himmel _ _ +73-6 9993-9994 ! _ ! _ _ +73-7 9995-9996 " _ " _ _ +73-8 9997-10002 sagte _ sagte _ _ +73-9 10003-10006 der _ der CM[34] COREFERENTIAL->34-3 +73-10 10007-10014 Wächter _ Wächter CM[34] COREFERENTIAL->34-3 +73-11 10015-10016 . _ . _ _ + +#Text=_ " Daß Sie sich in Ihre Lage nicht fügen können und daß Sie es darauf angelegt zu haben scheinen , uns , die wir Ihnen jetzt wahrscheinlich von allen Ihren Mitmenschen am nächsten stehen , nutzlos zu reizen ! " +74-1 10017-10018 _ _ _ _ _ +74-2 10019-10020 " _ " _ _ +74-3 10021-10024 Daß _ dass _ _ +74-4 10025-10028 Sie _ Sie _ _ +74-5 10029-10033 sich _ sich _ _ +74-6 10034-10036 in _ in _ _ +74-7 10037-10041 Ihre _ Ihre _ _ +74-8 10042-10046 Lage _ Lage _ _ +74-9 10047-10052 nicht _ nicht _ _ +74-10 10053-10058 fügen _ fügen _ _ +74-11 10059-10065 können _ können _ _ +74-12 10066-10069 und _ und _ _ +74-13 10070-10073 daß _ dass _ _ +74-14 10074-10077 Sie _ Sie _ _ +74-15 10078-10080 es _ es _ _ +74-16 10081-10087 darauf _ darauf _ _ +74-17 10088-10096 angelegt _ angelegt _ _ +74-18 10097-10099 zu _ zu _ _ +74-19 10100-10105 haben _ haben _ _ +74-20 10106-10114 scheinen _ scheinen _ _ +74-21 10115-10116 , _ , _ _ +74-22 10117-10120 uns _ uns _ _ +74-23 10121-10122 , _ , _ _ +74-24 10123-10126 die _ die _ _ +74-25 10127-10130 wir _ wir _ _ +74-26 10131-10136 Ihnen _ Ihnen _ _ +74-27 10137-10142 jetzt _ jetzt _ _ +74-28 10143-10157 wahrscheinlich _ wahrscheinlich _ _ +74-29 10158-10161 von _ von _ _ +74-30 10162-10167 allen _ allen _ _ +74-31 10168-10173 Ihren _ Ihren _ _ +74-32 10174-10185 Mitmenschen _ Mitmenschen _ _ +74-33 10186-10188 am _ am _ _ +74-34 10189-10197 nächsten _ nächsten _ _ +74-35 10198-10204 stehen _ stehen _ _ +74-36 10205-10206 , _ , _ _ +74-37 10207-10214 nutzlos _ nutzlos _ _ +74-38 10215-10217 zu _ zu _ _ +74-39 10218-10224 reizen _ reizen _ _ +74-40 10225-10226 ! _ ! _ _ +74-41 10227-10228 " _ " _ _ + +#Text=_ " Es ist so , glauben Sie es doch " , sagte Franz , _ führte die Kaffeetasse , die er in der Hand hielt , nicht zum Mund , sondern _ sah K. mit einem langen , wahrscheinlich bedeutungsvollen , aber unverständlichen Blick an . +75-1 10229-10230 _ _ _ _ _ +75-2 10231-10232 " _ " _ _ +75-3 10233-10235 Es _ Es _ _ +75-4 10236-10239 ist _ ist _ _ +75-5 10240-10242 so _ so _ _ +75-6 10243-10244 , _ , _ _ +75-7 10245-10252 glauben _ glauben _ _ +75-8 10253-10256 Sie _ Sie _ _ +75-9 10257-10259 es _ es _ _ +75-10 10260-10264 doch _ doch _ _ +75-11 10265-10266 " _ " _ _ +75-12 10267-10268 , _ , _ _ +75-13 10269-10274 sagte _ sagte _ _ +75-14 10275-10280 Franz PER Franz CM[14] ANAPHORIC->14-9 +75-15 10281-10282 , _ , _ _ +75-16 10283-10284 _ _ er _ _ +75-17 10285-10291 führte _ führte _ _ +75-18 10292-10295 die _ die CM[36] ANAPHORIC->36-1 +75-19 10296-10307 Kaffeetasse _ Kaffeetasse CM[36] ANAPHORIC->36-1 +75-20 10308-10309 , _ , _ _ +75-21 10310-10313 die _ die CM[36] *->36-2 +75-22 10314-10316 er _ er CM[14] COREFERENTIAL->14-10 +75-23 10317-10319 in _ in _ _ +75-24 10320-10323 der _ der _ _ +75-25 10324-10328 Hand _ Hand _ _ +75-26 10329-10334 hielt _ hielt _ _ +75-27 10335-10336 , _ , _ _ +75-28 10337-10342 nicht _ nicht _ _ +75-29 10343-10346 zum _ zum _ _ +75-30 10347-10351 Mund _ Mund _ _ +75-31 10352-10353 , _ , _ _ +75-32 10354-10361 sondern _ sondern _ _ +75-33 10362-10363 _ _ er _ _ +75-34 10364-10367 sah _ sah _ _ +75-35 10368-10370 K. PER K. CM[84]|CM[89] COREFERENTIAL->84-13|COREFERENTIAL->89-95 +75-36 10371-10374 mit _ mit _ _ +75-37 10375-10380 einem _ einem _ _ +75-38 10381-10387 langen _ langen _ _ +75-39 10388-10389 , _ , _ _ +75-40 10390-10404 wahrscheinlich _ wahrscheinlich _ _ +75-41 10405-10421 bedeutungsvollen _ bedeutungsvollen _ _ +75-42 10422-10423 , _ , _ _ +75-43 10424-10428 aber _ aber _ _ +75-44 10429-10445 unverständlichen _ unverständlichen _ _ +75-45 10446-10451 Blick _ Blick _ _ +75-46 10452-10454 an _ an _ _ +75-47 10455-10456 . _ . _ _ + +#Text=_ K. ließ sich , ohne es zu wollen , in ein Zwiegespräch der Blicke mit Franz ein , _ schlug dann aber doch auf seine Papiere und _ sagte : " Hier sind meine Legitimationspapiere . " +76-1 10457-10458 _ _ _ _ _ +76-2 10459-10461 K. PER K. CM[84]|CM[89] ANAPHORIC->84-14|ANAPHORIC->89-96 +76-3 10462-10466 ließ _ ließ _ _ +76-4 10467-10471 sich _ sich _ _ +76-5 10472-10473 , _ , _ _ +76-6 10474-10478 ohne _ ohne _ _ +76-7 10479-10481 es _ es _ _ +76-8 10482-10484 zu _ zu _ _ +76-9 10485-10491 wollen _ wollen _ _ +76-10 10492-10493 , _ , _ _ +76-11 10494-10496 in _ in _ _ +76-12 10497-10500 ein _ ein _ _ +76-13 10501-10513 Zwiegespräch _ Zwiegespräch _ _ +76-14 10514-10517 der _ der _ _ +76-15 10518-10524 Blicke _ Blicke _ _ +76-16 10525-10528 mit _ mit _ _ +76-17 10529-10534 Franz PER Franz CM[14] COREFERENTIAL->14-11 +76-18 10535-10538 ein _ ein _ _ +76-19 10539-10540 , _ , _ _ +76-20 10541-10542 _ _ er _ _ +76-21 10543-10549 schlug _ schlug _ _ +76-22 10550-10554 dann _ dann _ _ +76-23 10555-10559 aber _ aber _ _ +76-24 10560-10564 doch _ doch _ _ +76-25 10565-10568 auf _ auf _ _ +76-26 10569-10574 seine _ seine CM[84]|CM[85]|CM[89] COREFERENTIAL->84-15|COREFERENTIAL->85-3|COREFERENTIAL->89-97 +76-27 10575-10582 Papiere _ Papiere CM[85] COREFERENTIAL->85-3 +76-28 10583-10586 und _ und _ _ +76-29 10587-10588 _ _ er _ _ +76-30 10589-10594 sagte _ sagte _ _ +76-31 10595-10596 : _ : _ _ +76-32 10597-10598 " _ " _ _ +76-33 10599-10603 Hier _ Hier _ _ +76-34 10604-10608 sind _ sind _ _ +76-35 10609-10614 meine _ meine CM[85] ANAPHORIC->85-4 +76-36 10615-10635 Legitimationspapiere _ Legitimationspapiere CM[85] ANAPHORIC->85-4 +76-37 10636-10637 . _ . _ _ +76-38 10638-10639 " _ " _ _ + +#Text=_ " Was kümmern uns denn die ? " rief nun schon der große Wächter . +77-1 10640-10641 _ _ _ _ _ +77-2 10642-10643 " _ " _ _ +77-3 10644-10647 Was _ Was _ _ +77-4 10648-10655 kümmern _ kümmern _ _ +77-5 10656-10659 uns _ uns _ _ +77-6 10660-10664 denn _ denn _ _ +77-7 10665-10668 die _ die CM[85] *->85-5 +77-8 10669-10670 ? _ ? _ _ +77-9 10671-10672 " _ " _ _ +77-10 10673-10677 rief _ rief _ _ +77-11 10678-10681 nun _ nun _ _ +77-12 10682-10687 schon _ schon _ _ +77-13 10688-10691 der _ der CM[34] COREFERENTIAL->34-4 +77-14 10692-10697 große _ große CM[34] COREFERENTIAL->34-4 +77-15 10698-10705 Wächter _ Wächter CM[34] COREFERENTIAL->34-4 +77-16 10706-10707 . _ . _ _ + +#Text=_ " Sie führen sich ärger auf als ein Kind . +78-1 10708-10709 _ _ _ _ _ +78-2 10710-10711 " _ " _ _ +78-3 10712-10715 Sie _ Sie _ _ +78-4 10716-10722 führen _ führen _ _ +78-5 10723-10727 sich _ sich _ _ +78-6 10728-10733 ärger _ ärger _ _ +78-7 10734-10737 auf _ auf _ _ +78-8 10738-10741 als _ als _ _ +78-9 10742-10745 ein _ ein _ _ +78-10 10746-10750 Kind _ Kind _ _ +78-11 10751-10752 . _ . _ _ + +#Text=_ Was wollen Sie denn ? +79-1 10753-10754 _ _ _ _ _ +79-2 10755-10758 Was _ Was _ _ +79-3 10759-10765 wollen _ wollen _ _ +79-4 10766-10769 Sie _ Sie _ _ +79-5 10770-10774 denn _ denn _ _ +79-6 10775-10776 ? _ ? _ _ + +#Text=_ Wollen Sie Ihren großen , verfluchten Prozeß dadurch zu einem raschen Ende bringen , daß Sie mit uns , den Wächtern , über Legitimation und Verhaftbefehl diskutieren ? +80-1 10777-10778 _ _ _ _ _ +80-2 10779-10785 Wollen _ Wollen _ _ +80-3 10786-10789 Sie _ Sie _ _ +80-4 10790-10795 Ihren _ Ihren _ _ +80-5 10796-10802 großen _ großen _ _ +80-6 10803-10804 , _ , _ _ +80-7 10805-10816 verfluchten _ verfluchten _ _ +80-8 10817-10823 Prozeß _ Prozess _ _ +80-9 10824-10831 dadurch _ dadurch _ _ +80-10 10832-10834 zu _ zu _ _ +80-11 10835-10840 einem _ einem _ _ +80-12 10841-10848 raschen _ raschen _ _ +80-13 10849-10853 Ende _ Ende _ _ +80-14 10854-10861 bringen _ bringen _ _ +80-15 10862-10863 , _ , _ _ +80-16 10864-10867 daß _ dass _ _ +80-17 10868-10871 Sie _ Sie _ _ +80-18 10872-10875 mit _ mit _ _ +80-19 10876-10879 uns _ uns _ _ +80-20 10880-10881 , _ , _ _ +80-21 10882-10885 den _ den _ _ +80-22 10886-10894 Wächtern _ Wächtern _ _ +80-23 10895-10896 , _ , _ _ +80-24 10897-10901 über _ über _ _ +80-25 10902-10914 Legitimation _ Legitimation _ _ +80-26 10915-10918 und _ und _ _ +80-27 10919-10932 Verhaftbefehl _ Verhaftbefehl _ _ +80-28 10933-10944 diskutieren _ diskutieren _ _ +80-29 10945-10946 ? _ ? _ _ + +#Text=_ Wir sind niedrige Angestellte , die sich in einem Legitimationspapier kaum auskennen und die mit Ihrer Sache nichts anderes zu tun haben , als daß sie zehn Stunden täglich bei Ihnen Wache halten und dafür bezahlt werden . +81-1 10947-10948 _ _ _ _ _ +81-2 10949-10952 Wir _ Wir _ _ +81-3 10953-10957 sind _ sind _ _ +81-4 10958-10966 niedrige _ niedrige CM[37] ANAPHORIC->37-1 +81-5 10967-10978 Angestellte _ Angestellte CM[37] ANAPHORIC->37-1 +81-6 10979-10980 , _ , _ _ +81-7 10981-10984 die _ die CM[37] ANAPHORIC->37-2 +81-8 10985-10989 sich _ sich _ _ +81-9 10990-10992 in _ in _ _ +81-10 10993-10998 einem _ einem _ _ +81-11 10999-11018 Legitimationspapier _ Legitimationspapier _ _ +81-12 11019-11023 kaum _ kaum _ _ +81-13 11024-11033 auskennen _ auskennen _ _ +81-14 11034-11037 und _ und _ _ +81-15 11038-11041 die _ die CM[37] ANAPHORIC->37-3 +81-16 11042-11045 mit _ mit _ _ +81-17 11046-11051 Ihrer _ Ihrer _ _ +81-18 11052-11057 Sache _ Sache _ _ +81-19 11058-11064 nichts _ nichts _ _ +81-20 11065-11072 anderes _ anderes _ _ +81-21 11073-11075 zu _ zu _ _ +81-22 11076-11079 tun _ tun _ _ +81-23 11080-11085 haben _ haben _ _ +81-24 11086-11087 , _ , _ _ +81-25 11088-11091 als _ als _ _ +81-26 11092-11095 daß _ dass _ _ +81-27 11096-11099 sie _ sie CM[37] *->37-4 +81-28 11100-11104 zehn _ zehn _ _ +81-29 11105-11112 Stunden _ Stunden _ _ +81-30 11113-11120 täglich _ täglich _ _ +81-31 11121-11124 bei _ bei _ _ +81-32 11125-11130 Ihnen _ Ihnen _ _ +81-33 11131-11136 Wache _ Wache _ _ +81-34 11137-11143 halten _ halten _ _ +81-35 11144-11147 und _ und _ _ +81-36 11148-11153 dafür _ dafür _ _ +81-37 11154-11161 bezahlt _ bezahlt _ _ +81-38 11162-11168 werden _ werden _ _ +81-39 11169-11170 . _ . _ _ + +#Text=_ Das ist alles , was wir sind , trotzdem aber sind wir fähig , einzusehen , daß die hohen Behörden , in deren Dienst wir stehen , ehe sie eine solche Verhaftung verfügen , sich sehr genau über die Gründe der Verhaftung und die Person des Verhafteten unterrichten . +82-1 11171-11172 _ _ _ _ _ +82-2 11173-11176 Das _ Das _ _ +82-3 11177-11180 ist _ ist _ _ +82-4 11181-11186 alles _ alles CM[38] BOUND->38-1 +82-5 11187-11188 , _ , _ _ +82-6 11189-11192 was _ was CM[38] *->38-2 +82-7 11193-11196 wir _ wir _ _ +82-8 11197-11201 sind _ sind _ _ +82-9 11202-11203 , _ , _ _ +82-10 11204-11212 trotzdem _ trotzdem _ _ +82-11 11213-11217 aber _ aber _ _ +82-12 11218-11222 sind _ sind _ _ +82-13 11223-11226 wir _ wir _ _ +82-14 11227-11232 fähig _ fähig _ _ +82-15 11233-11234 , _ , _ _ +82-16 11235-11245 einzusehen _ einzusehen _ _ +82-17 11246-11247 , _ , _ _ +82-18 11248-11251 daß _ dass _ _ +82-19 11252-11255 die _ die CM[39] ANAPHORIC->39-1 +82-20 11256-11261 hohen _ hohen CM[39] ANAPHORIC->39-1 +82-21 11262-11270 Behörden _ Behörden CM[39] ANAPHORIC->39-1 +82-22 11271-11272 , _ , _ _ +82-23 11273-11275 in _ in _ _ +82-24 11276-11281 deren _ deren CM[39] ANAPHORIC->39-2 +82-25 11282-11288 Dienst _ Dienst _ _ +82-26 11289-11292 wir _ wir _ _ +82-27 11293-11299 stehen _ stehen _ _ +82-28 11300-11301 , _ , _ _ +82-29 11302-11305 ehe _ ehe _ _ +82-30 11306-11309 sie _ sie CM[39] *->39-3 +82-31 11310-11314 eine _ eine CM[40] COREFERENTIAL->40-1 +82-32 11315-11321 solche _ solche CM[40] COREFERENTIAL->40-1 +82-33 11322-11332 Verhaftung _ Verhaftung CM[40] COREFERENTIAL->40-1 +82-34 11333-11341 verfügen _ verfügen _ _ +82-35 11342-11343 , _ , _ _ +82-36 11344-11348 sich _ sich _ _ +82-37 11349-11353 sehr _ sehr _ _ +82-38 11354-11359 genau _ genau _ _ +82-39 11360-11364 über _ über _ _ +82-40 11365-11368 die _ die _ _ +82-41 11369-11375 Gründe _ Gründe _ _ +82-42 11376-11379 der _ der CM[40] *->40-2 +82-43 11380-11390 Verhaftung _ Verhaftung CM[40] *->40-2 +82-44 11391-11394 und _ und _ _ +82-45 11395-11398 die _ die _ _ +82-46 11399-11405 Person _ Person _ _ +82-47 11406-11409 des _ des _ _ +82-48 11410-11421 Verhafteten _ Verhafteten _ _ +82-49 11422-11434 unterrichten _ unterrichten _ _ +82-50 11435-11436 . _ . _ _ + +#Text=_ Es gibt darin keinen Irrtum . +83-1 11437-11438 _ _ _ _ _ +83-2 11439-11441 Es _ Es _ _ +83-3 11442-11446 gibt _ gibt _ _ +83-4 11447-11452 darin _ darin _ _ +83-5 11453-11459 keinen _ keinen _ _ +83-6 11460-11466 Irrtum _ Irrtum _ _ +83-7 11467-11468 . _ . _ _ + +#Text=_ Unsere Behörde , soweit ich sie kenne , und ich kenne nur die niedrigsten Grade , sucht doch nicht etwa die Schuld in der Bevölkerung , sondern _ wird , wie es im Gesetz heißt , von der Schuld angezogen und _ muß uns Wächter ausschicken . +84-1 11469-11470 _ _ _ _ _ +84-2 11471-11477 Unsere _ Unsere CM[41] ANAPHORIC->41-1 +84-3 11478-11485 Behörde _ Behörde CM[41] ANAPHORIC->41-1 +84-4 11486-11487 , _ , _ _ +84-5 11488-11494 soweit _ soweit _ _ +84-6 11495-11498 ich _ ich _ _ +84-7 11499-11502 sie _ sie CM[41] *->41-2 +84-8 11503-11508 kenne _ kenne _ _ +84-9 11509-11510 , _ , _ _ +84-10 11511-11514 und _ und _ _ +84-11 11515-11518 ich _ ich _ _ +84-12 11519-11524 kenne _ kenne _ _ +84-13 11525-11528 nur _ nur _ _ +84-14 11529-11532 die _ die _ _ +84-15 11533-11544 niedrigsten _ niedrigsten _ _ +84-16 11545-11550 Grade _ Grade _ _ +84-17 11551-11552 , _ , _ _ +84-18 11553-11558 sucht _ sucht _ _ +84-19 11559-11563 doch _ doch _ _ +84-20 11564-11569 nicht _ nicht _ _ +84-21 11570-11574 etwa _ etwa _ _ +84-22 11575-11578 die _ die CM[42] ANAPHORIC->42-1 +84-23 11579-11585 Schuld _ Schuld CM[42] ANAPHORIC->42-1 +84-24 11586-11588 in _ in _ _ +84-25 11589-11592 der _ der _ _ +84-26 11593-11604 Bevölkerung _ Bevölkerung _ _ +84-27 11605-11606 , _ , _ _ +84-28 11607-11614 sondern _ sondern _ _ +84-29 11615-11616 _ _ sie _ _ +84-30 11617-11621 wird _ wird _ _ +84-31 11622-11623 , _ , _ _ +84-32 11624-11627 wie _ wie _ _ +84-33 11628-11630 es _ es _ _ +84-34 11631-11633 im _ im CM[43] ANAPHORIC->43-1 +84-35 11634-11640 Gesetz _ Gesetz CM[43] ANAPHORIC->43-1 +84-36 11641-11646 heißt _ heißt _ _ +84-37 11647-11648 , _ , _ _ +84-38 11649-11652 von _ von _ _ +84-39 11653-11656 der _ der CM[42] *->42-2 +84-40 11657-11663 Schuld _ Schuld CM[42] *->42-2 +84-41 11664-11673 angezogen _ angezogen _ _ +84-42 11674-11677 und _ und _ _ +84-43 11678-11679 _ _ sie _ _ +84-44 11680-11683 muß _ muss _ _ +84-45 11684-11687 uns _ uns _ _ +84-46 11688-11695 Wächter _ Wächter _ _ +84-47 11696-11707 ausschicken _ ausschicken _ _ +84-48 11708-11709 . _ . _ _ + +#Text=_ Das ist Gesetz . +85-1 11710-11711 _ _ _ _ _ +85-2 11712-11715 Das _ Das _ _ +85-3 11716-11719 ist _ ist _ _ +85-4 11720-11726 Gesetz _ Gesetz _ _ +85-5 11727-11728 . _ . _ _ + +#Text=_ Wo gäbe es da einen Irrtum ? " +86-1 11729-11730 _ _ _ _ _ +86-2 11731-11733 Wo _ Wo _ _ +86-3 11734-11738 gäbe _ gäbe _ _ +86-4 11739-11741 es _ es _ _ +86-5 11742-11744 da _ da _ _ +86-6 11745-11750 einen _ einen _ _ +86-7 11751-11757 Irrtum _ Irrtum _ _ +86-8 11758-11759 ? _ ? _ _ +86-9 11760-11761 " _ " _ _ + +#Text=_ " Dieses Gesetz kenne ich nicht " , sagte K. " +87-1 11762-11763 _ _ _ _ _ +87-2 11764-11765 " _ " _ _ +87-3 11766-11772 Dieses _ Dieses CM[43] COREFERENTIAL->43-2 +87-4 11773-11779 Gesetz _ Gesetz CM[43] COREFERENTIAL->43-2 +87-5 11780-11785 kenne _ kenne _ _ +87-6 11786-11789 ich _ ich _ _ +87-7 11790-11795 nicht _ nicht _ _ +87-8 11796-11797 " _ " _ _ +87-9 11798-11799 , _ , _ _ +87-10 11800-11805 sagte _ sagte _ _ +87-11 11806-11808 K. PER K. CM[84]|CM[89] ANAPHORIC->84-16|ANAPHORIC->89-98 +87-12 11809-11810 " _ " _ _ + +#Text=_ _ _ Desto schlimmer für Sie " , sagte der Wächter . +88-1 11811-11812 _ _ _ _ _ +88-2 11813-11814 _ _ Das _ _ +88-3 11815-11816 _ _ ist _ _ +88-4 11817-11822 Desto _ desto _ _ +88-5 11823-11832 schlimmer _ schlimmer _ _ +88-6 11833-11836 für _ für _ _ +88-7 11837-11840 Sie _ Sie _ _ +88-8 11841-11842 " _ " _ _ +88-9 11843-11844 , _ , _ _ +88-10 11845-11850 sagte _ sagte _ _ +88-11 11851-11854 der _ der CM[34] COREFERENTIAL->34-5 +88-12 11855-11862 Wächter _ Wächter CM[34] COREFERENTIAL->34-5 +88-13 11863-11864 . _ . _ _ + +#Text=_ " Es besteht wohl auch nur in Ihren Köpfen " , sagte K. , er wollte sich irgendwie in die Gedanken der Wächter einschleichen , sie zu seinen Gunsten wenden oder sich dort einbürgern . +89-1 11865-11866 _ _ _ _ _ +89-2 11867-11868 " _ " _ _ +89-3 11869-11871 Es _ Es _ _ +89-4 11872-11879 besteht _ besteht _ _ +89-5 11880-11884 wohl _ wohl _ _ +89-6 11885-11889 auch _ auch _ _ +89-7 11890-11893 nur _ nur _ _ +89-8 11894-11896 in _ in _ _ +89-9 11897-11902 Ihren _ Ihren _ _ +89-10 11903-11909 Köpfen _ Köpfen _ _ +89-11 11910-11911 " _ " _ _ +89-12 11912-11913 , _ , _ _ +89-13 11914-11919 sagte _ sagte _ _ +89-14 11920-11922 K. PER K. CM[84]|CM[89] ANAPHORIC->84-17|ANAPHORIC->89-99 +89-15 11923-11924 , _ , _ _ +89-16 11925-11927 er _ er CM[84]|CM[89] ANAPHORIC->84-18|ANAPHORIC->89-100 +89-17 11928-11934 wollte _ wollte _ _ +89-18 11935-11939 sich _ sich _ _ +89-19 11940-11949 irgendwie _ irgendwie _ _ +89-20 11950-11952 in _ in _ _ +89-21 11953-11956 die _ die CM[44] ANAPHORIC->44-1 +89-22 11957-11965 Gedanken _ Gedanken CM[44] ANAPHORIC->44-1 +89-23 11966-11969 der _ der CM[24]|CM[44] COREFERENTIAL->24-12|ANAPHORIC->44-1 +89-24 11970-11977 Wächter _ Wächter CM[24]|CM[44] COREFERENTIAL->24-12|ANAPHORIC->44-1 +89-25 11978-11991 einschleichen _ einschleichen _ _ +89-26 11992-11993 , _ , _ _ +89-27 11994-11997 sie _ sie CM[44] *->44-2 +89-28 11998-12000 zu _ zu _ _ +89-29 12001-12007 seinen _ seinen CM[84]|CM[89] ANAPHORIC->84-19|ANAPHORIC->89-101 +89-30 12008-12015 Gunsten _ Gunsten _ _ +89-31 12016-12022 wenden _ wenden _ _ +89-32 12023-12027 oder _ oder _ _ +89-33 12028-12032 sich _ sich CM[84]|CM[89] ANAPHORIC->84-20|ANAPHORIC->89-102 +89-34 12033-12037 dort _ dort _ _ +89-35 12038-12048 einbürgern _ einbürgern _ _ +89-36 12049-12050 . _ . _ _ + +#Text=_ Aber der Wächter sagte nur abweisend : " Sie werden es zu fühlen bekommen . " +90-1 12051-12052 _ _ _ _ _ +90-2 12053-12057 Aber _ Aber _ _ +90-3 12058-12061 der _ der _ _ +90-4 12062-12069 Wächter _ Wächter _ _ +90-5 12070-12075 sagte _ sagte _ _ +90-6 12076-12079 nur _ nur _ _ +90-7 12080-12089 abweisend _ abweisend _ _ +90-8 12090-12091 : _ : _ _ +90-9 12092-12093 " _ " _ _ +90-10 12094-12097 Sie _ Sie _ _ +90-11 12098-12104 werden _ werden _ _ +90-12 12105-12107 es _ es _ _ +90-13 12108-12110 zu _ zu _ _ +90-14 12111-12117 fühlen _ fühlen _ _ +90-15 12118-12126 bekommen _ bekommen _ _ +90-16 12127-12128 . _ . _ _ +90-17 12129-12130 " _ " _ _ + +#Text=_ Franz mischte sich ein und _ sagte : " Sieh , Willem , er gibt zu , er kenne das Gesetz nicht , und _ behauptet gleichzeitig , schuldlos zu sein . " +91-1 12131-12132 _ _ _ _ _ +91-2 12133-12138 Franz PER Franz CM[14] COREFERENTIAL->14-12 +91-3 12139-12146 mischte _ mischte _ _ +91-4 12147-12151 sich _ sich _ _ +91-5 12152-12155 ein _ ein _ _ +91-6 12156-12159 und _ und _ _ +91-7 12160-12161 _ _ er _ _ +91-8 12162-12167 sagte _ sagte _ _ +91-9 12168-12169 : _ : _ _ +91-10 12170-12171 " _ " _ _ +91-11 12172-12176 Sieh _ Sieh _ _ +91-12 12177-12178 , _ , _ _ +91-13 12179-12185 Willem PER Willem CM[34] ANAPHORIC->34-6 +91-14 12186-12187 , _ , _ _ +91-15 12188-12190 er _ er CM[84]|CM[89] ANAPHORIC->84-21|ANAPHORIC->89-103 +91-16 12191-12195 gibt _ gibt _ _ +91-17 12196-12198 zu _ zu _ _ +91-18 12199-12200 , _ , _ _ +91-19 12201-12203 er _ er CM[84]|CM[89] ANAPHORIC->84-22|ANAPHORIC->89-104 +91-20 12204-12209 kenne _ kenne _ _ +91-21 12210-12213 das _ das CM[43] *->43-3 +91-22 12214-12220 Gesetz _ Gesetz CM[43] *->43-3 +91-23 12221-12226 nicht _ nicht _ _ +91-24 12227-12228 , _ , _ _ +91-25 12229-12232 und _ und _ _ +91-26 12233-12234 _ _ er _ _ +91-27 12235-12244 behauptet _ behauptet _ _ +91-28 12245-12257 gleichzeitig _ gleichzeitig _ _ +91-29 12258-12259 , _ , _ _ +91-30 12260-12269 schuldlos _ schuldlos _ _ +91-31 12270-12272 zu _ zu _ _ +91-32 12273-12277 sein _ sein _ _ +91-33 12278-12279 . _ . _ _ +91-34 12280-12281 " _ " _ _ + +#Text=_ " Du hast ganz recht , aber ihm kann man nichts begreiflich machen " , sagte der andere . +92-1 12282-12283 _ _ _ _ _ +92-2 12284-12285 " _ " _ _ +92-3 12286-12288 Du _ Du _ _ +92-4 12289-12293 hast _ hast _ _ +92-5 12294-12298 ganz _ ganz _ _ +92-6 12299-12304 recht _ Recht _ _ +92-7 12305-12306 , _ , _ _ +92-8 12307-12311 aber _ aber _ _ +92-9 12312-12315 ihm _ ihm CM[84]|CM[89] COREFERENTIAL->84-23|COREFERENTIAL->89-105 +92-10 12316-12320 kann _ kann _ _ +92-11 12321-12324 man _ man _ _ +92-12 12325-12331 nichts _ nichts _ _ +92-13 12332-12343 begreiflich _ begreiflich _ _ +92-14 12344-12350 machen _ machen _ _ +92-15 12351-12352 " _ " _ _ +92-16 12353-12354 , _ , _ _ +92-17 12355-12360 sagte _ sagte _ _ +92-18 12361-12364 der _ der CM[34] COREFERENTIAL->34-7 +92-19 12365-12371 andere _ andere CM[34] COREFERENTIAL->34-7 +92-20 12372-12373 . _ . _ _ + +#Text=_ K. antwortete nichts mehr ; muß ich , dachte er , durch das Geschwätz dieser niedrigsten Organe – sie geben selbst zu , es zu sein – mich noch mehr verwirren lassen ? +93-1 12374-12375 _ _ _ _ _ +93-2 12376-12378 K. PER K. CM[84]|CM[89] ANAPHORIC->84-24|ANAPHORIC->89-106 +93-3 12379-12389 antwortete _ antwortete _ _ +93-4 12390-12396 nichts _ nichts _ _ +93-5 12397-12401 mehr _ mehr _ _ +93-6 12402-12403 ; _ \; _ _ +93-7 12404-12407 muß _ muss _ _ +93-8 12408-12411 ich _ ich _ _ +93-9 12412-12413 , _ , _ _ +93-10 12414-12420 dachte _ dachte _ _ +93-11 12421-12423 er _ er CM[84]|CM[89] ANAPHORIC->84-25|ANAPHORIC->89-107 +93-12 12424-12425 , _ , _ _ +93-13 12426-12431 durch _ durch _ _ +93-14 12432-12435 das _ das _ _ +93-15 12436-12445 Geschwätz _ Geschwätz _ _ +93-16 12446-12452 dieser _ dieser CM[24] ANAPHORIC->24-13 +93-17 12453-12464 niedrigsten _ niedrigsten CM[24] ANAPHORIC->24-13 +93-18 12465-12471 Organe _ Organe CM[24] ANAPHORIC->24-13 +93-19 12472-12473 – _ – _ _ +93-20 12474-12477 sie _ sie CM[24] ANAPHORIC->24-14 +93-21 12478-12483 geben _ geben _ _ +93-22 12484-12490 selbst _ selbst _ _ +93-23 12491-12493 zu _ zu _ _ +93-24 12494-12495 , _ , _ _ +93-25 12496-12498 es _ es _ _ +93-26 12499-12501 zu _ zu _ _ +93-27 12502-12506 sein _ sein _ _ +93-28 12507-12508 – _ – _ _ +93-29 12509-12513 mich _ mich _ _ +93-30 12514-12518 noch _ noch _ _ +93-31 12519-12523 mehr _ mehr _ _ +93-32 12524-12533 verwirren _ verwirren _ _ +93-33 12534-12540 lassen _ lassen _ _ +93-34 12541-12542 ? _ ? _ _ + +#Text=_ Sie reden doch jedenfalls von Dingen , die sie gar nicht verstehen . +94-1 12543-12544 _ _ _ _ _ +94-2 12545-12548 Sie _ Sie CM[24] ANAPHORIC->24-15 +94-3 12549-12554 reden _ reden _ _ +94-4 12555-12559 doch _ doch _ _ +94-5 12560-12570 jedenfalls _ jedenfalls _ _ +94-6 12571-12574 von _ von _ _ +94-7 12575-12581 Dingen _ Dingen CM[45] ANAPHORIC->45-1 +94-8 12582-12583 , _ , _ _ +94-9 12584-12587 die _ die CM[45] *->45-2 +94-10 12588-12591 sie _ sie CM[24] ANAPHORIC->24-16 +94-11 12592-12595 gar _ gar _ _ +94-12 12596-12601 nicht _ nicht _ _ +94-13 12602-12611 verstehen _ verstehen _ _ +94-14 12612-12613 . _ . _ _ + +#Text=_ Ihre Sicherheit ist nur durch ihre Dummheit möglich . +95-1 12614-12615 _ _ _ _ _ +95-2 12616-12620 Ihre _ Ihre _ _ +95-3 12621-12631 Sicherheit _ Sicherheit _ _ +95-4 12632-12635 ist _ ist _ _ +95-5 12636-12639 nur _ nur _ _ +95-6 12640-12645 durch _ durch _ _ +95-7 12646-12650 ihre _ ihre _ _ +95-8 12651-12659 Dummheit _ Dummheit _ _ +95-9 12660-12667 möglich _ möglich _ _ +95-10 12668-12669 . _ . _ _ + +#Text=_ Ein paar Worte , die ich mit einem mir ebenbürtigen Menschen sprechen werde , werden alles unvergleichlich klarer machen als die längsten Reden mit diesen . +96-1 12670-12671 _ _ _ _ _ +96-2 12672-12675 Ein _ Ein CM[46] ANAPHORIC->46-1 +96-3 12676-12680 paar _ paar CM[46] ANAPHORIC->46-1 +96-4 12681-12686 Worte _ Worte CM[46] ANAPHORIC->46-1 +96-5 12687-12688 , _ , _ _ +96-6 12689-12692 die _ die CM[46] *->46-2 +96-7 12693-12696 ich _ ich _ _ +96-8 12697-12700 mit _ mit _ _ +96-9 12701-12706 einem _ einem _ _ +96-10 12707-12710 mir _ mir _ _ +96-11 12711-12723 ebenbürtigen _ ebenbürtigen _ _ +96-12 12724-12732 Menschen _ Menschen _ _ +96-13 12733-12741 sprechen _ sprechen _ _ +96-14 12742-12747 werde _ werde _ _ +96-15 12748-12749 , _ , _ _ +96-16 12750-12756 werden _ werden _ _ +96-17 12757-12762 alles _ alles _ _ +96-18 12763-12778 unvergleichlich _ unvergleichlich _ _ +96-19 12779-12785 klarer _ klarer _ _ +96-20 12786-12792 machen _ machen _ _ +96-21 12793-12796 als _ als _ _ +96-22 12797-12800 die _ die _ _ +96-23 12801-12809 längsten _ längsten _ _ +96-24 12810-12815 Reden _ Reden _ _ +96-25 12816-12819 mit _ mit _ _ +96-26 12820-12826 diesen _ diesen CM[24] ANAPHORIC->24-17 +96-27 12827-12828 . _ . _ _ + +#Text=_ Er ging einige Male in dem freien Raum des Zimmers auf und ab , drüben sah er die alte Frau , die einen noch viel älteren Greis zum Fenster gezerrt hatte , den sie umschlungen hielt . +97-1 12829-12830 _ _ _ _ _ +97-2 12831-12833 Er _ Er CM[84]|CM[89] ANAPHORIC->84-26|ANAPHORIC->89-108 +97-3 12834-12838 ging _ ging _ _ +97-4 12839-12845 einige _ einige _ _ +97-5 12846-12850 Male _ Male _ _ +97-6 12851-12853 in _ in _ _ +97-7 12854-12857 dem _ dem _ _ +97-8 12858-12864 freien _ freien _ _ +97-9 12865-12869 Raum _ Raum _ _ +97-10 12870-12873 des _ des CM[71] COREFERENTIAL->71-3 +97-11 12874-12881 Zimmers _ Zimmers CM[71] COREFERENTIAL->71-3 +97-12 12882-12885 auf _ auf _ _ +97-13 12886-12889 und _ und _ _ +97-14 12890-12892 ab _ ab _ _ +97-15 12893-12894 , _ , _ _ +97-16 12895-12901 drüben _ drüben _ _ +97-17 12902-12905 sah _ sah _ _ +97-18 12906-12908 er _ er CM[84]|CM[89] ANAPHORIC->84-27|ANAPHORIC->89-109 +97-19 12909-12912 die _ die CM[48] ANAPHORIC->48-3 +97-20 12913-12917 alte _ alte CM[48] ANAPHORIC->48-3 +97-21 12918-12922 Frau _ Frau CM[48] ANAPHORIC->48-3 +97-22 12923-12924 , _ , _ _ +97-23 12925-12928 die _ die CM[48] ANAPHORIC->48-4 +97-24 12929-12934 einen _ einen CM[47] ANAPHORIC->47-1 +97-25 12935-12939 noch _ noch CM[47] ANAPHORIC->47-1 +97-26 12940-12944 viel _ viel CM[47] ANAPHORIC->47-1 +97-27 12945-12952 älteren _ älteren CM[47] ANAPHORIC->47-1 +97-28 12953-12958 Greis _ Greis CM[47] ANAPHORIC->47-1 +97-29 12959-12962 zum _ zum _ _ +97-30 12963-12970 Fenster _ Fenster _ _ +97-31 12971-12978 gezerrt _ gezerrt _ _ +97-32 12979-12984 hatte _ hatte _ _ +97-33 12985-12986 , _ , _ _ +97-34 12987-12990 den _ den CM[47] *->47-2 +97-35 12991-12994 sie _ sie CM[48] *->48-5 +97-36 12995-13006 umschlungen _ umschlungen _ _ +97-37 13007-13012 hielt _ hielt _ _ +97-38 13013-13014 . _ . _ _ + +#Text=_ K. mußte dieser Schaustellung ein Ende machen : " Führen Sie mich zu Ihrem Vorgesetzten " , sagte er . +98-1 13015-13016 _ _ _ _ _ +98-2 13017-13019 K. PER K. CM[84]|CM[89] ANAPHORIC->84-28|ANAPHORIC->89-110 +98-3 13020-13025 mußte _ musste _ _ +98-4 13026-13032 dieser _ dieser _ _ +98-5 13033-13046 Schaustellung _ Schaustellung _ _ +98-6 13047-13050 ein _ ein _ _ +98-7 13051-13055 Ende _ Ende _ _ +98-8 13056-13062 machen _ machen _ _ +98-9 13063-13064 : _ : _ _ +98-10 13065-13066 " _ " _ _ +98-11 13067-13073 Führen _ Führen _ _ +98-12 13074-13077 Sie _ Sie _ _ +98-13 13078-13082 mich _ mich _ _ +98-14 13083-13085 zu _ zu _ _ +98-15 13086-13091 Ihrem _ Ihrem CM[49] ANAPHORIC->49-1 +98-16 13092-13104 Vorgesetzten _ Vorgesetzten CM[49] ANAPHORIC->49-1 +98-17 13105-13106 " _ " _ _ +98-18 13107-13108 , _ , _ _ +98-19 13109-13114 sagte _ sagte _ _ +98-20 13115-13117 er _ er CM[84]|CM[89] ANAPHORIC->84-29|ANAPHORIC->89-111 +98-21 13118-13119 . _ . _ _ + +#Text=_ " Wenn er es wünscht ; nicht früher " , sagte der Wächter , der Willem genannt worden war . +99-1 13120-13121 _ _ _ _ _ +99-2 13122-13123 " _ " _ _ +99-3 13124-13128 Wenn _ Wenn _ _ +99-4 13129-13131 er _ er CM[49] *->49-2 +99-5 13132-13134 es _ es _ _ +99-6 13135-13142 wünscht _ wünscht _ _ +99-7 13143-13144 ; _ \; _ _ +99-8 13145-13150 nicht _ nicht _ _ +99-9 13151-13157 früher _ früher _ _ +99-10 13158-13159 " _ " _ _ +99-11 13160-13161 , _ , _ _ +99-12 13162-13167 sagte _ sagte _ _ +99-13 13168-13171 der _ der CM[34] ANAPHORIC->34-8 +99-14 13172-13179 Wächter _ Wächter CM[34] ANAPHORIC->34-8 +99-15 13180-13181 , _ , _ _ +99-16 13182-13185 der _ der CM[34] ANAPHORIC->34-9 +99-17 13186-13192 Willem PER Willem _ _ +99-18 13193-13200 genannt _ genannt _ _ +99-19 13201-13207 worden _ worden _ _ +99-20 13208-13211 war _ war _ _ +99-21 13212-13213 . _ . _ _ + +#Text=_ " Und nun rate ich Ihnen " , fügte er hinzu , " in Ihr Zimmer zu gehen , sich ruhig zu verhalten und darauf zu warten , was über Sie verfügt werden wird . +100-1 13214-13215 _ _ _ _ _ +100-2 13216-13217 " _ " _ _ +100-3 13218-13221 Und _ Und _ _ +100-4 13222-13225 nun _ nun _ _ +100-5 13226-13230 rate _ rate _ _ +100-6 13231-13234 ich _ ich _ _ +100-7 13235-13240 Ihnen _ Ihnen _ _ +100-8 13241-13242 " _ " _ _ +100-9 13243-13244 , _ , _ _ +100-10 13245-13250 fügte _ fügte _ _ +100-11 13251-13253 er _ er CM[34] COREFERENTIAL->34-10 +100-12 13254-13259 hinzu _ hinzu _ _ +100-13 13260-13261 , _ , _ _ +100-14 13262-13263 " _ " _ _ +100-15 13264-13266 in _ in _ _ +100-16 13267-13270 Ihr _ Ihr _ _ +100-17 13271-13277 Zimmer _ Zimmer _ _ +100-18 13278-13280 zu _ zu _ _ +100-19 13281-13286 gehen _ gehen _ _ +100-20 13287-13288 , _ , _ _ +100-21 13289-13293 sich _ sich _ _ +100-22 13294-13299 ruhig _ ruhig _ _ +100-23 13300-13302 zu _ zu _ _ +100-24 13303-13312 verhalten _ verhalten _ _ +100-25 13313-13316 und _ und _ _ +100-26 13317-13323 darauf _ darauf _ _ +100-27 13324-13326 zu _ zu _ _ +100-28 13327-13333 warten _ warten _ _ +100-29 13334-13335 , _ , _ _ +100-30 13336-13339 was _ was _ _ +100-31 13340-13344 über _ über _ _ +100-32 13345-13348 Sie _ Sie _ _ +100-33 13349-13356 verfügt _ verfügt _ _ +100-34 13357-13363 werden _ werden _ _ +100-35 13364-13368 wird _ wird _ _ +100-36 13369-13370 . _ . _ _ + +#Text=_ Wir raten Ihnen , zerstreuen Sie sich nicht durch nutzlose Gedanken , sondern sammeln Sie sich , es werden große Anforderungen an Sie gestellt werden . +101-1 13371-13372 _ _ _ _ _ +101-2 13373-13376 Wir _ Wir _ _ +101-3 13377-13382 raten _ raten _ _ +101-4 13383-13388 Ihnen _ Ihnen _ _ +101-5 13389-13390 , _ , _ _ +101-6 13391-13401 zerstreuen _ zerstreuen _ _ +101-7 13402-13405 Sie _ Sie _ _ +101-8 13406-13410 sich _ sich _ _ +101-9 13411-13416 nicht _ nicht _ _ +101-10 13417-13422 durch _ durch _ _ +101-11 13423-13431 nutzlose _ nutzlose _ _ +101-12 13432-13440 Gedanken _ Gedanken _ _ +101-13 13441-13442 , _ , _ _ +101-14 13443-13450 sondern _ sondern _ _ +101-15 13451-13458 sammeln _ sammeln _ _ +101-16 13459-13462 Sie _ Sie _ _ +101-17 13463-13467 sich _ sich _ _ +101-18 13468-13469 , _ , _ _ +101-19 13470-13472 es _ es _ _ +101-20 13473-13479 werden _ werden _ _ +101-21 13480-13485 große _ große _ _ +101-22 13486-13499 Anforderungen _ Anforderungen _ _ +101-23 13500-13502 an _ an _ _ +101-24 13503-13506 Sie _ Sie _ _ +101-25 13507-13515 gestellt _ gestellt _ _ +101-26 13516-13522 werden _ werden _ _ +101-27 13523-13524 . _ . _ _ + +#Text=_ Sie haben uns nicht so behandelt , wie es unser Entgegenkommen verdient hätte , Sie haben vergessen , daß wir , mögen wir auch sein was immer _ _ , zumindest jetzt Ihnen gegenüber freie Männer sind , das ist kein kleines Übergewicht . +102-1 13525-13526 _ _ _ _ _ +102-2 13527-13530 Sie _ Sie _ _ +102-3 13531-13536 haben _ haben _ _ +102-4 13537-13540 uns _ uns _ _ +102-5 13541-13546 nicht _ nicht _ _ +102-6 13547-13549 so _ so _ _ +102-7 13550-13559 behandelt _ behandelt _ _ +102-8 13560-13561 , _ , _ _ +102-9 13562-13565 wie _ wie _ _ +102-10 13566-13568 es _ es _ _ +102-11 13569-13574 unser _ unser _ _ +102-12 13575-13589 Entgegenkommen _ Entgegenkommen _ _ +102-13 13590-13598 verdient _ verdient _ _ +102-14 13599-13604 hätte _ hätte _ _ +102-15 13605-13606 , _ , _ _ +102-16 13607-13610 Sie _ Sie _ _ +102-17 13611-13616 haben _ haben _ _ +102-18 13617-13626 vergessen _ vergessen _ _ +102-19 13627-13628 , _ , _ _ +102-20 13629-13632 daß _ dass _ _ +102-21 13633-13636 wir _ wir _ _ +102-22 13637-13638 , _ , _ _ +102-23 13639-13644 mögen _ mögen _ _ +102-24 13645-13648 wir _ wir _ _ +102-25 13649-13653 auch _ auch _ _ +102-26 13654-13658 sein _ sein _ _ +102-27 13659-13662 was _ was _ _ +102-28 13663-13668 immer _ immer _ _ +102-29 13669-13670 _ _ wir _ _ +102-30 13671-13672 _ _ sind _ _ +102-31 13673-13674 , _ , _ _ +102-32 13675-13684 zumindest _ zumindest _ _ +102-33 13685-13690 jetzt _ jetzt _ _ +102-34 13691-13696 Ihnen _ Ihnen _ _ +102-35 13697-13706 gegenüber _ gegenüber _ _ +102-36 13707-13712 freie _ freie _ _ +102-37 13713-13719 Männer _ Männer _ _ +102-38 13720-13724 sind _ sind _ _ +102-39 13725-13726 , _ , _ _ +102-40 13727-13730 das _ das _ _ +102-41 13731-13734 ist _ ist _ _ +102-42 13735-13739 kein _ kein _ _ +102-43 13740-13747 kleines _ kleines _ _ +102-44 13748-13759 Übergewicht _ Übergewicht _ _ +102-45 13760-13761 . _ . _ _ + +#Text=_ Trotzdem sind wir bereit , falls Sie Geld haben , Ihnen ein kleines Frühstück aus dem Kaffeehaus drüben zu bringen . " +103-1 13762-13763 _ _ _ _ _ +103-2 13764-13772 Trotzdem _ Trotzdem _ _ +103-3 13773-13777 sind _ sind _ _ +103-4 13778-13781 wir _ wir _ _ +103-5 13782-13788 bereit _ bereit _ _ +103-6 13789-13790 , _ , _ _ +103-7 13791-13796 falls _ falls _ _ +103-8 13797-13800 Sie _ Sie _ _ +103-9 13801-13805 Geld _ Geld _ _ +103-10 13806-13811 haben _ haben _ _ +103-11 13812-13813 , _ , _ _ +103-12 13814-13819 Ihnen _ Ihnen _ _ +103-13 13820-13823 ein _ ein _ _ +103-14 13824-13831 kleines _ kleines _ _ +103-15 13832-13841 Frühstück _ Frühstück _ _ +103-16 13842-13845 aus _ aus CM[86] COREFERENTIAL->86-1 +103-17 13846-13849 dem _ dem CM[86] COREFERENTIAL->86-1 +103-18 13850-13860 Kaffeehaus _ Kaffeehaus CM[86] COREFERENTIAL->86-1 +103-19 13861-13867 drüben _ drüben CM[86] COREFERENTIAL->86-1 +103-20 13868-13870 zu _ zu _ _ +103-21 13871-13878 bringen _ bringen _ _ +103-22 13879-13880 . _ . _ _ +103-23 13881-13882 " _ " _ _ + +#Text=_ Ohne auf dieses Angebot zu antworten , stand K. ein Weilchen lang still . +104-1 13883-13884 _ _ _ _ _ +104-2 13885-13889 Ohne _ Ohne _ _ +104-3 13890-13893 auf _ auf _ _ +104-4 13894-13900 dieses _ dieses _ _ +104-5 13901-13908 Angebot _ Angebot _ _ +104-6 13909-13911 zu _ zu _ _ +104-7 13912-13921 antworten _ antworten _ _ +104-8 13922-13923 , _ , _ _ +104-9 13924-13929 stand _ stand _ _ +104-10 13930-13932 K. PER K. CM[84]|CM[89] ANAPHORIC->84-30|ANAPHORIC->89-112 +104-11 13933-13936 ein _ ein _ _ +104-12 13937-13945 Weilchen _ Weilchen _ _ +104-13 13946-13950 lang _ lang _ _ +104-14 13951-13956 still _ still _ _ +104-15 13957-13958 . _ . _ _ + +#Text=_ Vielleicht würden ihn die beiden , wenn er die Tür des folgenden Zimmers oder gar die Tür des Vorzimmers öffnete , gar nicht zu hindern wagen , vielleicht wäre es die einfachste Lösung des Ganzen , daß er es auf die Spitze trieb . +105-1 13959-13960 _ _ _ _ _ +105-2 13961-13971 Vielleicht _ Vielleicht _ _ +105-3 13972-13978 würden _ würden _ _ +105-4 13979-13982 ihn _ ihn CM[84]|CM[89] ANAPHORIC->84-31|ANAPHORIC->89-113 +105-5 13983-13986 die _ die CM[24] ANAPHORIC->24-18 +105-6 13987-13993 beiden _ beiden CM[24] ANAPHORIC->24-18 +105-7 13994-13995 , _ , _ _ +105-8 13996-14000 wenn _ wenn _ _ +105-9 14001-14003 er _ er CM[84]|CM[89] ANAPHORIC->84-32|ANAPHORIC->89-114 +105-10 14004-14007 die _ die _ _ +105-11 14008-14011 Tür _ Tür _ _ +105-12 14012-14015 des _ des _ _ +105-13 14016-14025 folgenden _ folgenden _ _ +105-14 14026-14033 Zimmers _ Zimmers _ _ +105-15 14034-14038 oder _ oder _ _ +105-16 14039-14042 gar _ gar _ _ +105-17 14043-14046 die _ die _ _ +105-18 14047-14050 Tür _ Tür _ _ +105-19 14051-14054 des _ des _ _ +105-20 14055-14065 Vorzimmers _ Vorzimmers _ _ +105-21 14066-14073 öffnete _ öffnete _ _ +105-22 14074-14075 , _ , _ _ +105-23 14076-14079 gar _ gar _ _ +105-24 14080-14085 nicht _ nicht _ _ +105-25 14086-14088 zu _ zu _ _ +105-26 14089-14096 hindern _ hindern _ _ +105-27 14097-14102 wagen _ wagen _ _ +105-28 14103-14104 , _ , _ _ +105-29 14105-14115 vielleicht _ vielleicht _ _ +105-30 14116-14120 wäre _ wäre _ _ +105-31 14121-14123 es _ es _ _ +105-32 14124-14127 die _ die _ _ +105-33 14128-14138 einfachste _ einfachste _ _ +105-34 14139-14145 Lösung _ Lösung _ _ +105-35 14146-14149 des _ des _ _ +105-36 14150-14156 Ganzen _ Ganzen _ _ +105-37 14157-14158 , _ , _ _ +105-38 14159-14162 daß _ dass _ _ +105-39 14163-14165 er _ er CM[84]|CM[89] ANAPHORIC->84-33|ANAPHORIC->89-115 +105-40 14166-14168 es _ es _ _ +105-41 14169-14172 auf _ auf _ _ +105-42 14173-14176 die _ die _ _ +105-43 14177-14183 Spitze _ Spitze _ _ +105-44 14184-14189 trieb _ trieb _ _ +105-45 14190-14191 . _ . _ _ + +#Text=_ Aber vielleicht würden sie ihn doch packen und , war er einmal niedergeworfen , so war auch alle Überlegenheit verloren , die er jetzt ihnen gegenüber in gewisser Hinsicht doch wahrte . +106-1 14192-14193 _ _ _ _ _ +106-2 14194-14198 Aber _ Aber _ _ +106-3 14199-14209 vielleicht _ vielleicht _ _ +106-4 14210-14216 würden _ würden _ _ +106-5 14217-14220 sie _ sie CM[24] ANAPHORIC->24-19 +106-6 14221-14224 ihn _ ihn CM[84]|CM[89] ANAPHORIC->84-34|ANAPHORIC->89-116 +106-7 14225-14229 doch _ doch _ _ +106-8 14230-14236 packen _ packen _ _ +106-9 14237-14240 und _ und _ _ +106-10 14241-14242 , _ , _ _ +106-11 14243-14246 war _ war _ _ +106-12 14247-14249 er _ er CM[84]|CM[89] ANAPHORIC->84-35|ANAPHORIC->89-117 +106-13 14250-14256 einmal _ einmal _ _ +106-14 14257-14271 niedergeworfen _ niedergeworfen _ _ +106-15 14272-14273 , _ , _ _ +106-16 14274-14276 so _ so _ _ +106-17 14277-14280 war _ war _ _ +106-18 14281-14285 auch _ auch _ _ +106-19 14286-14290 alle _ alle CM[50] ANAPHORIC->50-1 +106-20 14291-14304 Überlegenheit _ Überlegenheit CM[50] ANAPHORIC->50-1 +106-21 14305-14313 verloren _ verloren _ _ +106-22 14314-14315 , _ , _ _ +106-23 14316-14319 die _ die CM[50] *->50-2 +106-24 14320-14322 er _ er CM[84]|CM[89] ANAPHORIC->84-36|ANAPHORIC->89-118 +106-25 14323-14328 jetzt _ jetzt _ _ +106-26 14329-14334 ihnen _ ihnen CM[24] COREFERENTIAL->24-20 +106-27 14335-14344 gegenüber _ gegenüber _ _ +106-28 14345-14347 in _ in _ _ +106-29 14348-14356 gewisser _ gewisser _ _ +106-30 14357-14365 Hinsicht _ Hinsicht _ _ +106-31 14366-14370 doch _ doch _ _ +106-32 14371-14377 wahrte _ wahrte _ _ +106-33 14378-14379 . _ . _ _ + +#Text=_ Deshalb zog er die Sicherheit der Lösung vor , wie sie der natürliche Verlauf bringen mußte , und _ ging in sein Zimmer zurück , ohne daß von seiner Seite oder von Seite der Wächter ein weiteres Wort gefallen wäre . +107-1 14380-14381 _ _ _ _ _ +107-2 14382-14389 Deshalb _ Deshalb _ _ +107-3 14390-14393 zog _ zog _ _ +107-4 14394-14396 er _ er CM[84]|CM[89] ANAPHORIC->84-37|ANAPHORIC->89-119 +107-5 14397-14400 die _ die CM[51] ANAPHORIC->51-1 +107-6 14401-14411 Sicherheit _ Sicherheit CM[51] ANAPHORIC->51-1 +107-7 14412-14415 der _ der CM[51] ANAPHORIC->51-1 +107-8 14416-14422 Lösung _ Lösung CM[51] ANAPHORIC->51-1 +107-9 14423-14426 vor _ vor _ _ +107-10 14427-14428 , _ , _ _ +107-11 14429-14432 wie _ wie _ _ +107-12 14433-14436 sie _ sie CM[51] *->51-2 +107-13 14437-14440 der _ der _ _ +107-14 14441-14451 natürliche _ natürliche _ _ +107-15 14452-14459 Verlauf _ Verlauf _ _ +107-16 14460-14467 bringen _ bringen _ _ +107-17 14468-14473 mußte _ musste _ _ +107-18 14474-14475 , _ , _ _ +107-19 14476-14479 und _ und _ _ +107-20 14480-14481 _ _ er _ _ +107-21 14482-14486 ging _ ging _ _ +107-22 14487-14489 in _ in _ _ +107-23 14490-14494 sein _ sein CM[84]|CM[89] ANAPHORIC->84-38|ANAPHORIC->89-120 +107-24 14495-14501 Zimmer _ Zimmer _ _ +107-25 14502-14508 zurück _ zurück _ _ +107-26 14509-14510 , _ , _ _ +107-27 14511-14515 ohne _ ohne _ _ +107-28 14516-14519 daß _ dass _ _ +107-29 14520-14523 von _ von _ _ +107-30 14524-14530 seiner _ seiner CM[84]|CM[89] ANAPHORIC->84-39|ANAPHORIC->89-121 +107-31 14531-14536 Seite _ Seite _ _ +107-32 14537-14541 oder _ oder _ _ +107-33 14542-14545 von _ von _ _ +107-34 14546-14551 Seite _ Seite _ _ +107-35 14552-14555 der _ der CM[24] COREFERENTIAL->24-21 +107-36 14556-14563 Wächter _ Wächter CM[24] COREFERENTIAL->24-21 +107-37 14564-14567 ein _ ein _ _ +107-38 14568-14576 weiteres _ weiteres _ _ +107-39 14577-14581 Wort _ Wort _ _ +107-40 14582-14590 gefallen _ gefallen _ _ +107-41 14591-14595 wäre _ wäre _ _ +107-42 14596-14597 . _ . _ _ + +#Text=_ Er warf sich auf sein Bett und _ nahm vom Waschtisch einen schönen Apfel , den er sich gestern abend für das Frühstück vorbereitet hatte . +108-1 14598-14599 _ _ _ _ _ +108-2 14600-14602 Er _ Er CM[84]|CM[89] ANAPHORIC->84-40|ANAPHORIC->89-122 +108-3 14603-14607 warf _ warf _ _ +108-4 14608-14612 sich _ sich CM[84]|CM[89] ANAPHORIC->84-41|ANAPHORIC->89-123 +108-5 14613-14616 auf _ auf _ _ +108-6 14617-14621 sein _ sein CM[84]|CM[89] ANAPHORIC->84-42|ANAPHORIC->89-124 +108-7 14622-14626 Bett _ Bett _ _ +108-8 14627-14630 und _ und _ _ +108-9 14631-14632 _ _ er _ _ +108-10 14633-14637 nahm _ nahm _ _ +108-11 14638-14641 vom _ vom _ _ +108-12 14642-14652 Waschtisch _ Waschtisch _ _ +108-13 14653-14658 einen _ einen CM[52] ANAPHORIC->52-1 +108-14 14659-14666 schönen _ schönen CM[52] ANAPHORIC->52-1 +108-15 14667-14672 Apfel _ Apfel CM[52] ANAPHORIC->52-1 +108-16 14673-14674 , _ , _ _ +108-17 14675-14678 den _ den CM[52] *->52-2 +108-18 14679-14681 er _ er CM[84]|CM[89] ANAPHORIC->84-43|ANAPHORIC->89-125 +108-19 14682-14686 sich _ sich CM[84]|CM[89] ANAPHORIC->84-44|ANAPHORIC->89-126 +108-20 14687-14694 gestern _ gestern _ _ +108-21 14695-14700 abend _ Abend _ _ +108-22 14701-14704 für _ für _ _ +108-23 14705-14708 das _ das _ _ +108-24 14709-14718 Frühstück _ Frühstück _ _ +108-25 14719-14730 vorbereitet _ vorbereitet _ _ +108-26 14731-14736 hatte _ hatte _ _ +108-27 14737-14738 . _ . _ _ + +#Text=_ Jetzt war er sein einziges Frühstück und jedenfalls , wie er sich beim ersten großen Bissen versicherte , viel besser , als das Frühstück aus dem schmutzigen Nachtcafé gewesen wäre , das er durch die Gnade der Wächter hätte bekommen können . +109-1 14739-14740 _ _ _ _ _ +109-2 14741-14746 Jetzt _ Jetzt _ _ +109-3 14747-14750 war _ war _ _ +109-4 14751-14753 er _ er CM[84]|CM[89] ANAPHORIC->84-45|ANAPHORIC->89-127 +109-5 14754-14758 sein _ sein _ _ +109-6 14759-14767 einziges _ einziges _ _ +109-7 14768-14777 Frühstück _ Frühstück _ _ +109-8 14778-14781 und _ und _ _ +109-9 14782-14792 jedenfalls _ jedenfalls _ _ +109-10 14793-14794 , _ , _ _ +109-11 14795-14798 wie _ wie _ _ +109-12 14799-14801 er _ er CM[84]|CM[89] ANAPHORIC->84-46|ANAPHORIC->89-128 +109-13 14802-14806 sich _ sich _ _ +109-14 14807-14811 beim _ beim _ _ +109-15 14812-14818 ersten _ ersten _ _ +109-16 14819-14825 großen _ großen _ _ +109-17 14826-14832 Bissen _ Bissen _ _ +109-18 14833-14844 versicherte _ versicherte _ _ +109-19 14845-14846 , _ , _ _ +109-20 14847-14851 viel _ viel _ _ +109-21 14852-14858 besser _ besser _ _ +109-22 14859-14860 , _ , _ _ +109-23 14861-14864 als _ als _ _ +109-24 14865-14868 das _ das CM[53] ANAPHORIC->53-1 +109-25 14869-14878 Frühstück _ Frühstück CM[53] ANAPHORIC->53-1 +109-26 14879-14882 aus _ aus CM[53] ANAPHORIC->53-1 +109-27 14883-14886 dem _ dem CM[53]|CM[86] ANAPHORIC->53-1|*->86-2 +109-28 14887-14898 schmutzigen _ schmutzigen CM[53]|CM[86] ANAPHORIC->53-1|*->86-2 +109-29 14899-14908 Nachtcafé _ Nachtcafé CM[53]|CM[86] ANAPHORIC->53-1|*->86-2 +109-30 14909-14916 gewesen _ gewesen _ _ +109-31 14917-14921 wäre _ wäre _ _ +109-32 14922-14923 , _ , _ _ +109-33 14924-14927 das _ das CM[53] *->53-2 +109-34 14928-14930 er _ er CM[84]|CM[89] ANAPHORIC->84-47|ANAPHORIC->89-129 +109-35 14931-14936 durch _ durch _ _ +109-36 14937-14940 die _ die _ _ +109-37 14941-14946 Gnade _ Gnade _ _ +109-38 14947-14950 der _ der CM[24] COREFERENTIAL->24-22 +109-39 14951-14958 Wächter _ Wächter CM[24] COREFERENTIAL->24-22 +109-40 14959-14964 hätte _ hätte _ _ +109-41 14965-14973 bekommen _ bekommen _ _ +109-42 14974-14980 können _ können _ _ +109-43 14981-14982 . _ . _ _ + +#Text=_ Er fühlte sich wohl und zuversichtlich , in der Bank versäumte er zwar heute vormittag seinen Dienst , aber das war bei der verhältnismäßig hohen Stellung , die er dort einnahm , leicht entschuldigt . +110-1 14983-14984 _ _ _ _ _ +110-2 14985-14987 Er _ Er CM[84]|CM[89] ANAPHORIC->84-48|ANAPHORIC->89-130 +110-3 14988-14994 fühlte _ fühlte _ _ +110-4 14995-14999 sich _ sich _ _ +110-5 15000-15004 wohl _ wohl _ _ +110-6 15005-15008 und _ und _ _ +110-7 15009-15023 zuversichtlich _ zuversichtlich _ _ +110-8 15024-15025 , _ , _ _ +110-9 15026-15028 in _ in _ _ +110-10 15029-15032 der _ der _ _ +110-11 15033-15037 Bank _ Bank _ _ +110-12 15038-15047 versäumte _ versäumte _ _ +110-13 15048-15050 er _ er CM[84]|CM[89] ANAPHORIC->84-49|ANAPHORIC->89-131 +110-14 15051-15055 zwar _ zwar _ _ +110-15 15056-15061 heute _ heute _ _ +110-16 15062-15071 vormittag _ Vormittag _ _ +110-17 15072-15078 seinen _ seinen _ _ +110-18 15079-15085 Dienst _ Dienst _ _ +110-19 15086-15087 , _ , _ _ +110-20 15088-15092 aber _ aber _ _ +110-21 15093-15096 das _ das _ _ +110-22 15097-15100 war _ war _ _ +110-23 15101-15104 bei _ bei _ _ +110-24 15105-15108 der _ der CM[54] ANAPHORIC->54-1 +110-25 15109-15124 verhältnismäßig _ verhältnismäßig CM[54] ANAPHORIC->54-1 +110-26 15125-15130 hohen _ hohen CM[54] ANAPHORIC->54-1 +110-27 15131-15139 Stellung _ Stellung CM[54] ANAPHORIC->54-1 +110-28 15140-15141 , _ , _ _ +110-29 15142-15145 die _ die CM[54] *->54-2 +110-30 15146-15148 er _ er CM[84]|CM[89] ANAPHORIC->84-50|ANAPHORIC->89-132 +110-31 15149-15153 dort _ dort _ _ +110-32 15154-15161 einnahm _ einnahm _ _ +110-33 15162-15163 , _ , _ _ +110-34 15164-15170 leicht _ leicht _ _ +110-35 15171-15183 entschuldigt _ entschuldigt _ _ +110-36 15184-15185 . _ . _ _ + +#Text=_ Sollte er die wirkliche Entschuldigung anführen ? +111-1 15186-15187 _ _ _ _ _ +111-2 15188-15194 Sollte _ Sollte _ _ +111-3 15195-15197 er _ er CM[84]|CM[89] ANAPHORIC->84-51|ANAPHORIC->89-133 +111-4 15198-15201 die _ die _ _ +111-5 15202-15211 wirkliche _ wirkliche _ _ +111-6 15212-15226 Entschuldigung _ Entschuldigung _ _ +111-7 15227-15235 anführen _ anführen _ _ +111-8 15236-15237 ? _ ? _ _ + +#Text=_ Er gedachte _ es zu tun , Würde man ihm nicht glauben , was in diesem Fall begreiflich war , so konnte er Frau Grubach als Zeugin führen oder auch die beiden Alten von drüben , die wohl jetzt auf dem Marsch zum gegenüberliegenden Fenster waren . +112-1 15238-15239 _ _ _ _ _ +112-2 15240-15242 Er _ Er CM[84]|CM[89] ANAPHORIC->84-52|ANAPHORIC->89-134 +112-3 15243-15251 gedachte _ gedachte _ _ +112-4 15252-15253 _ _ , _ _ +112-5 15254-15256 es _ es _ _ +112-6 15257-15259 zu _ zu _ _ +112-7 15260-15263 tun _ tun _ _ +112-8 15264-15265 , _ , _ _ +112-9 15266-15271 Würde _ würde _ _ +112-10 15272-15275 man _ man _ _ +112-11 15276-15279 ihm _ ihm CM[84]|CM[89] ANAPHORIC->84-53|ANAPHORIC->89-135 +112-12 15280-15285 nicht _ nicht _ _ +112-13 15286-15293 glauben _ glauben _ _ +112-14 15294-15295 , _ , _ _ +112-15 15296-15299 was _ was _ _ +112-16 15300-15302 in _ in _ _ +112-17 15303-15309 diesem _ diesem _ _ +112-18 15310-15314 Fall _ Fall _ _ +112-19 15315-15326 begreiflich _ begreiflich _ _ +112-20 15327-15330 war _ war _ _ +112-21 15331-15332 , _ , _ _ +112-22 15333-15335 so _ so _ _ +112-23 15336-15342 konnte _ konnte _ _ +112-24 15343-15345 er _ er CM[84]|CM[89] ANAPHORIC->84-54|ANAPHORIC->89-136 +112-25 15346-15350 Frau _ Frau CM[1] *->1-12 +112-26 15351-15358 Grubach PER Grubach CM[1] *->1-12 +112-27 15359-15362 als _ als _ _ +112-28 15363-15369 Zeugin _ Zeugin _ _ +112-29 15370-15376 führen _ führen _ _ +112-30 15377-15381 oder _ oder _ _ +112-31 15382-15386 auch _ auch _ _ +112-32 15387-15390 die _ die CM[55] ANAPHORIC->55-1 +112-33 15391-15397 beiden _ beiden CM[55] ANAPHORIC->55-1 +112-34 15398-15403 Alten _ Alten CM[55] ANAPHORIC->55-1 +112-35 15404-15407 von _ von CM[55] ANAPHORIC->55-1 +112-36 15408-15414 drüben _ drüben CM[55] ANAPHORIC->55-1 +112-37 15415-15416 , _ , _ _ +112-38 15417-15420 die _ die CM[55] *->55-2 +112-39 15421-15425 wohl _ wohl _ _ +112-40 15426-15431 jetzt _ jetzt _ _ +112-41 15432-15435 auf _ auf _ _ +112-42 15436-15439 dem _ dem _ _ +112-43 15440-15446 Marsch _ Marsch _ _ +112-44 15447-15450 zum _ zum _ _ +112-45 15451-15469 gegenüberliegenden _ gegenüberliegenden _ _ +112-46 15470-15477 Fenster _ Fenster _ _ +112-47 15478-15483 waren _ waren _ _ +112-48 15484-15485 . _ . _ _ + +#Text=_ Es wunderte K. , wenigstens aus dem Gedankengang der Wächter wunderte es ihn , daß sie ihn in das Zimmer getrieben und _ ihn hier allein gelassen hatten , wo er doch zehnfache Möglichkeit hatte , sich umzubringen . +113-1 15486-15487 _ _ _ _ _ +113-2 15488-15490 Es _ Es _ _ +113-3 15491-15499 wunderte _ wunderte _ _ +113-4 15500-15502 K. PER K. CM[84]|CM[89] ANAPHORIC->84-55|ANAPHORIC->89-137 +113-5 15503-15504 , _ , _ _ +113-6 15505-15515 wenigstens _ wenigstens _ _ +113-7 15516-15519 aus _ aus _ _ +113-8 15520-15523 dem _ dem _ _ +113-9 15524-15536 Gedankengang _ Gedankengang _ _ +113-10 15537-15540 der _ der CM[24] ANAPHORIC->24-23 +113-11 15541-15548 Wächter _ Wächter CM[24] ANAPHORIC->24-23 +113-12 15549-15557 wunderte _ wunderte _ _ +113-13 15558-15560 es _ es _ _ +113-14 15561-15564 ihn _ ihn CM[84]|CM[89] ANAPHORIC->84-56|ANAPHORIC->89-138 +113-15 15565-15566 , _ , _ _ +113-16 15567-15570 daß _ dass _ _ +113-17 15571-15574 sie _ sie CM[24] ANAPHORIC->24-24 +113-18 15575-15578 ihn _ ihn CM[84]|CM[89] ANAPHORIC->84-57|ANAPHORIC->89-139 +113-19 15579-15581 in _ in _ _ +113-20 15582-15585 das _ das CM[71] COREFERENTIAL->71-4 +113-21 15586-15592 Zimmer _ Zimmer CM[71] COREFERENTIAL->71-4 +113-22 15593-15602 getrieben _ getrieben _ _ +113-23 15603-15606 und _ und _ _ +113-24 15607-15608 _ _ sie _ _ +113-25 15609-15612 ihn _ ihn CM[84]|CM[89] ANAPHORIC->84-58|ANAPHORIC->89-140 +113-26 15613-15617 hier _ hier _ _ +113-27 15618-15624 allein _ allein _ _ +113-28 15625-15633 gelassen _ gelassen _ _ +113-29 15634-15640 hatten _ hatten _ _ +113-30 15641-15642 , _ , _ _ +113-31 15643-15645 wo _ wo _ _ +113-32 15646-15648 er _ er CM[84]|CM[89] ANAPHORIC->84-59|ANAPHORIC->89-141 +113-33 15649-15653 doch _ doch _ _ +113-34 15654-15663 zehnfache _ zehnfache _ _ +113-35 15664-15675 Möglichkeit _ Möglichkeit _ _ +113-36 15676-15681 hatte _ hatte _ _ +113-37 15682-15683 , _ , _ _ +113-38 15684-15688 sich _ sich CM[84]|CM[89] ANAPHORIC->84-60|ANAPHORIC->89-142 +113-39 15689-15700 umzubringen _ umzubringen _ _ +113-40 15701-15702 . _ . _ _ + +#Text=_ Gleichzeitig allerdings fragte er sich , diesmal aus seinem Gedankengang , was für einen Grund er haben könnte , es zu tun . +114-1 15703-15704 _ _ _ _ _ +114-2 15705-15717 Gleichzeitig _ Gleichzeitig _ _ +114-3 15718-15728 allerdings _ allerdings _ _ +114-4 15729-15735 fragte _ fragte _ _ +114-5 15736-15738 er _ er CM[84]|CM[89] ANAPHORIC->84-61|ANAPHORIC->89-143 +114-6 15739-15743 sich _ sich CM[84]|CM[89] ANAPHORIC->84-62|ANAPHORIC->89-144 +114-7 15744-15745 , _ , _ _ +114-8 15746-15753 diesmal _ diesmal _ _ +114-9 15754-15757 aus _ aus _ _ +114-10 15758-15764 seinem _ seinem CM[84]|CM[89] ANAPHORIC->84-63|ANAPHORIC->89-145 +114-11 15765-15777 Gedankengang _ Gedankengang _ _ +114-12 15778-15779 , _ , _ _ +114-13 15780-15783 was _ was _ _ +114-14 15784-15787 für _ für _ _ +114-15 15788-15793 einen _ einen _ _ +114-16 15794-15799 Grund _ Grund _ _ +114-17 15800-15802 er _ er CM[84]|CM[89] ANAPHORIC->84-64|ANAPHORIC->89-146 +114-18 15803-15808 haben _ haben _ _ +114-19 15809-15815 könnte _ könnte _ _ +114-20 15816-15817 , _ , _ _ +114-21 15818-15820 es _ es _ _ +114-22 15821-15823 zu _ zu _ _ +114-23 15824-15827 tun _ tun _ _ +114-24 15828-15829 . _ . _ _ + +#Text=_ Etwa weil die zwei nebenan saßen und _ sein Frühstück abgefangen hatten ? +115-1 15830-15831 _ _ _ _ _ +115-2 15832-15836 Etwa _ Etwa _ _ +115-3 15837-15841 weil _ weil _ _ +115-4 15842-15845 die _ die CM[24] ANAPHORIC->24-25 +115-5 15846-15850 zwei _ zwei CM[24] ANAPHORIC->24-25 +115-6 15851-15858 nebenan _ nebenan _ _ +115-7 15859-15864 saßen _ saßen _ _ +115-8 15865-15868 und _ und _ _ +115-9 15869-15870 _ _ sie _ _ +115-10 15871-15875 sein _ sein CM[84]|CM[87]|CM[89] ANAPHORIC->84-65|COREFERENTIAL->87-2|ANAPHORIC->89-147 +115-11 15876-15885 Frühstück _ Frühstück CM[87] COREFERENTIAL->87-2 +115-12 15886-15896 abgefangen _ abgefangen _ _ +115-13 15897-15903 hatten _ hatten _ _ +115-14 15904-15905 ? _ ? _ _ + +#Text=_ Es wäre so sinnlos gewesen , sich umzubringen , daß er , selbst wenn er es hätte tun wollen , infolge der Sinnlosigkeit dazu nicht im| stande gewesen wäre . +116-1 15906-15907 _ _ _ _ _ +116-2 15908-15910 Es _ Es _ _ +116-3 15911-15915 wäre _ wäre _ _ +116-4 15916-15918 so _ so _ _ +116-5 15919-15926 sinnlos _ sinnlos _ _ +116-6 15927-15934 gewesen _ gewesen _ _ +116-7 15935-15936 , _ , _ _ +116-8 15937-15941 sich _ sich _ _ +116-9 15942-15953 umzubringen _ umzubringen _ _ +116-10 15954-15955 , _ , _ _ +116-11 15956-15959 daß _ dass _ _ +116-12 15960-15962 er _ er CM[84]|CM[89] ANAPHORIC->84-66|ANAPHORIC->89-148 +116-13 15963-15964 , _ , _ _ +116-14 15965-15971 selbst _ selbst _ _ +116-15 15972-15976 wenn _ wenn _ _ +116-16 15977-15979 er _ er CM[84]|CM[89] ANAPHORIC->84-67|ANAPHORIC->89-149 +116-17 15980-15982 es _ es _ _ +116-18 15983-15988 hätte _ hätte _ _ +116-19 15989-15992 tun _ tun _ _ +116-20 15993-15999 wollen _ wollen _ _ +116-21 16000-16001 , _ , _ _ +116-22 16002-16009 infolge _ infolge _ _ +116-23 16010-16013 der _ der _ _ +116-24 16014-16027 Sinnlosigkeit _ Sinnlosigkeit _ _ +116-25 16028-16032 dazu _ dazu _ _ +116-26 16033-16038 nicht _ nicht _ _ +116-27 16039-16042 im| _ im _ _ +116-28 16043-16049 stande _ Stande _ _ +116-29 16050-16057 gewesen _ gewesen _ _ +116-30 16058-16062 wäre _ wäre _ _ +116-31 16063-16064 . _ . _ _ + +#Text=_ Wäre die geistige Beschränktheit der Wächter nicht so auffallend gewesen , so hätte man annehmen können , daß auch sie , infolge der gleichen Überzeugung , keine Gefahr darin gesehen hätten , ihn allein zu lassen . +117-1 16065-16066 _ _ _ _ _ +117-2 16067-16071 Wäre _ Wäre _ _ +117-3 16072-16075 die _ die _ _ +117-4 16076-16084 geistige _ geistige _ _ +117-5 16085-16099 Beschränktheit _ Beschränktheit _ _ +117-6 16100-16103 der _ der CM[24] ANAPHORIC->24-26 +117-7 16104-16111 Wächter _ Wächter CM[24] ANAPHORIC->24-26 +117-8 16112-16117 nicht _ nicht _ _ +117-9 16118-16120 so _ so _ _ +117-10 16121-16131 auffallend _ auffallend _ _ +117-11 16132-16139 gewesen _ gewesen _ _ +117-12 16140-16141 , _ , _ _ +117-13 16142-16144 so _ so _ _ +117-14 16145-16150 hätte _ hätte _ _ +117-15 16151-16154 man _ man _ _ +117-16 16155-16163 annehmen _ annehmen _ _ +117-17 16164-16170 können _ können _ _ +117-18 16171-16172 , _ , _ _ +117-19 16173-16176 daß _ dass _ _ +117-20 16177-16181 auch _ auch _ _ +117-21 16182-16185 sie _ sie CM[24] ANAPHORIC->24-27 +117-22 16186-16187 , _ , _ _ +117-23 16188-16195 infolge _ infolge _ _ +117-24 16196-16199 der _ der _ _ +117-25 16200-16208 gleichen _ gleichen _ _ +117-26 16209-16220 Überzeugung _ Überzeugung _ _ +117-27 16221-16222 , _ , _ _ +117-28 16223-16228 keine _ keine _ _ +117-29 16229-16235 Gefahr _ Gefahr _ _ +117-30 16236-16241 darin _ darin _ _ +117-31 16242-16249 gesehen _ gesehen _ _ +117-32 16250-16256 hätten _ hätten _ _ +117-33 16257-16258 , _ , _ _ +117-34 16259-16262 ihn _ ihn CM[84]|CM[89] ANAPHORIC->84-68|ANAPHORIC->89-150 +117-35 16263-16269 allein _ allein\| _ _ +117-36 16270-16272 zu _ zu\| _ _ +117-37 16273-16279 lassen _ lassen _ _ +117-38 16280-16281 . _ . _ _ + +#Text=_ Sie mochten jetzt , wenn sie wollten , zusehen , wie er zu einem Wandschränkchen ging , in dem er einen guten Schnaps aufbewahrte , wie er ein Gläschen zuerst zum Ersatz des Frühstücks leerte und wie er ein zweites Gläschen dazu bestimmte , sich Mut zu machen , das letztere nur aus Vorsicht für den unwahrscheinlichen Fall , daß es nötig sein sollte . +118-1 16282-16283 _ _ _ _ _ +118-2 16284-16287 Sie _ Sie CM[24] ANAPHORIC->24-28 +118-3 16288-16295 mochten _ mochten _ _ +118-4 16296-16301 jetzt _ jetzt _ _ +118-5 16302-16303 , _ , _ _ +118-6 16304-16308 wenn _ wenn _ _ +118-7 16309-16312 sie _ sie CM[24] COREFERENTIAL->24-29 +118-8 16313-16320 wollten _ wollten _ _ +118-9 16321-16322 , _ , _ _ +118-10 16323-16330 zusehen _ zusehen _ _ +118-11 16331-16332 , _ , _ _ +118-12 16333-16336 wie _ wie _ _ +118-13 16337-16339 er _ er CM[84]|CM[89] ANAPHORIC->84-69|ANAPHORIC->89-151 +118-14 16340-16342 zu _ zu _ _ +118-15 16343-16348 einem _ einem CM[56] ANAPHORIC->56-1 +118-16 16349-16364 Wandschränkchen _ Wandschränkchen CM[56] ANAPHORIC->56-1 +118-17 16365-16369 ging _ ging _ _ +118-18 16370-16371 , _ , _ _ +118-19 16372-16374 in _ in _ _ +118-20 16375-16378 dem _ dem CM[56] COREFERENTIAL->56-2 +118-21 16379-16381 er _ er CM[84]|CM[89] ANAPHORIC->84-70|ANAPHORIC->89-152 +118-22 16382-16387 einen _ einen _ _ +118-23 16388-16393 guten _ guten _ _ +118-24 16394-16401 Schnaps _ Schnaps _ _ +118-25 16402-16413 aufbewahrte _ aufbewahrte _ _ +118-26 16414-16415 , _ , _ _ +118-27 16416-16419 wie _ wie _ _ +118-28 16420-16422 er _ er CM[84]|CM[89] ANAPHORIC->84-71|ANAPHORIC->89-153 +118-29 16423-16426 ein _ ein _ _ +118-30 16427-16435 Gläschen _ Gläschen _ _ +118-31 16436-16442 zuerst _ zuerst _ _ +118-32 16443-16446 zum _ zum _ _ +118-33 16447-16453 Ersatz _ Ersatz _ _ +118-34 16454-16457 des _ des CM[87] *->87-3 +118-35 16458-16468 Frühstücks _ Frühstücks CM[87] *->87-3 +118-36 16469-16475 leerte _ leerte _ _ +118-37 16476-16479 und _ und _ _ +118-38 16480-16483 wie _ wie _ _ +118-39 16484-16486 er _ er CM[84]|CM[89] ANAPHORIC->84-72|ANAPHORIC->89-154 +118-40 16487-16490 ein _ ein _ _ +118-41 16491-16498 zweites _ zweites _ _ +118-42 16499-16507 Gläschen _ Gläschen _ _ +118-43 16508-16512 dazu _ dazu _ _ +118-44 16513-16522 bestimmte _ bestimmte _ _ +118-45 16523-16524 , _ , _ _ +118-46 16525-16529 sich _ sich CM[84]|CM[89] ANAPHORIC->84-73|ANAPHORIC->89-155 +118-47 16530-16533 Mut _ Mut _ _ +118-48 16534-16536 zu _ zu _ _ +118-49 16537-16543 machen _ machen _ _ +118-50 16544-16545 , _ , _ _ +118-51 16546-16549 das _ das _ _ +118-52 16550-16558 letztere _ letztere _ _ +118-53 16559-16562 nur _ nur _ _ +118-54 16563-16566 aus _ aus _ _ +118-55 16567-16575 Vorsicht _ Vorsicht _ _ +118-56 16576-16579 für _ für _ _ +118-57 16580-16583 den _ den _ _ +118-58 16584-16602 unwahrscheinlichen _ unwahrscheinlichen _ _ +118-59 16603-16607 Fall _ Fall _ _ +118-60 16608-16609 , _ , _ _ +118-61 16610-16613 daß _ dass _ _ +118-62 16614-16616 es _ es _ _ +118-63 16617-16622 nötig _ nötig _ _ +118-64 16623-16627 sein _ sein _ _ +118-65 16628-16634 sollte _ sollte _ _ +118-66 16635-16636 . _ . _ _ + +#Text=_ Da erschreckte ihn ein Zuruf aus dem Nebenzimmer derartig , daß er mit den Zähnen ans Glas schlug . +119-1 16637-16638 _ _ _ _ _ +119-2 16639-16641 Da _ Da _ _ +119-3 16642-16653 erschreckte _ erschreckte _ _ +119-4 16654-16657 ihn _ ihn CM[84]|CM[89] ANAPHORIC->84-74|ANAPHORIC->89-156 +119-5 16658-16661 ein _ ein _ _ +119-6 16662-16667 Zuruf _ Zuruf _ _ +119-7 16668-16671 aus _ aus _ _ +119-8 16672-16675 dem _ dem _ _ +119-9 16676-16687 Nebenzimmer _ Nebenzimmer _ _ +119-10 16688-16696 derartig _ derartig _ _ +119-11 16697-16698 , _ , _ _ +119-12 16699-16702 daß _ dass _ _ +119-13 16703-16705 er _ er CM[84]|CM[89] ANAPHORIC->84-75|ANAPHORIC->89-157 +119-14 16706-16709 mit _ mit _ _ +119-15 16710-16713 den _ den _ _ +119-16 16714-16720 Zähnen _ Zähnen _ _ +119-17 16721-16724 ans _ ans _ _ +119-18 16725-16729 Glas _ Glas _ _ +119-19 16730-16736 schlug _ schlug _ _ +119-20 16737-16738 . _ . _ _ + +#Text=_ " Der Aufseher ruft Sie ! " hieß es . +120-1 16739-16740 _ _ _ _ _ +120-2 16741-16742 " _ " _ _ +120-3 16743-16746 Der _ Der _ _ +120-4 16747-16755 Aufseher _ Aufseher _ _ +120-5 16756-16760 ruft _ ruft _ _ +120-6 16761-16764 Sie _ Sie _ _ +120-7 16765-16766 ! _ ! _ _ +120-8 16767-16768 " _ " _ _ +120-9 16769-16773 hieß _ hieß _ _ +120-10 16774-16776 es _ es _ _ +120-11 16777-16778 . _ . _ _ + +#Text=_ Es war nur das Schreien , das ihn erschreckte , dieses kurze , abgehackte , militärische Schreien , das er dem Wächter Franz gar nicht zugetraut hätte . +121-1 16779-16780 _ _ _ _ _ +121-2 16781-16783 Es _ Es _ _ +121-3 16784-16787 war _ war _ _ +121-4 16788-16791 nur _ nur _ _ +121-5 16792-16795 das _ das CM[57] ANAPHORIC->57-1 +121-6 16796-16804 Schreien _ Schreien CM[57] ANAPHORIC->57-1 +121-7 16805-16806 , _ , _ _ +121-8 16807-16810 das _ das CM[57] ANAPHORIC->57-2 +121-9 16811-16814 ihn _ ihn CM[84]|CM[89] ANAPHORIC->84-76|ANAPHORIC->89-158 +121-10 16815-16826 erschreckte _ erschreckte _ _ +121-11 16827-16828 , _ , _ _ +121-12 16829-16835 dieses _ dieses CM[57] ANAPHORIC->57-3 +121-13 16836-16841 kurze _ kurze CM[57] ANAPHORIC->57-3 +121-14 16842-16843 , _ , CM[57] ANAPHORIC->57-3 +121-15 16844-16854 abgehackte _ abgehackte CM[57] ANAPHORIC->57-3 +121-16 16855-16856 , _ , CM[57] ANAPHORIC->57-3 +121-17 16857-16869 militärische _ militärische CM[57] ANAPHORIC->57-3 +121-18 16870-16878 Schreien _ Schreien CM[57] ANAPHORIC->57-3 +121-19 16879-16880 , _ , _ _ +121-20 16881-16884 das _ das CM[57] *->57-4 +121-21 16885-16887 er _ er CM[84]|CM[89] ANAPHORIC->84-77|ANAPHORIC->89-159 +121-22 16888-16891 dem _ dem CM[14] COREFERENTIAL->14-13 +121-23 16892-16899 Wächter _ Wächter CM[14] COREFERENTIAL->14-13 +121-24 16900-16905 Franz PER Franz CM[14] COREFERENTIAL->14-13 +121-25 16906-16909 gar _ gar _ _ +121-26 16910-16915 nicht _ nicht _ _ +121-27 16916-16925 zugetraut _ zugetraut _ _ +121-28 16926-16931 hätte _ hätte _ _ +121-29 16932-16933 . _ . _ _ + +#Text=_ Der Befehl selbst war ihm sehr willkommen . +122-1 16934-16935 _ _ _ _ _ +122-2 16936-16939 Der _ Der _ _ +122-3 16940-16946 Befehl _ Befehl _ _ +122-4 16947-16953 selbst _ selbst _ _ +122-5 16954-16957 war _ war _ _ +122-6 16958-16961 ihm _ ihm CM[84]|CM[89] ANAPHORIC->84-78|ANAPHORIC->89-160 +122-7 16962-16966 sehr _ sehr _ _ +122-8 16967-16977 willkommen _ willkommen _ _ +122-9 16978-16979 . _ . _ _ + +#Text=_ " Endlich ! " rief er zurück , versperrte den Wandschrank und _ eilte sofort ins Nebenzimmer . +123-1 16980-16981 _ _ _ _ _ +123-2 16982-16983 " _ " _ _ +123-3 16984-16991 Endlich _ Endlich _ _ +123-4 16992-16993 ! _ ! _ _ +123-5 16994-16995 " _ " _ _ +123-6 16996-17000 rief _ rief _ _ +123-7 17001-17003 er _ er CM[84]|CM[89] ANAPHORIC->84-79|ANAPHORIC->89-161 +123-8 17004-17010 zurück _ zurück _ _ +123-9 17011-17012 , _ , _ _ +123-10 17013-17023 versperrte _ versperrte _ _ +123-11 17024-17027 den _ den CM[56] *->56-3 +123-12 17028-17039 Wandschrank _ Wandschrank CM[56] *->56-3 +123-13 17040-17043 und _ und _ _ +123-14 17044-17045 _ _ er _ _ +123-15 17046-17051 eilte _ eilte _ _ +123-16 17052-17058 sofort _ sofort _ _ +123-17 17059-17062 ins _ ins CM[60] COREFERENTIAL->60-1 +123-18 17063-17074 Nebenzimmer _ Nebenzimmer CM[60] COREFERENTIAL->60-1 +123-19 17075-17076 . _ . _ _ + +#Text=_ Dort standen die zwei Wächter und _ jagten ihn , als wäre das selbstverständlich , wieder in sein Zimmer zurück . +124-1 17077-17078 _ _ _ _ _ +124-2 17079-17083 Dort _ Dort _ _ +124-3 17084-17091 standen _ standen _ _ +124-4 17092-17095 die _ die CM[24] ANAPHORIC->24-30 +124-5 17096-17100 zwei _ zwei CM[24] ANAPHORIC->24-30 +124-6 17101-17108 Wächter _ Wächter CM[24] ANAPHORIC->24-30 +124-7 17109-17112 und _ und _ _ +124-8 17113-17114 _ _ sie _ _ +124-9 17115-17121 jagten _ jagten _ _ +124-10 17122-17125 ihn _ ihn CM[84]|CM[89] ANAPHORIC->84-80|ANAPHORIC->89-162 +124-11 17126-17127 , _ , _ _ +124-12 17128-17131 als _ als _ _ +124-13 17132-17136 wäre _ wäre _ _ +124-14 17137-17140 das _ das _ _ +124-15 17141-17159 selbstverständlich _ selbstverständlich _ _ +124-16 17160-17161 , _ , _ _ +124-17 17162-17168 wieder _ wieder _ _ +124-18 17169-17171 in _ in _ _ +124-19 17172-17176 sein _ sein CM[71]|CM[84]|CM[89] COREFERENTIAL->71-5|*->84-81|*->89-163 +124-20 17177-17183 Zimmer _ Zimmer CM[71] COREFERENTIAL->71-5 +124-21 17184-17190 zurück _ zurück _ _ +124-22 17191-17192 . _ . _ _ + +#Text=_ " Was fällt Euch ein ? " riefen sie . +125-1 17193-17194 _ _ _ _ _ +125-2 17195-17196 " _ " _ _ +125-3 17197-17200 Was _ Was _ _ +125-4 17201-17206 fällt _ fällt _ _ +125-5 17207-17211 Euch _ Euch _ _ +125-6 17212-17215 ein _ ein _ _ +125-7 17216-17217 ? _ ? _ _ +125-8 17218-17219 " _ " _ _ +125-9 17220-17226 riefen _ riefen _ _ +125-10 17227-17230 sie _ sie CM[24] COREFERENTIAL->24-31 +125-11 17231-17232 . _ . _ _ + +#Text=_ " Im Hemd wollt Ihr vor den Aufseher ? +126-1 17233-17234 _ _ _ _ _ +126-2 17235-17236 " _ " _ _ +126-3 17237-17239 Im _ Im _ _ +126-4 17240-17244 Hemd _ Hemd _ _ +126-5 17245-17250 wollt _ wollt _ _ +126-6 17251-17254 Ihr _ Ihr _ _ +126-7 17255-17258 vor _ vor _ _ +126-8 17259-17262 den _ den CM[58] ANAPHORIC->58-1 +126-9 17263-17271 Aufseher _ Aufseher CM[58] ANAPHORIC->58-1 +126-10 17272-17273 ? _ ? _ _ + +#Text=_ Er läßt Euch durchprügeln und uns _ _ mit ! " +127-1 17274-17275 _ _ _ _ _ +127-2 17276-17278 Er _ Er CM[58] COREFERENTIAL->58-2 +127-3 17279-17283 läßt _ lässt _ _ +127-4 17284-17288 Euch _ Euch _ _ +127-5 17289-17301 durchprügeln _ durchprügeln _ _ +127-6 17302-17305 und _ und _ _ +127-7 17306-17309 uns _ uns _ _ +127-8 17310-17311 _ _ prügelt _ _ +127-9 17312-17313 _ _ er _ _ +127-10 17314-17317 mit _ mit _ _ +127-11 17318-17319 ! _ ! _ _ +127-12 17320-17321 " _ " _ _ + +#Text=_ " Laßt mich , zum Teufel ! " rief K. , der schon bis zu seinem Kleiderkasten zurückgedrängt war , " wenn man mich im Bett überfällt , kann man nicht erwarten , mich im Festanzug zu finden . " +128-1 17322-17323 _ _ _ _ _ +128-2 17324-17325 " _ " _ _ +128-3 17326-17330 Laßt _ Lasst _ _ +128-4 17331-17335 mich _ mich _ _ +128-5 17336-17337 , _ , _ _ +128-6 17338-17341 zum _ zum _ _ +128-7 17342-17348 Teufel _ Teufel _ _ +128-8 17349-17350 ! _ ! _ _ +128-9 17351-17352 " _ " _ _ +128-10 17353-17357 rief _ rief _ _ +128-11 17358-17360 K. PER K. CM[71] ANAPHORIC->71-6 +128-12 17361-17362 , _ , _ _ +128-13 17363-17366 der _ der CM[71] ANAPHORIC->71-7 +128-14 17367-17372 schon _ schon _ _ +128-15 17373-17376 bis _ bis _ _ +128-16 17377-17379 zu _ zu _ _ +128-17 17380-17386 seinem _ seinem CM[71] ANAPHORIC->71-8 +128-18 17387-17400 Kleiderkasten _ Kleiderkasten _ _ +128-19 17401-17415 zurückgedrängt _ zurückgedrängt _ _ +128-20 17416-17419 war _ war _ _ +128-21 17420-17421 , _ , _ _ +128-22 17422-17423 " _ " _ _ +128-23 17424-17428 wenn _ Wenn _ _ +128-24 17429-17432 man _ man _ _ +128-25 17433-17437 mich _ mich _ _ +128-26 17438-17440 im _ im _ _ +128-27 17441-17445 Bett _ Bett _ _ +128-28 17446-17455 überfällt _ überfällt _ _ +128-29 17456-17457 , _ , _ _ +128-30 17458-17462 kann _ kann _ _ +128-31 17463-17466 man _ man _ _ +128-32 17467-17472 nicht _ nicht _ _ +128-33 17473-17481 erwarten _ erwarten _ _ +128-34 17482-17483 , _ , _ _ +128-35 17484-17488 mich _ mich _ _ +128-36 17489-17491 im _ im _ _ +128-37 17492-17501 Festanzug _ Festanzug _ _ +128-38 17502-17504 zu _ zu _ _ +128-39 17505-17511 finden _ finden _ _ +128-40 17512-17513 . _ . _ _ +128-41 17514-17515 " _ " _ _ + +#Text=_ " Es hilft nichts " , sagten die Wächter , die immer , wenn K. schrie , ganz ruhig , ja fast traurig wurden und _ ihn dadurch verwirrten oder _ _ gewissermaßen zur Besinnung brachten . +129-1 17516-17517 _ _ _ _ _ +129-2 17518-17519 " _ " _ _ +129-3 17520-17522 Es _ Es _ _ +129-4 17523-17528 hilft _ hilft _ _ +129-5 17529-17535 nichts _ nichts _ _ +129-6 17536-17537 " _ " _ _ +129-7 17538-17539 , _ , _ _ +129-8 17540-17546 sagten _ sagten _ _ +129-9 17547-17550 die _ die CM[24] ANAPHORIC->24-32 +129-10 17551-17558 Wächter _ Wächter CM[24] ANAPHORIC->24-32 +129-11 17559-17560 , _ , _ _ +129-12 17561-17564 die _ die CM[24] ANAPHORIC->24-33 +129-13 17565-17570 immer _ immer _ _ +129-14 17571-17572 , _ , _ _ +129-15 17573-17577 wenn _ wenn _ _ +129-16 17578-17580 K. PER K. CM[71] ANAPHORIC->71-9 +129-17 17581-17587 schrie _ schrie _ _ +129-18 17588-17589 , _ , _ _ +129-19 17590-17594 ganz _ ganz _ _ +129-20 17595-17600 ruhig _ ruhig _ _ +129-21 17601-17602 , _ , _ _ +129-22 17603-17605 ja _ ja _ _ +129-23 17606-17610 fast _ fast _ _ +129-24 17611-17618 traurig _ traurig _ _ +129-25 17619-17625 wurden _ wurden _ _ +129-26 17626-17629 und _ und _ _ +129-27 17630-17631 _ _ die _ _ +129-28 17632-17635 ihn _ ihn CM[71] ANAPHORIC->71-10 +129-29 17636-17643 dadurch _ dadurch _ _ +129-30 17644-17654 verwirrten _ verwirrten _ _ +129-31 17655-17659 oder _ oder _ _ +129-32 17660-17661 _ _ die _ _ +129-33 17662-17663 _ _ ihn _ _ +129-34 17664-17677 gewissermaßen _ gewissermaßen _ _ +129-35 17678-17681 zur _ zur _ _ +129-36 17682-17691 Besinnung _ Besinnung _ _ +129-37 17692-17700 brachten _ brachten _ _ +129-38 17701-17702 . _ . _ _ + +#Text=_ " _ _ Lächerliche Zeremonien ! " brummte er noch , _ hob aber schon einen Rock vom Stuhl und _ hielt ihn ein Weilchen mit beiden Händen , als unterbreite er ihn dem Urteil der Wächter . +130-1 17703-17704 _ _ _ _ _ +130-2 17705-17706 " _ " _ _ +130-3 17707-17708 _ _ Das _ _ +130-4 17709-17710 _ _ sind _ _ +130-5 17711-17722 Lächerliche _ lächerliche _ _ +130-6 17723-17733 Zeremonien _ Zeremonien _ _ +130-7 17734-17735 ! _ ! _ _ +130-8 17736-17737 " _ " _ _ +130-9 17738-17745 brummte _ brummte _ _ +130-10 17746-17748 er _ er CM[71] ANAPHORIC->71-11 +130-11 17749-17753 noch _ noch _ _ +130-12 17754-17755 , _ , _ _ +130-13 17756-17757 _ _ er _ _ +130-14 17758-17761 hob _ hob _ _ +130-15 17762-17766 aber _ aber _ _ +130-16 17767-17772 schon _ schon _ _ +130-17 17773-17778 einen _ einen CM[59] ANAPHORIC->59-1 +130-18 17779-17783 Rock _ Rock CM[59] ANAPHORIC->59-1 +130-19 17784-17787 vom _ vom _ _ +130-20 17788-17793 Stuhl _ Stuhl _ _ +130-21 17794-17797 und _ und _ _ +130-22 17798-17799 _ _ er _ _ +130-23 17800-17805 hielt _ hielt _ _ +130-24 17806-17809 ihn _ ihn CM[59] ANAPHORIC->59-2 +130-25 17810-17813 ein _ ein _ _ +130-26 17814-17822 Weilchen _ Weilchen _ _ +130-27 17823-17826 mit _ mit _ _ +130-28 17827-17833 beiden _ beiden _ _ +130-29 17834-17840 Händen _ Händen _ _ +130-30 17841-17842 , _ , _ _ +130-31 17843-17846 als _ als _ _ +130-32 17847-17858 unterbreite _ unterbreite _ _ +130-33 17859-17861 er _ er CM[71] ANAPHORIC->71-12 +130-34 17862-17865 ihn _ ihn CM[59] ANAPHORIC->59-3 +130-35 17866-17869 dem _ dem _ _ +130-36 17870-17876 Urteil _ Urteil _ _ +130-37 17877-17880 der _ der CM[24] ANAPHORIC->24-34 +130-38 17881-17888 Wächter _ Wächter CM[24] ANAPHORIC->24-34 +130-39 17889-17890 . _ . _ _ + +#Text=_ Sie schüttelten die Köpfe . +131-1 17891-17892 _ _ _ _ _ +131-2 17893-17896 Sie _ Sie CM[24] ANAPHORIC->24-35 +131-3 17897-17908 schüttelten _ schüttelten _ _ +131-4 17909-17912 die _ die _ _ +131-5 17913-17918 Köpfe _ Köpfe _ _ +131-6 17919-17920 . _ . _ _ + +#Text=_ " Es muß ein schwarzer Rock sein " , sagten sie . +132-1 17921-17922 _ _ _ _ _ +132-2 17923-17924 " _ " _ _ +132-3 17925-17927 Es _ Es _ _ +132-4 17928-17931 muß _ muss _ _ +132-5 17932-17935 ein _ ein _ _ +132-6 17936-17945 schwarzer _ schwarzer _ _ +132-7 17946-17950 Rock _ Rock _ _ +132-8 17951-17955 sein _ sein _ _ +132-9 17956-17957 " _ " _ _ +132-10 17958-17959 , _ , _ _ +132-11 17960-17966 sagten _ sagten _ _ +132-12 17967-17970 sie _ sie CM[24] COREFERENTIAL->24-36 +132-13 17971-17972 . _ . _ _ + +#Text=_ K. warf daraufhin den Rock zu Boden und _ sagte – er wußte selbst nicht , in welchem Sinne er es sagte – : " Es ist doch noch nicht die Hauptverhandlung . " +133-1 17973-17974 _ _ _ _ _ +133-2 17975-17977 K. PER K. CM[71] ANAPHORIC->71-13 +133-3 17978-17982 warf _ warf _ _ +133-4 17983-17992 daraufhin _ daraufhin _ _ +133-5 17993-17996 den _ den CM[59] *->59-4 +133-6 17997-18001 Rock _ Rock CM[59] *->59-4 +133-7 18002-18004 zu _ zu _ _ +133-8 18005-18010 Boden _ Boden _ _ +133-9 18011-18014 und _ und _ _ +133-10 18015-18016 _ _ er _ _ +133-11 18017-18022 sagte _ sagte _ _ +133-12 18023-18024 – _ – _ _ +133-13 18025-18027 er _ er CM[71] ANAPHORIC->71-14 +133-14 18028-18033 wußte _ wusste _ _ +133-15 18034-18040 selbst _ selbst _ _ +133-16 18041-18046 nicht _ nicht _ _ +133-17 18047-18048 , _ , _ _ +133-18 18049-18051 in _ in _ _ +133-19 18052-18059 welchem _ welchem _ _ +133-20 18060-18065 Sinne _ Sinne _ _ +133-21 18066-18068 er _ er CM[71] COREFERENTIAL->71-15 +133-22 18069-18071 es _ es _ _ +133-23 18072-18077 sagte _ sagte _ _ +133-24 18078-18079 – _ – _ _ +133-25 18080-18081 : _ : _ _ +133-26 18082-18083 " _ " _ _ +133-27 18084-18086 Es _ Es _ _ +133-28 18087-18090 ist _ ist _ _ +133-29 18091-18095 doch _ doch _ _ +133-30 18096-18100 noch _ noch _ _ +133-31 18101-18106 nicht _ nicht _ _ +133-32 18107-18110 die _ die _ _ +133-33 18111-18127 Hauptverhandlung _ Hauptverhandlung _ _ +133-34 18128-18129 . _ . _ _ +133-35 18130-18131 " _ " _ _ + +#Text=_ Die Wächter lächelten , _ blieben aber bei ihrem : " Es muß ein schwarzer Rock sein . " +134-1 18132-18133 _ _ _ _ _ +134-2 18134-18137 Die _ Die CM[24] ANAPHORIC->24-37 +134-3 18138-18145 Wächter _ Wächter CM[24] ANAPHORIC->24-37 +134-4 18146-18155 lächelten _ lächelten _ _ +134-5 18156-18157 , _ , _ _ +134-6 18158-18159 _ _ sie _ _ +134-7 18160-18167 blieben _ blieben _ _ +134-8 18168-18172 aber _ aber _ _ +134-9 18173-18176 bei _ bei _ _ +134-10 18177-18182 ihrem _ ihrem CM[24] ANAPHORIC->24-38 +134-11 18183-18184 : _ : _ _ +134-12 18185-18186 " _ " _ _ +134-13 18187-18189 Es _ Es _ _ +134-14 18190-18193 muß _ muss _ _ +134-15 18194-18197 ein _ ein _ _ +134-16 18198-18207 schwarzer _ schwarzer _ _ +134-17 18208-18212 Rock _ Rock _ _ +134-18 18213-18217 sein _ sein _ _ +134-19 18218-18219 . _ . _ _ +134-20 18220-18221 " _ " _ _ + +#Text=_ " Wenn ich dadurch die Sache beschleunige , soll es mir recht sein " , sagte K. , _ öffnete selbst den Kleiderkasten , _ suchte lange unter den vielen Kleidern , _ wählte sein bestes schwarzes Kleid , ein Jackettkleid , das durch seine Taille unter den Bekannten fast Aufsehen gemacht hatte , _ zog nun auch ein anderes Hemd hervor und _ begann , sich sorgfältig anzuziehen . +135-1 18222-18223 _ _ _ _ _ +135-2 18224-18225 " _ " _ _ +135-3 18226-18230 Wenn _ Wenn _ _ +135-4 18231-18234 ich _ ich _ _ +135-5 18235-18242 dadurch _ dadurch _ _ +135-6 18243-18246 die _ die _ _ +135-7 18247-18252 Sache _ Sache _ _ +135-8 18253-18265 beschleunige _ beschleunige _ _ +135-9 18266-18267 , _ , _ _ +135-10 18268-18272 soll _ soll _ _ +135-11 18273-18275 es _ es _ _ +135-12 18276-18279 mir _ mir _ _ +135-13 18280-18285 recht _ recht _ _ +135-14 18286-18290 sein _ sein _ _ +135-15 18291-18292 " _ " _ _ +135-16 18293-18294 , _ , _ _ +135-17 18295-18300 sagte _ sagte _ _ +135-18 18301-18303 K. PER K. CM[71] ANAPHORIC->71-16 +135-19 18304-18305 , _ , _ _ +135-20 18306-18307 _ _ er _ _ +135-21 18308-18315 öffnete _ öffnete _ _ +135-22 18316-18322 selbst _ selbst _ _ +135-23 18323-18326 den _ den _ _ +135-24 18327-18340 Kleiderkasten _ Kleiderkasten _ _ +135-25 18341-18342 , _ , _ _ +135-26 18343-18344 _ _ er _ _ +135-27 18345-18351 suchte _ suchte _ _ +135-28 18352-18357 lange _ lange _ _ +135-29 18358-18363 unter _ unter _ _ +135-30 18364-18367 den _ den _ _ +135-31 18368-18374 vielen _ vielen _ _ +135-32 18375-18383 Kleidern _ Kleidern _ _ +135-33 18384-18385 , _ , _ _ +135-34 18386-18387 _ _ er _ _ +135-35 18388-18394 wählte _ wählte _ _ +135-36 18395-18399 sein _ sein _ _ +135-37 18400-18406 bestes _ bestes _ _ +135-38 18407-18416 schwarzes _ schwarzes _ _ +135-39 18417-18422 Kleid _ Kleid _ _ +135-40 18423-18424 , _ , _ _ +135-41 18425-18428 ein _ ein _ _ +135-42 18429-18441 Jackettkleid _ Jackettkleid _ _ +135-43 18442-18443 , _ , _ _ +135-44 18444-18447 das _ das _ _ +135-45 18448-18453 durch _ durch _ _ +135-46 18454-18459 seine _ seine _ _ +135-47 18460-18466 Taille _ Taille _ _ +135-48 18467-18472 unter _ unter _ _ +135-49 18473-18476 den _ den _ _ +135-50 18477-18486 Bekannten _ Bekannten _ _ +135-51 18487-18491 fast _ fast _ _ +135-52 18492-18500 Aufsehen _ Aufsehen _ _ +135-53 18501-18508 gemacht _ gemacht _ _ +135-54 18509-18514 hatte _ hatte _ _ +135-55 18515-18516 , _ , _ _ +135-56 18517-18518 _ _ er _ _ +135-57 18519-18522 zog _ zog _ _ +135-58 18523-18526 nun _ nun _ _ +135-59 18527-18531 auch _ auch _ _ +135-60 18532-18535 ein _ ein _ _ +135-61 18536-18543 anderes _ anderes _ _ +135-62 18544-18548 Hemd _ Hemd _ _ +135-63 18549-18555 hervor _ hervor _ _ +135-64 18556-18559 und _ und _ _ +135-65 18560-18561 _ _ er _ _ +135-66 18562-18568 begann _ begann _ _ +135-67 18569-18570 , _ , _ _ +135-68 18571-18575 sich _ sich CM[71] ANAPHORIC->71-17 +135-69 18576-18586 sorgfältig _ sorgfältig _ _ +135-70 18587-18597 anzuziehen _ anzuziehen _ _ +135-71 18598-18599 . _ . _ _ + +#Text=_ Im geheimen glaubte er , eine Beschleunigung des Ganzen damit erreicht zu haben , daß die Wächter vergessen hatten , ihn zum Bad zu zwingen . +136-1 18600-18601 _ _ _ _ _ +136-2 18602-18604 Im _ Im _ _ +136-3 18605-18613 geheimen _ Geheimen _ _ +136-4 18614-18621 glaubte _ glaubte _ _ +136-5 18622-18624 er _ er CM[71] ANAPHORIC->71-18 +136-6 18625-18626 , _ , _ _ +136-7 18627-18631 eine _ eine _ _ +136-8 18632-18646 Beschleunigung _ Beschleunigung _ _ +136-9 18647-18650 des _ des _ _ +136-10 18651-18657 Ganzen _ Ganzen _ _ +136-11 18658-18663 damit _ damit _ _ +136-12 18664-18672 erreicht _ erreicht _ _ +136-13 18673-18675 zu _ zu _ _ +136-14 18676-18681 haben _ haben _ _ +136-15 18682-18683 , _ , _ _ +136-16 18684-18687 daß _ dass _ _ +136-17 18688-18691 die _ die CM[24] ANAPHORIC->24-39 +136-18 18692-18699 Wächter _ Wächter CM[24] ANAPHORIC->24-39 +136-19 18700-18709 vergessen _ vergessen _ _ +136-20 18710-18716 hatten _ hatten _ _ +136-21 18717-18718 , _ , _ _ +136-22 18719-18722 ihn _ ihn CM[71] ANAPHORIC->71-19 +136-23 18723-18726 zum _ zum _ _ +136-24 18727-18730 Bad _ Bad _ _ +136-25 18731-18733 zu _ zu _ _ +136-26 18734-18741 zwingen _ zwingen _ _ +136-27 18742-18743 . _ . _ _ + +#Text=_ Er beobachtete sie , ob sie sich vielleicht daran doch erinnern würden , aber das fiel ihnen natürlich gar nicht ein , dagegen vergaß Willem nicht , Franz mit der Meldung , daß sich K. anziehe , zum Aufseher zu schicken . +137-1 18744-18745 _ _ _ _ _ +137-2 18746-18748 Er _ Er CM[71] COREFERENTIAL->71-20 +137-3 18749-18760 beobachtete _ beobachtete _ _ +137-4 18761-18764 sie _ sie CM[24] ANAPHORIC->24-40 +137-5 18765-18766 , _ , _ _ +137-6 18767-18769 ob _ ob _ _ +137-7 18770-18773 sie _ sie CM[24] ANAPHORIC->24-41 +137-8 18774-18778 sich _ sich CM[24] ANAPHORIC->24-42 +137-9 18779-18789 vielleicht _ vielleicht _ _ +137-10 18790-18795 daran _ daran _ _ +137-11 18796-18800 doch _ doch _ _ +137-12 18801-18809 erinnern _ erinnern _ _ +137-13 18810-18816 würden _ würden _ _ +137-14 18817-18818 , _ , _ _ +137-15 18819-18823 aber _ aber _ _ +137-16 18824-18827 das _ das _ _ +137-17 18828-18832 fiel _ fiel _ _ +137-18 18833-18838 ihnen _ ihnen CM[24] COREFERENTIAL->24-43 +137-19 18839-18848 natürlich _ natürlich _ _ +137-20 18849-18852 gar _ gar _ _ +137-21 18853-18858 nicht _ nicht _ _ +137-22 18859-18862 ein _ ein _ _ +137-23 18863-18864 , _ , _ _ +137-24 18865-18872 dagegen _ dagegen _ _ +137-25 18873-18879 vergaß _ vergaß _ _ +137-26 18880-18886 Willem PER Willem CM[34] ANAPHORIC->34-11 +137-27 18887-18892 nicht _ nicht _ _ +137-28 18893-18894 , _ , _ _ +137-29 18895-18900 Franz PER Franz CM[14] COREFERENTIAL->14-14 +137-30 18901-18904 mit _ mit _ _ +137-31 18905-18908 der _ der _ _ +137-32 18909-18916 Meldung _ Meldung _ _ +137-33 18917-18918 , _ , _ _ +137-34 18919-18922 daß _ dass _ _ +137-35 18923-18927 sich _ sich CM[71] cataphoric->71-21 +137-36 18928-18930 K. PER K. CM[71] ANAPHORIC->71-22 +137-37 18931-18938 anziehe _ anziehe _ _ +137-38 18939-18940 , _ , _ _ +137-39 18941-18944 zum _ zum CM[58] COREFERENTIAL->58-3 +137-40 18945-18953 Aufseher _ Aufseher CM[58] COREFERENTIAL->58-3 +137-41 18954-18956 zu _ zu _ _ +137-42 18957-18965 schicken _ schicken _ _ +137-43 18966-18967 . _ . _ _ + +#Text=_ Als er vollständig angezogen war , mußte er knapp vor Willem durch das leere Nebenzimmer in das folgende Zimmer gehen , dessen Tür mit beiden Flügeln bereits geöffnet war . +138-1 18968-18969 _ _ _ _ _ +138-2 18970-18973 Als _ Als _ _ +138-3 18974-18976 er _ er CM[71] ANAPHORIC->71-23 +138-4 18977-18988 vollständig _ vollständig _ _ +138-5 18989-18998 angezogen _ angezogen _ _ +138-6 18999-19002 war _ war _ _ +138-7 19003-19004 , _ , _ _ +138-8 19005-19010 mußte _ musste _ _ +138-9 19011-19013 er _ er CM[71] ANAPHORIC->71-24 +138-10 19014-19019 knapp _ knapp _ _ +138-11 19020-19023 vor _ vor _ _ +138-12 19024-19030 Willem PER Willem CM[34] *->34-12 +138-13 19031-19036 durch _ durch _ _ +138-14 19037-19040 das _ das CM[60] *->60-2 +138-15 19041-19046 leere _ leere CM[60] *->60-2 +138-16 19047-19058 Nebenzimmer _ Nebenzimmer CM[60] *->60-2 +138-17 19059-19061 in _ in _ _ +138-18 19062-19065 das _ das CM[61] ANAPHORIC->61-1 +138-19 19066-19074 folgende _ folgende CM[61] ANAPHORIC->61-1 +138-20 19075-19081 Zimmer _ Zimmer CM[61] ANAPHORIC->61-1 +138-21 19082-19087 gehen _ gehen _ _ +138-22 19088-19089 , _ , _ _ +138-23 19090-19096 dessen _ dessen CM[61] ANAPHORIC->61-2 +138-24 19097-19100 Tür _ Tür _ _ +138-25 19101-19104 mit _ mit _ _ +138-26 19105-19111 beiden _ beiden _ _ +138-27 19112-19119 Flügeln _ Flügeln _ _ +138-28 19120-19127 bereits _ bereits _ _ +138-29 19128-19136 geöffnet _ geöffnet _ _ +138-30 19137-19140 war _ war _ _ +138-31 19141-19142 . _ . _ _ + +#Text=_ Dieses Zimmer wurde , wie K. genau wußte , seit kurzer Zeit von einem Fräulein Bürstner , einer Schreibmaschinistin , bewohnt , die sehr früh in die Arbeit zu gehen pflegte , _ spät nach Hause kam und mit der K. nicht viel mehr als die Grußworte gewechselt hatte . +139-1 19143-19144 _ _ _ _ _ +139-2 19145-19151 Dieses _ Dieses CM[61] ANAPHORIC->61-3 +139-3 19152-19158 Zimmer _ Zimmer CM[61] ANAPHORIC->61-3 +139-4 19159-19164 wurde _ wurde _ _ +139-5 19165-19166 , _ , _ _ +139-6 19167-19170 wie _ wie _ _ +139-7 19171-19173 K. PER K. CM[71] ANAPHORIC->71-25 +139-8 19174-19179 genau _ genau _ _ +139-9 19180-19185 wußte _ wusste _ _ +139-10 19186-19187 , _ , _ _ +139-11 19188-19192 seit _ seit _ _ +139-12 19193-19199 kurzer _ kurzer _ _ +139-13 19200-19204 Zeit _ Zeit _ _ +139-14 19205-19208 von _ von _ _ +139-15 19209-19214 einem _ einem CM[62] ANAPHORIC->62-2 +139-16 19215-19223 Fräulein _ Fräulein CM[62] ANAPHORIC->62-2 +139-17 19224-19232 Bürstner PER Bürstner CM[62] ANAPHORIC->62-2 +139-18 19233-19234 , _ , _ _ +139-19 19235-19240 einer _ einer _ _ +139-20 19241-19260 Schreibmaschinistin _ Schreibmaschinistin _ _ +139-21 19261-19262 , _ , _ _ +139-22 19263-19270 bewohnt _ bewohnt _ _ +139-23 19271-19272 , _ , _ _ +139-24 19273-19276 die _ die CM[62] ANAPHORIC->62-3 +139-25 19277-19281 sehr _ sehr _ _ +139-26 19282-19286 früh _ früh _ _ +139-27 19287-19289 in _ in _ _ +139-28 19290-19293 die _ die _ _ +139-29 19294-19300 Arbeit _ Arbeit _ _ +139-30 19301-19303 zu _ zu _ _ +139-31 19304-19309 gehen _ gehen _ _ +139-32 19310-19317 pflegte _ pflegte _ _ +139-33 19318-19319 , _ , _ _ +139-34 19320-19321 _ _ die _ _ +139-35 19322-19326 spät _ spät _ _ +139-36 19327-19331 nach _ nach _ _ +139-37 19332-19337 Hause _ Hause _ _ +139-38 19338-19341 kam _ kam _ _ +139-39 19342-19345 und _ und _ _ +139-40 19346-19349 mit _ mit _ _ +139-41 19350-19353 der _ der CM[62] COREFERENTIAL->62-4 +139-42 19354-19356 K. PER K. CM[71] COREFERENTIAL->71-26 +139-43 19357-19362 nicht _ nicht _ _ +139-44 19363-19367 viel _ viel _ _ +139-45 19368-19372 mehr _ mehr _ _ +139-46 19373-19376 als _ als _ _ +139-47 19377-19380 die _ die _ _ +139-48 19381-19390 Grußworte _ Grußworte _ _ +139-49 19391-19401 gewechselt _ gewechselt _ _ +139-50 19402-19407 hatte _ hatte _ _ +139-51 19408-19409 . _ . _ _ + +#Text=_ Jetzt war das Nachttischchen von ihrem Bett als Verhandlungstisch in die Mitte des Zimmers gerückt , und der Aufseher saß hinter ihm . +140-1 19410-19411 _ _ _ _ _ +140-2 19412-19417 Jetzt _ Jetzt _ _ +140-3 19418-19421 war _ war _ _ +140-4 19422-19425 das _ das CM[66] ANAPHORIC->66-1 +140-5 19426-19440 Nachttischchen _ Nachttischchen CM[66] ANAPHORIC->66-1 +140-6 19441-19444 von _ von _ _ +140-7 19445-19450 ihrem _ ihrem _ _ +140-8 19451-19455 Bett _ Bett _ _ +140-9 19456-19459 als _ als _ _ +140-10 19460-19477 Verhandlungstisch _ Verhandlungstisch _ _ +140-11 19478-19480 in _ in _ _ +140-12 19481-19484 die _ die _ _ +140-13 19485-19490 Mitte _ Mitte _ _ +140-14 19491-19494 des _ des CM[61] ANAPHORIC->61-4 +140-15 19495-19502 Zimmers _ Zimmers CM[61] ANAPHORIC->61-4 +140-16 19503-19510 gerückt _ gerückt _ _ +140-17 19511-19512 , _ , _ _ +140-18 19513-19516 und _ und _ _ +140-19 19517-19520 der _ der CM[58] ANAPHORIC->58-4 +140-20 19521-19529 Aufseher _ Aufseher CM[58] ANAPHORIC->58-4 +140-21 19530-19533 saß _ saß _ _ +140-22 19534-19540 hinter _ hinter _ _ +140-23 19541-19544 ihm _ ihm CM[66] COREFERENTIAL->66-2 +140-24 19545-19546 . _ . _ _ + +#Text=_ Er hatte die Beine übereinandergeschlagen und einen Arm auf die Rückenlehne des Stuhles gelegt . +141-1 19547-19548 _ _ _ _ _ +141-2 19549-19551 Er _ Er CM[58] COREFERENTIAL->58-5 +141-3 19552-19557 hatte _ hatte _ _ +141-4 19558-19561 die _ die _ _ +141-5 19562-19567 Beine _ Beine _ _ +141-6 19568-19590 übereinandergeschlagen _ übereinandergeschlagen _ _ +141-7 19591-19594 und _ und _ _ +141-8 19595-19600 einen _ einen _ _ +141-9 19601-19604 Arm _ Arm _ _ +141-10 19605-19608 auf _ auf _ _ +141-11 19609-19612 die _ die _ _ +141-12 19613-19624 Rückenlehne _ Rückenlehne _ _ +141-13 19625-19628 des _ des _ _ +141-14 19629-19636 Stuhles _ Stuhles _ _ +141-15 19637-19643 gelegt _ gelegt _ _ +141-16 19644-19645 . _ . _ _ + +#Text=_ In einer Ecke des Zimmers standen drei junge Leute und _ sahen die Photographien des Fräulein Bürstner an , die in einer an der Wand aufgehängten Matte steckten . +142-1 19646-19647 _ _ _ _ _ +142-2 19648-19650 In _ In _ _ +142-3 19651-19656 einer _ einer _ _ +142-4 19657-19661 Ecke _ Ecke _ _ +142-5 19662-19665 des _ des CM[61] *->61-5 +142-6 19666-19673 Zimmers _ Zimmers CM[61] *->61-5 +142-7 19674-19681 standen _ standen _ _ +142-8 19682-19686 drei _ drei CM[78] ANAPHORIC->78-1 +142-9 19687-19692 junge _ junge CM[78] ANAPHORIC->78-1 +142-10 19693-19698 Leute _ Leute CM[78] ANAPHORIC->78-1 +142-11 19699-19702 und _ und _ _ +142-12 19703-19704 _ _ sie _ _ +142-13 19705-19710 sahen _ sahen _ _ +142-14 19711-19714 die _ die CM[63] ANAPHORIC->63-1 +142-15 19715-19728 Photographien _ Photographien CM[63] ANAPHORIC->63-1 +142-16 19729-19732 des _ des CM[63] ANAPHORIC->63-1 +142-17 19733-19741 Fräulein _ Fräulein CM[62]|CM[63] *->62-5|ANAPHORIC->63-1 +142-18 19742-19750 Bürstner PER Bürstner CM[62]|CM[63] *->62-5|ANAPHORIC->63-1 +142-19 19751-19753 an _ an _ _ +142-20 19754-19755 , _ , _ _ +142-21 19756-19759 die _ die CM[63] *->63-2 +142-22 19760-19762 in _ in _ _ +142-23 19763-19768 einer _ einer _ _ +142-24 19769-19771 an _ an _ _ +142-25 19772-19775 der _ der _ _ +142-26 19776-19780 Wand _ Wand _ _ +142-27 19781-19793 aufgehängten _ aufgehängten _ _ +142-28 19794-19799 Matte _ Matte _ _ +142-29 19800-19808 steckten _ steckten _ _ +142-30 19809-19810 . _ . _ _ + +#Text=_ An der Klinke des offenen Fensters hing eine weiße Bluse . +143-1 19811-19812 _ _ _ _ _ +143-2 19813-19815 An _ An _ _ +143-3 19816-19819 der _ der _ _ +143-4 19820-19826 Klinke _ Klinke _ _ +143-5 19827-19830 des _ des _ _ +143-6 19831-19838 offenen _ offenen _ _ +143-7 19839-19847 Fensters _ Fensters _ _ +143-8 19848-19852 hing _ hing _ _ +143-9 19853-19857 eine _ eine _ _ +143-10 19858-19863 weiße _ weiße _ _ +143-11 19864-19869 Bluse _ Bluse _ _ +143-12 19870-19871 . _ . _ _ + +#Text=_ Im gegenüberliegenden Fenster lagen wieder die zwei Alten , doch hatte sich ihre Gesellschaft vergrößert , denn hinter ihnen , sie weit überragend , stand ein Mann mit einem auf der Brust offenen Hemd , der seinen rötlichen Spitzbart mit den Fingern drückte und _ _ drehte . +144-1 19872-19873 _ _ _ _ _ +144-2 19874-19876 Im _ Im _ _ +144-3 19877-19895 gegenüberliegenden _ gegenüberliegenden _ _ +144-4 19896-19903 Fenster _ Fenster _ _ +144-5 19904-19909 lagen _ lagen _ _ +144-6 19910-19916 wieder _ wieder _ _ +144-7 19917-19920 die _ die CM[64] ANAPHORIC->64-1 +144-8 19921-19925 zwei _ zwei CM[64] ANAPHORIC->64-1 +144-9 19926-19931 Alten _ Alten CM[64] ANAPHORIC->64-1 +144-10 19932-19933 , _ , _ _ +144-11 19934-19938 doch _ doch _ _ +144-12 19939-19944 hatte _ hatte _ _ +144-13 19945-19949 sich _ sich _ _ +144-14 19950-19954 ihre _ ihre _ _ +144-15 19955-19967 Gesellschaft _ Gesellschaft _ _ +144-16 19968-19978 vergrößert _ vergrößert _ _ +144-17 19979-19980 , _ , _ _ +144-18 19981-19985 denn _ denn _ _ +144-19 19986-19992 hinter _ hinter _ _ +144-20 19993-19998 ihnen _ ihnen CM[64] ANAPHORIC->64-2 +144-21 19999-20000 , _ , _ _ +144-22 20001-20004 sie _ sie CM[64] COREFERENTIAL->64-3 +144-23 20005-20009 weit _ weit _ _ +144-24 20010-20020 überragend _ überragend _ _ +144-25 20021-20022 , _ , _ _ +144-26 20023-20028 stand _ stand _ _ +144-27 20029-20032 ein _ ein CM[65] ANAPHORIC->65-1 +144-28 20033-20037 Mann _ Mann CM[65] ANAPHORIC->65-1 +144-29 20038-20041 mit _ mit CM[65] ANAPHORIC->65-1 +144-30 20042-20047 einem _ einem CM[65] ANAPHORIC->65-1 +144-31 20048-20051 auf _ auf CM[65] ANAPHORIC->65-1 +144-32 20052-20055 der _ der CM[65] ANAPHORIC->65-1 +144-33 20056-20061 Brust _ Brust CM[65] ANAPHORIC->65-1 +144-34 20062-20069 offenen _ offenen CM[65] ANAPHORIC->65-1 +144-35 20070-20074 Hemd _ Hemd CM[65] ANAPHORIC->65-1 +144-36 20075-20076 , _ , _ _ +144-37 20077-20080 der _ der CM[65] ANAPHORIC->65-2 +144-38 20081-20087 seinen _ seinen CM[65] COREFERENTIAL->65-3 +144-39 20088-20097 rötlichen _ rötlichen _ _ +144-40 20098-20107 Spitzbart _ Spitzbart _ _ +144-41 20108-20111 mit _ mit _ _ +144-42 20112-20115 den _ den _ _ +144-43 20116-20123 Fingern _ Fingern _ _ +144-44 20124-20131 drückte _ drückte _ _ +144-45 20132-20135 und _ und _ _ +144-46 20136-20137 _ _ der _ _ +144-47 20138-20139 _ _ ihn _ _ +144-48 20140-20146 drehte _ drehte _ _ +144-49 20147-20148 . _ . _ _ + +#Text=_ " _ _ Josef K. ? " fragte der Aufseher , vielleicht nur um K.s zerstreute Blicke auf sich zu lenken . +145-1 20149-20150 _ _ _ _ _ +145-2 20151-20152 " _ " _ _ +145-3 20153-20154 _ _ Sind _ _ +145-4 20155-20156 _ _ Sie _ _ +145-5 20157-20162 Josef PER[60] Josef _ _ +145-6 20163-20165 K. PER[60] K. _ _ +145-7 20166-20167 ? _ ? _ _ +145-8 20168-20169 " _ " _ _ +145-9 20170-20176 fragte _ fragte _ _ +145-10 20177-20180 der _ der CM[58] ANAPHORIC->58-6 +145-11 20181-20189 Aufseher _ Aufseher CM[58] ANAPHORIC->58-6 +145-12 20190-20191 , _ , _ _ +145-13 20192-20202 vielleicht _ vielleicht _ _ +145-14 20203-20206 nur _ nur _ _ +145-15 20207-20209 um _ um _ _ +145-16 20210-20213 K.s PER K.s CM[71] COREFERENTIAL->71-27 +145-17 20214-20224 zerstreute _ zerstreute _ _ +145-18 20225-20231 Blicke _ Blicke _ _ +145-19 20232-20235 auf _ auf _ _ +145-20 20236-20240 sich _ sich CM[58] COREFERENTIAL->58-7 +145-21 20241-20243 zu _ zu _ _ +145-22 20244-20250 lenken _ lenken _ _ +145-23 20251-20252 . _ . _ _ + +#Text=_ K. nickte . +146-1 20253-20254 _ _ _ _ _ +146-2 20255-20257 K. PER K. CM[71] COREFERENTIAL->71-28 +146-3 20258-20264 nickte _ nickte _ _ +146-4 20265-20266 . _ . _ _ + +#Text=_ " Sie sind durch die Vorgänge des heutigen Morgens wohl sehr überrascht ? " _ fragte der Aufseher und _ verschob dabei mit beiden Händen die wenigen Gegenstände , die auf dem Nachttischchen lagen , die Kerze mit Zündhölzchen , ein Buch und ein Nadelkissen , als seien es Gegenstände , die er zur Verhandlung benötige . +147-1 20267-20268 _ _ _ _ _ +147-2 20269-20270 " _ " _ _ +147-3 20271-20274 Sie _ Sie _ _ +147-4 20275-20279 sind _ sind _ _ +147-5 20280-20285 durch _ durch _ _ +147-6 20286-20289 die _ die _ _ +147-7 20290-20298 Vorgänge _ Vorgänge _ _ +147-8 20299-20302 des _ des _ _ +147-9 20303-20311 heutigen _ heutigen _ _ +147-10 20312-20319 Morgens _ Morgens _ _ +147-11 20320-20324 wohl _ wohl _ _ +147-12 20325-20329 sehr _ sehr _ _ +147-13 20330-20340 überrascht _ überrascht _ _ +147-14 20341-20342 ? _ ? _ _ +147-15 20343-20344 " _ " _ _ +147-16 20345-20346 _ _ , _ _ +147-17 20347-20353 fragte _ fragte _ _ +147-18 20354-20357 der _ der CM[58] ANAPHORIC->58-8 +147-19 20358-20366 Aufseher _ Aufseher CM[58] ANAPHORIC->58-8 +147-20 20367-20370 und _ und _ _ +147-21 20371-20372 _ _ er _ _ +147-22 20373-20381 verschob _ verschob _ _ +147-23 20382-20387 dabei _ dabei _ _ +147-24 20388-20391 mit _ mit _ _ +147-25 20392-20398 beiden _ beiden _ _ +147-26 20399-20405 Händen _ Händen _ _ +147-27 20406-20409 die _ die CM[68] ANAPHORIC->68-1 +147-28 20410-20417 wenigen _ wenigen CM[68] ANAPHORIC->68-1 +147-29 20418-20429 Gegenstände _ Gegenstände CM[68] ANAPHORIC->68-1 +147-30 20430-20431 , _ , _ _ +147-31 20432-20435 die _ die CM[68] COREFERENTIAL->68-2 +147-32 20436-20439 auf _ auf _ _ +147-33 20440-20443 dem _ dem CM[66] COREFERENTIAL->66-3 +147-34 20444-20458 Nachttischchen _ Nachttischchen CM[66] COREFERENTIAL->66-3 +147-35 20459-20464 lagen _ lagen _ _ +147-36 20465-20466 , _ , _ _ +147-37 20467-20470 die _ die CM[68]|CM[69] ANAPHORIC->68-3|COREFERENTIAL->69-1 +147-38 20471-20476 Kerze _ Kerze CM[68]|CM[69] ANAPHORIC->68-3|COREFERENTIAL->69-1 +147-39 20477-20480 mit _ mit CM[68] ANAPHORIC->68-3 +147-40 20481-20493 Zündhölzchen _ Zündhölzchen CM[68]|CM[70]|CM[72] ANAPHORIC->68-3|ANAPHORIC->70-1|COREFERENTIAL->72-1 +147-41 20494-20495 , _ , CM[68]|CM[70] ANAPHORIC->68-3|ANAPHORIC->70-1 +147-42 20496-20499 ein _ ein CM[68]|CM[70] ANAPHORIC->68-3|ANAPHORIC->70-1 +147-43 20500-20504 Buch _ Buch CM[68]|CM[70] ANAPHORIC->68-3|ANAPHORIC->70-1 +147-44 20505-20508 und _ und CM[68]|CM[70] ANAPHORIC->68-3|ANAPHORIC->70-1 +147-45 20509-20512 ein _ ein CM[68]|CM[70] ANAPHORIC->68-3|ANAPHORIC->70-1 +147-46 20513-20524 Nadelkissen _ Nadelkissen CM[68]|CM[70] ANAPHORIC->68-3|ANAPHORIC->70-1 +147-47 20525-20526 , _ , _ _ +147-48 20527-20530 als _ als _ _ +147-49 20531-20536 seien _ seien _ _ +147-50 20537-20539 es _ es CM[68] *->68-4 +147-51 20540-20551 Gegenstände _ Gegenstände CM[67] ANAPHORIC->67-1 +147-52 20552-20553 , _ , _ _ +147-53 20554-20557 die _ die CM[67] *->67-2 +147-54 20558-20560 er _ er CM[58] COREFERENTIAL->58-9 +147-55 20561-20564 zur _ zur _ _ +147-56 20565-20576 Verhandlung _ Verhandlung _ _ +147-57 20577-20585 benötige _ benötige _ _ +147-58 20586-20587 . _ . _ _ + +#Text=_ " Gewiß " , sagte K. , und das Wohlgefühl , endlich einem vernünftigen Menschen gegenüberzustehen und über seine Angelegenheit mit ihm sprechen zu können , ergriff ihn . +148-1 20588-20589 _ _ _ _ _ +148-2 20590-20591 " _ " _ _ +148-3 20592-20597 Gewiß _ Gewiss _ _ +148-4 20598-20599 " _ " _ _ +148-5 20600-20601 , _ , _ _ +148-6 20602-20607 sagte _ sagte _ _ +148-7 20608-20610 K. PER K. CM[71] ANAPHORIC->71-29 +148-8 20611-20612 , _ , _ _ +148-9 20613-20616 und _ und _ _ +148-10 20617-20620 das _ das _ _ +148-11 20621-20631 Wohlgefühl _ Wohlgefühl _ _ +148-12 20632-20633 , _ , _ _ +148-13 20634-20641 endlich _ endlich _ _ +148-14 20642-20647 einem _ einem _ _ +148-15 20648-20660 vernünftigen _ vernünftigen _ _ +148-16 20661-20669 Menschen _ Menschen _ _ +148-17 20670-20687 gegenüberzustehen _ gegenüberzustehen _ _ +148-18 20688-20691 und _ und _ _ +148-19 20692-20696 über _ über _ _ +148-20 20697-20702 seine _ seine CM[71] ANAPHORIC->71-30 +148-21 20703-20716 Angelegenheit _ Angelegenheit _ _ +148-22 20717-20720 mit _ mit _ _ +148-23 20721-20724 ihm _ ihm CM[71] ANAPHORIC->71-31 +148-24 20725-20733 sprechen _ sprechen _ _ +148-25 20734-20736 zu _ zu _ _ +148-26 20737-20743 können _ können _ _ +148-27 20744-20745 , _ , _ _ +148-28 20746-20753 ergriff _ ergriff _ _ +148-29 20754-20757 ihn _ ihn CM[71] COREFERENTIAL->71-32 +148-30 20758-20759 . _ . _ _ + +#Text=_ " Gewiß , ich bin überrascht , aber ich bin keineswegs sehr überrascht . " +149-1 20760-20761 _ _ _ _ _ +149-2 20762-20763 " _ " _ _ +149-3 20764-20769 Gewiß _ Gewiss _ _ +149-4 20770-20771 , _ , _ _ +149-5 20772-20775 ich _ ich _ _ +149-6 20776-20779 bin _ bin _ _ +149-7 20780-20790 überrascht _ überrascht _ _ +149-8 20791-20792 , _ , _ _ +149-9 20793-20797 aber _ aber _ _ +149-10 20798-20801 ich _ ich _ _ +149-11 20802-20805 bin _ bin _ _ +149-12 20806-20816 keineswegs _ keineswegs _ _ +149-13 20817-20821 sehr _ sehr _ _ +149-14 20822-20832 überrascht _ überrascht _ _ +149-15 20833-20834 . _ . _ _ +149-16 20835-20836 " _ " _ _ + +#Text=_ " _ _ Nicht sehr überrascht ? " _ fragte der Aufseher und _ stellte nun die Kerze in die Mitte des Tischchens , während er die anderen Sachen um sie gruppierte . +150-1 20837-20838 _ _ _ _ _ +150-2 20839-20840 " _ " _ _ +150-3 20841-20842 _ _ Sie _ _ +150-4 20843-20844 _ _ sind _ _ +150-5 20845-20850 Nicht _ nicht _ _ +150-6 20851-20855 sehr _ sehr _ _ +150-7 20856-20866 überrascht _ überrascht _ _ +150-8 20867-20868 ? _ ? _ _ +150-9 20869-20870 " _ " _ _ +150-10 20871-20872 _ _ , _ _ +150-11 20873-20879 fragte _ fragte _ _ +150-12 20880-20883 der _ der CM[58] COREFERENTIAL->58-10 +150-13 20884-20892 Aufseher _ Aufseher CM[58] COREFERENTIAL->58-10 +150-14 20893-20896 und _ und _ _ +150-15 20897-20898 _ _ er _ _ +150-16 20899-20906 stellte _ stellte _ _ +150-17 20907-20910 nun _ nun _ _ +150-18 20911-20914 die _ die CM[69] ANAPHORIC->69-2 +150-19 20915-20920 Kerze _ Kerze CM[69] ANAPHORIC->69-2 +150-20 20921-20923 in _ in _ _ +150-21 20924-20927 die _ die _ _ +150-22 20928-20933 Mitte _ Mitte _ _ +150-23 20934-20937 des _ des GROUP[66] COREFERENTIAL->66-4 +150-24 20938-20948 Tischchens _ Tischchens GROUP[66] COREFERENTIAL->66-4 +150-25 20949-20950 , _ , _ _ +150-26 20951-20958 während _ während _ _ +150-27 20959-20961 er _ er _ _ +150-28 20962-20965 die _ die GROUP[70] *->70-2 +150-29 20966-20973 anderen _ anderen GROUP[70] *->70-2 +150-30 20974-20980 Sachen _ Sachen GROUP[70] *->70-2 +150-31 20981-20983 um _ um _ _ +150-32 20984-20987 sie _ sie CM[69] *->69-3 +150-33 20988-20998 gruppierte _ gruppierte _ _ +150-34 20999-21000 . _ . _ _ + +#Text=_ " Sie mißverstehen mich vielleicht " , beeilte sich K. zu bemerken . +151-1 21001-21002 _ _ _ _ _ +151-2 21003-21004 " _ " _ _ +151-3 21005-21008 Sie _ Sie _ _ +151-4 21009-21021 mißverstehen _ missverstehen _ _ +151-5 21022-21026 mich _ mich _ _ +151-6 21027-21037 vielleicht _ vielleicht _ _ +151-7 21038-21039 " _ " _ _ +151-8 21040-21041 , _ , _ _ +151-9 21042-21049 beeilte _ beeilte _ _ +151-10 21050-21054 sich _ sich _ _ +151-11 21055-21057 K. PER K. CM[71] COREFERENTIAL->71-33 +151-12 21058-21060 zu _ zu _ _ +151-13 21061-21069 bemerken _ bemerken _ _ +151-14 21070-21071 . _ . _ _ + +#Text=_ " Ich meine " – hier unterbrach sich K. und _ sah sich nach einem Sessel um . +152-1 21072-21073 _ _ _ _ _ +152-2 21074-21075 " _ " _ _ +152-3 21076-21079 Ich _ Ich _ _ +152-4 21080-21085 meine _ meine _ _ +152-5 21086-21087 " _ " _ _ +152-6 21088-21089 – _ – _ _ +152-7 21090-21094 hier _ hier _ _ +152-8 21095-21105 unterbrach _ unterbrach _ _ +152-9 21106-21110 sich _ sich CM[71] cataphoric->71-34 +152-10 21111-21113 K. PER K. CM[71] ANAPHORIC->71-35 +152-11 21114-21117 und _ und _ _ +152-12 21118-21119 _ _ er _ _ +152-13 21120-21123 sah _ sah _ _ +152-14 21124-21128 sich _ sich CM[71] ANAPHORIC->71-36 +152-15 21129-21133 nach _ nach _ _ +152-16 21134-21139 einem _ einem _ _ +152-17 21140-21146 Sessel _ Sessel _ _ +152-18 21147-21149 um _ um _ _ +152-19 21150-21151 . _ . _ _ + +#Text=_ " Ich kann mich doch setzen ? " fragte er . +153-1 21152-21153 _ _ _ _ _ +153-2 21154-21155 " _ " _ _ +153-3 21156-21159 Ich _ Ich _ _ +153-4 21160-21164 kann _ kann _ _ +153-5 21165-21169 mich _ mich _ _ +153-6 21170-21174 doch _ doch _ _ +153-7 21175-21181 setzen _ setzen _ _ +153-8 21182-21183 ? _ ? _ _ +153-9 21184-21185 " _ " _ _ +153-10 21186-21192 fragte _ fragte _ _ +153-11 21193-21195 er _ er CM[71] COREFERENTIAL->71-37 +153-12 21196-21197 . _ . _ _ + +#Text=_ " Es ist nicht üblich " , antwortete der Aufseher . +154-1 21198-21199 _ _ _ _ _ +154-2 21200-21201 " _ " _ _ +154-3 21202-21204 Es _ Es _ _ +154-4 21205-21208 ist _ ist _ _ +154-5 21209-21214 nicht _ nicht _ _ +154-6 21215-21221 üblich _ üblich _ _ +154-7 21222-21223 " _ " _ _ +154-8 21224-21225 , _ , _ _ +154-9 21226-21236 antwortete _ antwortete _ _ +154-10 21237-21240 der _ der CM[58] COREFERENTIAL->58-11 +154-11 21241-21249 Aufseher _ Aufseher CM[58] COREFERENTIAL->58-11 +154-12 21250-21251 . _ . _ _ + +#Text=_ " Ich meine " , sagte nun K. ohne weitere Pause , " ich bin allerdings sehr überrascht , aber man ist , wenn man dreißig Jahre auf der Welt ist und _ _ _ sich allein hat durchschlagen müssen , wie es mir beschieden war , gegen Überraschungen abgehärtet und _ nimmt sie nicht zu schwer . +155-1 21252-21253 _ _ _ _ _ +155-2 21254-21255 " _ " _ _ +155-3 21256-21259 Ich _ Ich _ _ +155-4 21260-21265 meine _ meine _ _ +155-5 21266-21267 " _ " _ _ +155-6 21268-21269 , _ , _ _ +155-7 21270-21275 sagte _ sagte _ _ +155-8 21276-21279 nun _ nun _ _ +155-9 21280-21282 K. PER K. CM[71] COREFERENTIAL->71-38 +155-10 21283-21287 ohne _ ohne _ _ +155-11 21288-21295 weitere _ weitere _ _ +155-12 21296-21301 Pause _ Pause _ _ +155-13 21302-21303 , _ , _ _ +155-14 21304-21305 " _ " _ _ +155-15 21306-21309 ich _ ich _ _ +155-16 21310-21313 bin _ bin _ _ +155-17 21314-21324 allerdings _ allerdings _ _ +155-18 21325-21329 sehr _ sehr _ _ +155-19 21330-21340 überrascht _ überrascht _ _ +155-20 21341-21342 , _ , _ _ +155-21 21343-21347 aber _ aber _ _ +155-22 21348-21351 man _ man _ _ +155-23 21352-21355 ist _ ist _ _ +155-24 21356-21357 , _ , _ _ +155-25 21358-21362 wenn _ wenn _ _ +155-26 21363-21366 man _ man _ _ +155-27 21367-21374 dreißig _ dreißig _ _ +155-28 21375-21380 Jahre _ Jahre _ _ +155-29 21381-21384 auf _ auf _ _ +155-30 21385-21388 der _ der _ _ +155-31 21389-21393 Welt _ Welt _ _ +155-32 21394-21397 ist _ ist _ _ +155-33 21398-21401 und _ und _ _ +155-34 21402-21403 _ _ wenn _ _ +155-35 21404-21405 _ _ man _ _ +155-36 21406-21407 _ _ man _ _ +155-37 21408-21412 sich _ sich _ _ +155-38 21413-21419 allein _ allein _ _ +155-39 21420-21423 hat _ hat _ _ +155-40 21424-21437 durchschlagen _ durchschlagen _ _ +155-41 21438-21444 müssen _ müssen _ _ +155-42 21445-21446 , _ , _ _ +155-43 21447-21450 wie _ wie _ _ +155-44 21451-21453 es _ es _ _ +155-45 21454-21457 mir _ mir _ _ +155-46 21458-21468 beschieden _ beschieden _ _ +155-47 21469-21472 war _ war _ _ +155-48 21473-21474 , _ , _ _ +155-49 21475-21480 gegen _ gegen _ _ +155-50 21481-21495 Überraschungen _ Überraschungen _ _ +155-51 21496-21506 abgehärtet _ abgehärtet _ _ +155-52 21507-21510 und _ und _ _ +155-53 21511-21512 _ _ man _ _ +155-54 21513-21518 nimmt _ nimmt _ _ +155-55 21519-21522 sie _ sie _ _ +155-56 21523-21528 nicht _ nicht _ _ +155-57 21529-21531 zu _ zu _ _ +155-58 21532-21538 schwer _ schwer _ _ +155-59 21539-21540 . _ . _ _ + +#Text=_ Besonders die heutige _ : nicht . " +156-1 21541-21542 _ _ _ _ _ +156-2 21543-21552 Besonders _ Besonders _ _ +156-3 21553-21556 die _ die _ _ +156-4 21557-21564 heutige _ heutige _ _ +156-5 21565-21566 _ _ VERBt _ _ +156-6 21567-21568 : _ man _ _ +156-7 21569-21574 nicht _ nicht _ _ +156-8 21575-21576 . _ . _ _ +156-9 21577-21578 " _ " _ _ + +#Text=_ " Warum _ _ besonders die heutige nicht ? " +157-1 21579-21580 _ _ _ _ _ +157-2 21581-21582 " _ " _ _ +157-3 21583-21588 Warum _ Warum _ _ +157-4 21589-21590 _ _ VERBt _ _ +157-5 21591-21592 _ _ man _ _ +157-6 21593-21602 besonders _ besonders _ _ +157-7 21603-21606 die _ die _ _ +157-8 21607-21614 heutige _ heutige _ _ +157-9 21615-21620 nicht _ nicht _ _ +157-10 21621-21622 ? _ ? _ _ +157-11 21623-21624 " _ " _ _ + +#Text=_ " Ich will nicht sagen , daß ich das Ganze für einen Spaß ansehe , dafür scheinen mir die Veranstaltungen , die gemacht wurden , doch zu umfangreich . +158-1 21625-21626 _ _ _ _ _ +158-2 21627-21628 " _ " _ _ +158-3 21629-21632 Ich _ Ich _ _ +158-4 21633-21637 will _ will _ _ +158-5 21638-21643 nicht _ nicht _ _ +158-6 21644-21649 sagen _ sagen _ _ +158-7 21650-21651 , _ , _ _ +158-8 21652-21655 daß _ dass _ _ +158-9 21656-21659 ich _ ich _ _ +158-10 21660-21663 das _ das _ _ +158-11 21664-21669 Ganze _ Ganze _ _ +158-12 21670-21673 für _ für _ _ +158-13 21674-21679 einen _ einen _ _ +158-14 21680-21684 Spaß _ Spaß _ _ +158-15 21685-21691 ansehe _ ansehe _ _ +158-16 21692-21693 , _ , _ _ +158-17 21694-21699 dafür _ dafür _ _ +158-18 21700-21708 scheinen _ scheinen _ _ +158-19 21709-21712 mir _ mir _ _ +158-20 21713-21716 die _ die _ _ +158-21 21717-21732 Veranstaltungen _ Veranstaltungen _ _ +158-22 21733-21734 , _ , _ _ +158-23 21735-21738 die _ die _ _ +158-24 21739-21746 gemacht _ gemacht _ _ +158-25 21747-21753 wurden _ wurden _ _ +158-26 21754-21755 , _ , _ _ +158-27 21756-21760 doch _ doch _ _ +158-28 21761-21763 zu _ zu _ _ +158-29 21764-21775 umfangreich _ umfangreich _ _ +158-30 21776-21777 . _ . _ _ + +#Text=_ Es müßten alle Mitglieder der Pension daran beteiligt sein und auch Sie alle , das ginge über die Grenzen eines Spaßes . +159-1 21778-21779 _ _ _ _ _ +159-2 21780-21782 Es _ Es _ _ +159-3 21783-21789 müßten _ müssten _ _ +159-4 21790-21794 alle _ alle _ _ +159-5 21795-21805 Mitglieder _ Mitglieder _ _ +159-6 21806-21809 der _ der _ _ +159-7 21810-21817 Pension _ Pension _ _ +159-8 21818-21823 daran _ daran _ _ +159-9 21824-21833 beteiligt _ beteiligt _ _ +159-10 21834-21838 sein _ sein _ _ +159-11 21839-21842 und _ und _ _ +159-12 21843-21847 auch _ auch _ _ +159-13 21848-21851 Sie _ Sie _ _ +159-14 21852-21856 alle _ alle _ _ +159-15 21857-21858 , _ , _ _ +159-16 21859-21862 das _ das _ _ +159-17 21863-21868 ginge _ ginge _ _ +159-18 21869-21873 über _ über _ _ +159-19 21874-21877 die _ die _ _ +159-20 21878-21885 Grenzen _ Grenzen _ _ +159-21 21886-21891 eines _ eines _ _ +159-22 21892-21898 Spaßes _ Spaßes _ _ +159-23 21899-21900 . _ . _ _ + +#Text=_ Ich will also nicht sagen , daß es ein Spaß ist . " +160-1 21901-21902 _ _ _ _ _ +160-2 21903-21906 Ich _ Ich _ _ +160-3 21907-21911 will _ will _ _ +160-4 21912-21916 also _ also _ _ +160-5 21917-21922 nicht _ nicht _ _ +160-6 21923-21928 sagen _ sagen _ _ +160-7 21929-21930 , _ , _ _ +160-8 21931-21934 daß _ dass _ _ +160-9 21935-21937 es _ es _ _ +160-10 21938-21941 ein _ ein _ _ +160-11 21942-21946 Spaß _ Spaß _ _ +160-12 21947-21950 ist _ ist _ _ +160-13 21951-21952 . _ . _ _ +160-14 21953-21954 " _ " _ _ + +#Text=_ " _ _ Ganz richtig " , sagte der Aufseher und _ sah nach , wie| viel Zündhölzchen in der Zündhölzchenschachtel waren . +161-1 21955-21956 _ _ _ _ _ +161-2 21957-21958 " _ " _ _ +161-3 21959-21960 _ _ Das _ _ +161-4 21961-21962 _ _ ist _ _ +161-5 21963-21967 Ganz _ ganz _ _ +161-6 21968-21975 richtig _ richtig _ _ +161-7 21976-21977 " _ " _ _ +161-8 21978-21979 , _ , _ _ +161-9 21980-21985 sagte _ sagte _ _ +161-10 21986-21989 der _ der CM[58] COREFERENTIAL->58-12 +161-11 21990-21998 Aufseher _ Aufseher CM[58] COREFERENTIAL->58-12 +161-12 21999-22002 und _ und _ _ +161-13 22003-22004 _ _ er _ _ +161-14 22005-22008 sah _ sah _ _ +161-15 22009-22013 nach _ nach _ _ +161-16 22014-22015 , _ , _ _ +161-17 22016-22020 wie| _ wie _ _ +161-18 22021-22025 viel _ viel _ _ +161-19 22026-22038 Zündhölzchen _ Zündhölzchen _ _ +161-20 22039-22041 in _ in _ _ +161-21 22042-22045 der _ der CM[72] COREFERENTIAL->72-2 +161-22 22046-22067 Zündhölzchenschachtel _ Zündhölzchenschachtel CM[72] COREFERENTIAL->72-2 +161-23 22068-22073 waren _ waren _ _ +161-24 22074-22075 . _ . _ _ + +#Text=_ " Andererseits aber " , fuhr K. fort und _ wandte sich hierbei an alle und _ hätte gern sogar die Drei bei den Photographien sich zugewendet , " andererseits aber kann die Sache auch nicht viel Wichtigkeit haben . +162-1 22076-22077 _ _ _ _ _ +162-2 22078-22079 " _ " _ _ +162-3 22080-22092 Andererseits _ Andererseits _ _ +162-4 22093-22097 aber _ aber _ _ +162-5 22098-22099 " _ " _ _ +162-6 22100-22101 , _ , _ _ +162-7 22102-22106 fuhr _ fuhr _ _ +162-8 22107-22109 K. PER K. CM[71] ANAPHORIC->71-39 +162-9 22110-22114 fort _ fort _ _ +162-10 22115-22118 und _ und _ _ +162-11 22119-22120 _ _ er _ _ +162-12 22121-22127 wandte _ wandte _ _ +162-13 22128-22132 sich _ sich _ _ +162-14 22133-22140 hierbei _ hierbei _ _ +162-15 22141-22143 an _ an _ _ +162-16 22144-22148 alle _ alle _ _ +162-17 22149-22152 und _ und _ _ +162-18 22153-22154 _ _ er _ _ +162-19 22155-22160 hätte _ hätte _ _ +162-20 22161-22165 gern _ gern _ _ +162-21 22166-22171 sogar _ sogar _ _ +162-22 22172-22175 die _ die CM[78] COREFERENTIAL->78-2 +162-23 22176-22180 Drei _ drei CM[78] COREFERENTIAL->78-2 +162-24 22181-22184 bei _ bei CM[78] COREFERENTIAL->78-2 +162-25 22185-22188 den _ den CM[78] COREFERENTIAL->78-2 +162-26 22189-22202 Photographien _ Photographien CM[78] COREFERENTIAL->78-2 +162-27 22203-22207 sich _ sich _ _ +162-28 22208-22218 zugewendet _ zugewendet _ _ +162-29 22219-22220 , _ , _ _ +162-30 22221-22222 " _ " _ _ +162-31 22223-22235 andererseits _ andererseits _ _ +162-32 22236-22240 aber _ aber _ _ +162-33 22241-22245 kann _ kann _ _ +162-34 22246-22249 die _ die _ _ +162-35 22250-22255 Sache _ Sache _ _ +162-36 22256-22260 auch _ auch _ _ +162-37 22261-22266 nicht _ nicht _ _ +162-38 22267-22271 viel _ viel _ _ +162-39 22272-22283 Wichtigkeit _ Wichtigkeit _ _ +162-40 22284-22289 haben _ haben _ _ +162-41 22290-22291 . _ . _ _ + +#Text=_ Ich folgere das daraus , daß ich angeklagt bin , aber _ _ nicht die geringste Schuld auffinden kann , wegen deren man mich anklagen könnte . +163-1 22292-22293 _ _ _ _ _ +163-2 22294-22297 Ich _ Ich _ _ +163-3 22298-22305 folgere _ folgere _ _ +163-4 22306-22309 das _ das _ _ +163-5 22310-22316 daraus _ daraus _ _ +163-6 22317-22318 , _ , _ _ +163-7 22319-22322 daß _ dass _ _ +163-8 22323-22326 ich _ ich _ _ +163-9 22327-22336 angeklagt _ angeklagt _ _ +163-10 22337-22340 bin _ bin _ _ +163-11 22341-22342 , _ , _ _ +163-12 22343-22347 aber _ aber _ _ +163-13 22348-22349 _ _ dass _ _ +163-14 22350-22351 _ _ ich _ _ +163-15 22352-22357 nicht _ nicht _ _ +163-16 22358-22361 die _ die CM[73] ANAPHORIC->73-1 +163-17 22362-22371 geringste _ geringste CM[73] ANAPHORIC->73-1 +163-18 22372-22378 Schuld _ Schuld CM[73] ANAPHORIC->73-1 +163-19 22379-22388 auffinden _ auffinden _ _ +163-20 22389-22393 kann _ kann _ _ +163-21 22394-22395 , _ , _ _ +163-22 22396-22401 wegen _ wegen _ _ +163-23 22402-22407 deren _ derer CM[73] *->73-2 +163-24 22408-22411 man _ man _ _ +163-25 22412-22416 mich _ mich _ _ +163-26 22417-22425 anklagen _ anklagen _ _ +163-27 22426-22432 könnte _ könnte _ _ +163-28 22433-22434 . _ . _ _ + +#Text=_ Aber auch das ist nebensächlich , die Hauptfrage ist , von wem bin ich angeklagt ? +164-1 22435-22436 _ _ _ _ _ +164-2 22437-22441 Aber _ Aber _ _ +164-3 22442-22446 auch _ auch _ _ +164-4 22447-22450 das _ das _ _ +164-5 22451-22454 ist _ ist _ _ +164-6 22455-22468 nebensächlich _ nebensächlich _ _ +164-7 22469-22470 , _ , _ _ +164-8 22471-22474 die _ die _ _ +164-9 22475-22485 Hauptfrage _ Hauptfrage _ _ +164-10 22486-22489 ist _ ist _ _ +164-11 22490-22491 , _ , _ _ +164-12 22492-22495 von _ von _ _ +164-13 22496-22499 wem _ wem _ _ +164-14 22500-22503 bin _ bin _ _ +164-15 22504-22507 ich _ ich _ _ +164-16 22508-22517 angeklagt _ angeklagt _ _ +164-17 22518-22519 ? _ ? _ _ + +#Text=_ Welche Behörde führt das Verfahren ? +165-1 22520-22521 _ _ _ _ _ +165-2 22522-22528 Welche _ Welche _ _ +165-3 22529-22536 Behörde _ Behörde _ _ +165-4 22537-22542 führt _ führt _ _ +165-5 22543-22546 das _ das _ _ +165-6 22547-22556 Verfahren _ Verfahren _ _ +165-7 22557-22558 ? _ ? _ _ + +#Text=_ Sind Sie Beamte ? +166-1 22559-22560 _ _ _ _ _ +166-2 22561-22565 Sind _ Sind _ _ +166-3 22566-22569 Sie _ Sie _ _ +166-4 22570-22576 Beamte _ Beamte _ _ +166-5 22577-22578 ? _ ? _ _ + +#Text=_ Keiner hat eine Uniform , wenn man nicht Ihr Kleid " – hier wandte er sich an Franz – " eine Uniform nennen will , aber es ist doch eher ein Reiseanzug . +167-1 22579-22580 _ _ _ _ _ +167-2 22581-22587 Keiner _ Keiner _ _ +167-3 22588-22591 hat _ hat _ _ +167-4 22592-22596 eine _ eine _ _ +167-5 22597-22604 Uniform _ Uniform _ _ +167-6 22605-22606 , _ , _ _ +167-7 22607-22611 wenn _ wenn _ _ +167-8 22612-22615 man _ man _ _ +167-9 22616-22621 nicht _ nicht _ _ +167-10 22622-22625 Ihr _ Ihr CM[74] ANAPHORIC->74-1 +167-11 22626-22631 Kleid _ Kleid CM[74] ANAPHORIC->74-1 +167-12 22632-22633 " _ " _ _ +167-13 22634-22635 – _ – _ _ +167-14 22636-22640 hier _ hier _ _ +167-15 22641-22647 wandte _ wandte _ _ +167-16 22648-22650 er _ er CM[71] COREFERENTIAL->71-40 +167-17 22651-22655 sich _ sich _ _ +167-18 22656-22658 an _ an _ _ +167-19 22659-22664 Franz PER Franz CM[14] *->14-15 +167-20 22665-22666 – _ – _ _ +167-21 22667-22668 " _ " _ _ +167-22 22669-22673 eine _ eine _ _ +167-23 22674-22681 Uniform _ Uniform _ _ +167-24 22682-22688 nennen _ nennen _ _ +167-25 22689-22693 will _ will _ _ +167-26 22694-22695 , _ , _ _ +167-27 22696-22700 aber _ aber _ _ +167-28 22701-22703 es _ es CM[74] *->74-2 +167-29 22704-22707 ist _ ist _ _ +167-30 22708-22712 doch _ doch _ _ +167-31 22713-22717 eher _ eher _ _ +167-32 22718-22721 ein _ ein _ _ +167-33 22722-22732 Reiseanzug _ Reiseanzug _ _ +167-34 22733-22734 . _ . _ _ + +#Text=_ In diesen Fragen verlange ich Klarheit , und ich bin überzeugt , daß wir nach dieser Klarstellung voneinander den herzlichsten Abschied werden nehmen können . " +168-1 22735-22736 _ _ _ _ _ +168-2 22737-22739 In _ In _ _ +168-3 22740-22746 diesen _ diesen _ _ +168-4 22747-22753 Fragen _ Fragen _ _ +168-5 22754-22762 verlange _ verlange _ _ +168-6 22763-22766 ich _ ich _ _ +168-7 22767-22775 Klarheit _ Klarheit _ _ +168-8 22776-22777 , _ , _ _ +168-9 22778-22781 und _ und _ _ +168-10 22782-22785 ich _ ich _ _ +168-11 22786-22789 bin _ bin _ _ +168-12 22790-22799 überzeugt _ überzeugt _ _ +168-13 22800-22801 , _ , _ _ +168-14 22802-22805 daß _ dass _ _ +168-15 22806-22809 wir _ wir _ _ +168-16 22810-22814 nach _ nach _ _ +168-17 22815-22821 dieser _ dieser _ _ +168-18 22822-22834 Klarstellung _ Klarstellung _ _ +168-19 22835-22846 voneinander _ voneinander _ _ +168-20 22847-22850 den _ den _ _ +168-21 22851-22863 herzlichsten _ herzlichsten _ _ +168-22 22864-22872 Abschied _ Abschied _ _ +168-23 22873-22879 werden _ werden _ _ +168-24 22880-22886 nehmen _ nehmen _ _ +168-25 22887-22893 können _ können _ _ +168-26 22894-22895 . _ . _ _ +168-27 22896-22897 " _ " _ _ + +#Text=_ Der Aufseher schlug die Zündhölzchenschachtel auf den Tisch nieder . +169-1 22898-22899 _ _ _ _ _ +169-2 22900-22903 Der _ Der CM[58] ANAPHORIC->58-13 +169-3 22904-22912 Aufseher _ Aufseher CM[58] ANAPHORIC->58-13 +169-4 22913-22919 schlug _ schlug _ _ +169-5 22920-22923 die _ die CM[72] *->72-3 +169-6 22924-22945 Zündhölzchenschachtel _ Zündhölzchenschachtel CM[72] *->72-3 +169-7 22946-22949 auf _ auf _ _ +169-8 22950-22953 den _ den CM[66] COREFERENTIAL->66-5 +169-9 22954-22959 Tisch _ Tisch CM[66] COREFERENTIAL->66-5 +169-10 22960-22966 nieder _ nieder _ _ +169-11 22967-22968 . _ . _ _ + +#Text=_ " Sie befinden sich in einem großen Irrtum " , sagte er . +170-1 22969-22970 _ _ _ _ _ +170-2 22971-22972 " _ " _ _ +170-3 22973-22976 Sie _ Sie _ _ +170-4 22977-22985 befinden _ befinden _ _ +170-5 22986-22990 sich _ sich _ _ +170-6 22991-22993 in _ in _ _ +170-7 22994-22999 einem _ einem _ _ +170-8 23000-23006 großen _ großen _ _ +170-9 23007-23013 Irrtum _ Irrtum _ _ +170-10 23014-23015 " _ " _ _ +170-11 23016-23017 , _ , _ _ +170-12 23018-23023 sagte _ sagte _ _ +170-13 23024-23026 er _ er CM[58] COREFERENTIAL->58-14 +170-14 23027-23028 . _ . _ _ + +#Text=_ " Diese Herren hier und ich sind für Ihre Angelegenheit vollständig nebensächlich , ja wir wissen sogar von ihr fast nichts . +171-1 23029-23030 _ _ _ _ _ +171-2 23031-23032 " _ " _ _ +171-3 23033-23038 Diese _ Diese _ _ +171-4 23039-23045 Herren _ Herren _ _ +171-5 23046-23050 hier _ hier _ _ +171-6 23051-23054 und _ und _ _ +171-7 23055-23058 ich _ ich _ _ +171-8 23059-23063 sind _ sind _ _ +171-9 23064-23067 für _ für _ _ +171-10 23068-23072 Ihre _ Ihre _ _ +171-11 23073-23086 Angelegenheit _ Angelegenheit _ _ +171-12 23087-23098 vollständig _ vollständig _ _ +171-13 23099-23112 nebensächlich _ nebensächlich _ _ +171-14 23113-23114 , _ , _ _ +171-15 23115-23117 ja _ ja _ _ +171-16 23118-23121 wir _ wir _ _ +171-17 23122-23128 wissen _ wissen _ _ +171-18 23129-23134 sogar _ sogar _ _ +171-19 23135-23138 von _ von _ _ +171-20 23139-23142 ihr _ ihr _ _ +171-21 23143-23147 fast _ fast _ _ +171-22 23148-23154 nichts _ nichts _ _ +171-23 23155-23156 . _ . _ _ + +#Text=_ Wir könnten die regelrechtesten Uniformen tragen , und Ihre Sache würde um nichts schlechter stehen . +172-1 23157-23158 _ _ _ _ _ +172-2 23159-23162 Wir _ Wir _ _ +172-3 23163-23170 könnten _ könnten _ _ +172-4 23171-23174 die _ die _ _ +172-5 23175-23190 regelrechtesten _ regelrechtesten _ _ +172-6 23191-23200 Uniformen _ Uniformen _ _ +172-7 23201-23207 tragen _ tragen _ _ +172-8 23208-23209 , _ , _ _ +172-9 23210-23213 und _ und _ _ +172-10 23214-23218 Ihre _ Ihre _ _ +172-11 23219-23224 Sache _ Sache _ _ +172-12 23225-23230 würde _ würde _ _ +172-13 23231-23233 um _ um _ _ +172-14 23234-23240 nichts _ nichts _ _ +172-15 23241-23251 schlechter _ schlechter _ _ +172-16 23252-23258 stehen _ stehen _ _ +172-17 23259-23260 . _ . _ _ + +#Text=_ Ich kann Ihnen auch durchaus nicht sagen , daß Sie angeklagt sind oder vielmehr , ich weiß nicht , ob Sie es sind . +173-1 23261-23262 _ _ _ _ _ +173-2 23263-23266 Ich _ Ich _ _ +173-3 23267-23271 kann _ kann _ _ +173-4 23272-23277 Ihnen _ Ihnen _ _ +173-5 23278-23282 auch _ auch _ _ +173-6 23283-23291 durchaus _ durchaus _ _ +173-7 23292-23297 nicht _ nicht _ _ +173-8 23298-23303 sagen _ sagen _ _ +173-9 23304-23305 , _ , _ _ +173-10 23306-23309 daß _ dass _ _ +173-11 23310-23313 Sie _ Sie _ _ +173-12 23314-23323 angeklagt _ angeklagt _ _ +173-13 23324-23328 sind _ sind _ _ +173-14 23329-23333 oder _ oder _ _ +173-15 23334-23342 vielmehr _ vielmehr _ _ +173-16 23343-23344 , _ , _ _ +173-17 23345-23348 ich _ ich _ _ +173-18 23349-23353 weiß _ weiß _ _ +173-19 23354-23359 nicht _ nicht _ _ +173-20 23360-23361 , _ , _ _ +173-21 23362-23364 ob _ ob _ _ +173-22 23365-23368 Sie _ Sie _ _ +173-23 23369-23371 es _ es _ _ +173-24 23372-23376 sind _ sind _ _ +173-25 23377-23378 . _ . _ _ + +#Text=_ Sie sind verhaftet , das ist richtig , mehr weiß ich nicht . +174-1 23379-23380 _ _ _ _ _ +174-2 23381-23384 Sie _ Sie _ _ +174-3 23385-23389 sind _ sind _ _ +174-4 23390-23399 verhaftet _ verhaftet _ _ +174-5 23400-23401 , _ , _ _ +174-6 23402-23405 das _ das _ _ +174-7 23406-23409 ist _ ist _ _ +174-8 23410-23417 richtig _ richtig _ _ +174-9 23418-23419 , _ , _ _ +174-10 23420-23424 mehr _ mehr _ _ +174-11 23425-23429 weiß _ weiß _ _ +174-12 23430-23433 ich _ ich _ _ +174-13 23434-23439 nicht _ nicht _ _ +174-14 23440-23441 . _ . _ _ + +#Text=_ Vielleicht haben die Wächter etwas anderes geschwätzt , dann ist es eben nur Geschwätz gewesen . +175-1 23442-23443 _ _ _ _ _ +175-2 23444-23454 Vielleicht _ Vielleicht _ _ +175-3 23455-23460 haben _ haben _ _ +175-4 23461-23464 die _ die CM[24] *->24-44 +175-5 23465-23472 Wächter _ Wächter CM[24] *->24-44 +175-6 23473-23478 etwas _ etwas _ _ +175-7 23479-23486 anderes _ anderes _ _ +175-8 23487-23497 geschwätzt _ geschwätzt _ _ +175-9 23498-23499 , _ , _ _ +175-10 23500-23504 dann _ dann _ _ +175-11 23505-23508 ist _ ist _ _ +175-12 23509-23511 es _ es _ _ +175-13 23512-23516 eben _ eben _ _ +175-14 23517-23520 nur _ nur _ _ +175-15 23521-23530 Geschwätz _ Geschwätz _ _ +175-16 23531-23538 gewesen _ gewesen _ _ +175-17 23539-23540 . _ . _ _ + +#Text=_ Wenn ich nun aber auch Ihre Fragen nicht beantworte , so kann ich Ihnen doch raten , denken Sie weniger an uns und an das , was mit Ihnen geschehen wird , denken Sie lieber mehr an sich . +176-1 23541-23542 _ _ _ _ _ +176-2 23543-23547 Wenn _ Wenn _ _ +176-3 23548-23551 ich _ ich _ _ +176-4 23552-23555 nun _ nun _ _ +176-5 23556-23560 aber _ aber _ _ +176-6 23561-23565 auch _ auch _ _ +176-7 23566-23570 Ihre _ Ihre _ _ +176-8 23571-23577 Fragen _ Fragen _ _ +176-9 23578-23583 nicht _ nicht _ _ +176-10 23584-23594 beantworte _ beantworte _ _ +176-11 23595-23596 , _ , _ _ +176-12 23597-23599 so _ so _ _ +176-13 23600-23604 kann _ kann _ _ +176-14 23605-23608 ich _ ich _ _ +176-15 23609-23614 Ihnen _ Ihnen _ _ +176-16 23615-23619 doch _ doch _ _ +176-17 23620-23625 raten _ raten _ _ +176-18 23626-23627 , _ , _ _ +176-19 23628-23634 denken _ denken _ _ +176-20 23635-23638 Sie _ Sie _ _ +176-21 23639-23646 weniger _ weniger _ _ +176-22 23647-23649 an _ an _ _ +176-23 23650-23653 uns _ uns _ _ +176-24 23654-23657 und _ und _ _ +176-25 23658-23660 an _ an _ _ +176-26 23661-23664 das _ das _ _ +176-27 23665-23666 , _ , _ _ +176-28 23667-23670 was _ was _ _ +176-29 23671-23674 mit _ mit _ _ +176-30 23675-23680 Ihnen _ Ihnen _ _ +176-31 23681-23690 geschehen _ geschehen _ _ +176-32 23691-23695 wird _ wird _ _ +176-33 23696-23697 , _ , _ _ +176-34 23698-23704 denken _ denken _ _ +176-35 23705-23708 Sie _ Sie _ _ +176-36 23709-23715 lieber _ lieber _ _ +176-37 23716-23720 mehr _ mehr _ _ +176-38 23721-23723 an _ an _ _ +176-39 23724-23728 sich _ sich _ _ +176-40 23729-23730 . _ . _ _ + +#Text=_ Und machen Sie keinen solchen Lärm mit dem Gefühl Ihrer Unschuld , es stört den nicht gerade schlechten Eindruck , den Sie im übrigen machen . +177-1 23731-23732 _ _ _ _ _ +177-2 23733-23736 Und _ Und _ _ +177-3 23737-23743 machen _ machen _ _ +177-4 23744-23747 Sie _ Sie _ _ +177-5 23748-23754 keinen _ keinen _ _ +177-6 23755-23762 solchen _ solchen _ _ +177-7 23763-23767 Lärm _ Lärm _ _ +177-8 23768-23771 mit _ mit _ _ +177-9 23772-23775 dem _ dem _ _ +177-10 23776-23782 Gefühl _ Gefühl _ _ +177-11 23783-23788 Ihrer _ Ihrer _ _ +177-12 23789-23797 Unschuld _ Unschuld _ _ +177-13 23798-23799 , _ , _ _ +177-14 23800-23802 es _ es _ _ +177-15 23803-23808 stört _ stört _ _ +177-16 23809-23812 den _ den CM[75] ANAPHORIC->75-1 +177-17 23813-23818 nicht _ nicht CM[75] ANAPHORIC->75-1 +177-18 23819-23825 gerade _ gerade CM[75] ANAPHORIC->75-1 +177-19 23826-23836 schlechten _ schlechten CM[75] ANAPHORIC->75-1 +177-20 23837-23845 Eindruck _ Eindruck CM[75] ANAPHORIC->75-1 +177-21 23846-23847 , _ , _ _ +177-22 23848-23851 den _ den CM[75] *->75-2 +177-23 23852-23855 Sie _ Sie _ _ +177-24 23856-23858 im _ im _ _ +177-25 23859-23866 übrigen _ übrigen _ _ +177-26 23867-23873 machen _ machen _ _ +177-27 23874-23875 . _ . _ _ + +#Text=_ Auch sollten Sie überhaupt im Reden zurückhaltender sein , fast alles , was Sie vorhin gesagt haben , hätte man auch , wenn Sie nur ein paar Worte gesagt hätten , Ihrem Verhalten entnehmen können , außerdem war es nichts für Sie übermäßig Günstiges . " +178-1 23876-23877 _ _ _ _ _ +178-2 23878-23882 Auch _ Auch _ _ +178-3 23883-23890 sollten _ sollten _ _ +178-4 23891-23894 Sie _ Sie _ _ +178-5 23895-23904 überhaupt _ überhaupt _ _ +178-6 23905-23907 im _ im _ _ +178-7 23908-23913 Reden _ Reden _ _ +178-8 23914-23929 zurückhaltender _ zurückhaltender _ _ +178-9 23930-23934 sein _ sein _ _ +178-10 23935-23936 , _ , _ _ +178-11 23937-23941 fast _ fast _ _ +178-12 23942-23947 alles _ alles CM[76] BOUND->76-1 +178-13 23948-23949 , _ , _ _ +178-14 23950-23953 was _ was CM[76] *->76-2 +178-15 23954-23957 Sie _ Sie _ _ +178-16 23958-23964 vorhin _ vorhin _ _ +178-17 23965-23971 gesagt _ gesagt _ _ +178-18 23972-23977 haben _ haben _ _ +178-19 23978-23979 , _ , _ _ +178-20 23980-23985 hätte _ hätte _ _ +178-21 23986-23989 man _ man _ _ +178-22 23990-23994 auch _ auch _ _ +178-23 23995-23996 , _ , _ _ +178-24 23997-24001 wenn _ wenn _ _ +178-25 24002-24005 Sie _ Sie _ _ +178-26 24006-24009 nur _ nur _ _ +178-27 24010-24013 ein _ ein _ _ +178-28 24014-24018 paar _ paar _ _ +178-29 24019-24024 Worte _ Worte _ _ +178-30 24025-24031 gesagt _ gesagt _ _ +178-31 24032-24038 hätten _ hätten _ _ +178-32 24039-24040 , _ , _ _ +178-33 24041-24046 Ihrem _ Ihrem _ _ +178-34 24047-24056 Verhalten _ Verhalten _ _ +178-35 24057-24066 entnehmen _ entnehmen _ _ +178-36 24067-24073 können _ können _ _ +178-37 24074-24075 , _ , _ _ +178-38 24076-24084 außerdem _ außerdem _ _ +178-39 24085-24088 war _ war _ _ +178-40 24089-24091 es _ es _ _ +178-41 24092-24098 nichts _ nichts _ _ +178-42 24099-24102 für _ für _ _ +178-43 24103-24106 Sie _ Sie _ _ +178-44 24107-24116 übermäßig _ übermäßig _ _ +178-45 24117-24126 Günstiges _ Günstiges _ _ +178-46 24127-24128 . _ . _ _ +178-47 24129-24130 " _ " _ _ + +#Text=_ K. starrte den Aufseher an . +179-1 24131-24132 _ _ _ _ _ +179-2 24133-24135 K. PER K. CM[71] ANAPHORIC->71-41 +179-3 24136-24143 starrte _ starrte _ _ +179-4 24144-24147 den _ den CM[58] COREFERENTIAL->58-15 +179-5 24148-24156 Aufseher _ Aufseher CM[58] COREFERENTIAL->58-15 +179-6 24157-24159 an _ an _ _ +179-7 24160-24161 . _ . _ _ + +#Text=_ Schulmäßige Lehren bekam er hier von einem vielleicht jüngeren Menschen ? +180-1 24162-24163 _ _ _ _ _ +180-2 24164-24175 Schulmäßige _ Schulmäßige _ _ +180-3 24176-24182 Lehren _ Lehren _ _ +180-4 24183-24188 bekam _ bekam _ _ +180-5 24189-24191 er _ er _ _ +180-6 24192-24196 hier _ hier _ _ +180-7 24197-24200 von _ von _ _ +180-8 24201-24206 einem _ einem _ _ +180-9 24207-24217 vielleicht _ vielleicht _ _ +180-10 24218-24226 jüngeren _ jüngeren _ _ +180-11 24227-24235 Menschen _ Menschen _ _ +180-12 24236-24237 ? _ ? _ _ + +#Text=_ Für seine Offenheit wurde er mit einer Rüge bestraft ? +181-1 24238-24239 _ _ _ _ _ +181-2 24240-24243 Für _ Für _ _ +181-3 24244-24249 seine _ seine CM[71] ANAPHORIC->71-42 +181-4 24250-24259 Offenheit _ Offenheit _ _ +181-5 24260-24265 wurde _ wurde _ _ +181-6 24266-24268 er _ er CM[71] ANAPHORIC->71-43 +181-7 24269-24272 mit _ mit _ _ +181-8 24273-24278 einer _ einer _ _ +181-9 24279-24283 Rüge _ Rüge _ _ +181-10 24284-24292 bestraft _ bestraft _ _ +181-11 24293-24294 ? _ ? _ _ + +#Text=_ Und über den Grund seiner Verhaftung und über deren Auftraggeber erfuhr er nichts ? +182-1 24295-24296 _ _ _ _ _ +182-2 24297-24300 Und _ Und _ _ +182-3 24301-24305 über _ über _ _ +182-4 24306-24309 den _ den _ _ +182-5 24310-24315 Grund _ Grund _ _ +182-6 24316-24322 seiner _ seiner CM[71]|CM[77] ANAPHORIC->71-44|ANAPHORIC->77-1 +182-7 24323-24333 Verhaftung _ Verhaftung CM[77] ANAPHORIC->77-1 +182-8 24334-24337 und _ und _ _ +182-9 24338-24342 über _ über _ _ +182-10 24343-24348 deren _ deren CM[77] *->77-2 +182-11 24349-24361 Auftraggeber _ Auftraggeber _ _ +182-12 24362-24368 erfuhr _ erfuhr _ _ +182-13 24369-24371 er _ er CM[71] ANAPHORIC->71-45 +182-14 24372-24378 nichts _ nichts _ _ +182-15 24379-24380 ? _ ? _ _ + +#Text=_ Er geriet in eine gewisse Aufregung , _ ging auf und ab , woran ihn niemand hinderte , _ schob seine Manschetten zurück , _ befühlte die Brust , _ strich sein Haar zurecht , _ kam an den drei Herren vorüber , _ sagte : " Es ist ja sinnlos " , worauf sich diese zu ihm umdrehten und _ ihn entgegenkommend , aber ernst ansahen und _ machte endlich wieder vor dem Tisch des Aufsehers halt . +183-1 24381-24382 _ _ _ _ _ +183-2 24383-24385 Er _ Er CM[71] ANAPHORIC->71-46 +183-3 24386-24392 geriet _ geriet _ _ +183-4 24393-24395 in _ in _ _ +183-5 24396-24400 eine _ eine _ _ +183-6 24401-24408 gewisse _ gewisse _ _ +183-7 24409-24418 Aufregung _ Aufregung _ _ +183-8 24419-24420 , _ , _ _ +183-9 24421-24422 _ _ er _ _ +183-10 24423-24427 ging _ ging _ _ +183-11 24428-24431 auf _ auf _ _ +183-12 24432-24435 und _ und _ _ +183-13 24436-24438 ab _ ab _ _ +183-14 24439-24440 , _ , _ _ +183-15 24441-24446 woran _ woran _ _ +183-16 24447-24450 ihn _ ihn CM[71] ANAPHORIC->71-47 +183-17 24451-24458 niemand _ niemand _ _ +183-18 24459-24467 hinderte _ hinderte _ _ +183-19 24468-24469 , _ , _ _ +183-20 24470-24471 _ _ er _ _ +183-21 24472-24477 schob _ schob _ _ +183-22 24478-24483 seine _ seine CM[71] ANAPHORIC->71-48 +183-23 24484-24495 Manschetten _ Manschetten _ _ +183-24 24496-24502 zurück _ zurück _ _ +183-25 24503-24504 , _ , _ _ +183-26 24505-24506 _ _ er _ _ +183-27 24507-24515 befühlte _ befühlte _ _ +183-28 24516-24519 die _ die _ _ +183-29 24520-24525 Brust _ Brust _ _ +183-30 24526-24527 , _ , _ _ +183-31 24528-24529 _ _ er _ _ +183-32 24530-24536 strich _ strich _ _ +183-33 24537-24541 sein _ sein CM[71] ANAPHORIC->71-49 +183-34 24542-24546 Haar _ Haar _ _ +183-35 24547-24554 zurecht _ zurecht _ _ +183-36 24555-24556 , _ , _ _ +183-37 24557-24558 _ _ er _ _ +183-38 24559-24562 kam _ kam _ _ +183-39 24563-24565 an _ an _ _ +183-40 24566-24569 den _ den CM[78] ANAPHORIC->78-3 +183-41 24570-24574 drei _ drei CM[78] ANAPHORIC->78-3 +183-42 24575-24581 Herren _ Herren CM[78] ANAPHORIC->78-3 +183-43 24582-24589 vorüber _ vorüber _ _ +183-44 24590-24591 , _ , _ _ +183-45 24592-24593 _ _ er _ _ +183-46 24594-24599 sagte _ sagte _ _ +183-47 24600-24601 : _ : _ _ +183-48 24602-24603 " _ " _ _ +183-49 24604-24606 Es _ Es _ _ +183-50 24607-24610 ist _ ist _ _ +183-51 24611-24613 ja _ ja _ _ +183-52 24614-24621 sinnlos _ sinnlos _ _ +183-53 24622-24623 " _ " _ _ +183-54 24624-24625 , _ , _ _ +183-55 24626-24632 worauf _ worauf _ _ +183-56 24633-24637 sich _ sich CM[78] CATAPHORIC->78-4 +183-57 24638-24643 diese _ diese CM[78] *->78-5 +183-58 24644-24646 zu _ zu _ _ +183-59 24647-24650 ihm _ ihm CM[71] ANAPHORIC->71-50 +183-60 24651-24660 umdrehten _ umdrehten _ _ +183-61 24661-24664 und _ und _ _ +183-62 24665-24666 _ _ sie _ _ +183-63 24667-24670 ihn _ ihn CM[71] ANAPHORIC->71-51 +183-64 24671-24686 entgegenkommend _ entgegenkommend _ _ +183-65 24687-24688 , _ , _ _ +183-66 24689-24693 aber _ aber _ _ +183-67 24694-24699 ernst _ ernst _ _ +183-68 24700-24707 ansahen _ ansahen _ _ +183-69 24708-24711 und _ und _ _ +183-70 24712-24713 _ _ er _ _ +183-71 24714-24720 machte _ machte _ _ +183-72 24721-24728 endlich _ endlich _ _ +183-73 24729-24735 wieder _ wieder _ _ +183-74 24736-24739 vor _ vor _ _ +183-75 24740-24743 dem _ dem CM[66] *->66-6 +183-76 24744-24749 Tisch _ Tisch CM[66] *->66-6 +183-77 24750-24753 des _ des CM[58]|CM[66] COREFERENTIAL->58-16|*->66-6 +183-78 24754-24763 Aufsehers _ Aufsehers CM[58]|CM[66] COREFERENTIAL->58-16|*->66-6 +183-79 24764-24768 halt _ halt _ _ +183-80 24769-24770 . _ . _ _ + +#Text=_ " Der Staatsanwalt Hasterer ist mein guter Freund " , sagte er , " kann ich _ ihm telephonieren ? " +184-1 24771-24772 _ _ _ _ _ +184-2 24773-24774 " _ " _ _ +184-3 24775-24778 Der _ Der CM[79]|CM[80] ANAPHORIC->79-1|ANAPHORIC->80-1 +184-4 24779-24791 Staatsanwalt _ Staatsanwalt CM[79]|CM[80] ANAPHORIC->79-1|ANAPHORIC->80-1 +184-5 24792-24800 Hasterer PER Hasterer CM[79]|CM[80] ANAPHORIC->79-1|ANAPHORIC->80-1 +184-6 24801-24804 ist _ ist _ _ +184-7 24805-24809 mein _ mein _ _ +184-8 24810-24815 guter _ guter _ _ +184-9 24816-24822 Freund _ Freund _ _ +184-10 24823-24824 " _ " _ _ +184-11 24825-24826 , _ , _ _ +184-12 24827-24832 sagte _ sagte _ _ +184-13 24833-24835 er _ er CM[71] ANAPHORIC->71-52 +184-14 24836-24837 , _ , _ _ +184-15 24838-24839 " _ " _ _ +184-16 24840-24844 kann _ kann _ _ +184-17 24845-24848 ich _ ich _ _ +184-18 24849-24850 _ _ mit _ _ +184-19 24851-24854 ihm _ ihm CM[79]|CM[80] ANAPHORIC->79-2|ANAPHORIC->80-2 +184-20 24855-24868 telephonieren _ telefonieren _ _ +184-21 24869-24870 ? _ ? _ _ +184-22 24871-24872 " _ " _ _ + +#Text=_ " Gewiß " , sagte der Aufseher , " aber ich weiß nicht , welchen Sinn das haben sollte , es müßte denn sein , daß Sie irgendeine private Angelegenheit mit ihm zu besprechen haben . " +185-1 24873-24874 _ _ _ _ _ +185-2 24875-24876 " _ " _ _ +185-3 24877-24882 Gewiß _ Gewiss _ _ +185-4 24883-24884 " _ " _ _ +185-5 24885-24886 , _ , _ _ +185-6 24887-24892 sagte _ sagte _ _ +185-7 24893-24896 der _ der CM[58] COREFERENTIAL->58-17 +185-8 24897-24905 Aufseher _ Aufseher CM[58] COREFERENTIAL->58-17 +185-9 24906-24907 , _ , _ _ +185-10 24908-24909 " _ " _ _ +185-11 24910-24914 aber _ aber _ _ +185-12 24915-24918 ich _ ich _ _ +185-13 24919-24923 weiß _ weiß _ _ +185-14 24924-24929 nicht _ nicht _ _ +185-15 24930-24931 , _ , _ _ +185-16 24932-24939 welchen _ welchen _ _ +185-17 24940-24944 Sinn _ Sinn _ _ +185-18 24945-24948 das _ das _ _ +185-19 24949-24954 haben _ haben _ _ +185-20 24955-24961 sollte _ sollte _ _ +185-21 24962-24963 , _ , _ _ +185-22 24964-24966 es _ es _ _ +185-23 24967-24972 müßte _ müsste _ _ +185-24 24973-24977 denn _ denn _ _ +185-25 24978-24982 sein _ sein _ _ +185-26 24983-24984 , _ , _ _ +185-27 24985-24988 daß _ dass _ _ +185-28 24989-24992 Sie _ Sie _ _ +185-29 24993-25003 irgendeine _ irgendeine _ _ +185-30 25004-25011 private _ private _ _ +185-31 25012-25025 Angelegenheit _ Angelegenheit _ _ +185-32 25026-25029 mit _ mit _ _ +185-33 25030-25033 ihm _ ihm CM[79]|CM[80] *->79-3|*->80-3 +185-34 25034-25036 zu _ zu _ _ +185-35 25037-25047 besprechen _ besprechen _ _ +185-36 25048-25053 haben _ haben _ _ +185-37 25054-25055 . _ . _ _ +185-38 25056-25057 " _ " _ _ + +#Text=_ " Welchen Sinn _ _ ? " rief K. , mehr bestürzt als geärgert . +186-1 25058-25059 _ _ _ _ _ +186-2 25060-25061 " _ " _ _ +186-3 25062-25069 Welchen _ Welchen _ _ +186-4 25070-25074 Sinn _ Sinn _ _ +186-5 25075-25076 _ _ das _ _ +186-6 25077-25078 _ _ machte _ _ +186-7 25079-25080 ? _ ? _ _ +186-8 25081-25082 " _ " _ _ +186-9 25083-25087 rief _ rief _ _ +186-10 25088-25090 K. PER K. CM[71] COREFERENTIAL->71-53 +186-11 25091-25092 , _ , _ _ +186-12 25093-25097 mehr _ mehr _ _ +186-13 25098-25106 bestürzt _ bestürzt _ _ +186-14 25107-25110 als _ als _ _ +186-15 25111-25119 geärgert _ geärgert _ _ +186-16 25120-25121 . _ . _ _ + +#Text=_ " Wer sind Sie denn ? +187-1 25122-25123 _ _ _ _ _ +187-2 25124-25125 " _ " _ _ +187-3 25126-25129 Wer _ Wer _ _ +187-4 25130-25134 sind _ sind _ _ +187-5 25135-25138 Sie _ Sie _ _ +187-6 25139-25143 denn _ denn _ _ +187-7 25144-25145 ? _ ? _ _ + +#Text=_ Sie wollen einen Sinn und _ führen dieses Sinnloseste auf , das es gibt ? +188-1 25146-25147 _ _ _ _ _ +188-2 25148-25151 Sie _ Sie _ _ +188-3 25152-25158 wollen _ wollen _ _ +188-4 25159-25164 einen _ einen _ _ +188-5 25165-25169 Sinn _ Sinn _ _ +188-6 25170-25173 und _ und _ _ +188-7 25174-25175 _ _ sie _ _ +188-8 25176-25182 führen _ führen _ _ +188-9 25183-25189 dieses _ dieses CM[81] ANAPHORIC->81-1 +188-10 25190-25201 Sinnloseste _ Sinnloseste CM[81] ANAPHORIC->81-1 +188-11 25202-25205 auf _ auf _ _ +188-12 25206-25207 , _ , _ _ +188-13 25208-25211 das _ das CM[81] *->81-2 +188-14 25212-25214 es _ es _ _ +188-15 25215-25219 gibt _ gibt _ _ +188-16 25220-25221 ? _ ? _ _ + +#Text=_ Ist es nicht zum Steinerweichen ? +189-1 25222-25223 _ _ _ _ _ +189-2 25224-25227 Ist _ Ist _ _ +189-3 25228-25230 es _ es _ _ +189-4 25231-25236 nicht _ nicht _ _ +189-5 25237-25240 zum _ zum _ _ +189-6 25241-25255 Steinerweichen _ Steinerweichen _ _ +189-7 25256-25257 ? _ ? _ _ + +#Text=_ Die Herren haben mich zuerst überfallen , und jetzt sitzen _ oder stehen sie hier herum und _ lassen mich vor Ihnen die Hohe Schule reiten . +190-1 25258-25259 _ _ _ _ _ +190-2 25260-25263 Die _ Die _ _ +190-3 25264-25270 Herren _ Herren _ _ +190-4 25271-25276 haben _ haben _ _ +190-5 25277-25281 mich _ mich _ _ +190-6 25282-25288 zuerst _ zuerst _ _ +190-7 25289-25299 überfallen _ überfallen _ _ +190-8 25300-25301 , _ , _ _ +190-9 25302-25305 und _ und _ _ +190-10 25306-25311 jetzt _ jetzt _ _ +190-11 25312-25318 sitzen _ sitzen _ _ +190-12 25319-25320 _ _ sie _ _ +190-13 25321-25325 oder _ oder _ _ +190-14 25326-25332 stehen _ stehen _ _ +190-15 25333-25336 sie _ sie _ _ +190-16 25337-25341 hier _ hier _ _ +190-17 25342-25347 herum _ herum _ _ +190-18 25348-25351 und _ und _ _ +190-19 25352-25353 _ _ sie _ _ +190-20 25354-25360 lassen _ lassen _ _ +190-21 25361-25365 mich _ mich _ _ +190-22 25366-25369 vor _ vor _ _ +190-23 25370-25375 Ihnen _ Ihnen _ _ +190-24 25376-25379 die _ die _ _ +190-25 25380-25384 Hohe _ hohe _ _ +190-26 25385-25391 Schule _ Schule _ _ +190-27 25392-25398 reiten _ reiten _ _ +190-28 25399-25400 . _ . _ _ + +#Text=_ Welchen Sinn es hätte , an einen Staatsanwalt zu telephonieren , wenn ich angeblich verhaftet bin ? +191-1 25401-25402 _ _ _ _ _ +191-2 25403-25410 Welchen _ Welchen _ _ +191-3 25411-25415 Sinn _ Sinn _ _ +191-4 25416-25418 es _ es _ _ +191-5 25419-25424 hätte _ hätte _ _ +191-6 25425-25426 , _ , _ _ +191-7 25427-25429 an _ mit _ _ +191-8 25430-25435 einen _ einem _ _ +191-9 25436-25448 Staatsanwalt _ Staatsanwalt _ _ +191-10 25449-25451 zu _ zu _ _ +191-11 25452-25465 telephonieren _ telefonieren _ _ +191-12 25466-25467 , _ , _ _ +191-13 25468-25472 wenn _ wenn _ _ +191-14 25473-25476 ich _ ich _ _ +191-15 25477-25486 angeblich _ angeblich _ _ +191-16 25487-25496 verhaftet _ verhaftet _ _ +191-17 25497-25500 bin _ bin _ _ +191-18 25501-25502 ? _ ? _ _ + +#Text=_ Gut , ich werde nicht telephonieren . " +192-1 25503-25504 _ _ _ _ _ +192-2 25505-25508 Gut _ Gut _ _ +192-3 25509-25510 , _ , _ _ +192-4 25511-25514 ich _ ich _ _ +192-5 25515-25520 werde _ werde _ _ +192-6 25521-25526 nicht _ nicht _ _ +192-7 25527-25540 telephonieren _ telefonieren _ _ +192-8 25541-25542 . _ . _ _ +192-9 25543-25544 " _ " _ _ + +#Text=_ " Aber doch " , sagte der Aufseher und _ streckte die Hand zum Vorzimmer aus , wo das Telephon war , " bitte , telephonieren Sie doch . " +193-1 25545-25546 _ _ _ _ _ +193-2 25547-25548 " _ " _ _ +193-3 25549-25553 Aber _ Aber _ _ +193-4 25554-25558 doch _ doch _ _ +193-5 25559-25560 " _ " _ _ +193-6 25561-25562 , _ , _ _ +193-7 25563-25568 sagte _ sagte _ _ +193-8 25569-25572 der _ der CM[58] COREFERENTIAL->58-18 +193-9 25573-25581 Aufseher _ Aufseher CM[58] COREFERENTIAL->58-18 +193-10 25582-25585 und _ und _ _ +193-11 25586-25587 _ _ er _ _ +193-12 25588-25596 streckte _ streckte _ _ +193-13 25597-25600 die _ die _ _ +193-14 25601-25605 Hand _ Hand _ _ +193-15 25606-25609 zum _ zum _ _ +193-16 25610-25619 Vorzimmer _ Vorzimmer _ _ +193-17 25620-25623 aus _ aus _ _ +193-18 25624-25625 , _ , _ _ +193-19 25626-25628 wo _ wo _ _ +193-20 25629-25632 das _ das _ _ +193-21 25633-25641 Telephon _ Telefon _ _ +193-22 25642-25645 war _ war _ _ +193-23 25646-25647 , _ , _ _ +193-24 25648-25649 " _ " _ _ +193-25 25650-25655 bitte _ bitte _ _ +193-26 25656-25657 , _ , _ _ +193-27 25658-25671 telephonieren _ telefonieren _ _ +193-28 25672-25675 Sie _ Sie _ _ +193-29 25676-25680 doch _ doch _ _ +193-30 25681-25682 . _ . _ _ +193-31 25683-25684 " _ " _ _ + +#Text=_ " Nein , ich will nicht mehr " , sagte K. und _ ging zum Fenster . +194-1 25685-25686 _ _ _ _ _ +194-2 25687-25688 " _ " _ _ +194-3 25689-25693 Nein _ Nein _ _ +194-4 25694-25695 , _ , _ _ +194-5 25696-25699 ich _ ich _ _ +194-6 25700-25704 will _ will _ _ +194-7 25705-25710 nicht _ nicht _ _ +194-8 25711-25715 mehr _ mehr _ _ +194-9 25716-25717 " _ " _ _ +194-10 25718-25719 , _ , _ _ +194-11 25720-25725 sagte _ sagte _ _ +194-12 25726-25728 K. PER K. CM[71] COREFERENTIAL->71-54 +194-13 25729-25732 und _ und _ _ +194-14 25733-25734 _ _ er _ _ +194-15 25735-25739 ging _ ging _ _ +194-16 25740-25743 zum _ zum CM[82] COREFERENTIAL->82-1 +194-17 25744-25751 Fenster _ Fenster CM[82] COREFERENTIAL->82-1 +194-18 25752-25753 . _ . _ _ + +#Text=_ Drüben war noch die Gesellschaft beim Fenster und _ schien nur jetzt dadurch , daß K. ans Fenster herangetreten war , in der Ruhe des Zuschauens ein wenig gestört . +195-1 25754-25755 _ _ _ _ _ +195-2 25756-25762 Drüben _ Drüben _ _ +195-3 25763-25766 war _ war _ _ +195-4 25767-25771 noch _ noch _ _ +195-5 25772-25775 die _ die _ _ +195-6 25776-25788 Gesellschaft _ Gesellschaft _ _ +195-7 25789-25793 beim _ beim CM[82] COREFERENTIAL->82-2 +195-8 25794-25801 Fenster _ Fenster CM[82] COREFERENTIAL->82-2 +195-9 25802-25805 und _ und _ _ +195-10 25806-25807 _ _ sie _ _ +195-11 25808-25814 schien _ schien _ _ +195-12 25815-25818 nur _ nur _ _ +195-13 25819-25824 jetzt _ jetzt _ _ +195-14 25825-25832 dadurch _ dadurch _ _ +195-15 25833-25834 , _ , _ _ +195-16 25835-25838 daß _ dass _ _ +195-17 25839-25841 K. PER K. CM[71] COREFERENTIAL->71-55 +195-18 25842-25845 ans _ ans CM[82] COREFERENTIAL->82-3 +195-19 25846-25853 Fenster _ Fenster CM[82] COREFERENTIAL->82-3 +195-20 25854-25867 herangetreten _ herangetreten _ _ +195-21 25868-25871 war _ war _ _ +195-22 25872-25873 , _ , _ _ +195-23 25874-25876 in _ in _ _ +195-24 25877-25880 der _ der _ _ +195-25 25881-25885 Ruhe _ Ruhe _ _ +195-26 25886-25889 des _ des _ _ +195-27 25890-25900 Zuschauens _ Zuschauens _ _ +195-28 25901-25904 ein _ ein _ _ +195-29 25905-25910 wenig _ wenig _ _ +195-30 25911-25918 gestört _ gestört _ _ +195-31 25919-25920 . _ . _ _ + +#Text=_ Die Alten wollten sich erheben , aber der Mann hinter ihnen beruhigte sie . +196-1 25921-25922 _ _ _ _ _ +196-2 25923-25926 Die _ Die CM[64] ANAPHORIC->64-4 +196-3 25927-25932 Alten _ Alten CM[64] ANAPHORIC->64-4 +196-4 25933-25940 wollten _ wollten _ _ +196-5 25941-25945 sich _ sich CM[64] ANAPHORIC->64-5 +196-6 25946-25953 erheben _ erheben _ _ +196-7 25954-25955 , _ , _ _ +196-8 25956-25960 aber _ aber _ _ +196-9 25961-25964 der _ der CM[65] COREFERENTIAL->65-4 +196-10 25965-25969 Mann _ Mann CM[65] COREFERENTIAL->65-4 +196-11 25970-25976 hinter _ hinter CM[65] COREFERENTIAL->65-4 +196-12 25977-25982 ihnen _ ihnen CM[64]|CM[65] ANAPHORIC->64-6|COREFERENTIAL->65-4 +196-13 25983-25992 beruhigte _ beruhigte _ _ +196-14 25993-25996 sie _ sie CM[64] COREFERENTIAL->64-7 +196-15 25997-25998 . _ . _ _ + +#Text=_ " Dort sind auch solche Zuschauer " , rief K. ganz laut dem Aufseher zu und _ zeigte mit dem Zeigefinger hinaus . +197-1 25999-26000 _ _ _ _ _ +197-2 26001-26002 " _ " _ _ +197-3 26003-26007 Dort _ Dort _ _ +197-4 26008-26012 sind _ sind _ _ +197-5 26013-26017 auch _ auch _ _ +197-6 26018-26024 solche _ solche _ _ +197-7 26025-26034 Zuschauer _ Zuschauer _ _ +197-8 26035-26036 " _ " _ _ +197-9 26037-26038 , _ , _ _ +197-10 26039-26043 rief _ rief _ _ +197-11 26044-26046 K. PER K. CM[71] ANAPHORIC->71-56 +197-12 26047-26051 ganz _ ganz _ _ +197-13 26052-26056 laut _ laut _ _ +197-14 26057-26060 dem _ dem CM[58] *->58-19 +197-15 26061-26069 Aufseher _ Aufseher CM[58] *->58-19 +197-16 26070-26072 zu _ zu _ _ +197-17 26073-26076 und _ und _ _ +197-18 26077-26078 _ _ er _ _ +197-19 26079-26085 zeigte _ zeigte _ _ +197-20 26086-26089 mit _ mit _ _ +197-21 26090-26093 dem _ dem _ _ +197-22 26094-26105 Zeigefinger _ Zeigefinger _ _ +197-23 26106-26112 hinaus _ hinaus _ _ +197-24 26113-26114 . _ . _ _ + +#Text=_ " _ _ Weg von dort " , rief er dann hinüber . +198-1 26115-26116 _ _ _ _ _ +198-2 26117-26118 " _ " _ _ +198-3 26119-26120 _ _ Gehen _ _ +198-4 26121-26122 _ _ Sie _ _ +198-5 26123-26126 Weg _ weg _ _ +198-6 26127-26130 von _ von _ _ +198-7 26131-26135 dort _ dort _ _ +198-8 26136-26137 " _ " _ _ +198-9 26138-26139 , _ , _ _ +198-10 26140-26144 rief _ rief _ _ +198-11 26145-26147 er _ er CM[71] *->71-57 +198-12 26148-26152 dann _ dann _ _ +198-13 26153-26160 hinüber _ hinüber _ _ +198-14 26161-26162 . _ . _ _ + +#Text=_ Die drei wichen auch sofort ein paar Schritte zurück , die beiden Alten _ sogar noch hinter den Mann , der sie mit seinem breiten Körper deckte und , _ nach seinen Mundbewegungen zu schließen , irgend etwas auf die Entfernung hin Unverständliches sagte . +199-1 26163-26164 _ _ _ _ _ +199-2 26165-26168 Die _ Die _ _ +199-3 26169-26173 drei _ Drei _ _ +199-4 26174-26180 wichen _ wichen _ _ +199-5 26181-26185 auch _ auch _ _ +199-6 26186-26192 sofort _ sofort _ _ +199-7 26193-26196 ein _ ein _ _ +199-8 26197-26201 paar _ paar _ _ +199-9 26202-26210 Schritte _ Schritte _ _ +199-10 26211-26217 zurück _ zurück _ _ +199-11 26218-26219 , _ , _ _ +199-12 26220-26223 die _ die CM[64] ANAPHORIC->64-8 +199-13 26224-26230 beiden _ beiden CM[64] ANAPHORIC->64-8 +199-14 26231-26236 Alten _ Alten CM[64] ANAPHORIC->64-8 +199-15 26237-26238 _ _ wichen _ _ +199-16 26239-26244 sogar _ sogar _ _ +199-17 26245-26249 noch _ noch _ _ +199-18 26250-26256 hinter _ hinter _ _ +199-19 26257-26260 den _ den CM[65] ANAPHORIC->65-5 +199-20 26261-26265 Mann _ Mann CM[65] ANAPHORIC->65-5 +199-21 26266-26267 , _ , _ _ +199-22 26268-26271 der _ der CM[65] ANAPHORIC->65-6 +199-23 26272-26275 sie _ sie _ _ +199-24 26276-26279 mit _ mit _ _ +199-25 26280-26286 seinem _ seinem CM[65] ANAPHORIC->65-7 +199-26 26287-26294 breiten _ breiten _ _ +199-27 26295-26301 Körper _ Körper _ _ +199-28 26302-26308 deckte _ deckte _ _ +199-29 26309-26312 und _ und _ _ +199-30 26313-26314 , _ , _ _ +199-31 26315-26316 _ _ der _ _ +199-32 26317-26321 nach _ nach _ _ +199-33 26322-26328 seinen _ seinen CM[65] *->65-8 +199-34 26329-26343 Mundbewegungen _ Mundbewegungen _ _ +199-35 26344-26346 zu _ zu _ _ +199-36 26347-26356 schließen _ schließen _ _ +199-37 26357-26358 , _ , _ _ +199-38 26359-26365 irgend _ irgend\| _ _ +199-39 26366-26371 etwas _ etwas _ _ +199-40 26372-26375 auf _ auf _ _ +199-41 26376-26379 die _ die _ _ +199-42 26380-26390 Entfernung _ Entfernung _ _ +199-43 26391-26394 hin _ hin _ _ +199-44 26395-26411 Unverständliches _ Unverständliches _ _ +199-45 26412-26417 sagte _ sagte _ _ +199-46 26418-26419 . _ . _ _ + +#Text=_ Ganz aber verschwanden sie nicht , sondern _ schienen auf den Augenblick zu warten , in dem sie sich unbemerkt wieder dem Fenster nähern könnten . +200-1 26420-26421 _ _ _ _ _ +200-2 26422-26426 Ganz _ Ganz _ _ +200-3 26427-26431 aber _ aber _ _ +200-4 26432-26444 verschwanden _ verschwanden _ _ +200-5 26445-26448 sie _ sie CM[64] ANAPHORIC->64-9 +200-6 26449-26454 nicht _ nicht _ _ +200-7 26455-26456 , _ , _ _ +200-8 26457-26464 sondern _ sondern _ _ +200-9 26465-26466 _ _ sie _ _ +200-10 26467-26475 schienen _ schienen _ _ +200-11 26476-26479 auf _ auf _ _ +200-12 26480-26483 den _ den CM[83] ANAPHORIC->83-1 +200-13 26484-26494 Augenblick _ Augenblick CM[83] ANAPHORIC->83-1 +200-14 26495-26497 zu _ zu _ _ +200-15 26498-26504 warten _ warten _ _ +200-16 26505-26506 , _ , _ _ +200-17 26507-26509 in _ in _ _ +200-18 26510-26513 dem _ dem CM[83] *->83-2 +200-19 26514-26517 sie _ sie CM[64] *->64-10 +200-20 26518-26522 sich _ sich _ _ +200-21 26523-26532 unbemerkt _ unbemerkt _ _ +200-22 26533-26539 wieder _ wieder _ _ +200-23 26540-26543 dem _ dem CM[82] *->82-4 +200-24 26544-26551 Fenster _ Fenster CM[82] *->82-4 +200-25 26552-26558 nähern _ nähern _ _ +200-26 26559-26566 könnten _ könnten _ _ +200-27 26567-26568 . _ . _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3/example2.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3/example2.tsv new file mode 100644 index 0000000000..dd1f43aaaf --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3/example2.tsv @@ -0,0 +1,105 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS|PosValue +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|value +#T_RL=de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency|DependencyType|BT_de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + +#Text=Ms. Haag plays Elianti . +1-1 0-3 Ms. NNP PER[2] SUBJ 1-3 +1-2 4-8 Haag NNP PER[2] SBJ 1-3 +1-3 9-14 plays VBD _ P|ROOT 1-5|1-3 +1-4 15-22 Elianti NNP OTH OBJ 1-3 +1-5 23-24 . . _ _ _ + +#Text=Rolls-Royce Motor Cars Inc. said it expects its U.S. sales to remain steady at about 1,200 cars in 1990 . +2-1 25-36 Rolls-Royce NNP ORG[4] NAME 2-3 +2-2 37-42 Motor NNP ORG[4] NAME 2-3 +2-3 43-47 Cars NNPS _ SBJ 2-5 +2-4 48-52 Inc. NNP _ POSTHON 2-3 +2-5 53-57 said VBD _ ROOT 2-5 +2-6 58-60 it PRP _ SBJ 2-7 +2-7 61-68 expects VBZ _ OBJ|OPRD 2-5|2-11 +2-8 69-72 its PRP$ _ NMOD 2-10 +2-9 73-77 U.S. NNP LOC NMOD 2-10 +2-10 78-83 sales NNS _ OBJ 2-7 +2-11 84-86 to TO _ _ _ +2-12 87-93 remain VB _ IM 2-11 +2-13 94-100 steady JJ _ PRD 2-12 +2-14 101-103 at IN _ LOC 2-13 +2-15 104-109 about IN _ DEP 2-16 +2-16 110-115 1,200 CD _ NMOD 2-17 +2-17 116-120 cars NNS _ PMOD|TMP 2-14|2-18 +2-18 121-123 in IN _ _ _ +2-19 124-128 1990 CD _ PMOD 2-18 +2-20 129-130 . . _ P 2-5 + +#Text=The luxury auto maker last year sold 1,214 cars in the U.S. +3-1 131-134 The DT _ NMOD 3-4 +3-2 135-141 luxury NN _ NMOD 3-3 +3-3 142-146 auto NN _ NMOD 3-4 +3-4 147-152 maker NN _ SBJ 3-7 +3-5 153-157 last JJ _ NMOD 3-6 +3-6 158-162 year NN _ TMP 3-4 +3-7 163-167 sold VBD _ ROOT 3-7 +3-8 168-173 1,214 CD _ NMOD 3-9 +3-9 174-178 cars NNS _ OBJ 3-7 +3-10 179-181 in IN _ LOC 3-9 +3-11 182-185 the DT _ NMOD 3-12 +3-12 186-190 U.S. NNP LOC PMOD 3-10 + +#Text=BELL INDUSTRIES Inc. increased its quarterly to 10 cents from seven cents a share . +4-1 191-195 BELL NNP ORG[7]|PER[8] NAME 4-2 +4-2 196-206 INDUSTRIES NNP ORG[7] SBJ 4-4 +4-3 207-211 Inc. NNP ORG[7] POSTHON 4-2 +4-4 212-221 increased VBD _ ROOT 4-4 +4-5 222-225 its PRP$ _ NMOD 4-6 +4-6 226-235 quarterly NN _ OBJ 4-4 +4-7 236-238 to TO _ DIR 4-4 +4-8 239-241 10 CD _ NMOD 4-9 +4-9 242-247 cents NNS _ PMOD 4-7 +4-10 248-252 from IN _ DIR 4-9 +4-11 253-258 seven CD _ NMOD 4-12 +4-12 259-264 cents NNS _ PMOD 4-10 +4-13 265-266 a DT _ NMOD 4-14 +4-14 267-272 share NN _ ADV 4-12 +4-15 273-274 . . _ P 4-4 + +#Text=The new rate will be payable Feb. 15 . +5-1 275-278 The DT _ NMOD 5-3 +5-2 279-282 new JJ _ NMOD 5-3 +5-3 283-287 rate NN _ SBJ 5-4 +5-4 288-292 will MD _ ROOT 5-4 +5-5 293-295 be VB _ VC 5-4 +5-6 296-303 payable JJ _ PRD 5-5 +5-7 304-308 Feb. NNP _ TMP 5-5 +5-8 309-311 15 CD _ NMOD 5-7 +5-9 312-313 . . _ P 5-4 + +#Text=A record date has n't been set . +6-1 314-315 A DT _ NMOD 6-3 +6-2 316-322 record NN _ NMOD 6-3 +6-3 323-327 date NN _ SBJ 6-4 +6-4 328-331 has VBZ _ ROOT 6-4 +6-5 332-335 n't RB _ ADV 6-4 +6-6 336-340 been VBN _ VC 6-4 +6-7 341-344 set VBN _ VC 6-6 +6-8 345-346 . . _ P 6-4 + +#Text=Bell , based in Los Angeles , makes and distributes electronic , computer and building products . +7-1 347-351 Bell NNP ORG SBJ 7-8 +7-2 352-353 , , _ P 7-1 +7-3 354-359 based VBN _ APPO 7-1 +7-4 360-362 in IN _ LOC 7-3 +7-5 363-366 Los NNP LOC[10] NAME 7-6 +7-6 367-374 Angeles NNP LOC[10] PMOD 7-4 +7-7 375-376 , , _ P 7-1 +7-8 377-382 makes VBZ _ ROOT 7-8 +7-9 383-386 and CC _ COORD 7-8 +7-10 387-398 distributes VBZ _ CONJ 7-9 +7-11 399-409 electronic JJ _ NMOD 7-16 +7-12 410-411 , , _ P 7-11 +7-13 412-420 computer NN _ COORD 7-11 +7-14 421-424 and CC _ COORD 7-13 +7-15 425-433 building NN _ CONJ 7-14 +7-16 434-442 products NNS _ OBJ 7-8 +7-17 443-444 . . _ P 7-8 diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3/webannotest_2100105761.xml b/dkpro-core-io-webanno-asl/src/test/resources/tsv3/webannotest_2100105761.xml new file mode 100644 index 0000000000..5dca2cb423 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3/webannotest_2100105761.xml @@ -0,0 +1,1909 @@ + + + + + CTB version: Mere Fjæsbog + DK-CLARIN + + Data capture + dsl-dsn.dk + file + + + + + + 239 + 5 + + + dsl-dsn.dk + 2100105761 + + + full + 0 + + + partial + 0 + + + partial + 0 + + + + + This text belongs to the DK-CLARIN Reference Corpus of General Danish, DK-CLARIN Repository Version, April 2012 + + + + + Mere Fjæsbog + + Hoffmann, Bente + + + Translated by + empty + + + + Bentes blog. Strøtanker fra en skribents skrivebord + + empty + + + Bente Hoffmann + + empty + empty + empty + empty + empty + + + http://www.bentehoffmann.dk/blog/mere-fj%c3%a6sbog + nil + + + nil + 99999999 + + + + + + + + CTB version + + + DK-CLARIN-WP2.1 + + + + Annotation with lemma forms of basic tokens + + + + Annotation with POS and inflection of basic tokens + + + + + + + + + + Danish + + + 22 + + + nil + + 99999999 + + + basic-basic + adult-adult + + + + + + + nil + + + + + + + + + + + created + errors fixed with dk.dsl.ja.textjuggler.TextCleaner + lemmatized and POS-tagged with ePOS-DSL + + + + +

+ " + + + Facebook + + handler + + det + + egentlig + + bare + + om + + at + + blive + + set + . + + I + + informationssamfundet + + er + + den + + største + + knaphedsressource + + andres + + opmærksomhed + . + + Vi + + ser + + det + + også + + + + den + + måde + , + + vi + + bruger + + sms + + og + + e + - + mail + . + + Vældig + + mange + + af + + beskederne + + handler + + sådan + + set + + bare + + om + + at + + sige + : + + Hallo + + hallo + , + + glem + + ikke + , + + at + + jeg + + findes + . + + Det + + samme + + med + + Facebook + . + + Og + + det + + at + + have + + 2 + . + 000 + + såkaldte + + Facebookvenner + + giver + + indtryk + + af + , + + at + + man + + er + + en + + person + , + + der + + sidder + + som + + en + + edderkop + + i + + et + + stort + + netværk + . + + Og + + i + + et + + netværkssamfund + + er + + det + + en + + stor + + ressource + + for + + vældig + + mange + + mennesker + " + , + + siger + + Thomas + + Hylland + + Eriksen + . +

+

+ Ovenstående + + tankevækkende + + ord + + kunne + + man + + bl + . + a + . + + læse + + i + + går + + i + + Politikens + + artikel + + Danskerne + + flokkes + + + + Facebook + , + + som + + fx + + kunne + + fortælle + , + + at + + 12 + + procent + + af + + os + + har + + en + + profil + . +

+

+ Min + + holdning + + til + + Fjæsbogen + + har + + jeg + + jo + + tidligere + + ytret + + ( + og + + min + + profil + + har + + længe + + været + + deaktiveret + ; + + til + + min + + store + + irritation + + kan + + den + + jo + + ikke + + slettes + ) + . + + Men + + i + + dag + + fandt + + jeg + + et + + meget + + interessant + + blogindlæg + + hos + + per + - + olof + . + dk + , + + som + + har + + været + + i + + dialog + + med + + en + + af + + eksperterne + , + + Gisle + + Hannemyr + , + + bag + + Politikens + + artikel + . + + Læs + + endelig + + med + , + + hvis + + du + + også + + er + + skeptisk + + over + + for + + Facebook + . +

+

+ Tags + : + + citat + , + + Facebook + , + + Fjæsbog + , + + Netværk + , + + opmærksomhed + , + + Politiken +

+

+ Dette + + indlæg + + blev + + udgivet + + den + + tirsdag + , + + 27 + . + + maj + + 2008 + + kl + . + + 13 + : + 04 + + og + + er + + gemt + + i + + Citater + , + + Internettet + , + + Netværk + . + Du + + kan + + følge + + alle + + svar + + til + + dette + + indlæg + + via + + RSS + + 2 + . + 0 + - + feedet + . + + Du + + kan + + skrive + + en + + kommentar + , + + eller + + sende + + et + + trackback + + fra + + dit + + eget + + site + . +

+ + + " + + Facebook + handle + det + egentlig + bare + om + at + blive + se + . + i + informationssamfund + være + den + stor + knaphedsressource + andres + opmærksomhed + . + vi + se + det + også + + den + måde + , + vi + bruge + sms + og + E + - + mail + . + vældig + mangen + af + besked + handle + sådan + se + bare + om + at + sige + : + hallo + hallo + , + glem + ikke + , + at + jeg + finde + . + den + samme + med + Facebook + . + og + det + at + have + 2 + . + 000 + såkaldt + Facebookvenner + give + indtryk + af + , + at + man + være + en + person + , + der + sidde + som + en + edderkop + i + en + stor + netværk + . + og + i + en + netværkssamfund + være + det + en + stor + ressource + for + vældig + mangen + menneske + " + , + sige + Thomas + Hylland + Eriksen + . + ovenstående + tankevækkende + ord + kunne + man + blandt + . + anden + . + læse + i + går + i + Politiken + artikel + dansker + flokkes + + Facebook + , + som + fx + kunne + fortælle + , + at + 12 + procent + af + vi + have + en + profil + . + min + holdning + til + Fjæsbogen + have + jeg + jo + tidlig + ytre + ( + og + min + profil + have + længe + være + deaktivere + ; + til + min + stor + irritation + kunne + den + jo + ikke + slette + ) + . + men + i + dag + finde + jeg + en + megen + interessant + blogindlæg + hos + per + - + olof + . + dk + , + som + have + være + i + dialog + med + en + af + ekspert + , + Gisle + Hannemyr + , + bag + Politiken + artikel + . + læse + endelig + med + , + hvis + du + også + være + skeptisk + over + for + Facebook + . + tag + : + citat + , + Facebook + , + Fjæsbog + , + netværk + , + opmærksomhed + , + Politiken + denne + indlæg + blive + udgive + den + tirsdag + , + 27 + . + maj + 2008 + kl + . + 13 + : + 04 + og + være + gemme + i + citat + , + internet + , + netværk + . + du + kunne + følge + al + svar + til + denne + indlæg + via + rss + 2 + . + 0 + - + feed + . + du + kunne + skrive + en + kommentar + , + eller + sende + en + trackback + fra + din + egen + site + . + + + NP:siu#:--:---- + T-:----:--:---- + NP:siu#:--:---- + VF:----:sa:---- + PP:s-un:--:-3n- + D-:----:--:u--- + D-:----:--:u--- + T-:----:--:---- + UI:----:--:---- + VI:----:-a:---- + VT:siu#:t-:---- + ZP + T-:----:--:---- + NC:sdun:--:---- + VF:----:sa:---- + PM:s-uc:--:---- + AC:§du§:--:s--- + NC:siuc:--:---- + XY:----:--:---- + NC:siuc:--:---- + ZP + PP:p-nc:--:-1n- + VF:----:sa:---- + PP:s-un:--:-3n- + D-:----:--:u--- + T-:----:--:---- + PM:s-uc:--:---- + NC:siuc:--:---- + ZD + PP:p-nc:--:-1n- + VF:----:sa:---- + NC:siuc:--:---- + CC:----:--:---- + XS:----:--:---- + ZH + NC:siuc:--:---- + ZP + AD:----:--:p--- + AC:p#u#:--:p--- + T-:----:--:---- + NC:pdu#:--:---- + VF:----:sa:---- + D-:----:--:u--- + VT:siu#:t-:---- + D-:----:--:u--- + T-:----:--:---- + UI:----:--:---- + VI:----:-a:---- + ZP + I-:----:--:---- + I-:----:--:---- + ZD + VM:----:--:---- + D-:----:--:u--- + ZD + CS:----:--:---- + PP:s-nc:--:-1n- + VF:----:sp:---- + ZP + PM:s-un:--:---- + AC:§§u§:--:p--- + T-:----:--:---- + NP:siu#:--:---- + ZP + CC:----:--:---- + PP:s-un:--:-3n- + UI:----:--:---- + VI:----:-a:---- + LC:--u-:--:---- + ZP + LC:--u-:--:---- + AC:p#u#:--:p--- + NP:siu#:--:---- + VF:----:sa:---- + NC:siun:--:---- + T-:----:--:---- + ZD + CS:----:--:---- + PI:#-uc:--:---- + VF:----:sa:---- + PI:s-uc:--:---- + NC:siuc:--:---- + ZD + US:----:--:---- + VF:----:sa:---- + US:----:--:---- + PI:s-uc:--:---- + NC:siuc:--:---- + T-:----:--:---- + PI:s-un:--:---- + AC:siun:--:p--- + NC:siun:--:---- + ZP + CC:----:--:---- + T-:----:--:---- + PI:s-un:--:---- + NC:siun:--:---- + VF:----:sa:---- + PP:s-un:--:-3n- + PI:s-uc:--:---- + AC:siuc:--:p--- + NC:siuc:--:---- + T-:----:--:---- + AD:----:--:p--- + AC:p#u#:--:p--- + NC:piu#:--:---- + NP:siu#:--:---- + ZD + VF:----:sa:---- + NP:siu#:--:---- + NP:siu#:--:---- + NP:siu#:--:---- + ZP + AC:§§u§:--:p--- + AC:§§u§:--:p--- + NC:siun:--:---- + VF:----:ta:---- + PI:#-uc:--:---- + T-:----:--:---- + ZP + PI:s-un:--:---- + ZP + VI:----:-a:---- + T-:----:--:---- + NC:§§u§:--:---- + T-:----:--:---- + NP:sig#:--:---- + XY:----:--:---- + NC:pdu#:--:---- + VF:----:sp:---- + T-:----:--:---- + NP:siu#:--:---- + ZD + US:----:--:---- + D-:----:--:u--- + VF:----:ta:---- + VI:----:-a:---- + ZD + CS:----:--:---- + LC:--u-:--:---- + NC:piu#:--:---- + T-:----:--:---- + PP:p-uc:--:-1#- + VF:----:sa:---- + PI:s-uc:--:---- + NC:siuc:--:---- + ZP + PO:s--c:--:-1ns + NC:siuc:--:---- + T-:----:--:---- + NP:siu#:--:---- + VF:----:sa:---- + PP:s-nc:--:-1n- + D-:----:--:u--- + AD:----:--:c--- + VT:siu#:t-:---- + ZD + CC:----:--:---- + PO:s--c:--:-1ns + NC:siuc:--:---- + VF:----:sa:---- + D-:----:--:u--- + VT:siu#:t-:---- + VT:siu#:t-:---- + ZP + T-:----:--:---- + PO:s--c:--:-1ns + AC:sdu#:--:p--- + NC:siuc:--:---- + VF:----:sa:---- + PP:s-uc:--:-3n- + D-:----:--:u--- + D-:----:--:u--- + VF:----:sp:---- + ZD + ZP + CC:----:--:---- + T-:----:--:---- + NC:siuc:--:---- + VF:----:ta:---- + PP:s-nc:--:-1n- + PI:s-un:--:---- + AD:----:--:p--- + AC:siu§:--:p--- + NC:siun:--:---- + T-:----:--:---- + XY:----:--:---- + ZH + XF:----:--:---- + ZP + XS:----:--:---- + ZD + US:----:--:---- + VF:----:sa:---- + VT:siu#:t-:---- + T-:----:--:---- + NC:siuc:--:---- + T-:----:--:---- + PI:s-uc:--:---- + T-:----:--:---- + NC:pdu#:--:---- + ZD + NP:siu#:--:---- + NP:siu#:--:---- + ZD + T-:----:--:---- + NP:sig#:--:---- + NC:siuc:--:---- + ZP + VM:----:--:---- + D-:----:--:u--- + T-:----:--:---- + ZD + CS:----:--:---- + PP:s-nc:--:-2n- + D-:----:--:u--- + VF:----:sa:---- + AD:----:--:p--- + D-:----:--:u--- + T-:----:--:---- + NP:siu#:--:---- + ZP + NC:piu#:--:---- + ZP + NC:siun:--:---- + ZD + NP:siu#:--:---- + ZD + NP:siu#:--:---- + ZD + NC:siun:--:---- + ZD + NC:siuc:--:---- + ZD + NP:siu#:--:---- + PM:s-un:--:---- + NC:siun:--:---- + VF:----:ta:---- + VT:siu#:t-:---- + PM:s-uc:--:---- + NC:siuc:--:---- + ZD + LC:--u-:--:---- + ZP + NC:siuc:--:---- + LC:--u-:--:---- + XS:----:--:---- + ZP + LC:--u-:--:---- + ZP + LC:--u-:--:---- + CC:----:--:---- + VF:----:sa:---- + VT:siu#:t-:---- + T-:----:--:---- + NC:piu#:--:---- + ZD + NC:sdun:--:---- + ZD + NC:siun:--:---- + ZP + PP:s-nc:--:-2n- + VF:----:sa:---- + VI:----:-a:---- + PI:p-u#:--:---- + NC:piu#:--:---- + T-:----:--:---- + PM:s-un:--:---- + NC:siun:--:---- + T-:----:--:---- + XS:----:--:---- + LC:--u-:--:---- + ZP + LC:--u-:--:---- + ZH + NC:sdun:--:---- + ZP + PP:s-nc:--:-2n- + VF:----:sa:---- + VI:----:-a:---- + PI:s-uc:--:---- + NC:siuc:--:---- + ZD + CC:----:--:---- + VI:----:-a:---- + PI:s-un:--:---- + NC:siun:--:---- + T-:----:--:---- + PO:s--n:--:-2ns + AC:siun:--:p--- + NC:siu#:--:---- + ZP + + +verb.social +noun.act +noun.cognition +noun.person +adj.phys +verb.static +verb.cognition +verb.act +verb.emotion +noun.artifact +noun.institution +verb.social +verb.static +adj.phys +noun.artifact +noun.motive +verb.cognition +verb.act +verb.cognition +verb.act +verb.emotion +noun.motive +noun.artifact +noun.TOP +verb.cognition +verb.act +verb.act +noun.person +verb.communication +noun.artifact +noun.artifact +noun.phenomenon +verb.act +adj.phys +noun.communication +verb.social +noun.act +noun.cognition +noun.person +adj.all +verb.cognition +verb.act +verb.emotion +adj.phys +verb.cognition +verb.communication +verb.cognition +verb.motion +verb.cognition +verb.static +verb.act +verb.cognition +noun.quantity +verb.act +noun.location +verb.act +noun.person +verb.communication +noun.cognition +noun.feeling +verb.act +verb.social +verb.static +noun.person +verb.motion +verb.act +verb.static +noun.animal +noun.artifact +adj.phys +noun.group +noun.artifact +noun.artifact +noun.person +verb.social +verb.static +adj.phys +noun.attribute +noun.artifact +noun.substance +verb.motion +noun.artifact +verb.emotion +verb.communication +adj.phys +noun.person +verb.cognition +verb.communication +noun.act +noun.artifact +verb.social +verb.act +noun.artifact +noun.phenomenon +verb.cognition +noun.artifact +verb.motion +verb.act +verb.static +noun.artifact +noun.artifact +noun.TOP +noun.person +noun.animal +noun.person +verb.social +noun.quantity +verb.weather +noun.attribute +verb.cognition +verb.act +noun.artifact +noun.person +noun.cognition +noun.phenomenon +verb.cognition +verb.act +adj.time +verb.communication +noun.artifact +noun.person +verb.cognition +verb.act +noun.artifact +verb.social +verb.static +adj.phys +noun.feeling +verb.social +verb.motion +noun.location +adj.all +noun.act +verb.cognition +verb.communication +noun.artifact +noun.time +verb.motion +verb.cognition +adj.mental +adj.all +verb.cognition +verb.act +verb.social +verb.static +noun.artifact +noun.act +noun.communication +verb.act +noun.person +verb.act +verb.social +noun.body +noun.artifact +noun.food +noun.artifact +noun.TOP +verb.motion +noun.TOP +verb.cognition +noun.group +noun.quantity +noun.artifact +adj.phys +adj.all +verb.act +verb.social +verb.static +verb.motion +noun.artifact +verb.emotion +verb.communication +noun.act +noun.artifact +noun.feeling +noun.artifact +noun.group +noun.artifact +noun.artifact +noun.motive +noun.act +verb.motion +noun.artifact +verb.static +noun.time +noun.time +verb.social +verb.static +verb.motion +noun.artifact +noun.artifact +noun.artifact +noun.phenomenon +noun.group +noun.artifact +verb.social +verb.motion +noun.person +noun.phenomenon +noun.location +noun.communication +noun.act +adj.phys +noun.cognition +noun.person +noun.act +verb.motion +noun.artifact +verb.social +verb.act +verb.communication +noun.act +noun.communication +noun.artifact +verb.weather +verb.act +noun.artifact +verb.social +noun.act +noun.cognition +noun.person +adj.phys +verb.static +verb.cognition +verb.act +verb.emotion +noun.artifact +noun.institution +verb.social +verb.static +adj.phys +noun.artifact +noun.motive +verb.cognition +verb.act +verb.cognition +verb.act +verb.emotion +noun.motive +noun.artifact +noun.TOP +verb.cognition +verb.act +verb.act +noun.person +verb.communication +noun.artifact +noun.artifact +noun.phenomenon +verb.act +adj.phys +noun.communication +verb.social +noun.act +noun.cognition +noun.person +adj.all +verb.cognition +verb.act +verb.emotion +adj.phys +verb.cognition +verb.communication +verb.cognition +verb.motion +verb.cognition +verb.static +verb.act +verb.cognition +noun.quantity +verb.act +noun.location +verb.act +noun.person +verb.communication +noun.cognition +noun.feeling +verb.act +verb.social +verb.static +noun.person +verb.motion +verb.act +verb.static +noun.animal +noun.artifact +adj.phys +noun.group +noun.artifact +noun.artifact +noun.person +verb.social +verb.static +adj.phys +noun.attribute +noun.artifact +noun.substance +verb.motion +noun.artifact +verb.emotion +verb.communication +adj.phys +noun.person +verb.cognition +verb.communication +verb.social +noun.act +noun.cognition +noun.person +adj.phys +verb.static +verb.cognition +verb.act +verb.emotion +noun.artifact +noun.institution +verb.social +verb.static +adj.phys +noun.artifact +noun.motive +verb.cognition +verb.act +verb.cognition +verb.act +verb.emotion +noun.motive +noun.artifact +noun.TOP +verb.cognition +verb.act +verb.act +noun.person +verb.communication +noun.artifact +noun.artifact +noun.phenomenon +verb.act +adj.phys +noun.communication +verb.social +noun.act +noun.cognition +noun.person +adj.all +verb.cognition +verb.act +verb.emotion +adj.phys +verb.cognition +verb.communication +verb.cognition +verb.motion +verb.cognition +verb.static +verb.act +verb.cognition +noun.quantity +verb.act +noun.location +verb.act +noun.person +verb.communication +noun.cognition +noun.feeling +verb.act +verb.social +verb.static +noun.person +verb.motion +verb.act +verb.static +noun.animal +noun.artifact +adj.phys +noun.group +noun.artifact +noun.artifact +noun.person +verb.social +verb.static +adj.phys +noun.attribute +noun.artifact +noun.substance +verb.motion +noun.artifact +verb.emotion +verb.communication +adj.phys +noun.person +verb.cognition +verb.communication +noun.act +noun.artifact +verb.social +verb.act +noun.artifact +noun.phenomenon +verb.cognition +noun.artifact +verb.motion +verb.act +verb.static +noun.artifact +noun.artifact +noun.TOP +noun.person +noun.animal +noun.person +verb.social +noun.quantity +verb.weather +noun.attribute +verb.cognition +verb.act +noun.artifact +noun.person +noun.act +noun.artifact +verb.social +verb.act +noun.artifact +noun.phenomenon +verb.cognition +noun.artifact +verb.motion +verb.act +verb.static +noun.artifact +noun.artifact +noun.TOP +noun.person +noun.animal +noun.person +verb.social +noun.quantity +verb.weather +noun.attribute +verb.cognition +verb.act +noun.artifact +noun.person +noun.cognition +noun.phenomenon +verb.cognition +verb.act +adj.time +verb.communication +noun.artifact +noun.person +verb.cognition +verb.act +noun.artifact +verb.social +verb.static +adj.phys +noun.feeling +verb.social +verb.motion +noun.location +adj.all +noun.act +verb.cognition +verb.communication +noun.artifact +noun.time +verb.motion +verb.cognition +adj.mental +adj.all +verb.cognition +verb.act +verb.social +verb.static +noun.artifact +noun.act +noun.communication +verb.act +noun.person +verb.act +verb.social +noun.body +noun.artifact +noun.food +noun.artifact +noun.TOP +verb.motion +noun.TOP +verb.cognition +noun.group +noun.quantity +noun.artifact +adj.phys +adj.all +verb.act +verb.social +verb.static +verb.motion +noun.artifact +verb.emotion +verb.communication +noun.cognition +noun.phenomenon +verb.cognition +verb.act +adj.time +verb.communication +noun.artifact +noun.person +verb.cognition +verb.act +noun.artifact +verb.social +verb.static +adj.phys +noun.feeling +verb.social +verb.motion +noun.location +adj.all +noun.act +verb.cognition +verb.communication +noun.artifact +noun.time +verb.motion +verb.cognition +adj.mental +adj.all +verb.cognition +verb.act +verb.social +verb.static +noun.artifact +noun.act +noun.communication +verb.act +noun.person +verb.act +verb.social +noun.body +noun.artifact +noun.food +noun.artifact +noun.TOP +verb.motion +noun.TOP +verb.cognition +noun.group +noun.quantity +noun.artifact +adj.phys +adj.all +verb.act +verb.social +verb.static +verb.motion +noun.artifact +verb.emotion +verb.communication +noun.act +noun.artifact +noun.feeling +noun.artifact +noun.group +noun.artifact +noun.artifact +noun.motive +noun.act +noun.artifact +noun.feeling +noun.artifact +noun.group +noun.artifact +noun.artifact +noun.motive +noun.act +verb.motion +noun.artifact +verb.static +noun.time +noun.time +verb.social +verb.static +verb.motion +noun.artifact +noun.artifact +noun.artifact +noun.phenomenon +noun.group +noun.artifact +verb.social +verb.motion +noun.person +noun.phenomenon +noun.location +noun.communication +noun.act +adj.phys +noun.cognition +noun.person +noun.act +verb.motion +noun.artifact +verb.social +verb.act +verb.communication +noun.act +noun.communication +noun.artifact +verb.weather +verb.act +noun.artifact +noun.act +verb.motion +noun.artifact +verb.static +noun.time +noun.time +verb.social +verb.static +verb.motion +noun.artifact +noun.artifact +noun.artifact +noun.phenomenon +noun.group +noun.artifact +verb.social +verb.motion +noun.person +noun.phenomenon +noun.location +noun.communication +noun.act +adj.phys +noun.cognition +noun.person +noun.act +verb.motion +noun.artifact +verb.social +verb.act +verb.communication +noun.act +noun.communication +noun.artifact +verb.weather +verb.act +noun.artifact + +
+
+ diff --git a/dkpro-core-io-webanno-asl/suppressions.xml b/dkpro-core-io-webanno-asl/suppressions.xml new file mode 100644 index 0000000000..05381817ea --- /dev/null +++ b/dkpro-core-io-webanno-asl/suppressions.xml @@ -0,0 +1,9 @@ + + + + + + + diff --git a/dkpro-core-io-xces-asl/pom.xml b/dkpro-core-io-xces-asl/pom.xml index cfe3c7a1c2..308ffc9dfb 100644 --- a/dkpro-core-io-xces-asl/pom.xml +++ b/dkpro-core-io-xces-asl/pom.xml @@ -18,15 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core-asl - 1.10.0-SNAPSHOT + org.dkpro.core + dkpro-core-asl + 2.3.0-SNAPSHOT ../dkpro-core-asl - org.dkpro.core dkpro-core-io-xces-asl jar DKPro Core ASL - IO - XCES-XML + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -53,33 +53,53 @@ - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl + org.dkpro.core + dkpro-core-api-lexmorph-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl + org.dkpro.core + dkpro-core-api-segmentation-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl - + org.dkpro.core + dkpro-core-api-parameter-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api + + + javax.xml.bind + jaxb-api + + + com.sun.xml.bind + jaxb-core + + + com.sun.xml.bind + jaxb-impl + + + javax.activation + javax.activation-api + junit junit test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test @@ -101,15 +121,15 @@ org.apache.maven.plugins maven-dependency-plugin - - - de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.opennlp-model-parser-en-chunking + + com.sun.xml.bind:jaxb-core + com.sun.xml.bind:jaxb-impl + javax.activation:javax.activation-api diff --git a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/XcesBasicXmlReader.java b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/XcesBasicXmlReader.java index 2b914e8223..7ca986b7bb 100644 --- a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/XcesBasicXmlReader.java +++ b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/XcesBasicXmlReader.java @@ -34,21 +34,30 @@ import javax.xml.stream.events.XMLEvent; import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.factory.JCasBuilder; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; import org.dkpro.core.io.xces.models.XcesBodyBasic; import org.dkpro.core.io.xces.models.XcesParaBasic; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; +import eu.openminted.share.annotations.api.DocumentationResource; +/** + * Reader for the basic XCES XML format. + */ +@ResourceMetaData(name = "XCES Basic XML Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph"}) +@MimeTypeCapability({MimeTypes.APPLICATION_X_XCES_BASIC}) public class XcesBasicXmlReader extends JCasResourceCollectionReader_ImplBase { - @Override public void getNext(JCas aJCas) throws IOException, CollectionException @@ -70,6 +79,7 @@ public void getNext(JCas aJCas) unmarshallerBasic.setEventHandler(new ValidationEventHandler() { + @Override public boolean handleEvent(ValidationEvent event) { throw new RuntimeException(event.getMessage(), event.getLinkedException()); diff --git a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/XcesBasicXmlWriter.java b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/XcesBasicXmlWriter.java index f69d0bf0b2..920b37854e 100644 --- a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/XcesBasicXmlWriter.java +++ b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/XcesBasicXmlWriter.java @@ -35,34 +35,47 @@ import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; import org.dkpro.core.io.xces.models.XcesBodyBasic; import org.dkpro.core.io.xces.models.XcesParaBasic; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; +import eu.openminted.share.annotations.api.DocumentationResource; import javanet.staxutils.IndentingXMLEventWriter; -@ResourceMetaData(name="XCES Basic XML Writer") +/** + * Writer for the basic XCES XML format. + */ +@ResourceMetaData(name = "XCES Basic XML Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.APPLICATION_X_XCES_BASIC}) @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph" }) -public class XcesBasicXmlWriter extends JCasFileWriter_ImplBase +public class XcesBasicXmlWriter + extends JCasFileWriter_ImplBase { - - public static final String PARAM_FILENAME_SUFFIX = "filenameSuffix"; - @ConfigurationParameter(name = PARAM_FILENAME_SUFFIX, mandatory = true, defaultValue = ".xml") - private String filenameSuffix; + /** + * Use this filename extension. + */ + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; + @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".xml") + private String filenameExtension; /** * Character encoding of the output data. */ - public static final String PARAM_TARGET_ENCODING = "targetEncoding"; - @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; + @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String targetEncoding; @Override @@ -72,7 +85,7 @@ public void process(JCas aJCas) OutputStream docOS = null; XMLEventWriter xmlEventWriter = null; try { - docOS = getOutputStream(aJCas, filenameSuffix); + docOS = getOutputStream(aJCas, filenameExtension); XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newInstance(); xmlEventWriter = new IndentingXMLEventWriter( xmlOutputFactory.createXMLEventWriter(docOS, targetEncoding)); @@ -94,7 +107,8 @@ public void process(JCas aJCas) // Begin body of all the paragraphs Collection parasInCas = JCasUtil.select(aJCas, Paragraph.class); XcesBodyBasic xb = convertToXcesBasicPara(parasInCas); - marshaller.marshal(new JAXBElement(new QName("body"), XcesBodyBasic.class, xb), + marshaller.marshal( + new JAXBElement(new QName("body"), XcesBodyBasic.class, xb), xmlEventWriter); // End body of all the paragraphs // xmlEventWriter.add(xmlef.createEndElement("", "", "body")); @@ -127,7 +141,7 @@ private XcesBodyBasic convertToXcesBasicPara(Collection parasInCas) for (Paragraph p : parasInCas) { XcesParaBasic para = new XcesParaBasic(); para.s = p.getCoveredText(); - para.id= "p"+Integer.toString(paraNo); + para.id = "p" + Integer.toString(paraNo); paraList.add(para); paraNo++; } diff --git a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/XcesXmlReader.java b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/XcesXmlReader.java index eb750b286b..5dbfa858da 100644 --- a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/XcesXmlReader.java +++ b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/XcesXmlReader.java @@ -34,26 +34,38 @@ import javax.xml.stream.events.XMLEvent; import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.factory.JCasBuilder; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; import org.dkpro.core.io.xces.models.XcesBody; import org.dkpro.core.io.xces.models.XcesPara; import org.dkpro.core.io.xces.models.XcesSentence; import org.dkpro.core.io.xces.models.XcesToken; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.DocumentationResource; -@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) +/** + * Reader for the XCES XML format. + */ +@ResourceMetaData(name = "XCES XML Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) +@MimeTypeCapability({MimeTypes.APPLICATION_X_XCES}) public class XcesXmlReader extends JCasResourceCollectionReader_ImplBase { @@ -62,12 +74,10 @@ public class XcesXmlReader public void getNext(JCas aJCas) throws IOException, CollectionException { - Resource res = nextFile(); initCas(aJCas, res); InputStream is = null; - try { is = CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()); @@ -79,6 +89,7 @@ public void getNext(JCas aJCas) unmarshaller.setEventHandler(new ValidationEventHandler() { + @Override public boolean handleEvent(ValidationEvent event) { throw new RuntimeException(event.getMessage(), event.getLinkedException()); @@ -103,10 +114,8 @@ public boolean handleEvent(ValidationEvent event) else { xmlEventReader.next(); } - } jb.close(); - } catch (XMLStreamException ex1) { throw new IOException(ex1); @@ -117,7 +126,6 @@ public boolean handleEvent(ValidationEvent event) finally { closeQuietly(is); } - } private void readPara(JCasBuilder jb, Object bodyObj) @@ -160,8 +168,9 @@ private void readPara(JCasBuilder jb, Object bodyObj) token.setPos(pos); } sentEnd = jb.getPosition(); - if (tnext == null) + if (tnext == null) { jb.add("\n"); + } if (tnext != null) { jb.add(" "); } @@ -174,15 +183,12 @@ private void readPara(JCasBuilder jb, Object bodyObj) para.addToIndexes(); jb.add("\n"); } - } } public static boolean isStartElement(XMLEvent aEvent, String aElement) { - return aEvent.isStartElement() && ((StartElement) aEvent).getName().getLocalPart().equals(aElement); } - -} \ No newline at end of file +} diff --git a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/XcesXmlWriter.java b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/XcesXmlWriter.java index aaae8042ae..7bc8c59b63 100644 --- a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/XcesXmlWriter.java +++ b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/XcesXmlWriter.java @@ -35,23 +35,30 @@ import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; import org.dkpro.core.io.xces.models.XcesBody; import org.dkpro.core.io.xces.models.XcesPara; import org.dkpro.core.io.xces.models.XcesSentence; import org.dkpro.core.io.xces.models.XcesToken; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.DocumentationResource; import javanet.staxutils.IndentingXMLEventWriter; -@ResourceMetaData(name="XCES XML Writer") +/** + * Writer for the XCES XML format. + */ +@ResourceMetaData(name = "XCES XML Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", @@ -59,18 +66,24 @@ "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph" }) +@MimeTypeCapability({MimeTypes.APPLICATION_X_XCES}) public class XcesXmlWriter extends JCasFileWriter_ImplBase { - public static final String PARAM_FILENAME_SUFFIX = "filenameSuffix"; - @ConfigurationParameter(name = PARAM_FILENAME_SUFFIX, mandatory = true, defaultValue = ".xml") - private String filenameSuffix; + /** + * Use this filename extension. + */ + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; + @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".xml") + private String filenameExtension; /** * Character encoding of the output data. */ public static final String PARAM_TARGET_ENCODING = "targetEncoding"; - @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) + @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) private String targetEncoding; @Override @@ -80,7 +93,7 @@ public void process(JCas aJCas) OutputStream docOS = null; XMLEventWriter xmlEventWriter = null; try { - docOS = getOutputStream(aJCas, filenameSuffix); + docOS = getOutputStream(aJCas, filenameExtension); XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newInstance(); xmlEventWriter = new IndentingXMLEventWriter( xmlOutputFactory.createXMLEventWriter(docOS, targetEncoding)); diff --git a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesBody.java b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesBody.java index 7f80d4b5ff..c5e05e5fb4 100644 --- a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesBody.java +++ b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesBody.java @@ -25,6 +25,6 @@ @XmlRootElement public class XcesBody { - @XmlElement(name="p",type=XcesPara.class) + @XmlElement(name = "p", type = XcesPara.class) public List p; } diff --git a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesBodyBasic.java b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesBodyBasic.java index 5473f79ce8..b5c6f558ab 100644 --- a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesBodyBasic.java +++ b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesBodyBasic.java @@ -24,7 +24,5 @@ @XmlRootElement public class XcesBodyBasic { - public List p; - } diff --git a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesPara.java b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesPara.java index 6bfdae1858..6535acca94 100644 --- a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesPara.java +++ b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesPara.java @@ -23,9 +23,10 @@ import javax.xml.bind.annotation.XmlID; public class XcesPara -{ +{ @XmlID @XmlAttribute - public String id; - public List s; + public String id; + + public List s; } diff --git a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesParaBasic.java b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesParaBasic.java index 1794239721..278f21bbd5 100644 --- a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesParaBasic.java +++ b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesParaBasic.java @@ -21,13 +21,12 @@ import javax.xml.bind.annotation.XmlID; import javax.xml.bind.annotation.XmlValue; - public class XcesParaBasic -{ - +{ @XmlID @XmlAttribute - public String id; + public String id; + @XmlValue - public String s; + public String s; } diff --git a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesSentence.java b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesSentence.java index 9984bc8caa..b3dea557f7 100644 --- a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesSentence.java +++ b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesSentence.java @@ -28,6 +28,7 @@ public class XcesSentence @XmlID @XmlAttribute public String id; - @XmlElement(name="t") + + @XmlElement(name = "t") public List xcesTokens; } diff --git a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesToken.java b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesToken.java index a2fd554a8a..e48adefa19 100644 --- a/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesToken.java +++ b/dkpro-core-io-xces-asl/src/main/java/org/dkpro/core/io/xces/models/XcesToken.java @@ -22,14 +22,16 @@ public class XcesToken { - @XmlID + @XmlID @XmlAttribute public String id; - @XmlAttribute(name="word") + + @XmlAttribute(name = "word") public String word; - @XmlAttribute(name="tag") + + @XmlAttribute(name = "tag") public String tag; - @XmlAttribute(name="lemma") + + @XmlAttribute(name = "lemma") public String lemma; - } diff --git a/dkpro-core-io-xces-asl/src/test/java/org/dkpro/core/io/xces/XcesXmlReaderTest.java b/dkpro-core-io-xces-asl/src/test/java/org/dkpro/core/io/xces/XcesXmlReaderTest.java index e5be8add62..b7b7094360 100644 --- a/dkpro-core-io-xces-asl/src/test/java/org/dkpro/core/io/xces/XcesXmlReaderTest.java +++ b/dkpro-core-io-xces-asl/src/test/java/org/dkpro/core/io/xces/XcesXmlReaderTest.java @@ -17,21 +17,20 @@ */ package org.dkpro.core.io.xces; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testOneWay; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.dkpro.core.testing.IOTestRunner.testOneWay; import org.junit.Test; public class XcesXmlReaderTest { - @Test public void xcesOneWayBasicTest() throws Exception { testOneWay( createReaderDescription(XcesBasicXmlReader.class, - XcesBasicXmlReader.PARAM_SOURCE_LOCATION, "src/test/resources/", + XcesBasicXmlReader.PARAM_SOURCE_LOCATION, "src/test/resources/", XcesBasicXmlReader.PARAM_LANGUAGE, "el"), "xces-basic.xml.dump", "xces-basic.xml"); } @@ -42,9 +41,8 @@ public void xcesOneWayTest() { testOneWay( createReaderDescription(XcesXmlReader.class, - XcesXmlReader.PARAM_SOURCE_LOCATION, "src/test/resources/", + XcesXmlReader.PARAM_SOURCE_LOCATION, "src/test/resources/", XcesXmlReader.PARAM_LANGUAGE, "el"), "xces-complex.xml.dump", "xces-complex.xml"); } - } diff --git a/dkpro-core-io-xces-asl/src/test/java/org/dkpro/core/io/xces/XcesXmlReaderWriterTest.java b/dkpro-core-io-xces-asl/src/test/java/org/dkpro/core/io/xces/XcesXmlReaderWriterTest.java index 982814af91..4f42db0933 100644 --- a/dkpro-core-io-xces-asl/src/test/java/org/dkpro/core/io/xces/XcesXmlReaderWriterTest.java +++ b/dkpro-core-io-xces-asl/src/test/java/org/dkpro/core/io/xces/XcesXmlReaderWriterTest.java @@ -29,9 +29,9 @@ import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.pipeline.SimplePipeline; import org.custommonkey.xmlunit.XMLAssert; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class XcesXmlReaderWriterTest { diff --git a/dkpro-core-io-xces-asl/src/test/resources/log4j.properties b/dkpro-core-io-xces-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-io-xces-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-xces-asl/src/test/resources/log4j2.xml b/dkpro-core-io-xces-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-xces-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-xces-asl/src/test/resources/xces-complex.xml.dump b/dkpro-core-io-xces-asl/src/test/resources/xces-complex.xml.dump index ca62262497..ace17c7de7 100644 --- a/dkpro-core-io-xces-asl/src/test/resources/xces-complex.xml.dump +++ b/dkpro-core-io-xces-asl/src/test/resources/xces-complex.xml.dump @@ -57,6 +57,7 @@ Token begin: 0 end: 4 PosValue: "PnDmFe03SgNmXx" + order: 0 [είναι] POS sofa: _InitialView @@ -84,6 +85,7 @@ Token begin: 5 end: 10 PosValue: "VbMnIdPr03SgXxIpPvXx" + order: 0 [η] POS sofa: _InitialView @@ -111,6 +113,7 @@ Token begin: 11 end: 12 PosValue: "AtDfFeSgNm" + order: 0 [πρώτη] POS sofa: _InitialView @@ -138,6 +141,7 @@ Token begin: 13 end: 18 PosValue: "NmOdFeSgNmAj" + order: 0 [γραμμή] POS sofa: _InitialView @@ -165,6 +169,7 @@ Token begin: 19 end: 25 PosValue: "NoCmFeSgNm" + order: 0 [.] POS sofa: _InitialView @@ -192,6 +197,7 @@ Token begin: 26 end: 27 PosValue: "PTERM_P" + order: 0 [Αυτή είναι η πρώτη γραμμή . Αυτό είναι μια άλλη γραμμή .] Paragraph @@ -230,6 +236,7 @@ Token begin: 29 end: 33 PosValue: "PnDmFe03SgNmXx" + order: 0 [είναι] POS sofa: _InitialView @@ -257,6 +264,7 @@ Token begin: 34 end: 39 PosValue: "VbMnIdPr03SgXxIpPvXx" + order: 0 [η] POS sofa: _InitialView @@ -284,6 +292,7 @@ Token begin: 40 end: 41 PosValue: "AtDfFeSgNm" + order: 0 [πρώτη] POS sofa: _InitialView @@ -311,6 +320,7 @@ Token begin: 42 end: 47 PosValue: "NmOdFeSgNmAj" + order: 0 [γραμμή] POS sofa: _InitialView @@ -338,6 +348,7 @@ Token begin: 48 end: 54 PosValue: "NoCmFeSgNm" + order: 0 [.] POS sofa: _InitialView @@ -365,6 +376,7 @@ Token begin: 55 end: 56 PosValue: "PTERM_P" + order: 0 [Αυτό είναι μια άλλη γραμμή .] Sentence sofa: _InitialView @@ -397,6 +409,7 @@ Token begin: 57 end: 61 PosValue: "PnDmNe03SgNmXx" + order: 0 [είναι] POS sofa: _InitialView @@ -424,6 +437,7 @@ Token begin: 62 end: 67 PosValue: "VbMnIdPr03SgXxIpPvXx" + order: 0 [μια] POS sofa: _InitialView @@ -451,6 +465,7 @@ Token begin: 68 end: 71 PosValue: "AtIdFeSgNm" + order: 0 [άλλη] POS sofa: _InitialView @@ -478,6 +493,7 @@ Token begin: 72 end: 76 PosValue: "PnIdFe03SgNmXx" + order: 0 [γραμμή] POS sofa: _InitialView @@ -505,6 +521,7 @@ Token begin: 77 end: 83 PosValue: "NoCmFeSgNm" + order: 0 [.] POS sofa: _InitialView @@ -532,6 +549,7 @@ Token begin: 84 end: 85 PosValue: "PTERM_P" + order: 0 [Γίνε η αλλαγή που θέλεις να δεις στον κόσμο .] Paragraph sofa: _InitialView @@ -569,6 +587,7 @@ Token begin: 87 end: 91 PosValue: "VbMnMpXx02SgXxPePvXx" + order: 0 [η] POS sofa: _InitialView @@ -596,6 +615,7 @@ Token begin: 92 end: 93 PosValue: "AtDfFeSgNm" + order: 0 [αλλαγή] POS sofa: _InitialView @@ -623,6 +643,7 @@ Token begin: 94 end: 100 PosValue: "NoCmFeSgNm" + order: 0 [που] POS sofa: _InitialView @@ -650,6 +671,7 @@ Token begin: 101 end: 104 PosValue: "PnReFe03SgNmXx" + order: 0 [θέλεις] POS sofa: _InitialView @@ -677,6 +699,7 @@ Token begin: 105 end: 111 PosValue: "VbMnIdPr02SgXxIpAvXx" + order: 0 [να] POS sofa: _InitialView @@ -704,6 +727,7 @@ Token begin: 112 end: 114 PosValue: "PtSj" + order: 0 [δεις] POS sofa: _InitialView @@ -731,6 +755,7 @@ Token begin: 115 end: 119 PosValue: "VbMnIdXx02SgXxPeAvXx" + order: 0 [στον] POS sofa: _InitialView @@ -758,6 +783,7 @@ Token begin: 120 end: 124 PosValue: "AsPpPaMaSgAc" + order: 0 [κόσμο] POS sofa: _InitialView @@ -785,6 +811,7 @@ Token begin: 125 end: 130 PosValue: "NoCmMaSgAc" + order: 0 [.] POS sofa: _InitialView @@ -812,6 +839,7 @@ Token begin: 131 end: 132 PosValue: "PTERM_P" + order: 0 -------- View _InitialView end ---------------------------------- ======== CAS 0 end ================================== \ No newline at end of file diff --git a/dkpro-core-io-xmi-asl/pom.xml b/dkpro-core-io-xmi-asl/pom.xml index 871d96521e..b2d8b7f88e 100644 --- a/dkpro-core-io-xmi-asl/pom.xml +++ b/dkpro-core-io-xmi-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.xmi-asl + dkpro-core-io-xmi-asl jar DKPro Core ASL - IO - UIMA XMI + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -40,29 +41,33 @@ commons-io - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.text-asl + eu.openminted.share.annotations + omtd-share-annotations-api + + + org.dkpro.core + dkpro-core-io-text-asl test - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.testing-asl + org.dkpro.core + dkpro-core-testing-asl test @@ -70,5 +75,10 @@ junit test + + org.assertj + assertj-core + test + diff --git a/dkpro-core-io-xmi-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xmi/XmiReader.java b/dkpro-core-io-xmi-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xmi/XmiReader.java deleted file mode 100644 index a27ca6ce95..0000000000 --- a/dkpro-core-io-xmi-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xmi/XmiReader.java +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.xmi; - -import java.io.IOException; -import java.io.InputStream; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.impl.XmiCasDeserializer; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.xml.sax.SAXException; - -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; - -/** - * Reader for UIMA XMI files. - */ -@ResourceMetaData(name="UIMA XMI CAS Reader") -@MimeTypeCapability({MimeTypes.APPLICATION_VND_XMI_XML, MimeTypes.APPLICATION_X_UIMA_XMI}) -@TypeCapability( - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) -public class XmiReader - extends ResourceCollectionReaderBase -{ - /** - * In lenient mode, unknown types are ignored and do not cause an exception to be thrown. - */ - public static final String PARAM_LENIENT = "lenient"; - @ConfigurationParameter(name=PARAM_LENIENT, mandatory=true, defaultValue="false") - private boolean lenient; - - /** - * Add DKPro Core metadata if it is not already present in the document. - */ - public static final String PARAM_ADD_DOCUMENT_METADATA = "addDocumentMetadata"; - @ConfigurationParameter(name=PARAM_ADD_DOCUMENT_METADATA, mandatory=true, defaultValue="true") - private boolean addDocumentMetadata; - - /** - * Generate new DKPro Core document metadata (i.e. title, ID, URI) for the document instead - * of retaining what is already present in the XMI file. - */ - public static final String PARAM_OVERRIDE_DOCUMENT_METADATA = "overrideDocumentMetadata"; - @ConfigurationParameter(name=PARAM_OVERRIDE_DOCUMENT_METADATA, mandatory=true, defaultValue="false") - private boolean overrideDocumentMetadata; - - @Override - public void getNext(CAS aCAS) - throws IOException, CollectionException - { - Resource res = nextFile(); - - // Read XMI file - try (InputStream is = CompressionUtils.getInputStream(res.getLocation(), - res.getInputStream())) { - XmiCasDeserializer.deserialize(is, aCAS, lenient); - } - catch (SAXException e) { - throw new IOException(e); - } - - // Handle DKPro Core DocumentMetaData - AnnotationFS docAnno = aCAS.getDocumentAnnotation(); - if (docAnno.getType().getName().equals(DocumentMetaData.class.getName())) { - if (overrideDocumentMetadata) { - // Unless the language is explicity set on the reader, try to retain the language - // already present in the XMI file. - String language = getLanguage(); - if (language == null) { - language = aCAS.getDocumentLanguage(); - } - aCAS.removeFsFromIndexes(docAnno); - - initCas(aCAS, res); - - aCAS.setDocumentLanguage(language); - } - } - else if (addDocumentMetadata) { - initCas(aCAS, res); - } - } -} diff --git a/dkpro-core-io-xmi-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xmi/XmiWriter.java b/dkpro-core-io-xmi-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xmi/XmiWriter.java deleted file mode 100644 index d668e63463..0000000000 --- a/dkpro-core-io-xmi-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xmi/XmiWriter.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.xmi; - -import static org.apache.commons.io.IOUtils.closeQuietly; - -import java.io.File; -import java.io.IOException; -import java.io.OutputStream; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CASRuntimeException; -import org.apache.uima.cas.impl.XmiCasSerializer; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.TypeSystemUtil; -import org.xml.sax.SAXException; - -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; - -/** - * UIMA XMI format writer. - */ -@ResourceMetaData(name="UIMA XMI CAS Writer") -@MimeTypeCapability({MimeTypes.APPLICATION_VND_XMI_XML, MimeTypes.APPLICATION_X_UIMA_XMI}) -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) -public class XmiWriter - extends JCasFileWriter_ImplBase -{ - public static final String PARAM_PRETTY_PRINT = "prettyPrint"; - @ConfigurationParameter(name = PARAM_PRETTY_PRINT, mandatory = true, defaultValue = "true") - private boolean prettyPrint; - - /** - * Location to write the type system to. If this is not set, a file called typesystem.xml will - * be written to the XMI output path. If this is set, it is expected to be a file relative - * to the current work directory or an absolute file. - *
- * If this parameter is set, the {@link #PARAM_COMPRESSION} parameter has no effect on the - * type system. Instead, if the file name ends in ".gz", the file will be compressed, - * otherwise not. - */ - public static final String PARAM_TYPE_SYSTEM_FILE = "typeSystemFile"; - @ConfigurationParameter(name=PARAM_TYPE_SYSTEM_FILE, mandatory=false) - private File typeSystemFile; - - /** - * Specify the suffix of output files. Default value .xmi. If the suffix is not - * needed, provide an empty string as value. - */ - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; - @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".xmi") - private String filenameSuffix; - - private boolean typeSystemWritten; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - typeSystemWritten = false; - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - try (OutputStream docOS = getOutputStream(aJCas, filenameSuffix)) { - XmiCasSerializer.serialize(aJCas.getCas(), null, docOS, prettyPrint, null); - - if (!typeSystemWritten) { - writeTypeSystem(aJCas); - typeSystemWritten = true; - } - } - catch (Exception e) { - throw new AnalysisEngineProcessException(e); - } - } - - private void writeTypeSystem(JCas aJCas) - throws IOException, CASRuntimeException, SAXException - { - @SuppressWarnings("resource") - OutputStream typeOS = null; - - try { - if (typeSystemFile != null) { - typeOS = CompressionUtils.getOutputStream(typeSystemFile); - } - else { - typeOS = getOutputStream("TypeSystem", ".xml"); - } - - TypeSystemUtil.typeSystem2TypeSystemDescription(aJCas.getTypeSystem()).toXML(typeOS); - } - finally { - closeQuietly(typeOS); - } - } -} diff --git a/dkpro-core-io-xmi-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xmi/package-info.java b/dkpro-core-io-xmi-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xmi/package-info.java deleted file mode 100644 index a5d81b6333..0000000000 --- a/dkpro-core-io-xmi-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xmi/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Support for (de)serializing the CAS to/from XMI files. - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.io.xmi; diff --git a/dkpro-core-io-xmi-asl/src/main/java/org/dkpro/core/io/xmi/XmiReader.java b/dkpro-core-io-xmi-asl/src/main/java/org/dkpro/core/io/xmi/XmiReader.java new file mode 100644 index 0000000000..f84e26f9e2 --- /dev/null +++ b/dkpro-core-io-xmi-asl/src/main/java/org/dkpro/core/io/xmi/XmiReader.java @@ -0,0 +1,175 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.xmi; + +import static java.util.Objects.nonNull; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; + +import org.apache.uima.UIMAFramework; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.impl.CASImpl; +import org.apache.uima.cas.impl.CASMgrSerializer; +import org.apache.uima.cas.impl.Serialization; +import org.apache.uima.cas.impl.XmiCasDeserializer; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.metadata.TypeSystemDescription; +import org.apache.uima.util.CasCreationUtils; +import org.apache.uima.util.InvalidXMLException; +import org.apache.uima.util.TypeSystemUtil; +import org.apache.uima.util.XMLInputSource; +import org.dkpro.core.api.io.ResourceCollectionReaderBase; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.xml.sax.SAXException; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.Parameters; + +/** + * Reader for UIMA XMI files. + */ +@ResourceMetaData(name = "UIMA XMI CAS Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@Parameters( + exclude = { + ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, + ResourceCollectionReaderBase.PARAM_INCLUDE_HIDDEN, + ResourceCollectionReaderBase.PARAM_USE_DEFAULT_EXCLUDES, + ResourceCollectionReaderBase.PARAM_LOG_FREQ, + XmiReader.PARAM_TYPE_SYSTEM_FILE }) +@MimeTypeCapability({MimeTypes.APPLICATION_VND_XMI_XML, MimeTypes.APPLICATION_X_UIMA_XMI}) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) +public class XmiReader + extends ResourceCollectionReaderBase +{ + /** + * In lenient mode, unknown types are ignored and do not cause an exception to be thrown. + */ + public static final String PARAM_LENIENT = "lenient"; + @ConfigurationParameter(name = PARAM_LENIENT, mandatory = true, defaultValue = "false") + private boolean lenient; + + /** + * Add DKPro Core metadata if it is not already present in the document. + */ + public static final String PARAM_ADD_DOCUMENT_METADATA = "addDocumentMetadata"; + @ConfigurationParameter(name = PARAM_ADD_DOCUMENT_METADATA, mandatory = true, defaultValue = "true") + private boolean addDocumentMetadata; + + /** + * Generate new DKPro Core document metadata (i.e. title, ID, URI) for the document instead + * of retaining what is already present in the XMI file. + */ + public static final String PARAM_OVERRIDE_DOCUMENT_METADATA = "overrideDocumentMetadata"; + @ConfigurationParameter(name = PARAM_OVERRIDE_DOCUMENT_METADATA, mandatory = true, defaultValue = "false") + private boolean overrideDocumentMetadata; + + /** + * Determines whether the type system from a currently read file should be merged + * with the current type system. + */ + public static final String PARAM_MERGE_TYPE_SYSTEM = "mergeTypeSystem"; + @ConfigurationParameter(name = PARAM_MERGE_TYPE_SYSTEM, mandatory = true, defaultValue = "false") + private boolean mergeTypeSystem; + + /** + * If a type system is specified, then the type system already in the CAS is replaced + * by this one. Except if {@link XmiReader#PARAM_MERGE_TYPE_SYSTEM} is enabled, in which + * case it will be merged with the type system already present in the CAS. + */ + public static final String PARAM_TYPE_SYSTEM_FILE = "typeSystemFile"; + @ConfigurationParameter(name = PARAM_TYPE_SYSTEM_FILE, mandatory = false) + private File typeSystemFile; + + @Override + public void getNext(CAS aCAS) + throws IOException, CollectionException + { + if (nonNull(typeSystemFile)) { + try { + List tsds = new ArrayList<>(); + if (mergeTypeSystem) { + tsds.add(TypeSystemUtil.typeSystem2TypeSystemDescription(aCAS.getTypeSystem())); + } + tsds.add(UIMAFramework.getXMLParser() + .parseTypeSystemDescription(new XMLInputSource(typeSystemFile))); + TypeSystemDescription merged = CasCreationUtils.mergeTypeSystems(tsds); + + // Create a temporary CAS with the merged TS + CAS mergedCas = CasCreationUtils.createCas(merged, null, null, null); + + // Create a holder for the CAS metadata + CASMgrSerializer casMgrSerializer = Serialization + .serializeCASMgr((CASImpl) mergedCas); + + // Reinitialize CAS with merged type system + ((CASImpl) aCAS).getBinaryCasSerDes() + .setupCasFromCasMgrSerializer(casMgrSerializer); + } + catch (InvalidXMLException | ResourceInitializationException e) { + throw new IOException(e); + } + } + + Resource res = nextFile(); + + // Read XMI file + try (InputStream is = CompressionUtils.getInputStream(res.getLocation(), + res.getInputStream())) { + XmiCasDeserializer.deserialize(is, aCAS, lenient); + } + catch (SAXException e) { + throw new IOException(e); + } + + // Handle DKPro Core DocumentMetaData + AnnotationFS docAnno = aCAS.getDocumentAnnotation(); + if (docAnno.getType().getName().equals(DocumentMetaData.class.getName())) { + if (overrideDocumentMetadata) { + // Unless the language is explicity set on the reader, try to retain the language + // already present in the XMI file. + String language = getLanguage(); + if (language == null) { + language = aCAS.getDocumentLanguage(); + } + aCAS.removeFsFromIndexes(docAnno); + + initCas(aCAS, res); + + aCAS.setDocumentLanguage(language); + } + } + else if (addDocumentMetadata) { + initCas(aCAS, res); + } + } +} diff --git a/dkpro-core-io-xmi-asl/src/main/java/org/dkpro/core/io/xmi/XmiWriter.java b/dkpro-core-io-xmi-asl/src/main/java/org/dkpro/core/io/xmi/XmiWriter.java new file mode 100644 index 0000000000..af5b38c20c --- /dev/null +++ b/dkpro-core-io-xmi-asl/src/main/java/org/dkpro/core/io/xmi/XmiWriter.java @@ -0,0 +1,157 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.xmi; + +import static java.util.Arrays.asList; +import static org.apache.commons.io.IOUtils.closeQuietly; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; + +import javax.xml.transform.OutputKeys; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CASRuntimeException; +import org.apache.uima.cas.impl.XmiCasSerializer; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.TypeSystemUtil; +import org.apache.uima.util.XMLSerializer; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.xml.sax.SAXException; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * UIMA XMI format writer. + */ +@ResourceMetaData(name = "UIMA XMI CAS Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.APPLICATION_VND_XMI_XML, MimeTypes.APPLICATION_X_UIMA_XMI}) +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) +public class XmiWriter + extends JCasFileWriter_ImplBase +{ + /** + * Format and indent the XML. + */ + public static final String PARAM_PRETTY_PRINT = "prettyPrint"; + @ConfigurationParameter(name = PARAM_PRETTY_PRINT, mandatory = true, defaultValue = "true") + private boolean prettyPrint; + + /** + * Location to write the type system to. If this is not set, a file called typesystem.xml will + * be written to the XMI output path. If this is set, it is expected to be a file relative + * to the current work directory or an absolute file. + *
+ * If this parameter is set, the {@link #PARAM_COMPRESSION} parameter has no effect on the + * type system. Instead, if the file name ends in ".gz", the file will be compressed, + * otherwise not. + */ + public static final String PARAM_TYPE_SYSTEM_FILE = "typeSystemFile"; + @ConfigurationParameter(name = PARAM_TYPE_SYSTEM_FILE, mandatory = false) + private File typeSystemFile; + + /** + * Specify the suffix of output files. Default value .xmi. If the suffix is not + * needed, provide an empty string as value. + */ + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; + @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".xmi") + private String filenameSuffix; + + /** + * Defines the XML version used for serializing the data. The default is XML {@code "1.0"}. + * However, XML 1.0 does not support certain Unicode characters. To support a wider range of + * characters, you can switch this parameter to {@code "1.1"}. + */ + public static final String PARAM_VERSION = "version"; + @ConfigurationParameter(name = PARAM_VERSION, mandatory = true, defaultValue = "1.0") + private String version; + + + private boolean typeSystemWritten; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + if (!asList("1.0", "1.1").contains(version)) { + throw new ResourceInitializationException(new IllegalArgumentException( + "Invalid value for parameter version: [" + version + "]")); + } + + typeSystemWritten = false; + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + try (OutputStream docOS = getOutputStream(aJCas, filenameSuffix)) { + XmiCasSerializer xmiCasSerializer = new XmiCasSerializer(null); + XMLSerializer sax2xml = new XMLSerializer(docOS, prettyPrint); + sax2xml.setOutputProperty(OutputKeys.VERSION, version); + xmiCasSerializer.serialize(aJCas.getCas(), sax2xml.getContentHandler(), null, null, + null); + + if (!typeSystemWritten) { + writeTypeSystem(aJCas); + typeSystemWritten = true; + } + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + } + + private void writeTypeSystem(JCas aJCas) + throws IOException, CASRuntimeException, SAXException + { + @SuppressWarnings("resource") + OutputStream typeOS = null; + + try { + if (typeSystemFile != null) { + typeOS = CompressionUtils.getOutputStream(typeSystemFile); + } + else { + typeOS = getOutputStream("TypeSystem", ".xml"); + } + + TypeSystemUtil.typeSystem2TypeSystemDescription(aJCas.getTypeSystem()).toXML(typeOS); + } + finally { + closeQuietly(typeOS); + } + } +} diff --git a/dkpro-core-io-xmi-asl/src/main/java/org/dkpro/core/io/xmi/package-info.java b/dkpro-core-io-xmi-asl/src/main/java/org/dkpro/core/io/xmi/package-info.java new file mode 100644 index 0000000000..346b517698 --- /dev/null +++ b/dkpro-core-io-xmi-asl/src/main/java/org/dkpro/core/io/xmi/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Support for (de)serializing the CAS to/from XMI files. + * + * @since 1.1.0 + */ +package org.dkpro.core.io.xmi; diff --git a/dkpro-core-io-xmi-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/xmi/XmiWriterReaderTest.java b/dkpro-core-io-xmi-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/xmi/XmiWriterReaderTest.java deleted file mode 100644 index b95178fe48..0000000000 --- a/dkpro-core-io-xmi-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/xmi/XmiWriterReaderTest.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.xmi; - -import static org.apache.commons.io.FileUtils.readFileToString; -import static org.apache.uima.fit.factory.TypeSystemDescriptionFactory.createTypeSystemDescription; -import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.io.File; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.cas.CAS; -import org.apache.uima.collection.CollectionReader; -import org.apache.uima.fit.factory.AnalysisEngineFactory; -import org.apache.uima.fit.factory.CollectionReaderFactory; -import org.apache.uima.util.CasCreationUtils; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - -public class XmiWriterReaderTest -{ - @Rule - public TemporaryFolder testFolder = new TemporaryFolder(); - - @Test - public void test() throws Exception - { - write(); - read(); - } - - public void write() throws Exception - { - CollectionReader textReader = CollectionReaderFactory.createReader( - TextReader.class, - ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "src/test/resources/texts", - ResourceCollectionReaderBase.PARAM_PATTERNS, new String [] { - ResourceCollectionReaderBase.INCLUDE_PREFIX + "latin.txt" - }, - ResourceCollectionReaderBase.PARAM_LANGUAGE, "latin"); - - AnalysisEngine xmiWriter = AnalysisEngineFactory.createEngine( - XmiWriter.class, - XmiWriter.PARAM_TARGET_LOCATION, testFolder.getRoot().getPath()); - - runPipeline(textReader, xmiWriter); - - assertTrue(new File(testFolder.getRoot(), "latin.txt.xmi").exists()); - } - - public void read() throws Exception - { - CollectionReader xmiReader = CollectionReaderFactory.createReader( - XmiReader.class, - ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, testFolder.getRoot().getPath(), - ResourceCollectionReaderBase.PARAM_PATTERNS, new String [] { - ResourceCollectionReaderBase.INCLUDE_PREFIX+"*.xmi" - }); - - CAS cas = CasCreationUtils.createCas(createTypeSystemDescription(), null, null); - xmiReader.getNext(cas); - - String refText = readFileToString(new File("src/test/resources/texts/latin.txt")); - assertEquals(refText, cas.getDocumentText()); - assertEquals("latin", cas.getDocumentLanguage()); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-io-xmi-asl/src/test/java/org/dkpro/core/io/xmi/XmiReaderTest.java b/dkpro-core-io-xmi-asl/src/test/java/org/dkpro/core/io/xmi/XmiReaderTest.java new file mode 100644 index 0000000000..58d6bb954c --- /dev/null +++ b/dkpro-core-io-xmi-asl/src/test/java/org/dkpro/core/io/xmi/XmiReaderTest.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.xmi; + +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.junit.Assert.assertNull; + +import java.io.IOException; + +import org.apache.uima.cas.Type; +import org.apache.uima.cas.impl.XCASParsingException; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Rule; +import org.junit.Test; + +public class XmiReaderTest +{ + @Test + public void testTypeSystemMerge() throws Exception + { + JCas jcas = JCasFactory.createJCas(); + + CollectionReader reader = createReader(XmiReader.class, + XmiReader.PARAM_SOURCE_LOCATION, "src/test/resources/xmi/english.xmi", + XmiReader.PARAM_TYPE_SYSTEM_FILE, "src/test/resources/ts/typesystem.xml", + XmiReader.PARAM_MERGE_TYPE_SYSTEM, true); + + reader.getNext(jcas.getCas()); + + Type spanType = jcas.getTypeSystem().getType("de.tudarmstadt.ukp.dkpro.core.io.xmi.Span"); + AnnotationFS span = jcas.getCas().createAnnotation(spanType, 0, 1); + jcas.getCas().addFsToIndexes(span); + } + + @Test + public void testNoTypeSystemMerge() throws Exception + { + JCas jcas = JCasFactory.createJCas(); + + CollectionReader reader = createReader(XmiReader.class, + XmiReader.PARAM_SOURCE_LOCATION, "src/test/resources/xmi/english.xmi"); + + reader.getNext(jcas.getCas()); + + Type spanType = jcas.getTypeSystem().getType("de.tudarmstadt.ukp.dkpro.core.io.xmi.Span"); + assertNull(spanType); + } + + @Test + public void testTypeSystemReplace() throws Exception + { + JCas jcas = JCasFactory.createJCas(); + + CollectionReader reader = createReader(XmiReader.class, + XmiReader.PARAM_SOURCE_LOCATION, "src/test/resources/xmi/english.xmi", + XmiReader.PARAM_TYPE_SYSTEM_FILE, "src/test/resources/ts/typesystem.xml"); + + assertThatThrownBy(() -> { + reader.getNext(jcas.getCas()); + }).isInstanceOf(IOException.class).hasCauseInstanceOf(XCASParsingException.class); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-xmi-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/xmi/XmiReaderWriterTest.java b/dkpro-core-io-xmi-asl/src/test/java/org/dkpro/core/io/xmi/XmiReaderWriterTest.java similarity index 97% rename from dkpro-core-io-xmi-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/xmi/XmiReaderWriterTest.java rename to dkpro-core-io-xmi-asl/src/test/java/org/dkpro/core/io/xmi/XmiReaderWriterTest.java index 1ffd9c6a77..b16c8418c0 100644 --- a/dkpro-core-io-xmi-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/xmi/XmiReaderWriterTest.java +++ b/dkpro-core-io-xmi-asl/src/test/java/org/dkpro/core/io/xmi/XmiReaderWriterTest.java @@ -15,12 +15,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.xmi; +package org.dkpro.core.io.xmi; -import static de.tudarmstadt.ukp.dkpro.core.testing.IOTestRunner.testRoundTrip; import static org.apache.commons.io.FilenameUtils.separatorsToUnix; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; import static org.apache.uima.fit.util.FSUtil.getFeature; +import static org.dkpro.core.testing.IOTestRunner.testRoundTrip; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -28,11 +28,10 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.TOP; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - public class XmiReaderWriterTest { @Test diff --git a/dkpro-core-io-xmi-asl/src/test/java/org/dkpro/core/io/xmi/XmiWriterReaderTest.java b/dkpro-core-io-xmi-asl/src/test/java/org/dkpro/core/io/xmi/XmiWriterReaderTest.java new file mode 100644 index 0000000000..7bb67b83b0 --- /dev/null +++ b/dkpro-core-io-xmi-asl/src/test/java/org/dkpro/core/io/xmi/XmiWriterReaderTest.java @@ -0,0 +1,253 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.xmi; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.commons.io.FileUtils.readFileToString; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; +import static org.apache.uima.fit.factory.JCasFactory.createText; +import static org.apache.uima.fit.factory.TypeSystemDescriptionFactory.createTypeSystemDescription; +import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.util.Collections; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.cas.CAS; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.CollectionReaderFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.apache.uima.util.CasCreationUtils; +import org.dkpro.core.api.io.ResourceCollectionReaderBase; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.xml.sax.XMLReader; +import org.xml.sax.helpers.XMLReaderFactory; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; + +public class XmiWriterReaderTest +{ + @Rule + public TemporaryFolder testFolder = new TemporaryFolder(); + + @Test + public void thatWritingAndReadingXML1_1works() throws Exception + { + File outputFolder = testContext.getTestOutputFolder(); + + JCas outDocument = createText( + readFileToString(new File("src/test/resources/texts/chinese.txt"), UTF_8), "zh"); + + DocumentMetaData dmd = DocumentMetaData.create(outDocument); + dmd.setDocumentId("output.xmi"); + + AnalysisEngine writer = createEngine(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, outputFolder, + XmiWriter.PARAM_STRIP_EXTENSION, true, + XmiWriter.PARAM_VERSION, "1.1", + XmiWriter.PARAM_OVERWRITE, true); + + writer.process(outDocument); + + JCas inDocument = JCasFactory.createJCas(); + + CollectionReader reader = createReader(XmiReader.class, + XmiReader.PARAM_SOURCE_LOCATION, new File(outputFolder, "output.xmi")); + reader.getNext(inDocument.getCas()); + + assertThat(outDocument.getDocumentText()).isEqualTo(inDocument.getDocumentText()); + } + + @Test + public void thatWritingAndReadingXML1_0ControlCharactersWorks() throws Exception + { + System.out.println(Collections.list( + getClass().getClassLoader().getResources("META-INF/services/org.xml.sax.driver"))); + XMLReader r = XMLReaderFactory.createXMLReader(); + System.out.printf("http://xml.org/sax/features/xml-1.1: %s%n", + r.getFeature("http://xml.org/sax/features/xml-1.1")); + + File outputFolder = testContext.getTestOutputFolder(); + + StringBuilder text = new StringBuilder(); + for (char ch = 0; ch < 0xFFFE; ch++) { + if ( + // These are rejected already by UIMA during serialization + (0x0000 <= ch && ch < 0x0009) || + (0x000B <= ch && ch < 0x000D) || + (0x000E <= ch && ch < 0x0020) || + (0xD800 <= ch && ch < 0xE000) + ) { + text.append(" "); + } + else { + text.append(ch); + } + } + + JCas outDocument = createText(text.toString(), "en"); + + DocumentMetaData dmd = DocumentMetaData.create(outDocument); + dmd.setDocumentId("output.xmi"); + + AnalysisEngine writer = createEngine(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, outputFolder, + XmiWriter.PARAM_STRIP_EXTENSION, true, + XmiWriter.PARAM_VERSION, "1.0", + XmiWriter.PARAM_OVERWRITE, true); + + writer.process(outDocument); + + JCas inDocument = JCasFactory.createJCas(); + + CollectionReader reader = createReader(XmiReader.class, + XmiReader.PARAM_SOURCE_LOCATION, new File(outputFolder, "output.xmi")); + reader.getNext(inDocument.getCas()); + + String expected = inDocument.getDocumentText(); + String actual = outDocument.getDocumentText(); + + assertThat(actual.length()) + .isEqualTo(expected.length()); + + for (int i = 0; i < expected.length(); i++) { + if (expected.charAt(i) != actual.charAt(i)) { + System.out.printf("[U+%04X] %d does not match expected %d%n", i, + (int) actual.charAt(i), (int) expected.charAt(i)); + } + } + + assertThat(outDocument.getDocumentText()).isEqualTo(inDocument.getDocumentText()); + } + @Test + public void thatWritingAndReadingXML1_1ControlCharactersWorks() throws Exception + { + System.out.println(Collections.list( + getClass().getClassLoader().getResources("META-INF/services/org.xml.sax.driver"))); + XMLReader r = XMLReaderFactory.createXMLReader(); + System.out.printf("http://xml.org/sax/features/xml-1.1: %s%n", + r.getFeature("http://xml.org/sax/features/xml-1.1")); + + File outputFolder = testContext.getTestOutputFolder(); + + StringBuilder text = new StringBuilder(); + for (char ch = 0; ch < 0xFFFE; ch++) { + if ( + // These are rejected already by UIMA during serialization + ch == 0x0000 || + (0xD800 <= ch && ch < 0xE000) || + // These are rejected during parsing by the XML parser + (0x007f <= ch && ch <= 0x0084) || + (0x0086 <= ch && ch <= 0x009F) || + // These are normalized to " " + ch == 0x0085 || ch == 0x2028 + ) { + text.append(" "); + } + else { + text.append(ch); + } + } + + JCas outDocument = createText(text.toString(), "en"); + + DocumentMetaData dmd = DocumentMetaData.create(outDocument); + dmd.setDocumentId("output.xmi"); + + AnalysisEngine writer = createEngine(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, outputFolder, + XmiWriter.PARAM_STRIP_EXTENSION, true, + XmiWriter.PARAM_VERSION, "1.1", + XmiWriter.PARAM_OVERWRITE, true); + + writer.process(outDocument); + + JCas inDocument = JCasFactory.createJCas(); + + CollectionReader reader = createReader(XmiReader.class, + XmiReader.PARAM_SOURCE_LOCATION, new File(outputFolder, "output.xmi")); + reader.getNext(inDocument.getCas()); + + String expected = inDocument.getDocumentText(); + String actual = outDocument.getDocumentText(); + + assertThat(actual.length()) + .isEqualTo(expected.length()); + + for (int i = 0; i < expected.length(); i++) { + if (expected.charAt(i) != actual.charAt(i)) { + System.out.printf("[U+%04X] %d does not match expected %d%n", i, + (int) actual.charAt(i), (int) expected.charAt(i)); + } + } + + assertThat(outDocument.getDocumentText()).isEqualTo(inDocument.getDocumentText()); + } + + @Test + public void test() throws Exception + { + write(); + read(); + } + + public void write() throws Exception + { + CollectionReader textReader = CollectionReaderFactory.createReader( + TextReader.class, + ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "src/test/resources/texts", + ResourceCollectionReaderBase.PARAM_PATTERNS, "latin.txt", + ResourceCollectionReaderBase.PARAM_LANGUAGE, "latin"); + + AnalysisEngine xmiWriter = AnalysisEngineFactory.createEngine( + XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, testFolder.getRoot().getPath()); + + runPipeline(textReader, xmiWriter); + + assertTrue(new File(testFolder.getRoot(), "latin.txt.xmi").exists()); + } + + public void read() throws Exception + { + CollectionReader xmiReader = CollectionReaderFactory.createReader( + XmiReader.class, + ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, testFolder.getRoot().getPath(), + ResourceCollectionReaderBase.PARAM_PATTERNS, "*.xmi"); + + CAS cas = CasCreationUtils.createCas(createTypeSystemDescription(), null, null); + xmiReader.getNext(cas); + + String refText = readFileToString(new File("src/test/resources/texts/latin.txt")); + assertEquals(refText, cas.getDocumentText()); + assertEquals("latin", cas.getDocumentLanguage()); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-xmi-asl/src/test/resources/log4j.properties b/dkpro-core-io-xmi-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-io-xmi-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-io-xmi-asl/src/test/resources/log4j2.xml b/dkpro-core-io-xmi-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-io-xmi-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/dkpro-core-io-xmi-asl/src/test/resources/texts/chinese.txt b/dkpro-core-io-xmi-asl/src/test/resources/texts/chinese.txt new file mode 100644 index 0000000000..e0433463b0 --- /dev/null +++ b/dkpro-core-io-xmi-asl/src/test/resources/texts/chinese.txt @@ -0,0 +1 @@ +第四卷第一四二八页。   diff --git a/dkpro-core-io-xmi-asl/src/test/resources/ts/typesystem.xml b/dkpro-core-io-xmi-asl/src/test/resources/ts/typesystem.xml new file mode 100644 index 0000000000..9d1bf2c182 --- /dev/null +++ b/dkpro-core-io-xmi-asl/src/test/resources/ts/typesystem.xml @@ -0,0 +1,14 @@ + + + + + 1.0 + + + + de.tudarmstadt.ukp.dkpro.core.io.xmi.Span + + uima.tcas.Annotation + + + diff --git a/dkpro-core-io-xmi-asl/src/test/resources/xmi/english.xmi b/dkpro-core-io-xmi-asl/src/test/resources/xmi/english.xmi index 46a2c7f925..58697c5f30 100644 --- a/dkpro-core-io-xmi-asl/src/test/resources/xmi/english.xmi +++ b/dkpro-core-io-xmi-asl/src/test/resources/xmi/english.xmi @@ -1,250 +1,250 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dkpro-core-io-xml-asl/pom.xml b/dkpro-core-io-xml-asl/pom.xml index f9a13951c1..c9d82d0ac3 100644 --- a/dkpro-core-io-xml-asl/pom.xml +++ b/dkpro-core-io-xml-asl/pom.xml @@ -18,14 +18,15 @@ 4.0.0 - de.tudarmstadt.ukp.dkpro.core-asl - de.tudarmstadt.ukp.dkpro.core - 1.10.0-SNAPSHOT + dkpro-core-asl + org.dkpro.core + 2.3.0-SNAPSHOT ../dkpro-core-asl - de.tudarmstadt.ukp.dkpro.core.io.xml-asl + dkpro-core-io-xml-asl jar DKPro Core ASL - IO - XML + https://dkpro.github.io/dkpro-core/ org.apache.uima @@ -60,29 +61,42 @@ commons-io - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.io-asl + org.dkpro.core + dkpro-core-api-io-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.metadata-asl + org.dkpro.core + dkpro-core-api-metadata-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.resources-asl + org.dkpro.core + dkpro-core-api-resources-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.structure-asl + org.dkpro.core + dkpro-core-api-structure-asl - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.api.parameter-asl + org.dkpro.core + dkpro-core-api-parameter-asl + + + org.dkpro.core + dkpro-core-api-xml-asl + + + eu.openminted.share.annotations + omtd-share-annotations-api junit junit test + + org.dkpro.core + dkpro-core-testing-asl + test + diff --git a/dkpro-core-io-xml-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xml/InlineXmlWriter.java b/dkpro-core-io-xml-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xml/InlineXmlWriter.java deleted file mode 100644 index 3a62500f24..0000000000 --- a/dkpro-core-io-xml-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xml/InlineXmlWriter.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.xml; - -import static org.apache.commons.io.IOUtils.closeQuietly; - -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.OutputStream; -import java.net.URL; - -import javax.xml.transform.Transformer; -import javax.xml.transform.TransformerException; -import javax.xml.transform.TransformerFactory; -import javax.xml.transform.stream.StreamResult; -import javax.xml.transform.stream.StreamSource; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CASException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.CasToInlineXml; - -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; - -/** - * Writes an approximation of the content of a textual CAS as an inline XML file. Optionally applies - * an XSLT stylesheet. - *

- * Note this component inherits the restrictions from {@link CasToInlineXml}: - * - *

    - *
  • Features whose values are FeatureStructures are not represented.
  • - *
  • Feature values which are strings longer than 64 characters are truncated.
  • - *
  • Feature values which are arrays of primitives are represented by strings that look like [ - * xxx, xxx ]
  • - *
  • The Subject of analysis is presumed to be a text string.
  • - *
  • Some characters in the document's Subject-of-analysis are replaced by blanks, because the - * characters aren't valid in xml documents.
  • - *
  • It doesn't work for annotations which are overlapping, because these cannot be properly - * represented as properly - nested XML.
  • - *
- * - * @since 1.1.0 - */ -@ResourceMetaData(name="Inline XML Writer") -@MimeTypeCapability({MimeTypes.APPLICATION_XML, MimeTypes.TEXT_XML}) -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) -public class InlineXmlWriter - extends JCasFileWriter_ImplBase -{ - /** - * XSLT stylesheet to apply. - */ - public static final String PARAM_XSLT = "Xslt"; - @ConfigurationParameter(name=PARAM_XSLT, mandatory=false) - private String xslt; - - private CasToInlineXml cas2xml; - private Transformer transformer; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - if (xslt != null) { - TransformerFactory tf = TransformerFactory.newInstance(); - try { - URL url = ResourceUtils.resolveLocation(xslt, this, getContext()); - transformer = tf.newTransformer(new StreamSource(url.openStream())); - } catch (Exception e) { - throw new ResourceInitializationException(e); - } - } - - cas2xml = new CasToInlineXml(); - } - - @Override - public - void process(final JCas aJCas) throws AnalysisEngineProcessException - { - OutputStream docOS = null; - try { - docOS = getOutputStream(aJCas, ".xml"); - - final String xmlAnnotations = cas2xml.generateXML(aJCas.getCas()); - if (transformer != null) { - transformer.transform( - new StreamSource(new ByteArrayInputStream(xmlAnnotations.getBytes("UTF-8"))), - new StreamResult(docOS)); - } - else { - docOS.write(xmlAnnotations.getBytes("UTF-8")); - } - } - catch (final CASException e) { - throw new AnalysisEngineProcessException(e); - } - catch (final IOException e) { - throw new AnalysisEngineProcessException(e); - } - catch (TransformerException e) { - throw new AnalysisEngineProcessException(e); - } - finally { - closeQuietly(docOS); - } - } -} diff --git a/dkpro-core-io-xml-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xml/XmlReader.java b/dkpro-core-io-xml-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xml/XmlReader.java deleted file mode 100644 index ceb2fc8006..0000000000 --- a/dkpro-core-io-xml-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xml/XmlReader.java +++ /dev/null @@ -1,466 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.xml; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.LinkedList; -import java.util.Map; -import java.util.Set; - -import javax.xml.stream.XMLStreamException; - -import org.apache.uima.UimaContext; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.CASException; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.component.CasCollectionReader_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.Progress; -import org.apache.uima.util.ProgressImpl; -import org.codehaus.stax2.XMLStreamReader2; - -import com.ctc.wstx.stax.WstxInputFactory; - -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.structure.type.Field; - -/** - * Reader for XML files. - */ -@ResourceMetaData(name="XML Reader") -@MimeTypeCapability({MimeTypes.APPLICATION_XML, MimeTypes.TEXT_XML}) -@TypeCapability( - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.structure.type.Field", - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) -public class XmlReader extends CasCollectionReader_ImplBase { - - /** - * Location from which the input is read. - */ - public static final String PARAM_SOURCE_LOCATION = ComponentParameters.PARAM_SOURCE_LOCATION; - @ConfigurationParameter(name=PARAM_SOURCE_LOCATION, mandatory=true) - private String inputDirectory; - - /** - * Set this as the language of the produced documents. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name=PARAM_LANGUAGE, mandatory=false) - private String language; - - /** - * optional, tags those should be worked on (if empty, then all tags - * except those ExcludeTags will be worked on) - */ - public static final String PARAM_INCLUDE_TAG = "IncludeTag"; - @ConfigurationParameter(name=PARAM_INCLUDE_TAG, mandatory=true, defaultValue={}) - private Set includeTags; - - /** - * optional, tags those should not be worked on. Out them should no - * text be extracted and also no Annotations be produced. - */ - public static final String PARAM_EXCLUDE_TAG = "ExcludeTag"; - @ConfigurationParameter(name=PARAM_EXCLUDE_TAG, mandatory=true, defaultValue={}) - private Set excludeTags; - - /** - * tag which contains the docId - */ - public static final String PARAM_DOC_ID_TAG = "DocIdTag"; - @ConfigurationParameter(name=PARAM_DOC_ID_TAG, mandatory=false) - private String docIdTag; - - /** - * The collection ID to set in the {@link DocumentMetaData}. - */ - public static final String PARAM_COLLECTION_ID = "collectionId"; - @ConfigurationParameter(name=PARAM_COLLECTION_ID, mandatory=false) - private String collectionId; - - private static final String MESSAGE_DIGEST = "de.tudarmstadt.ukp.dkpro.core.io.xml.XmlReader_Messages"; - private static final String INVALID_PATH_EXCEPTION = "invalid_path_error"; - private static final String EMPTY_DIRECTORY_EXCEPTION = "empty_directory_error"; - private static final String MISSING_DOC_ID_EXCEPTION = "missing_doc_id_error"; - private static final String EMPTY_DOC_ID_EXCEPTION = "empty_doc_id_error"; - private static final String MULTIPLE_DOC_ID_EXCEPTION = "multiple_doc_id_error"; - private static final String SUBSTITUTE_EXCEPTION = "substitute_error"; - - // mandatory, list of xml files to be readed in - private final ArrayList xmlFiles = new ArrayList(); - - // Xml stream reader - private XMLStreamReader2 xmlReader; - - // current be parsed file index - private int currentParsedFile; - - private int iDoc; - private boolean useSubstitution; - private Map substitution; - - private String docIdElementLocalName; - private String docIdAttributeName; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - // mandatory, directory where that those be parsed XML files are - File inDir = new File(inputDirectory); - // get all xml files from the input directory (ignore the - // subdirectories) - if (inDir.isDirectory()) { - File[] files = inDir.listFiles(); - for (File file : files) { - if (file.isFile() && (file.toString().endsWith(".xml") || file.toString().endsWith(".sgml"))) { - xmlFiles.add(file); - } - } - Collections.sort(xmlFiles); - } - else { - throw new ResourceInitializationException( - MESSAGE_DIGEST, - INVALID_PATH_EXCEPTION, - new Object[] {inDir}); - } - - // if xmlFiles is not empty, then initialize the Stax Reader - if (xmlFiles.isEmpty()) { - throw new ResourceInitializationException( - MESSAGE_DIGEST, - EMPTY_DIRECTORY_EXCEPTION, - new Object[] {inDir}); - } - - currentParsedFile = 0; - - if (docIdTag != null && docIdTag.contains("/@")) { - int split = docIdTag.indexOf("/@"); - docIdElementLocalName = docIdTag.substring(0, split); - docIdAttributeName = docIdTag.substring(split+2); - } - else { - docIdElementLocalName = docIdTag; - } - } - - @Override - public void getNext(CAS aCAS) - throws IOException, CollectionException - { - JCas jcas; - try { - jcas = aCAS.getJCas(); - } - catch (CASException e) { - throw new CollectionException(e); - } - - // parse the xml file - try { - // if the last file is already done, then work on the next file - if (xmlReader == null) { - WstxInputFactory factory = new WstxInputFactory(); - xmlReader = factory.createXMLStreamReader(xmlFiles - .get(currentParsedFile)); - iDoc = 0; - } - - // ignore the root element of the file - // parse the second layer element, suppose they are all documents - // read in all elements under second layer - parseSubDocument(jcas); - - iDoc++; - if (xmlReader.getDepth() < 2) { - xmlReader.closeCompletely(); - xmlReader = null; - currentParsedFile++; - } - } catch (XMLStreamException e) { - e.printStackTrace(); - throw new CollectionException(e); - } catch (Exception e) { - e.printStackTrace(); - throw new CollectionException(e); - } - - } - - @Override - public Progress[] getProgress() - { - return new Progress[] { new ProgressImpl(currentParsedFile, xmlFiles - .size(), Progress.ENTITIES) }; - } - - @Override - public boolean hasNext() - throws IOException, CollectionException - { - if (xmlReader != null) { - // There is still more to parse in the current file - return true; - } - if (currentParsedFile >= 0 && currentParsedFile < xmlFiles.size()) { - // There are additional files to parse - return true; - } - else { - // There is nothing more - return false; - } - } - - @Override - public void close() - throws IOException - { - // Nothing to do - } - - private void parseSubDocument(JCas jcas) - throws XMLStreamException, IOException, CollectionException - { - // set the jcas document language if the parameter exists - if (language != null) { - jcas.setDocumentLanguage(language); - } - - LinkedList openTagStack = new LinkedList(); - - // get document tag - String docTag = seekSubDocumentRoot(); - - StringBuilder documentText = new StringBuilder(); - String docId = null; - while (xmlReader.hasNext() && xmlReader.getDepth() > 1) { - if (xmlReader.isStartElement()) { - String tagName = xmlReader.getName().getLocalPart(); - openTagStack.push(tagName); - - // If the docId is an attribute, try to fetch it now - String id = null; - if (isDocIdElement(tagName) && docIdAttributeName != null) { - id = xmlReader.getAttributeValue(null, docIdAttributeName); - } - - xmlReader.next(); - String elementText = collectText(); - if (elementText.length() > 0) { - // If the docId is an element value, we may capture it now - if (isDocIdElement(tagName) && docIdAttributeName == null) { - id = elementText; - } - - // Process the current span of text - processText(jcas, tagName, elementText, documentText); - } - - // If a docId has been captured, check if it valid and unique - if (id != null) { - if (docId != null) { - throw new CollectionException( - MULTIPLE_DOC_ID_EXCEPTION, - new Object[] { docIdTag }); - } - if (id.length() == 0) { - throw new CollectionException(EMPTY_DOC_ID_EXCEPTION, - new Object[] { docIdTag }); - } - docId = id; - } - } - else if(xmlReader.isCharacters()) { - String tagName = openTagStack.peek(); - - String elementText = collectText(); - if(elementText.length()==0) { - continue; - } - - // Process the current span of text - processText(jcas, tagName, elementText, documentText); - } - else if (xmlReader.isEndElement()) { - String tagName = xmlReader.getName().getLocalPart(); - - // if it is end of document then stop processing - if (docTag.equals(tagName)) { - xmlReader.nextTag(); - break; - } - - openTagStack.poll(); - xmlReader.next(); - } - } - jcas.setDocumentText(documentText.toString()); - - // Add Document MetaData - String fileName = xmlFiles.get(currentParsedFile).getName(); -// String fileExtension = ""; - int dotPlace = fileName.lastIndexOf ( '.' ); - if(docIdTag!=null) { - if(docId==null) { - throw new CollectionException( - MESSAGE_DIGEST, - MISSING_DOC_ID_EXCEPTION, - new Object[] {docIdTag}); - } - } else { - if ( dotPlace >= 0 ) { -// fileExtension = fileName.substring( dotPlace + 1 ); - docId = fileName.substring(0, dotPlace)+"-"+iDoc; - } - } - - String docUri = xmlFiles.get(currentParsedFile).toURI().toString(); - DocumentMetaData docMetaData = DocumentMetaData.create(jcas); - docMetaData.setDocumentId(docId); - docMetaData.setDocumentUri(docUri+"#"+docId); - docMetaData.setCollectionId(collectionId); - -// System.out.println("Fetched document: "+docUri+"#"+docId); - } - - /** - * Create a field annotation for the given element name at the given location. - * If substitutions are used, the field is created using the substituted name. - * - * @param jcas the JCas. - * @param localName the local name of the current XML element. - * @param begin the start offset. - * @param end the end offset. - */ - private void createFieldAnnotation(JCas jcas, String localName, int begin, int end) - { - String fieldName = null; - if (useSubstitution) { - fieldName = substitution.get(localName); - if (fieldName == null) { - fieldName = localName; - } - } - else { - fieldName = localName; - } - - Field field = new Field(jcas, begin, end); - field.setName(fieldName); - field.addToIndexes(); - } - - private boolean isIncluded(final String tagName) - { - boolean needToBeParsed = (includeTags.size() == 0) || includeTags.contains(tagName); - if (excludeTags.size() > 0 && excludeTags.contains(tagName)) { - needToBeParsed = false; - } - return needToBeParsed; - } - - /** - * Process the text found within the given element. If text from the given - * element should be included in the document, then it is added and a proper - * {@link Field} annotation is created. - * - * @param jcas the JCas. - * @param localName the element in which the text was found - * @param elementText the text - * @param documentText the document text buffer - */ - private void processText(JCas jcas, String localName, String elementText, - StringBuilder documentText) - { - if (isIncluded(localName)) { - int begin = documentText.length(); - documentText = documentText.append(elementText); - documentText = documentText.append("\n\n"); - int end = documentText.length()-1; - createFieldAnnotation(jcas, localName, begin, end); - } - } - - /** - * Collect all consecutive text starting at the current point. - * - * @return the concatenated consecutive text. - */ - private String collectText() throws XMLStreamException - { - StringBuilder elementText = new StringBuilder(); - while(xmlReader.isCharacters()) { - elementText.append(xmlReader.getText().replaceAll("\r", "").trim()); - xmlReader.next(); - } - return elementText.toString(); - } - - /** - * Seek to the root element of the next sub-document and return its local name. - * - * @return the local name of the sub-document root element. - */ - private String seekSubDocumentRoot() - throws XMLStreamException, IOException - { - // if this is not the first document in the file then the current - // element is the docTag - String docTag = null; - if (xmlReader.isStartElement() && xmlReader.getDepth() > 1) { - docTag = xmlReader.getName().getLocalPart(); - } - else { - while (xmlReader.hasNext() && xmlReader.getDepth() < 2) { - xmlReader.next(); - } - while (xmlReader.hasNext() && !xmlReader.isStartElement()) { - xmlReader.next(); - } - if (xmlReader.getDepth() == 2 && xmlReader.isStartElement()) { - docTag = xmlReader.getName().getLocalPart(); - } - else { - throw new IOException("file is empty: " - + xmlFiles.get(currentParsedFile)); - } - } - return docTag; - } - - private boolean isDocIdElement(String localName) - { - return docIdElementLocalName != null && docIdElementLocalName.equals(localName); - } -} diff --git a/dkpro-core-io-xml-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xml/XmlTextReader.java b/dkpro-core-io-xml-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xml/XmlTextReader.java deleted file mode 100644 index 1f50463db5..0000000000 --- a/dkpro-core-io-xml-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xml/XmlTextReader.java +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.xml; - -import static org.apache.commons.io.IOUtils.closeQuietly; - -import java.io.IOException; -import java.io.InputStream; - -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; - -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.CASException; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.util.Logger; -import org.xml.sax.InputSource; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - -import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; - -/** - * @since 1.1.0 - */ -@ResourceMetaData(name="XML Text Reader") -@MimeTypeCapability({MimeTypes.APPLICATION_XML, MimeTypes.TEXT_XML}) -@TypeCapability( - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) - -public class XmlTextReader - extends ResourceCollectionReaderBase -{ - @Override - public void getNext(CAS aCAS) - throws IOException, CollectionException - { - Resource res = nextFile(); - initCas(aCAS, res); - - InputStream is = null; - - try { - JCas jcas = aCAS.getJCas(); - - is = res.getInputStream(); - - // Create handler - Handler handler = newSaxHandler(); - handler.setJCas(jcas); - handler.setLogger(getLogger()); - - // Parser XML - SAXParserFactory pf = SAXParserFactory.newInstance(); - SAXParser parser = pf.newSAXParser(); - - InputSource source = new InputSource(is); - source.setPublicId(res.getLocation()); - source.setSystemId(res.getLocation()); - parser.parse(source, handler); - - // Set up language - if (getConfigParameterValue(PARAM_LANGUAGE) != null) { - aCAS.setDocumentLanguage((String) getConfigParameterValue(PARAM_LANGUAGE)); - } - } - catch (CASException e) { - throw new CollectionException(e); - } - catch (ParserConfigurationException e) { - throw new CollectionException(e); - } - catch (SAXException e) { - throw new IOException(e); - } - finally { - closeQuietly(is); - } - } - - protected Handler newSaxHandler() - { - return new TextExtractor(); - } - - /** - */ - protected abstract static class Handler - extends DefaultHandler - { - private JCas jcas; - private Logger logger; - - public void setJCas(final JCas aJCas) - { - jcas = aJCas; - } - - protected JCas getJCas() - { - return jcas; - } - - public void setLogger(Logger aLogger) - { - logger = aLogger; - } - - public Logger getLogger() - { - return logger; - } - } - - /** - */ - public static class TextExtractor - extends Handler - { - private final StringBuilder buffer = new StringBuilder(); - - @Override - public void characters(char[] aCh, int aStart, int aLength) - throws SAXException - { - buffer.append(aCh, aStart, aLength); - } - - @Override - public void ignorableWhitespace(char[] aCh, int aStart, int aLength) - throws SAXException - { - buffer.append(aCh, aStart, aLength); - } - - @Override - public void endDocument() - throws SAXException - { - getJCas().setDocumentText(buffer.toString()); - } - - protected StringBuilder getBuffer() - { - return buffer; - } - } -} diff --git a/dkpro-core-io-xml-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xml/package-info.java b/dkpro-core-io-xml-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xml/package-info.java deleted file mode 100644 index df4071e0ab..0000000000 --- a/dkpro-core-io-xml-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xml/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Support for XML files (read-only). - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.io.xml; diff --git a/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/InlineXmlWriter.java b/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/InlineXmlWriter.java new file mode 100644 index 0000000000..43def9f6bf --- /dev/null +++ b/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/InlineXmlWriter.java @@ -0,0 +1,137 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.xml; + +import static org.apache.commons.io.IOUtils.closeQuietly; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.net.URL; + +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.stream.StreamResult; +import javax.xml.transform.stream.StreamSource; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CASException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.CasToInlineXml; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.ResourceUtils; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Writes an approximation of the content of a textual CAS as an inline XML file. Optionally applies + * an XSLT stylesheet. + *

+ * Note this component inherits the restrictions from {@link CasToInlineXml}: + * + *

    + *
  • Features whose values are FeatureStructures are not represented.
  • + *
  • Feature values which are strings longer than 64 characters are truncated.
  • + *
  • Feature values which are arrays of primitives are represented by strings that look like [ + * xxx, xxx ]
  • + *
  • The Subject of analysis is presumed to be a text string.
  • + *
  • Some characters in the document's Subject-of-analysis are replaced by blanks, because the + * characters aren't valid in xml documents.
  • + *
  • It doesn't work for annotations which are overlapping, because these cannot be properly + * represented as properly - nested XML.
  • + *
+ * + * @since 1.1.0 + */ +@ResourceMetaData(name = "Inline XML Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.APPLICATION_XML, MimeTypes.TEXT_XML}) +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) +public class InlineXmlWriter + extends JCasFileWriter_ImplBase +{ + /** + * XSLT stylesheet to apply. + */ + public static final String PARAM_XSLT = "Xslt"; + @ConfigurationParameter(name = PARAM_XSLT, mandatory = false) + private String xslt; + + private CasToInlineXml cas2xml; + private Transformer transformer; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + if (xslt != null) { + TransformerFactory tf = TransformerFactory.newInstance(); + try { + URL url = ResourceUtils.resolveLocation(xslt, this, getContext()); + transformer = tf.newTransformer(new StreamSource(url.openStream())); + } catch (Exception e) { + throw new ResourceInitializationException(e); + } + } + + cas2xml = new CasToInlineXml(); + } + + @Override + public void process(final JCas aJCas) throws AnalysisEngineProcessException + { + OutputStream docOS = null; + try { + docOS = getOutputStream(aJCas, ".xml"); + + final String xmlAnnotations = cas2xml.generateXML(aJCas.getCas()); + if (transformer != null) { + transformer.transform( + new StreamSource(new ByteArrayInputStream(xmlAnnotations.getBytes("UTF-8"))), + new StreamResult(docOS)); + } + else { + docOS.write(xmlAnnotations.getBytes("UTF-8")); + } + } + catch (final CASException e) { + throw new AnalysisEngineProcessException(e); + } + catch (final IOException e) { + throw new AnalysisEngineProcessException(e); + } + catch (TransformerException e) { + throw new AnalysisEngineProcessException(e); + } + finally { + closeQuietly(docOS); + } + } +} diff --git a/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/XmlDocumentReader.java b/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/XmlDocumentReader.java new file mode 100644 index 0000000000..1e883f7be8 --- /dev/null +++ b/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/XmlDocumentReader.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.xml; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; + +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.api.xml.CasXmlHandler; +import org.xml.sax.InputSource; + +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.Parameters; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Simple XML reader which loads all text from the XML file into the CAS document text and generates + * XML annotations for all XML elements, attributes and text nodes. + * + * @see XmlDocumentWriter + */ +@Component(value = OperationType.READER) +@ResourceMetaData(name = "XML Document Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@Parameters( + exclude = { + XmlReader.PARAM_SOURCE_LOCATION }) +@MimeTypeCapability({MimeTypes.APPLICATION_XML, MimeTypes.TEXT_XML}) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "org.dkpro.core.api.xml.type.XmlAttribute", + "org.dkpro.core.api.xml.type.XmlDocument", + "org.dkpro.core.api.xml.type.XmlElement", + "org.dkpro.core.api.xml.type.XmlNode", + "org.dkpro.core.api.xml.type.XmlTextNode" }) +public class XmlDocumentReader + extends JCasResourceCollectionReader_ImplBase +{ + @Override + public void getNext(JCas aJCas) throws IOException, CollectionException + { + Resource res = nextFile(); + initCas(aJCas, res); + + try (InputStream is = new BufferedInputStream( + CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()))) { + + // Create handler + CasXmlHandler handler = new CasXmlHandler(aJCas); + + // Parser XML + SAXParserFactory pf = SAXParserFactory.newInstance(); + SAXParser parser = pf.newSAXParser(); + + InputSource source = new InputSource(is); + source.setPublicId(res.getLocation()); + source.setSystemId(res.getLocation()); + parser.parse(source, handler); + } + catch (IOException e) { + throw e; + } + catch (Exception e) { + throw new CollectionException(e); + } + } +} diff --git a/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/XmlDocumentWriter.java b/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/XmlDocumentWriter.java new file mode 100644 index 0000000000..a43856863b --- /dev/null +++ b/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/XmlDocumentWriter.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.xml; + +import static javax.xml.transform.OutputKeys.INDENT; +import static javax.xml.transform.OutputKeys.METHOD; +import static javax.xml.transform.OutputKeys.OMIT_XML_DECLARATION; + +import java.io.OutputStream; + +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.sax.SAXTransformerFactory; +import javax.xml.transform.sax.TransformerHandler; +import javax.xml.transform.stream.StreamResult; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.xml.Cas2SaxEvents; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Simple XML write takes the XML annotations for elements, attributes and text nodes and renders + * them into an XML file. + * + * @see XmlDocumentReader + */ +@ResourceMetaData(name = "XML Document Writer") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.APPLICATION_XML, MimeTypes.TEXT_XML}) +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "org.dkpro.core.api.xml.type.XmlAttribute", + "org.dkpro.core.api.xml.type.XmlDocument", + "org.dkpro.core.api.xml.type.XmlElement", + "org.dkpro.core.api.xml.type.XmlNode", + "org.dkpro.core.api.xml.type.XmlTextNode"}) +public class XmlDocumentWriter + extends JCasFileWriter_ImplBase +{ + /** + * Specify the suffix of output files. Default value .txt. If the suffix is not + * needed, provide an empty string as value. + */ + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; + @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".xml") + private String filenameSuffix; + + /** + * Whether to omit the XML preamble. + */ + public static final String PARAM_OMIT_XML_DECLARATION = "omitXmlDeclaration"; + @ConfigurationParameter(name = PARAM_OMIT_XML_DECLARATION, mandatory = true, defaultValue = "true") + private boolean omitXmlDeclaration; + + /** + * Output method. + */ + public static final String PARAM_OUTPUT_METHOD = "outputMethod"; + @ConfigurationParameter(name = PARAM_OUTPUT_METHOD, mandatory = true, defaultValue = "xml") + private String outputMethod; + + /** + * Indent output . + */ + public static final String PARAM_INDENT = "indent"; + @ConfigurationParameter(name = PARAM_INDENT, mandatory = true, defaultValue = "false") + private boolean indent; + + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException + { + try (OutputStream docOS = getOutputStream(aJCas, filenameSuffix)) { + SAXTransformerFactory tf = (SAXTransformerFactory) TransformerFactory.newInstance(); + tf.setFeature("http://javax.xml.XMLConstants/feature/secure-processing", true); + TransformerHandler th = tf.newTransformerHandler(); + if (omitXmlDeclaration) { + th.getTransformer().setOutputProperty(OMIT_XML_DECLARATION, "yes"); + } + th.getTransformer().setOutputProperty(METHOD, outputMethod); + th.getTransformer().setOutputProperty(INDENT, indent ? "yes" : "no"); + th.setResult(new StreamResult(docOS)); + + Cas2SaxEvents serializer = new Cas2SaxEvents(th); + serializer.process(aJCas); + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + } +} diff --git a/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/XmlReader.java b/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/XmlReader.java new file mode 100644 index 0000000000..ae8759b6ac --- /dev/null +++ b/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/XmlReader.java @@ -0,0 +1,480 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.xml; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedList; +import java.util.Map; +import java.util.Set; + +import javax.xml.stream.XMLStreamException; + +import org.apache.uima.UimaContext; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.CASException; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.component.CasCollectionReader_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Progress; +import org.apache.uima.util.ProgressImpl; +import org.codehaus.stax2.XMLStreamReader2; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; + +import com.ctc.wstx.stax.WstxInputFactory; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.structure.type.Field; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.Parameters; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Very basic reader to load texts from a XML file. + *

+ * The XML file is expected to contain one or more elements under its root note and each of these + * is treated as a separate document. Each of these child elements may contain further children + * containing text which may or may not be included into the CAS document text, depending on + * {@link #PARAM_INCLUDE_TAG} and {@link #PARAM_EXCLUDE_TAG}. + *

+ * If you are looking for a more generic XML reader which imports the structure of an XML file into + * a CAS, please look at {@link XmlDocumentReader}. + */ +@Component(value = OperationType.READER) +@ResourceMetaData(name = "XML Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@Parameters( + exclude = { + XmlReader.PARAM_SOURCE_LOCATION }) +@MimeTypeCapability({MimeTypes.APPLICATION_XML, MimeTypes.TEXT_XML}) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.structure.type.Field", + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) +public class XmlReader + extends CasCollectionReader_ImplBase +{ + /** + * Location from which the input is read. + */ + public static final String PARAM_SOURCE_LOCATION = ComponentParameters.PARAM_SOURCE_LOCATION; + @ConfigurationParameter(name = PARAM_SOURCE_LOCATION, mandatory = true) + private String inputDirectory; + + /** + * Set this as the language of the produced documents. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + private String language; + + /** + * optional, tags those should be worked on (if empty, then all tags except those ExcludeTags + * will be worked on) + */ + public static final String PARAM_INCLUDE_TAG = "IncludeTag"; + @ConfigurationParameter(name = PARAM_INCLUDE_TAG, mandatory = true, defaultValue = {}) + private Set includeTags; + + /** + * optional, tags those should not be worked on. Out them should no text be extracted and also + * no Annotations be produced. + */ + public static final String PARAM_EXCLUDE_TAG = "ExcludeTag"; + @ConfigurationParameter(name = PARAM_EXCLUDE_TAG, mandatory = true, defaultValue = {}) + private Set excludeTags; + + /** + * tag which contains the docId + */ + public static final String PARAM_DOC_ID_TAG = "DocIdTag"; + @ConfigurationParameter(name = PARAM_DOC_ID_TAG, mandatory = false) + private String docIdTag; + + /** + * The collection ID to set in the {@link DocumentMetaData}. + */ + public static final String PARAM_COLLECTION_ID = "collectionId"; + @ConfigurationParameter(name = PARAM_COLLECTION_ID, mandatory = false) + private String collectionId; + + private static final String MESSAGE_DIGEST = "de.tudarmstadt.ukp.dkpro.core.io.xml.XmlReader_Messages"; + private static final String INVALID_PATH_EXCEPTION = "invalid_path_error"; + private static final String EMPTY_DIRECTORY_EXCEPTION = "empty_directory_error"; + private static final String MISSING_DOC_ID_EXCEPTION = "missing_doc_id_error"; + private static final String EMPTY_DOC_ID_EXCEPTION = "empty_doc_id_error"; + private static final String MULTIPLE_DOC_ID_EXCEPTION = "multiple_doc_id_error"; + private static final String SUBSTITUTE_EXCEPTION = "substitute_error"; + + // mandatory, list of xml files to be readed in + private final ArrayList xmlFiles = new ArrayList(); + + // Xml stream reader + private XMLStreamReader2 xmlReader; + + // current be parsed file index + private int currentParsedFile; + + private int iDoc; + private boolean useSubstitution; + private Map substitution; + + private String docIdElementLocalName; + private String docIdAttributeName; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + // mandatory, directory where that those be parsed XML files are + File inDir = new File(inputDirectory); + // get all xml files from the input directory (ignore the + // subdirectories) + if (inDir.isDirectory()) { + File[] files = inDir.listFiles(); + for (File file : files) { + if (file.isFile() && (file.toString().endsWith(".xml") + || file.toString().endsWith(".sgml"))) { + xmlFiles.add(file); + } + } + Collections.sort(xmlFiles); + } + else { + throw new ResourceInitializationException( + MESSAGE_DIGEST, + INVALID_PATH_EXCEPTION, + new Object[] {inDir}); + } + + // if xmlFiles is not empty, then initialize the Stax Reader + if (xmlFiles.isEmpty()) { + throw new ResourceInitializationException( + MESSAGE_DIGEST, + EMPTY_DIRECTORY_EXCEPTION, + new Object[] {inDir}); + } + + currentParsedFile = 0; + + if (docIdTag != null && docIdTag.contains("/@")) { + int split = docIdTag.indexOf("/@"); + docIdElementLocalName = docIdTag.substring(0, split); + docIdAttributeName = docIdTag.substring(split + 2); + } + else { + docIdElementLocalName = docIdTag; + } + } + + @Override + public void getNext(CAS aCAS) + throws IOException, CollectionException + { + JCas jcas; + try { + jcas = aCAS.getJCas(); + } + catch (CASException e) { + throw new CollectionException(e); + } + + // parse the xml file + try { + // if the last file is already done, then work on the next file + if (xmlReader == null) { + WstxInputFactory factory = new WstxInputFactory(); + xmlReader = factory.createXMLStreamReader(xmlFiles + .get(currentParsedFile)); + iDoc = 0; + } + + // ignore the root element of the file + // parse the second layer element, suppose they are all documents + // read in all elements under second layer + parseSubDocument(jcas); + + iDoc++; + if (xmlReader.getDepth() < 2) { + xmlReader.closeCompletely(); + xmlReader = null; + currentParsedFile++; + } + } + catch (Exception e) { + throw new CollectionException(e); + } + } + + @Override + public Progress[] getProgress() + { + return new Progress[] { new ProgressImpl(currentParsedFile, xmlFiles + .size(), Progress.ENTITIES) }; + } + + @Override + public boolean hasNext() + throws IOException, CollectionException + { + if (xmlReader != null) { + // There is still more to parse in the current file + return true; + } + if (currentParsedFile >= 0 && currentParsedFile < xmlFiles.size()) { + // There are additional files to parse + return true; + } + else { + // There is nothing more + return false; + } + } + + @Override + public void close() + throws IOException + { + // Nothing to do + } + + private void parseSubDocument(JCas jcas) + throws XMLStreamException, IOException, CollectionException + { + // set the jcas document language if the parameter exists + if (language != null) { + jcas.setDocumentLanguage(language); + } + + LinkedList openTagStack = new LinkedList(); + + // get document tag + String docTag = seekSubDocumentRoot(); + + StringBuilder documentText = new StringBuilder(); + String docId = null; + while (xmlReader.hasNext() && xmlReader.getDepth() > 1) { + if (xmlReader.isStartElement()) { + String tagName = xmlReader.getName().getLocalPart(); + openTagStack.push(tagName); + + // If the docId is an attribute, try to fetch it now + String id = null; + if (isDocIdElement(tagName) && docIdAttributeName != null) { + id = xmlReader.getAttributeValue(null, docIdAttributeName); + } + + xmlReader.next(); + String elementText = collectText(); + if (elementText.length() > 0) { + // If the docId is an element value, we may capture it now + if (isDocIdElement(tagName) && docIdAttributeName == null) { + id = elementText; + } + + // Process the current span of text + processText(jcas, tagName, elementText, documentText); + } + + // If a docId has been captured, check if it valid and unique + if (id != null) { + if (docId != null) { + throw new CollectionException( + MULTIPLE_DOC_ID_EXCEPTION, + new Object[] { docIdTag }); + } + if (id.length() == 0) { + throw new CollectionException(EMPTY_DOC_ID_EXCEPTION, + new Object[] { docIdTag }); + } + docId = id; + } + } + else if (xmlReader.isCharacters()) { + String tagName = openTagStack.peek(); + + String elementText = collectText(); + if (elementText.length() == 0) { + continue; + } + + // Process the current span of text + processText(jcas, tagName, elementText, documentText); + } + else if (xmlReader.isEndElement()) { + String tagName = xmlReader.getName().getLocalPart(); + + // if it is end of document then stop processing + if (docTag.equals(tagName)) { + xmlReader.nextTag(); + break; + } + + openTagStack.poll(); + xmlReader.next(); + } + } + jcas.setDocumentText(documentText.toString()); + + // Add Document MetaData + String fileName = xmlFiles.get(currentParsedFile).getName(); +// String fileExtension = ""; + int dotPlace = fileName.lastIndexOf ( '.' ); + if (docIdTag != null) { + if (docId == null) { + throw new CollectionException(MESSAGE_DIGEST, MISSING_DOC_ID_EXCEPTION, + new Object[] { docIdTag }); + } + } + else { + if (dotPlace >= 0) { + // fileExtension = fileName.substring( dotPlace + 1 ); + docId = fileName.substring(0, dotPlace) + "-" + iDoc; + } + } + + String docUri = xmlFiles.get(currentParsedFile).toURI().toString(); + DocumentMetaData docMetaData = DocumentMetaData.create(jcas); + docMetaData.setDocumentId(docId); + docMetaData.setDocumentUri(docUri + "#" + docId); + docMetaData.setCollectionId(collectionId); + +// System.out.println("Fetched document: "+docUri+"#"+docId); + } + + /** + * Create a field annotation for the given element name at the given location. + * If substitutions are used, the field is created using the substituted name. + * + * @param jcas the JCas. + * @param localName the local name of the current XML element. + * @param begin the start offset. + * @param end the end offset. + */ + private void createFieldAnnotation(JCas jcas, String localName, int begin, int end) + { + String fieldName = null; + if (useSubstitution) { + fieldName = substitution.get(localName); + if (fieldName == null) { + fieldName = localName; + } + } + else { + fieldName = localName; + } + + Field field = new Field(jcas, begin, end); + field.setName(fieldName); + field.addToIndexes(); + } + + private boolean isIncluded(final String tagName) + { + boolean needToBeParsed = (includeTags.size() == 0) || includeTags.contains(tagName); + if (excludeTags.size() > 0 && excludeTags.contains(tagName)) { + needToBeParsed = false; + } + return needToBeParsed; + } + + /** + * Process the text found within the given element. If text from the given + * element should be included in the document, then it is added and a proper + * {@link Field} annotation is created. + * + * @param jcas the JCas. + * @param localName the element in which the text was found + * @param elementText the text + * @param documentText the document text buffer + */ + private void processText(JCas jcas, String localName, String elementText, + StringBuilder documentText) + { + if (isIncluded(localName)) { + int begin = documentText.length(); + documentText = documentText.append(elementText); + documentText = documentText.append("\n\n"); + int end = documentText.length() - 1; + createFieldAnnotation(jcas, localName, begin, end); + } + } + + /** + * Collect all consecutive text starting at the current point. + * + * @return the concatenated consecutive text. + */ + private String collectText() throws XMLStreamException + { + StringBuilder elementText = new StringBuilder(); + while (xmlReader.isCharacters()) { + elementText.append(xmlReader.getText().replaceAll("\r", "").trim()); + xmlReader.next(); + } + return elementText.toString(); + } + + /** + * Seek to the root element of the next sub-document and return its local name. + * + * @return the local name of the sub-document root element. + */ + private String seekSubDocumentRoot() + throws XMLStreamException, IOException + { + // if this is not the first document in the file then the current + // element is the docTag + String docTag = null; + if (xmlReader.isStartElement() && xmlReader.getDepth() > 1) { + docTag = xmlReader.getName().getLocalPart(); + } + else { + while (xmlReader.hasNext() && xmlReader.getDepth() < 2) { + xmlReader.next(); + } + while (xmlReader.hasNext() && !xmlReader.isStartElement()) { + xmlReader.next(); + } + if (xmlReader.getDepth() == 2 && xmlReader.isStartElement()) { + docTag = xmlReader.getName().getLocalPart(); + } + else { + throw new IOException("file is empty: " + + xmlFiles.get(currentParsedFile)); + } + } + return docTag; + } + + private boolean isDocIdElement(String localName) + { + return docIdElementLocalName != null && docIdElementLocalName.equals(localName); + } +} diff --git a/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/XmlTextReader.java b/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/XmlTextReader.java new file mode 100644 index 0000000000..4c3d142b9f --- /dev/null +++ b/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/XmlTextReader.java @@ -0,0 +1,172 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.xml; + +import static org.apache.commons.io.IOUtils.closeQuietly; + +import java.io.IOException; +import java.io.InputStream; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.CASException; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.util.Logger; +import org.dkpro.core.api.io.ResourceCollectionReaderBase; +import org.dkpro.core.api.parameter.MimeTypes; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * @since 1.1.0 + */ +@ResourceMetaData(name = "XML Text Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") +@MimeTypeCapability({MimeTypes.APPLICATION_XML, MimeTypes.TEXT_XML}) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) + +public class XmlTextReader + extends ResourceCollectionReaderBase +{ + @Override + public void getNext(CAS aCAS) + throws IOException, CollectionException + { + Resource res = nextFile(); + initCas(aCAS, res); + + InputStream is = null; + + try { + JCas jcas = aCAS.getJCas(); + + is = res.getInputStream(); + + // Create handler + Handler handler = newSaxHandler(); + handler.setJCas(jcas); + handler.setLogger(getLogger()); + + // Parser XML + SAXParserFactory pf = SAXParserFactory.newInstance(); + SAXParser parser = pf.newSAXParser(); + + InputSource source = new InputSource(is); + source.setPublicId(res.getLocation()); + source.setSystemId(res.getLocation()); + parser.parse(source, handler); + + // Set up language + if (getConfigParameterValue(PARAM_LANGUAGE) != null) { + aCAS.setDocumentLanguage((String) getConfigParameterValue(PARAM_LANGUAGE)); + } + } + catch (CASException e) { + throw new CollectionException(e); + } + catch (ParserConfigurationException e) { + throw new CollectionException(e); + } + catch (SAXException e) { + throw new IOException(e); + } + finally { + closeQuietly(is); + } + } + + protected Handler newSaxHandler() + { + return new TextExtractor(); + } + + /** + */ + protected abstract static class Handler + extends DefaultHandler + { + private JCas jcas; + private Logger logger; + + public void setJCas(final JCas aJCas) + { + jcas = aJCas; + } + + protected JCas getJCas() + { + return jcas; + } + + public void setLogger(Logger aLogger) + { + logger = aLogger; + } + + public Logger getLogger() + { + return logger; + } + } + + /** + */ + public static class TextExtractor + extends Handler + { + private final StringBuilder buffer = new StringBuilder(); + + @Override + public void characters(char[] aCh, int aStart, int aLength) + throws SAXException + { + buffer.append(aCh, aStart, aLength); + } + + @Override + public void ignorableWhitespace(char[] aCh, int aStart, int aLength) + throws SAXException + { + buffer.append(aCh, aStart, aLength); + } + + @Override + public void endDocument() + throws SAXException + { + getJCas().setDocumentText(buffer.toString()); + } + + protected StringBuilder getBuffer() + { + return buffer; + } + } +} diff --git a/dkpro-core-io-xml-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xml/XmlXPathReader.java b/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/XmlXPathReader.java similarity index 97% rename from dkpro-core-io-xml-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xml/XmlXPathReader.java rename to dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/XmlXPathReader.java index 09251eac51..e373daa3ad 100644 --- a/dkpro-core-io-xml-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/xml/XmlXPathReader.java +++ b/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/XmlXPathReader.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.io.xml; +package org.dkpro.core.io.xml; import java.io.File; import java.io.FileInputStream; @@ -48,14 +48,15 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Level; +import org.dkpro.core.api.io.FileSetCollectionReaderBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; -import de.tudarmstadt.ukp.dkpro.core.api.io.FileSetCollectionReaderBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.structure.type.Field; +import eu.openminted.share.annotations.api.DocumentationResource; /** * A component reader for XML files implemented with XPath. @@ -66,7 +67,8 @@ *

* If your expression evaluates to leaf nodes, empty CASes will be created. */ -@ResourceMetaData(name="XPath-based XML Reader") +@ResourceMetaData(name = "XPath-based XML Reader") +@DocumentationResource("${docbase}/format-reference.html#format-${command}") @MimeTypeCapability({MimeTypes.APPLICATION_XML, MimeTypes.TEXT_XML}) @TypeCapability( outputs = { diff --git a/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/package-info.java b/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/package-info.java new file mode 100644 index 0000000000..8ef1ccca6e --- /dev/null +++ b/dkpro-core-io-xml-asl/src/main/java/org/dkpro/core/io/xml/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Support for XML files (read-only). + * + * @since 1.1.0 + */ +package org.dkpro.core.io.xml; diff --git a/dkpro-core-io-xml-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/xml/XmlReader_Messages.properties b/dkpro-core-io-xml-asl/src/main/resources/org/dkpro/core/io/xml/XmlReader_Messages.properties similarity index 100% rename from dkpro-core-io-xml-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/io/xml/XmlReader_Messages.properties rename to dkpro-core-io-xml-asl/src/main/resources/org/dkpro/core/io/xml/XmlReader_Messages.properties diff --git a/dkpro-core-io-xml-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/xml/InlineXmlWriterTest.java b/dkpro-core-io-xml-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/xml/InlineXmlWriterTest.java deleted file mode 100644 index 8966867497..0000000000 --- a/dkpro-core-io-xml-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/xml/InlineXmlWriterTest.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.xml; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.junit.Assert.fail; - -import java.io.File; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.jcas.JCas; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; - -public class InlineXmlWriterTest -{ - @Rule - public TemporaryFolder workspace = new TemporaryFolder(); - - @Test - public void testInlineXmlCasConsumer() - throws Exception - { - String testDocument = "This is a test."; - - AnalysisEngine consumer = createEngine(InlineXmlWriter.class, - InlineXmlWriter.PARAM_TARGET_LOCATION, workspace.getRoot().getPath(), - InlineXmlWriter.PARAM_STRIP_EXTENSION, true); - - JCas jcas = consumer.newJCas(); - jcas.setDocumentText(testDocument); - - DocumentMetaData meta = DocumentMetaData.create(jcas); - meta.setDocumentId("testId"); - meta.setDocumentTitle("title"); - meta.setDocumentBaseUri(workspace.getRoot().toURI().toString()); - meta.setDocumentUri(new File(workspace.getRoot(), "test.txt").toURI().toString()); - - JCas view = jcas.createView("plainTextDocument"); - view.setDocumentText(testDocument); - - consumer.process(jcas); - - File writtenFile = new File(workspace.getRoot(), "test.xml"); - if (!writtenFile.exists()) { - fail("File not correctly written."); - } - } -} diff --git a/dkpro-core-io-xml-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/xml/XPathXmlReaderFeatureTest.java b/dkpro-core-io-xml-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/xml/XPathXmlReaderFeatureTest.java deleted file mode 100644 index 9ad317f268..0000000000 --- a/dkpro-core-io-xml-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/xml/XPathXmlReaderFeatureTest.java +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.xml; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; -import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; - -import java.io.IOException; - -import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReader; -import org.apache.uima.fit.component.CasDumpWriter; -import org.junit.Test; - -public class XPathXmlReaderFeatureTest -{ - private static final String VALID_DOCS_ROOT = "src/test/resources/input/valid_docs"; - - @Test - public void abbreviatedFormatTest() throws UIMAException, IOException - { - CollectionReader reader = createReader( - XmlXPathReader.class, - XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, - XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]abbr*.xml" }, - XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", - XmlXPathReader.PARAM_LANGUAGE, "en" - ); - - // Should find one file - AnalysisEngineDescription writer = createEngineDescription( - CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/abbr_format_reading.txt" - ); - - runPipeline(reader, writer); - } - - - @Test - public void fullFormatTest() throws UIMAException, IOException - { - CollectionReader reader = createReader( - XmlXPathReader.class, - XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, - XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]full*.xml" }, - XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/topic", - XmlXPathReader.PARAM_LANGUAGE, "en" - ); - - // Should find one file - AnalysisEngineDescription writer = createEngineDescription( - CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/full_format_reading.txt" - ); - - runPipeline(reader, writer); - } - - - @Test - public void heteroFormatsTest() throws UIMAException, IOException - { - CollectionReader reader = createReader( - XmlXPathReader.class, - XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, - XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]full*.xml", "[+]abbr*.xml" }, - XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/topic | /topics/top", - XmlXPathReader.PARAM_LANGUAGE, "en" - ); - - // Should find one file - AnalysisEngineDescription writer = createEngineDescription( - CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/hetero_formats_reading.txt" - ); - - runPipeline(reader, writer); - } - - - @Test - public void recursiveReadingTest() throws UIMAException, IOException - { - CollectionReader reader = createReader( - XmlXPathReader.class, - XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, - XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]**/abbr*.xml" }, - XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", - XmlXPathReader.PARAM_LANGUAGE, "en" - ); - - // Should find two files - AnalysisEngineDescription writer = createEngineDescription( - CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/recursive_reading.txt" - ); - - runPipeline(reader, writer); - } - - - @Test - public void tagFilteringTest() throws UIMAException, IOException - { - CollectionReader reader = createReader( - XmlXPathReader.class, - XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, - XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]abbr*.*" }, - XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", - // read only num and EN-title tags - XmlXPathReader.PARAM_INCLUDE_TAGS, new String[] { "EN-title", "num" }, - XmlXPathReader.PARAM_LANGUAGE, "en" - ); - - AnalysisEngineDescription writer = createEngineDescription( - CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/tag_filtering.txt" - ); - - runPipeline(reader, writer); - } - - - @Test - public void substitutionTest() throws UIMAException, IOException - { - CollectionReader reader = createReader( - XmlXPathReader.class, - XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, - XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]abbr*.*" }, - XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", - // Subtitute "EN-title" tag with "title" and "EN-narr" with "narration" - XmlXPathReader.PARAM_SUBSTITUTE_TAGS, new String[] { "EN-title", "title", "EN-narr", "narration" }, - XmlXPathReader.PARAM_LANGUAGE, "en" - ); - - AnalysisEngineDescription writer = createEngineDescription( - CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/substitution.txt" - ); - - runPipeline(reader, writer); - } - - -} diff --git a/dkpro-core-io-xml-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/xml/XPathXmlReaderIdValidationTest.java b/dkpro-core-io-xml-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/xml/XPathXmlReaderIdValidationTest.java deleted file mode 100644 index 2a51e3f39f..0000000000 --- a/dkpro-core-io-xml-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/io/xml/XPathXmlReaderIdValidationTest.java +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.io.xml; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; -import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; - -import java.io.IOException; - -import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReader; -import org.apache.uima.fit.component.CasDumpWriter; -import org.junit.Test; - -public class XPathXmlReaderIdValidationTest -{ - private static final String VALID_DOCS_ROOT = "src/test/resources/input/valid_docs"; - private static final String INVALID_DOCS_ROOT = "src/test/resources/input/invalid_docs"; - - // Valid docs - - @Test - public void idValidationTest() throws UIMAException, IOException - { - CollectionReader reader = createReader( - XmlXPathReader.class, - XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, - XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]**/abbr*.xml" }, - XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", - XmlXPathReader.PARAM_LANGUAGE, "en", - XmlXPathReader.PARAM_DOC_ID_TAG, "num" - ); - - // Should find two files - AnalysisEngineDescription writer = createEngineDescription( - CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/id_validation.txt" - ); - - runPipeline(reader, writer); - } - - - @Test - public void heteroFormatsIdValidationTest() throws UIMAException, IOException - { - CollectionReader reader = createReader( - XmlXPathReader.class, - XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, - XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]full*.xml", "[+]abbr*.xml" }, - XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/topic | /topics/top", - XmlXPathReader.PARAM_LANGUAGE, "en", - XmlXPathReader.PARAM_DOC_ID_TAG, "identifier | num" - ); - - // Should find two files - AnalysisEngineDescription writer = createEngineDescription( - CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/hetero_formats_id_validation.txt" - ); - - runPipeline(reader, writer); - } - - - @Test - public void attributeIdTest() throws UIMAException, IOException - { - CollectionReader reader = createReader( - XmlXPathReader.class, - XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, - XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]attribute_id.xml" }, - XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", - XmlXPathReader.PARAM_DOC_ID_TAG, "@num" - ); - - AnalysisEngineDescription writer = createEngineDescription( - CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/attribute_id.txt" - ); - - runPipeline(reader, writer); - } - - - @Test - public void deepTagIdTest() throws UIMAException, IOException - { - CollectionReader reader = createReader( - XmlXPathReader.class, - XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, - XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]deep_tag_id.xml" }, - XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", - XmlXPathReader.PARAM_DOC_ID_TAG, "EN-title/num" - ); - - AnalysisEngineDescription writer = createEngineDescription( - CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/deep_tag_id.txt" - ); - - runPipeline(reader, writer); - } - - - @Test - public void deepAttributeIdTest() throws UIMAException, IOException - { - CollectionReader reader = createReader( - XmlXPathReader.class, - XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, - XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]deep_attribute_id.xml" }, - XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", - XmlXPathReader.PARAM_DOC_ID_TAG, "EN-title/@num" - ); - - AnalysisEngineDescription writer = createEngineDescription( - CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/deep_attribute_id.txt" - ); - - runPipeline(reader, writer); - } - - - // Invalid docs - - @Test(expected = IllegalArgumentException.class) - public void invalidSubstitutionParameterTest() throws UIMAException, IOException - { - CollectionReader reader = createReader( - XmlXPathReader.class, - XmlXPathReader.PARAM_SOURCE_LOCATION, INVALID_DOCS_ROOT, - XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]*.*" }, - XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", - XmlXPathReader.PARAM_SUBSTITUTE_TAGS, new String[] { "EN-title" }, // User should provide even number parameters - XmlXPathReader.PARAM_LANGUAGE, "en" - ); - - AnalysisEngineDescription writer = createEngineDescription( - CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/invalid_subst_param.txt" - ); - - runPipeline(reader, writer); - } - - - @Test(expected = IllegalStateException.class) - public void emptyIdTest() throws UIMAException, IOException - { - // Doc contains ID tag but no value is provided within the tag. - // E.g. - CollectionReader reader = createReader( - XmlXPathReader.class, - XmlXPathReader.PARAM_SOURCE_LOCATION, INVALID_DOCS_ROOT, - XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]empty_id.xml" }, - XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", - XmlXPathReader.PARAM_DOC_ID_TAG, "num", - XmlXPathReader.PARAM_LANGUAGE, "en" - ); - - AnalysisEngineDescription writer = createEngineDescription( - CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/empty_id.txt" - ); - - runPipeline(reader, writer); - } - - - @Test(expected = IllegalStateException.class) - public void noIdTagTest() throws UIMAException, IOException - { - // Doc doesn't contain ID tag at all - CollectionReader reader = createReader( - XmlXPathReader.class, - XmlXPathReader.PARAM_SOURCE_LOCATION, INVALID_DOCS_ROOT, - XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]no_id_tag.xml" }, - XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", - XmlXPathReader.PARAM_DOC_ID_TAG, "num", - XmlXPathReader.PARAM_LANGUAGE, "en" - ); - - AnalysisEngineDescription writer = createEngineDescription( - CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/no_id_tag.txt" - ); - - runPipeline(reader, writer); - } - - - @Test(expected = IllegalStateException.class) - public void nonUniqueIdTagTest() throws UIMAException, IOException - { - // A single doc contains ID tag twice - // E.g. - // 01 - // 01 - // ..... - // ... - // </top> - CollectionReader reader = createReader( - XmlXPathReader.class, - XmlXPathReader.PARAM_SOURCE_LOCATION, INVALID_DOCS_ROOT, - XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]duplicated_id_tags.xml" }, - XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", - XmlXPathReader.PARAM_DOC_ID_TAG, "num", - XmlXPathReader.PARAM_LANGUAGE, "en" - ); - - AnalysisEngineDescription writer = createEngineDescription( - CasDumpWriter.class, - CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/duplicated_id_tags.txt" - ); - - runPipeline(reader, writer); - } - - -} diff --git a/dkpro-core-io-xml-asl/src/test/java/org/dkpro/core/io/xml/InlineXmlWriterTest.java b/dkpro-core-io-xml-asl/src/test/java/org/dkpro/core/io/xml/InlineXmlWriterTest.java new file mode 100644 index 0000000000..f92c3664cb --- /dev/null +++ b/dkpro-core-io-xml-asl/src/test/java/org/dkpro/core/io/xml/InlineXmlWriterTest.java @@ -0,0 +1,68 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.xml; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.junit.Assert.fail; + +import java.io.File; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.xml.InlineXmlWriter; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; + +public class InlineXmlWriterTest +{ + @Rule + public TemporaryFolder workspace = new TemporaryFolder(); + + @Test + public void testInlineXmlCasConsumer() + throws Exception + { + String testDocument = "This is a test."; + + AnalysisEngine consumer = createEngine(InlineXmlWriter.class, + InlineXmlWriter.PARAM_TARGET_LOCATION, workspace.getRoot().getPath(), + InlineXmlWriter.PARAM_STRIP_EXTENSION, true); + + JCas jcas = consumer.newJCas(); + jcas.setDocumentText(testDocument); + + DocumentMetaData meta = DocumentMetaData.create(jcas); + meta.setDocumentId("testId"); + meta.setDocumentTitle("title"); + meta.setDocumentBaseUri(workspace.getRoot().toURI().toString()); + meta.setDocumentUri(new File(workspace.getRoot(), "test.txt").toURI().toString()); + + JCas view = jcas.createView("plainTextDocument"); + view.setDocumentText(testDocument); + + consumer.process(jcas); + + File writtenFile = new File(workspace.getRoot(), "test.xml"); + if (!writtenFile.exists()) { + fail("File not correctly written."); + } + } +} diff --git a/dkpro-core-io-xml-asl/src/test/java/org/dkpro/core/io/xml/SimpleXmlReaderWriterTest.java b/dkpro-core-io-xml-asl/src/test/java/org/dkpro/core/io/xml/SimpleXmlReaderWriterTest.java new file mode 100644 index 0000000000..749d7e2c11 --- /dev/null +++ b/dkpro-core-io-xml-asl/src/test/java/org/dkpro/core/io/xml/SimpleXmlReaderWriterTest.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.xml; + +import static org.dkpro.core.testing.IOTestRunner.testRoundTrip; + +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Rule; +import org.junit.Test; + +public class SimpleXmlReaderWriterTest +{ + @Test + public void testBasic() throws Exception + { + testRoundTrip(XmlDocumentReader.class, XmlDocumentWriter.class, + "xml/basic.xml"); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-io-xml-asl/src/test/java/org/dkpro/core/io/xml/XPathXmlReaderFeatureTest.java b/dkpro-core-io-xml-asl/src/test/java/org/dkpro/core/io/xml/XPathXmlReaderFeatureTest.java new file mode 100644 index 0000000000..14a6abeba5 --- /dev/null +++ b/dkpro-core-io-xml-asl/src/test/java/org/dkpro/core/io/xml/XPathXmlReaderFeatureTest.java @@ -0,0 +1,166 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.xml; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; +import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; + +import java.io.IOException; + +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.component.CasDumpWriter; +import org.dkpro.core.io.xml.XmlXPathReader; +import org.junit.Test; + +public class XPathXmlReaderFeatureTest +{ + private static final String VALID_DOCS_ROOT = "src/test/resources/input/valid_docs"; + + @Test + public void abbreviatedFormatTest() throws UIMAException, IOException + { + CollectionReader reader = createReader( + XmlXPathReader.class, + XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, + XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]abbr*.xml" }, + XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", + XmlXPathReader.PARAM_LANGUAGE, "en" + ); + + // Should find one file + AnalysisEngineDescription writer = createEngineDescription( + CasDumpWriter.class, + CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/abbr_format_reading.txt" + ); + + runPipeline(reader, writer); + } + + + @Test + public void fullFormatTest() throws UIMAException, IOException + { + CollectionReader reader = createReader( + XmlXPathReader.class, + XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, + XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]full*.xml" }, + XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/topic", + XmlXPathReader.PARAM_LANGUAGE, "en" + ); + + // Should find one file + AnalysisEngineDescription writer = createEngineDescription( + CasDumpWriter.class, + CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/full_format_reading.txt" + ); + + runPipeline(reader, writer); + } + + + @Test + public void heteroFormatsTest() throws UIMAException, IOException + { + CollectionReader reader = createReader( + XmlXPathReader.class, + XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, + XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]full*.xml", "[+]abbr*.xml" }, + XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/topic | /topics/top", + XmlXPathReader.PARAM_LANGUAGE, "en" + ); + + // Should find one file + AnalysisEngineDescription writer = createEngineDescription( + CasDumpWriter.class, + CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/hetero_formats_reading.txt" + ); + + runPipeline(reader, writer); + } + + + @Test + public void recursiveReadingTest() throws UIMAException, IOException + { + CollectionReader reader = createReader( + XmlXPathReader.class, + XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, + XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]**/abbr*.xml" }, + XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", + XmlXPathReader.PARAM_LANGUAGE, "en" + ); + + // Should find two files + AnalysisEngineDescription writer = createEngineDescription( + CasDumpWriter.class, + CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/recursive_reading.txt" + ); + + runPipeline(reader, writer); + } + + + @Test + public void tagFilteringTest() throws UIMAException, IOException + { + CollectionReader reader = createReader( + XmlXPathReader.class, + XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, + XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]abbr*.*" }, + XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", + // read only num and EN-title tags + XmlXPathReader.PARAM_INCLUDE_TAGS, new String[] { "EN-title", "num" }, + XmlXPathReader.PARAM_LANGUAGE, "en" + ); + + AnalysisEngineDescription writer = createEngineDescription( + CasDumpWriter.class, + CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/tag_filtering.txt" + ); + + runPipeline(reader, writer); + } + + + @Test + public void substitutionTest() throws UIMAException, IOException + { + CollectionReader reader = createReader( + XmlXPathReader.class, + XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, + XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]abbr*.*" }, + XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", + // Subtitute "EN-title" tag with "title" and "EN-narr" with "narration" + XmlXPathReader.PARAM_SUBSTITUTE_TAGS, new String[] { + "EN-title", "title", "EN-narr", "narration" }, + XmlXPathReader.PARAM_LANGUAGE, "en" + ); + + AnalysisEngineDescription writer = createEngineDescription( + CasDumpWriter.class, + CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/substitution.txt" + ); + + runPipeline(reader, writer); + } + + +} diff --git a/dkpro-core-io-xml-asl/src/test/java/org/dkpro/core/io/xml/XPathXmlReaderIdValidationTest.java b/dkpro-core-io-xml-asl/src/test/java/org/dkpro/core/io/xml/XPathXmlReaderIdValidationTest.java new file mode 100644 index 0000000000..b27e008693 --- /dev/null +++ b/dkpro-core-io-xml-asl/src/test/java/org/dkpro/core/io/xml/XPathXmlReaderIdValidationTest.java @@ -0,0 +1,241 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.xml; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; +import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; + +import java.io.IOException; + +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.component.CasDumpWriter; +import org.dkpro.core.io.xml.XmlXPathReader; +import org.junit.Test; + +public class XPathXmlReaderIdValidationTest +{ + private static final String VALID_DOCS_ROOT = "src/test/resources/input/valid_docs"; + private static final String INVALID_DOCS_ROOT = "src/test/resources/input/invalid_docs"; + + // Valid docs + + @Test + public void idValidationTest() throws UIMAException, IOException + { + CollectionReader reader = createReader( + XmlXPathReader.class, + XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, + XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]**/abbr*.xml" }, + XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", + XmlXPathReader.PARAM_LANGUAGE, "en", + XmlXPathReader.PARAM_DOC_ID_TAG, "num" + ); + + // Should find two files + AnalysisEngineDescription writer = createEngineDescription( + CasDumpWriter.class, + CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/id_validation.txt" + ); + + runPipeline(reader, writer); + } + + + @Test + public void heteroFormatsIdValidationTest() throws UIMAException, IOException + { + CollectionReader reader = createReader( + XmlXPathReader.class, + XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, + XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]full*.xml", "[+]abbr*.xml" }, + XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/topic | /topics/top", + XmlXPathReader.PARAM_LANGUAGE, "en", + XmlXPathReader.PARAM_DOC_ID_TAG, "identifier | num" + ); + + // Should find two files + AnalysisEngineDescription writer = createEngineDescription( + CasDumpWriter.class, + CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/hetero_formats_id_validation.txt" + ); + + runPipeline(reader, writer); + } + + + @Test + public void attributeIdTest() throws UIMAException, IOException + { + CollectionReader reader = createReader( + XmlXPathReader.class, + XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, + XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]attribute_id.xml" }, + XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", + XmlXPathReader.PARAM_DOC_ID_TAG, "@num" + ); + + AnalysisEngineDescription writer = createEngineDescription( + CasDumpWriter.class, + CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/attribute_id.txt" + ); + + runPipeline(reader, writer); + } + + + @Test + public void deepTagIdTest() throws UIMAException, IOException + { + CollectionReader reader = createReader( + XmlXPathReader.class, + XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, + XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]deep_tag_id.xml" }, + XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", + XmlXPathReader.PARAM_DOC_ID_TAG, "EN-title/num" + ); + + AnalysisEngineDescription writer = createEngineDescription( + CasDumpWriter.class, + CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/deep_tag_id.txt" + ); + + runPipeline(reader, writer); + } + + + @Test + public void deepAttributeIdTest() throws UIMAException, IOException + { + CollectionReader reader = createReader( + XmlXPathReader.class, + XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT, + XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]deep_attribute_id.xml" }, + XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", + XmlXPathReader.PARAM_DOC_ID_TAG, "EN-title/@num" + ); + + AnalysisEngineDescription writer = createEngineDescription( + CasDumpWriter.class, + CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/deep_attribute_id.txt" + ); + + runPipeline(reader, writer); + } + + + // Invalid docs + + @Test(expected = IllegalArgumentException.class) + public void invalidSubstitutionParameterTest() throws UIMAException, IOException + { + CollectionReader reader = createReader( + XmlXPathReader.class, + XmlXPathReader.PARAM_SOURCE_LOCATION, INVALID_DOCS_ROOT, + XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]*.*" }, + XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", + // User should provide even number parameters + XmlXPathReader.PARAM_SUBSTITUTE_TAGS, new String[] { "EN-title" }, + XmlXPathReader.PARAM_LANGUAGE, "en" + ); + + AnalysisEngineDescription writer = createEngineDescription( + CasDumpWriter.class, + CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/invalid_subst_param.txt" + ); + + runPipeline(reader, writer); + } + + + @Test(expected = IllegalStateException.class) + public void emptyIdTest() throws UIMAException, IOException + { + // Doc contains ID tag but no value is provided within the tag. + // E.g. <num></num> + CollectionReader reader = createReader( + XmlXPathReader.class, + XmlXPathReader.PARAM_SOURCE_LOCATION, INVALID_DOCS_ROOT, + XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]empty_id.xml" }, + XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", + XmlXPathReader.PARAM_DOC_ID_TAG, "num", + XmlXPathReader.PARAM_LANGUAGE, "en" + ); + + AnalysisEngineDescription writer = createEngineDescription( + CasDumpWriter.class, + CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/empty_id.txt" + ); + + runPipeline(reader, writer); + } + + + @Test(expected = IllegalStateException.class) + public void noIdTagTest() throws UIMAException, IOException + { + // Doc doesn't contain ID tag at all + CollectionReader reader = createReader( + XmlXPathReader.class, + XmlXPathReader.PARAM_SOURCE_LOCATION, INVALID_DOCS_ROOT, + XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]no_id_tag.xml" }, + XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", + XmlXPathReader.PARAM_DOC_ID_TAG, "num", + XmlXPathReader.PARAM_LANGUAGE, "en" + ); + + AnalysisEngineDescription writer = createEngineDescription( + CasDumpWriter.class, + CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/no_id_tag.txt" + ); + + runPipeline(reader, writer); + } + + + @Test(expected = IllegalStateException.class) + public void nonUniqueIdTagTest() throws UIMAException, IOException + { + // A single doc contains ID tag twice + // E.g. <top> + // <num>01</num> + // <num>01</num> + // <title>..... + // ... + // </top> + CollectionReader reader = createReader( + XmlXPathReader.class, + XmlXPathReader.PARAM_SOURCE_LOCATION, INVALID_DOCS_ROOT, + XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]duplicated_id_tags.xml" }, + XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top", + XmlXPathReader.PARAM_DOC_ID_TAG, "num", + XmlXPathReader.PARAM_LANGUAGE, "en" + ); + + AnalysisEngineDescription writer = createEngineDescription( + CasDumpWriter.class, + CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/duplicated_id_tags.txt" + ); + + runPipeline(reader, writer); + } + + +} diff --git a/dkpro-core-io-xml-asl/src/test/resources/xml/basic.xml b/dkpro-core-io-xml-asl/src/test/resources/xml/basic.xml new file mode 100644 index 0000000000..35e9d7dbc8 --- /dev/null +++ b/dkpro-core-io-xml-asl/src/test/resources/xml/basic.xml @@ -0,0 +1,3 @@ +<root xmlns="http://defaultNamespace" xmlns:style="http://styleNamespace"> + This is <style:b a="1" style:b="2">bold</style:b> text. +</root> diff --git a/dkpro-core-ixa-asl/pom.xml b/dkpro-core-ixa-asl/pom.xml index 0caeca195a..a8f90f0c89 100644 --- a/dkpro-core-ixa-asl/pom.xml +++ b/dkpro-core-ixa-asl/pom.xml @@ -18,14 +18,15 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.ixa-asl</artifactId> + <artifactId>dkpro-core-ixa-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - IXA</name> + <url>https://dkpro.github.io/dkpro-core/</url> <dependencies> <dependency> <groupId>org.apache.uima</groupId> @@ -46,7 +47,7 @@ <dependency> <groupId>eus.ixa</groupId> <artifactId>ixa-pipe-pos</artifactId> - <version>1.5.1</version> + <version>1.5.3</version> </dependency> <!-- <dependency> @@ -74,46 +75,50 @@ </dependency> --> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.metadata-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-metadata-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <!-- <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.ner-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-ner-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.syntax-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-syntax-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> </dependency> --> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-opennlp-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> <dependency> @@ -135,9 +140,9 @@ <dependencyManagement> <dependencies> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-opennlp-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <type>pom</type> <scope>import</scope> </dependency> diff --git a/dkpro-core-ixa-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ixa/IxaLemmatizer.java b/dkpro-core-ixa-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ixa/IxaLemmatizer.java deleted file mode 100644 index 60de78d636..0000000000 --- a/dkpro-core-ixa-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ixa/IxaLemmatizer.java +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.ixa; - -import static org.apache.uima.fit.util.JCasUtil.indexCovered; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.toText; -import static org.apache.uima.util.Level.INFO; - -import java.io.InputStream; -import java.util.Collection; -import java.util.Map; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.ixa.internal.IxaLemmatizerTagsetDescriptionProvider; -import eus.ixa.ixa.pipe.lemma.LemmatizerME; -import eus.ixa.ixa.pipe.lemma.LemmatizerModel; - -/** - * Lemmatizer using the OpenNLP-based Ixa implementation. - */ -@ResourceMetaData(name="IXA Lemmatizer") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }) -public class IxaLemmatizer - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - protected String modelLocation; - - /** - * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") - protected boolean printTagSet; - - private CasConfigurableProviderBase<LemmatizerME> modelProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase<LemmatizerME>(this, "lemmatizer") - { - @Override - protected LemmatizerME produceResource(InputStream aStream) - throws Exception - { - LemmatizerModel model = new LemmatizerModel(aStream); - - // Extract tagset information from the model - IxaLemmatizerTagsetDescriptionProvider tsdp = new IxaLemmatizerTagsetDescriptionProvider( - getResourceMetaData().getProperty("pos.tagset"), POS.class, - model.getLemmatizerSequenceModel(), "t0"); - addTagset(tsdp, false); - - if (printTagSet) { - getContext().getLogger().log(INFO, tsdp.toString()); - } - - return new LemmatizerME(model); - } - - }; - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - modelProvider.configure(aJCas.getCas()); - LemmatizerME analyzer = modelProvider.getResource(); - - // Iterate over all sentences - Map<Sentence, Collection<Token>> index = indexCovered(aJCas, Sentence.class, Token.class); - for (Sentence sentence : select(aJCas, Sentence.class)) { - Collection<Token> tokens = index.get(sentence); - String[] tokenTexts = toText(tokens).toArray(new String[tokens.size()]); - String[] tokenPos = tokens.stream() - .map(t -> { return t.getPos().getPosValue(); }) - .toArray(s -> { return new String[tokens.size()]; }); - - String[] encodedLemmas = analyzer.lemmatize(tokenTexts, tokenPos); - String[] lemmas = analyzer.decodeLemmas(tokenTexts, encodedLemmas); - - int i = 0; - for (Token t : tokens) { - String lemmaString = lemmas[i]; - if (lemmaString == null) { - lemmaString = t.getText(); - } - Lemma l = new Lemma(aJCas, t.getBegin(), t.getEnd()); - l.setValue(lemmaString); - l.addToIndexes(); - - t.setLemma(l); - i++; - } - } - } -} diff --git a/dkpro-core-ixa-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ixa/IxaPosTagger.java b/dkpro-core-ixa-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ixa/IxaPosTagger.java deleted file mode 100644 index c20a991305..0000000000 --- a/dkpro-core-ixa-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ixa/IxaPosTagger.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.ixa; - -import org.apache.uima.fit.descriptor.ResourceMetaData; - -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; - -/** - * Part-of-Speech annotator using OpenNLP with IXA extensions. - */ -@ResourceMetaData(name="IXA POS-Tagger") -public class IxaPosTagger - extends OpenNlpPosTagger -{ - // The IXA POS tagger models make use of IXA classes. But they do so from within OpenNLP. - // From the outside, it looks and works exactly like an OpenNLP POS tagger. So we just - // derive from the OpenNlpPosTagger in side this module. This has the effect that through - // the module dependencies, we have the required IXA JARs on the classpath. It also has - // the effect that the package for the models changes from ...opennlp.lib to ...ixa.lib. -} diff --git a/dkpro-core-ixa-asl/src/main/java/org/dkpro/core/ixa/IxaLemmatizer.java b/dkpro-core-ixa-asl/src/main/java/org/dkpro/core/ixa/IxaLemmatizer.java new file mode 100644 index 0000000000..1b637b3f79 --- /dev/null +++ b/dkpro-core-ixa-asl/src/main/java/org/dkpro/core/ixa/IxaLemmatizer.java @@ -0,0 +1,184 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.ixa; + +import static org.apache.uima.fit.util.JCasUtil.indexCovered; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.toText; +import static org.apache.uima.util.Level.INFO; + +import java.io.InputStream; +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.ixa.internal.IxaLemmatizerTagsetDescriptionProvider; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; +import eus.ixa.ixa.pipe.lemma.LemmatizerME; +import eus.ixa.ixa.pipe.lemma.LemmatizerModel; + +/** + * Lemmatizer using the OpenNLP-based Ixa implementation. + */ +@Component(OperationType.LEMMATIZER) +@ResourceMetaData(name = "IXA Lemmatizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }) +public class IxaLemmatizer + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + protected String modelLocation; + + /** + * Log the tag set(s) when a model is loaded. + */ + public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") + protected boolean printTagSet; + + private CasConfigurableProviderBase<LemmatizerME> modelProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase<LemmatizerME>(this, "lemmatizer") + { + { + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/ixa/lib/lemmatizer-${language}-${variant}.properties"); + } + + @Override + protected LemmatizerME produceResource(InputStream aStream) + throws Exception + { + LemmatizerModel model = new LemmatizerModel(aStream); + + // Extract tagset information from the model + IxaLemmatizerTagsetDescriptionProvider tsdp = + new IxaLemmatizerTagsetDescriptionProvider( + getResourceMetaData().getProperty("pos.tagset"), POS.class, + model.getLemmatizerSequenceModel(), "t0"); + addTagset(tsdp, false); + + if (printTagSet) { + getContext().getLogger().log(INFO, tsdp.toString()); + } + + return new LemmatizerME(model); + } + + }; + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + modelProvider.configure(aJCas.getCas()); + LemmatizerME analyzer = modelProvider.getResource(); + + // Iterate over all sentences + Map<Sentence, List<Token>> index = indexCovered(aJCas, Sentence.class, Token.class); + for (Sentence sentence : select(aJCas, Sentence.class)) { + Collection<Token> tokens = index.get(sentence); + String[] tokenTexts = toText(tokens).toArray(new String[tokens.size()]); + String[] tokenPos = tokens.stream() + .map(t -> { return t.getPos().getPosValue(); }) + .toArray(s -> { return new String[tokens.size()]; }); + + String[] encodedLemmas = analyzer.lemmatize(tokenTexts, tokenPos); + String[] lemmas = analyzer.decodeLemmas(tokenTexts, encodedLemmas); + + int i = 0; + for (Token t : tokens) { + String lemmaString = lemmas[i]; + if (lemmaString == null) { + lemmaString = t.getText(); + } + Lemma l = new Lemma(aJCas, t.getBegin(), t.getEnd()); + l.setValue(lemmaString); + l.addToIndexes(); + + t.setLemma(l); + i++; + } + } + } +} diff --git a/dkpro-core-ixa-asl/src/main/java/org/dkpro/core/ixa/IxaPosTagger.java b/dkpro-core-ixa-asl/src/main/java/org/dkpro/core/ixa/IxaPosTagger.java new file mode 100644 index 0000000000..9e1c48cba8 --- /dev/null +++ b/dkpro-core-ixa-asl/src/main/java/org/dkpro/core/ixa/IxaPosTagger.java @@ -0,0 +1,54 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.ixa; + +import static org.dkpro.core.api.resources.ResourceObjectProviderBase.LOCATION; + +import org.apache.uima.UimaContext; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.opennlp.OpenNlpPosTagger; + +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Part-of-Speech annotator using OpenNLP with IXA extensions. + */ +@Component(OperationType.PART_OF_SPEECH_TAGGER) +@ResourceMetaData(name = "IXA POS-Tagger") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +public class IxaPosTagger + extends OpenNlpPosTagger +{ + // The IXA POS tagger models make use of IXA classes. But they do so from within OpenNLP. + // From the outside, it looks and works exactly like an OpenNLP POS tagger. So we just + // derive from the OpenNlpPosTagger in side this module. This has the effect that through + // the module dependencies, we have the required IXA JARs on the classpath. It also has + // the effect that the package for the models changes from ...opennlp.lib to ...ixa.lib. + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider.setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/ixa/lib/tagger-${language}-${variant}.properties"); + } +} diff --git a/dkpro-core-ixa-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ixa/internal/IxaLemmatizerTagsetDescriptionProvider.java b/dkpro-core-ixa-asl/src/main/java/org/dkpro/core/ixa/internal/IxaLemmatizerTagsetDescriptionProvider.java similarity index 95% rename from dkpro-core-ixa-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ixa/internal/IxaLemmatizerTagsetDescriptionProvider.java rename to dkpro-core-ixa-asl/src/main/java/org/dkpro/core/ixa/internal/IxaLemmatizerTagsetDescriptionProvider.java index c044de4cc9..62b2d82d28 100644 --- a/dkpro-core-ixa-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ixa/internal/IxaLemmatizerTagsetDescriptionProvider.java +++ b/dkpro-core-ixa-asl/src/main/java/org/dkpro/core/ixa/internal/IxaLemmatizerTagsetDescriptionProvider.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.ixa.internal; +package org.dkpro.core.ixa.internal; import static java.util.Collections.singletonMap; @@ -26,8 +26,8 @@ import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.reflect.FieldUtils; +import org.dkpro.core.api.metadata.TagsetBase; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.TagsetBase; import opennlp.tools.ml.model.AbstractModel; import opennlp.tools.ml.model.SequenceClassificationModel; diff --git a/dkpro-core-ixa-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/ixa/lib/lemmatizer-default-variants.map b/dkpro-core-ixa-asl/src/main/resources/org/dkpro/core/ixa/lib/lemmatizer-default-variants.map similarity index 100% rename from dkpro-core-ixa-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/ixa/lib/lemmatizer-default-variants.map rename to dkpro-core-ixa-asl/src/main/resources/org/dkpro/core/ixa/lib/lemmatizer-default-variants.map diff --git a/dkpro-core-ixa-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/ixa/lib/tagger-default-variants.map b/dkpro-core-ixa-asl/src/main/resources/org/dkpro/core/ixa/lib/tagger-default-variants.map similarity index 100% rename from dkpro-core-ixa-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/ixa/lib/tagger-default-variants.map rename to dkpro-core-ixa-asl/src/main/resources/org/dkpro/core/ixa/lib/tagger-default-variants.map diff --git a/dkpro-core-ixa-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ixa/IxaLemmatizerTest.java b/dkpro-core-ixa-asl/src/test/java/org/dkpro/core/ixa/IxaLemmatizerTest.java similarity index 97% rename from dkpro-core-ixa-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ixa/IxaLemmatizerTest.java rename to dkpro-core-ixa-asl/src/test/java/org/dkpro/core/ixa/IxaLemmatizerTest.java index 4e9fb3c7b5..a84c213120 100644 --- a/dkpro-core-ixa-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ixa/IxaLemmatizerTest.java +++ b/dkpro-core-ixa-asl/src/test/java/org/dkpro/core/ixa/IxaLemmatizerTest.java @@ -15,22 +15,24 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.ixa; +package org.dkpro.core.ixa; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.jcas.JCas; +import org.dkpro.core.ixa.IxaLemmatizer; +import org.dkpro.core.ixa.IxaPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class IxaLemmatizerTest { @@ -212,7 +214,8 @@ public void testSpanish() "VSSI3S0", "VSSP1S0", "VSSP2S0", "VSSP3P0", "VSSP3S0", "W", "Z", "Zm", "Zp", "_" }; AssertAnnotations.assertLemma(lemmas, select(jcas, Lemma.class)); - // AssertAnnotations.assertTagset(IxaPosTagger.class, POS.class, "ancora-ixa", posTags, jcas); + // AssertAnnotations.assertTagset(IxaPosTagger.class, POS.class, "ancora-ixa", + // posTags, jcas); AssertAnnotations.assertTagset(IxaLemmatizer.class, POS.class, "ancora-ixa", posTags, jcas); } diff --git a/dkpro-core-ixa-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ixa/IxaPosTaggerTest.java b/dkpro-core-ixa-asl/src/test/java/org/dkpro/core/ixa/IxaPosTaggerTest.java similarity index 98% rename from dkpro-core-ixa-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ixa/IxaPosTaggerTest.java rename to dkpro-core-ixa-asl/src/test/java/org/dkpro/core/ixa/IxaPosTaggerTest.java index e9f8c41347..d03a255635 100644 --- a/dkpro-core-ixa-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ixa/IxaPosTaggerTest.java +++ b/dkpro-core-ixa-asl/src/test/java/org/dkpro/core/ixa/IxaPosTaggerTest.java @@ -15,21 +15,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.ixa; +package org.dkpro.core.ixa; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; +import org.dkpro.core.ixa.IxaPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class IxaPosTaggerTest { diff --git a/dkpro-core-ixa-asl/src/test/resources/log4j.properties b/dkpro-core-ixa-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-ixa-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-ixa-asl/src/test/resources/log4j2.xml b/dkpro-core-ixa-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-ixa-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Logger name="org.dkpro.core.api.resources.ResourceObjectProviderBase" level="INFO"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-jazzy-asl/pom.xml b/dkpro-core-jazzy-asl/pom.xml index 8c82c072b8..fafccc6d4d 100644 --- a/dkpro-core-jazzy-asl/pom.xml +++ b/dkpro-core-jazzy-asl/pom.xml @@ -1,11 +1,11 @@ <!-- - Copyright 2017 - Ubiquitous Knowledge Processing (UKP) Lab - Technische Universität Darmstadt - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at + Licensed to the Technische Universität Darmstadt under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The Technische Universität Darmstadt + licenses this file to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. http://www.apache.org/licenses/LICENSE-2.0 @@ -18,14 +18,15 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <version>1.10.0-SNAPSHOT</version> + <artifactId>dkpro-core-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.jazzy-asl</artifactId> + <artifactId>dkpro-core-jazzy-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - Jazzy (v ${jazzy.version}) (LGPL)</name> + <url>https://dkpro.github.io/dkpro-core/</url> <description>http://jazzy.sourceforge.net/</description> <properties> <jazzy.version>0.5.2</jazzy.version> @@ -53,28 +54,32 @@ <version>${jazzy.version}</version> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-frequency-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.frequency-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-anomaly-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.anomaly-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-ngrams-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.ngrams-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> </dependency> <dependency> <groupId>junit</groupId> @@ -82,4 +87,21 @@ <scope>test</scope> </dependency> </dependencies> + <build> + <plugins> + <plugin> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-maven-plugin</artifactId> + <configuration> + <!-- + The following components must be configured via external resources which is not + possible on the OpenMinTeD platform. + --> + <uimaDescriptorExcludes> + <exclude>**/CorrectionsContextualizer.xml</exclude> + </uimaDescriptorExcludes> + </configuration> + </plugin> + </plugins> + </build> </project> diff --git a/dkpro-core-jazzy-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/jazzy/JazzyChecker.java b/dkpro-core-jazzy-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/jazzy/JazzyChecker.java deleted file mode 100644 index 4dfe2f6995..0000000000 --- a/dkpro-core-jazzy-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/jazzy/JazzyChecker.java +++ /dev/null @@ -1,227 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.jazzy; - -import static org.apache.commons.io.IOUtils.closeQuietly; -import static org.apache.uima.fit.util.JCasUtil.select; - -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.net.URL; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.cas.FSArray; -import org.apache.uima.resource.ResourceInitializationException; - -import com.swabunga.spell.engine.SpellDictionary; -import com.swabunga.spell.engine.SpellDictionaryHashMap; -import com.swabunga.spell.engine.Word; - -import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SpellingAnomaly; -import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.AnnotationChecker; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * This annotator uses Jazzy for the decision whether a word is spelled correctly or not. - */ -@ResourceMetaData(name="Jazzy Spellchecker") -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SpellingAnomaly", - "de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction"}) - -public class JazzyChecker - extends JCasAnnotator_ImplBase -{ - /** - * Location from which the model is read. The model file is a simple word-list with one word - * per line. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true) - private String dictPath; - - /** - * The character encoding used by the model. - */ - public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; - @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, defaultValue = "UTF-8") - private String dictEncoding; - - /** - * Determines the maximum edit distance (as an int value) that a suggestion for a spelling error may have. - * E.g. if set to one suggestions are limited to words within edit distance 1 to the original word. - */ - public static final String PARAM_SCORE_THRESHOLD = "scoreThreshold"; - @ConfigurationParameter(name = PARAM_SCORE_THRESHOLD, mandatory = true, defaultValue = "1") - private int scoreThreshold; - - private SpellDictionary dict; - - @Override - public void initialize(final UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - InputStream is = null; - try { - URL url = ResourceUtils.resolveLocation(dictPath, this, context); - this.getLogger().debug("Loading dictionary from " + url); - is = url.openStream(); - dict = new SpellDictionaryHashMap(new InputStreamReader(is, dictEncoding)); - } - catch (IOException e) { - throw new ResourceInitializationException(e); - } - finally { - closeQuietly(is); - } - } - - @Override - public void process(final JCas jcas) - throws AnalysisEngineProcessException - { - - AnnotationChecker.requireExists(this, jcas, this.getLogger(), Token.class); - AnnotationChecker.requireNotExists(this, jcas, this.getLogger(), - SpellingAnomaly.class, SuggestedAction.class); - - for (Token t : select(jcas, Token.class)) { - String tokenText = t.getText(); - if (tokenText.matches("[\\.\\?\\!]")) { - continue; - } - if (!dict.isCorrect(tokenText)) { - SpellingAnomaly anomaly = new SpellingAnomaly(jcas, t.getBegin(), t.getEnd()); - - // only try to correct single character tokens if they are letters - if (tokenText.length() == 1 && !Character.isLetter(tokenText.charAt(0))) { - continue; - } - - @SuppressWarnings("unchecked") - List<Word> suggestions = dict.getSuggestions(tokenText, scoreThreshold); - - SuggestionCostTuples tuples = new SuggestionCostTuples(); - for (Word suggestion : suggestions) { - String suggestionString = suggestion.getWord(); - int cost = suggestion.getCost(); - - if (suggestionString != null) { - tuples.addTuple(suggestionString, cost); - } - } - - if (tuples.size() > 0) { - FSArray actions = new FSArray(jcas, tuples.size()); - int i=0; - for (SuggestionCostTuple tuple : tuples) { - SuggestedAction action = new SuggestedAction(jcas); - action.setReplacement(tuple.getSuggestion()); - action.setCertainty(tuple.getNormalizedCost(tuples.getMaxCost())); - - actions.set(i, action); - i++; - } - anomaly.setSuggestions(actions); - anomaly.addToIndexes(); - } - } - } - } - - class SuggestionCostTuples implements Iterable<SuggestionCostTuple> { - private final List<SuggestionCostTuple> tuples; - private int maxCost; - - public SuggestionCostTuples() - { - tuples = new ArrayList<SuggestionCostTuple>(); - maxCost = 0; - } - - public void addTuple(String suggestion, int cost) { - tuples.add(new SuggestionCostTuple(suggestion, cost)); - - if (cost > maxCost) { - maxCost = cost; - } - } - - public int getMaxCost() { - return maxCost; - } - - public int size() { - return tuples.size(); - } - - @Override - public Iterator<SuggestionCostTuple> iterator() - { - return tuples.iterator(); - } - } - - class SuggestionCostTuple { - private final String suggestion; - private final Integer cost; - - public SuggestionCostTuple(String suggestion, Integer cost) - { - this.suggestion = suggestion; - this.cost = cost; - } - - public String getSuggestion() - { - return suggestion; - } - - public Integer getCost() - { - return cost; - } - - public float getNormalizedCost(int maxCost) - { - if (maxCost > 0) { - return (float) cost / maxCost; - } - else { - return 0f; - } - } - } -} \ No newline at end of file diff --git a/dkpro-core-jazzy-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/jazzy/package-info.java b/dkpro-core-jazzy-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/jazzy/package-info.java deleted file mode 100644 index 67a53071df..0000000000 --- a/dkpro-core-jazzy-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/jazzy/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Spelling correction based on <a href="http://jazzy.sourceforge.net/">Jazzy</a>. - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.jazzy; diff --git a/dkpro-core-jazzy-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/jazzy/util/ContextualizerUtils.java b/dkpro-core-jazzy-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/jazzy/util/ContextualizerUtils.java deleted file mode 100644 index 5b99f68e25..0000000000 --- a/dkpro-core-jazzy-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/jazzy/util/ContextualizerUtils.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright 2014 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.jazzy.util; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.uima.jcas.tcas.Annotation; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -// NOTE: these utils were copied from DKPro Spelling as I don't want to make core dependent on it -public class ContextualizerUtils -{ - public static int getCandidatePosition(Annotation candidate, List<Token> tokens) - { - int position = -1; - - for (int i=0; i<tokens.size(); i++) { - if (tokens.get(i).getBegin() == candidate.getBegin() && - tokens.get(i).getEnd() == candidate.getEnd()) - { - position = i; - } - } - - return position; - } - - public static List<String> getChangedWords(String edit, List<String> words, int offset) { - List<String> changedWords = new ArrayList<String>(words); - changedWords.set(offset, edit); - - return changedWords; - } - - public static List<String> limitToContextWindow(List<String> words, int offset, int windowSize) { - int minOffset = offset - windowSize; - if (minOffset < 0) { - minOffset = 0; - } - - int maxOffset = offset + windowSize; - if (maxOffset >= words.size()) { - maxOffset = words.size()-1; - } - - List<String> changedWords = words.subList(minOffset, maxOffset+1); - - return changedWords; - } - - public static String getTrigram(String s1, String s2, String s3) { - StringBuilder sb = new StringBuilder(); - sb.append(s1); - sb.append(" "); - sb.append(s2); - sb.append(" "); - sb.append(s3); - return sb.toString(); - } -} diff --git a/dkpro-core-jazzy-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/jazzy/CorrectionsContextualizer.java b/dkpro-core-jazzy-asl/src/main/java/org/dkpro/core/jazzy/CorrectionsContextualizer.java similarity index 76% rename from dkpro-core-jazzy-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/jazzy/CorrectionsContextualizer.java rename to dkpro-core-jazzy-asl/src/main/java/org/dkpro/core/jazzy/CorrectionsContextualizer.java index 9d2e8aac7f..2b5314c594 100644 --- a/dkpro-core-jazzy-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/jazzy/CorrectionsContextualizer.java +++ b/dkpro-core-jazzy-asl/src/main/java/org/dkpro/core/jazzy/CorrectionsContextualizer.java @@ -1,184 +1,193 @@ -/* - * Copyright 2014 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.jazzy; - -import static de.tudarmstadt.ukp.dkpro.core.jazzy.util.ContextualizerUtils.getCandidatePosition; -import static de.tudarmstadt.ukp.dkpro.core.jazzy.util.ContextualizerUtils.getChangedWords; -import static de.tudarmstadt.ukp.dkpro.core.jazzy.util.ContextualizerUtils.getTrigram; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ExternalResource; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.cas.FSArray; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SpellingAnomaly; -import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.provider.FrequencyCountProvider; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringIterable; - -/** - * This component assumes that some spell checker has already been applied upstream (e.g. Jazzy). - * It then uses ngram frequencies from a frequency provider in order to rank the provided corrections. - */ -@ResourceMetaData(name="Corrections Contextualizer") -public class CorrectionsContextualizer - extends JCasAnnotator_ImplBase -{ - private static final String BOS ="<S>"; - - public final static String FREQUENCY_PROVIDER_RESOURCE = "FrequencyProvider"; - @ExternalResource(key = FREQUENCY_PROVIDER_RESOURCE) - private FrequencyCountProvider provider; - - protected Map<String,Long> countCache; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - countCache = new HashMap<String,Long>(); - } - - @Override - public void process(JCas jcas) - throws AnalysisEngineProcessException - { - for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) { - List<Token> tokens = JCasUtil.selectCovered(jcas, Token.class, sentence); - List<String> tokenStrings = JCasUtil.toText(tokens); - for (SpellingAnomaly anomaly : JCasUtil.selectCovered(jcas, SpellingAnomaly.class, sentence)) { - - FSArray suggestedActions = anomaly.getSuggestions(); - int n = suggestedActions.size(); - FSArray newActions = new FSArray(jcas, n + 1); - for (int i=0; i<n; i++) { - SuggestedAction action = (SuggestedAction) suggestedActions.get(i); - - List<String> changedWords = getChangedWords(action.getReplacement(), tokenStrings, getCandidatePosition(anomaly, tokens)); - - double probability = getSentenceProbability(changedWords); - - action.setCertainty((float) probability); - newActions.set(i, action); - - } - - // add the original word as a possibility - // might turn out that it fits in well according to ngram model - SuggestedAction newAction = new SuggestedAction(jcas); - newAction.setReplacement(anomaly.getCoveredText()); - newAction.setCertainty((float) getSentenceProbability(tokenStrings)); - newActions.set(n, newAction); - - anomaly.setSuggestions(newActions); - } - } - } - - protected double getSentenceProbability(List<String> words) throws AnalysisEngineProcessException { - double sentenceProbability = 0.0; - - if (words.size() < 1) { - return 0.0; - } - - long nrOfUnigrams; - try { - nrOfUnigrams = provider.getNrOfTokens(); - } - catch (Exception e) { - throw new AnalysisEngineProcessException(e); - } - - List<String> trigrams = new ArrayList<String>(); - - // in the google n-grams this is not represented (only single BOS markers) - // but I leave it in place in case we add another n-gram provider - trigrams.add(getTrigram(BOS, BOS, words.get(0))); - - if (words.size() > 1) { - trigrams.add(getTrigram(BOS, words.get(0), words.get(1))); - } - - for (String trigram : new NGramStringIterable(words, 3, 3)) { - trigrams.add(trigram); - } - - // FIXME - implement backoff or linear interpolation - - for (String trigram : trigrams) { - long trigramFreq = getNGramCount(trigram); - - String[] parts = StringUtils.split(trigram, " "); - - String bigram = StringUtils.join(Arrays.copyOfRange(parts, 0, 2), " "); - long bigramFreq = getNGramCount(bigram); - - String unigram = StringUtils.join(Arrays.copyOfRange(parts, 0, 1), " "); - long unigramFreq = getNGramCount(unigram); - - if (trigramFreq < 1) { - trigramFreq = 1; - } - if (bigramFreq < 1) { - bigramFreq = 1; - } - if (unigramFreq < 1) { - unigramFreq = 1; - } - - double trigramProb = Math.log( (double) trigramFreq / bigramFreq); - double bigramProb = Math.log( (double) bigramFreq / unigramFreq); - double unigramProb = Math.log( (double) unigramFreq / nrOfUnigrams); - - double interpolated = (trigramProb + bigramProb + unigramProb) / 3.0; - - sentenceProbability += interpolated; - } - - return Math.exp(sentenceProbability); - } - - protected long getNGramCount(String ngram) throws AnalysisEngineProcessException { - if (!countCache.containsKey(ngram)) { - try { - countCache.put(ngram, provider.getFrequency(ngram)); - } - catch (Exception e) { - throw new AnalysisEngineProcessException(e); - } - } - - return countCache.get(ngram); - } -} \ No newline at end of file +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.jazzy; + +import static org.dkpro.core.jazzy.util.ContextualizerUtils.getCandidatePosition; +import static org.dkpro.core.jazzy.util.ContextualizerUtils.getChangedWords; +import static org.dkpro.core.jazzy.util.ContextualizerUtils.getTrigram; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ExternalResource; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.frequency.provider.FrequencyCountProvider; +import org.dkpro.core.ngrams.util.NGramStringIterable; + +import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SpellingAnomaly; +import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * This component assumes that some spell checker has already been applied upstream (e.g. Jazzy). It + * then uses n-gram frequencies from a frequency provider in order to rank the provided corrections. + */ +@ResourceMetaData(name = "Corrections Contextualizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +public class CorrectionsContextualizer + extends JCasAnnotator_ImplBase +{ + private static final String BOS = "<S>"; + + /** + * Resource providing the frequency counts. + */ + public final static String RES_FREQUENCY_PROVIDER = "FrequencyProvider"; + @ExternalResource(key = RES_FREQUENCY_PROVIDER) + private FrequencyCountProvider provider; + + protected Map<String,Long> countCache; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + countCache = new HashMap<String,Long>(); + } + + @Override + public void process(JCas jcas) + throws AnalysisEngineProcessException + { + for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) { + List<Token> tokens = JCasUtil.selectCovered(jcas, Token.class, sentence); + List<String> tokenStrings = JCasUtil.toText(tokens); + for (SpellingAnomaly anomaly : JCasUtil.selectCovered(jcas, SpellingAnomaly.class, + sentence)) { + + FSArray suggestedActions = anomaly.getSuggestions(); + int n = suggestedActions.size(); + FSArray newActions = new FSArray(jcas, n + 1); + for (int i = 0; i < n; i++) { + SuggestedAction action = (SuggestedAction) suggestedActions.get(i); + + List<String> changedWords = getChangedWords(action.getReplacement(), + tokenStrings, getCandidatePosition(anomaly, tokens)); + + double probability = getSentenceProbability(changedWords); + + action.setCertainty((float) probability); + newActions.set(i, action); + + } + + // add the original word as a possibility + // might turn out that it fits in well according to ngram model + SuggestedAction newAction = new SuggestedAction(jcas); + newAction.setReplacement(anomaly.getCoveredText()); + newAction.setCertainty((float) getSentenceProbability(tokenStrings)); + newActions.set(n, newAction); + + anomaly.setSuggestions(newActions); + } + } + } + + protected double getSentenceProbability(List<String> words) + throws AnalysisEngineProcessException + { + double sentenceProbability = 0.0; + + if (words.size() < 1) { + return 0.0; + } + + long nrOfUnigrams; + try { + nrOfUnigrams = provider.getNrOfTokens(); + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + + List<String> trigrams = new ArrayList<String>(); + + // in the google n-grams this is not represented (only single BOS markers) + // but I leave it in place in case we add another n-gram provider + trigrams.add(getTrigram(BOS, BOS, words.get(0))); + + if (words.size() > 1) { + trigrams.add(getTrigram(BOS, words.get(0), words.get(1))); + } + + for (String trigram : new NGramStringIterable(words, 3, 3)) { + trigrams.add(trigram); + } + + // FIXME - implement backoff or linear interpolation + + for (String trigram : trigrams) { + long trigramFreq = getNGramCount(trigram); + + String[] parts = StringUtils.split(trigram, " "); + + String bigram = StringUtils.join(Arrays.copyOfRange(parts, 0, 2), " "); + long bigramFreq = getNGramCount(bigram); + + String unigram = StringUtils.join(Arrays.copyOfRange(parts, 0, 1), " "); + long unigramFreq = getNGramCount(unigram); + + if (trigramFreq < 1) { + trigramFreq = 1; + } + if (bigramFreq < 1) { + bigramFreq = 1; + } + if (unigramFreq < 1) { + unigramFreq = 1; + } + + double trigramProb = Math.log( (double) trigramFreq / bigramFreq); + double bigramProb = Math.log( (double) bigramFreq / unigramFreq); + double unigramProb = Math.log( (double) unigramFreq / nrOfUnigrams); + + double interpolated = (trigramProb + bigramProb + unigramProb) / 3.0; + + sentenceProbability += interpolated; + } + + return Math.exp(sentenceProbability); + } + + protected long getNGramCount(String ngram) throws AnalysisEngineProcessException { + if (!countCache.containsKey(ngram)) { + try { + countCache.put(ngram, provider.getFrequency(ngram)); + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + } + + return countCache.get(ngram); + } +} diff --git a/dkpro-core-jazzy-asl/src/main/java/org/dkpro/core/jazzy/JazzyChecker.java b/dkpro-core-jazzy-asl/src/main/java/org/dkpro/core/jazzy/JazzyChecker.java new file mode 100644 index 0000000000..f08c6a0403 --- /dev/null +++ b/dkpro-core-jazzy-asl/src/main/java/org/dkpro/core/jazzy/JazzyChecker.java @@ -0,0 +1,233 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.jazzy; + +import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.apache.uima.fit.util.JCasUtil.select; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URL; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.AnnotationChecker; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.ResourceUtils; + +import com.swabunga.spell.engine.SpellDictionary; +import com.swabunga.spell.engine.SpellDictionaryHashMap; +import com.swabunga.spell.engine.Word; + +import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SpellingAnomaly; +import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * This annotator uses Jazzy for the decision whether a word is spelled correctly or not. + */ +@Component(OperationType.SPELLING_CHECKER) +@ResourceMetaData(name = "Jazzy Spellchecker") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SpellingAnomaly", + "de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction"}) + +public class JazzyChecker + extends JCasAnnotator_ImplBase +{ + /** + * Location from which the model is read. The model file is a simple word-list with one word + * per line. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true) + private String dictPath; + + /** + * The character encoding used by the model. + */ + public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; + @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, defaultValue = "UTF-8") + private String dictEncoding; + + /** + * Determines the maximum edit distance (as an int value) that a suggestion for a spelling error + * may have. E.g. if set to one suggestions are limited to words within edit distance 1 to the + * original word. + */ + public static final String PARAM_SCORE_THRESHOLD = "scoreThreshold"; + @ConfigurationParameter(name = PARAM_SCORE_THRESHOLD, mandatory = true, defaultValue = "1") + private int scoreThreshold; + + private SpellDictionary dict; + + @Override + public void initialize(final UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + InputStream is = null; + try { + URL url = ResourceUtils.resolveLocation(dictPath, this, context); + this.getLogger().debug("Loading dictionary from " + url); + is = url.openStream(); + dict = new SpellDictionaryHashMap(new InputStreamReader(is, dictEncoding)); + } + catch (IOException e) { + throw new ResourceInitializationException(e); + } + finally { + closeQuietly(is); + } + } + + @Override + public void process(final JCas jcas) + throws AnalysisEngineProcessException + { + + AnnotationChecker.requireExists(this, jcas, this.getLogger(), Token.class); + AnnotationChecker.requireNotExists(this, jcas, this.getLogger(), + SpellingAnomaly.class, SuggestedAction.class); + + for (Token t : select(jcas, Token.class)) { + String tokenText = t.getText(); + if (tokenText.matches("[\\.\\?\\!]")) { + continue; + } + if (!dict.isCorrect(tokenText)) { + SpellingAnomaly anomaly = new SpellingAnomaly(jcas, t.getBegin(), t.getEnd()); + + // only try to correct single character tokens if they are letters + if (tokenText.length() == 1 && !Character.isLetter(tokenText.charAt(0))) { + continue; + } + + @SuppressWarnings("unchecked") + List<Word> suggestions = dict.getSuggestions(tokenText, scoreThreshold); + + SuggestionCostTuples tuples = new SuggestionCostTuples(); + for (Word suggestion : suggestions) { + String suggestionString = suggestion.getWord(); + int cost = suggestion.getCost(); + + if (suggestionString != null) { + tuples.addTuple(suggestionString, cost); + } + } + + if (tuples.size() > 0) { + FSArray actions = new FSArray(jcas, tuples.size()); + int i = 0; + for (SuggestionCostTuple tuple : tuples) { + SuggestedAction action = new SuggestedAction(jcas); + action.setReplacement(tuple.getSuggestion()); + action.setCertainty(tuple.getNormalizedCost(tuples.getMaxCost())); + + actions.set(i, action); + i++; + } + anomaly.setSuggestions(actions); + anomaly.addToIndexes(); + } + } + } + } + + class SuggestionCostTuples implements Iterable<SuggestionCostTuple> { + private final List<SuggestionCostTuple> tuples; + private int maxCost; + + public SuggestionCostTuples() + { + tuples = new ArrayList<SuggestionCostTuple>(); + maxCost = 0; + } + + public void addTuple(String suggestion, int cost) { + tuples.add(new SuggestionCostTuple(suggestion, cost)); + + if (cost > maxCost) { + maxCost = cost; + } + } + + public int getMaxCost() { + return maxCost; + } + + public int size() { + return tuples.size(); + } + + @Override + public Iterator<SuggestionCostTuple> iterator() + { + return tuples.iterator(); + } + } + + class SuggestionCostTuple { + private final String suggestion; + private final Integer cost; + + public SuggestionCostTuple(String suggestion, Integer cost) + { + this.suggestion = suggestion; + this.cost = cost; + } + + public String getSuggestion() + { + return suggestion; + } + + public Integer getCost() + { + return cost; + } + + public float getNormalizedCost(int maxCost) + { + if (maxCost > 0) { + return (float) cost / maxCost; + } + else { + return 0f; + } + } + } +} diff --git a/dkpro-core-jazzy-asl/src/main/java/org/dkpro/core/jazzy/package-info.java b/dkpro-core-jazzy-asl/src/main/java/org/dkpro/core/jazzy/package-info.java new file mode 100644 index 0000000000..5ca52e7e26 --- /dev/null +++ b/dkpro-core-jazzy-asl/src/main/java/org/dkpro/core/jazzy/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Spelling correction based on <a href="http://jazzy.sourceforge.net/">Jazzy</a>. + * + * @since 1.1.0 + */ +package org.dkpro.core.jazzy; diff --git a/dkpro-core-jazzy-asl/src/main/java/org/dkpro/core/jazzy/util/ContextualizerUtils.java b/dkpro-core-jazzy-asl/src/main/java/org/dkpro/core/jazzy/util/ContextualizerUtils.java new file mode 100644 index 0000000000..2ad3126a81 --- /dev/null +++ b/dkpro-core-jazzy-asl/src/main/java/org/dkpro/core/jazzy/util/ContextualizerUtils.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.jazzy.util; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.uima.jcas.tcas.Annotation; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +// NOTE: these utils were copied from DKPro Spelling as I don't want to make core dependent on it +public class ContextualizerUtils +{ + public static int getCandidatePosition(Annotation candidate, List<Token> tokens) + { + int position = -1; + + for (int i = 0; i < tokens.size(); i++) { + if (tokens.get(i).getBegin() == candidate.getBegin() && + tokens.get(i).getEnd() == candidate.getEnd()) + { + position = i; + } + } + + return position; + } + + public static List<String> getChangedWords(String edit, List<String> words, int offset) { + List<String> changedWords = new ArrayList<String>(words); + changedWords.set(offset, edit); + + return changedWords; + } + + public static List<String> limitToContextWindow(List<String> words, int offset, int windowSize) + { + int minOffset = offset - windowSize; + if (minOffset < 0) { + minOffset = 0; + } + + int maxOffset = offset + windowSize; + if (maxOffset >= words.size()) { + maxOffset = words.size() - 1; + } + + List<String> changedWords = words.subList(minOffset, maxOffset + 1); + + return changedWords; + } + + public static String getTrigram(String s1, String s2, String s3) { + StringBuilder sb = new StringBuilder(); + sb.append(s1); + sb.append(" "); + sb.append(s2); + sb.append(" "); + sb.append(s3); + return sb.toString(); + } +} diff --git a/dkpro-core-jazzy-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/jazzy/JazzyCheckerTest.java b/dkpro-core-jazzy-asl/src/test/java/org/dkpro/core/jazzy/JazzyCheckerTest.java similarity index 79% rename from dkpro-core-jazzy-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/jazzy/JazzyCheckerTest.java rename to dkpro-core-jazzy-asl/src/test/java/org/dkpro/core/jazzy/JazzyCheckerTest.java index 414e7213aa..9f11b11d52 100644 --- a/dkpro-core-jazzy-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/jazzy/JazzyCheckerTest.java +++ b/dkpro-core-jazzy-asl/src/test/java/org/dkpro/core/jazzy/JazzyCheckerTest.java @@ -1,24 +1,25 @@ /* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.jazzy; +package org.dkpro.core.jazzy; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.ExternalResourceFactory.createResourceDescription; import static org.apache.uima.fit.util.JCasUtil.select; import static org.junit.Assert.assertEquals; @@ -26,14 +27,13 @@ import java.util.List; import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.fit.factory.ExternalResourceFactory; import org.apache.uima.fit.testing.factory.TokenBuilder; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ExternalResourceDescription; +import org.dkpro.core.api.frequency.TestFrequencyCountResource; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SpellingAnomaly; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.TestFrequencyCountResource; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; @@ -78,15 +78,17 @@ public void contextualizedSpellCheckerTest() { String testDocumentEnglish = "The cat sta on the mat ."; - ExternalResourceDescription resource = ExternalResourceFactory.createExternalResourceDescription(TestFrequencyCountResource.class); + ExternalResourceDescription resource = createResourceDescription( + TestFrequencyCountResource.class); // String context = DkproContext.getContext().getWorkspace("web1t").getAbsolutePath(); // String workspace = "en"; -// ExternalResourceDescription resource = ExternalResourceFactory.createExternalResourceDescription( +// ExternalResourceDescription resource = createExternalResourceDescription( // Web1TFrequencyCountResource.class, // Web1TFrequencyCountResource.PARAM_MIN_NGRAM_LEVEL, "1", // Web1TFrequencyCountResource.PARAM_MAX_NGRAM_LEVEL, "3", -// Web1TFrequencyCountResource.PARAM_INDEX_PATH, new File(context, workspace).getAbsolutePath() +// Web1TFrequencyCountResource.PARAM_INDEX_PATH, +// new File(context, workspace).getAbsolutePath() // ); AnalysisEngine engine = createEngine( @@ -98,7 +100,7 @@ public void contextualizedSpellCheckerTest() ), createEngineDescription( CorrectionsContextualizer.class, - CorrectionsContextualizer.FREQUENCY_PROVIDER_RESOURCE, resource + CorrectionsContextualizer.RES_FREQUENCY_PROVIDER, resource ) ) ); diff --git a/dkpro-core-jieba-asl/LICENSE.txt b/dkpro-core-jieba-asl/LICENSE.txt new file mode 100644 index 0000000000..d645695673 --- /dev/null +++ b/dkpro-core-jieba-asl/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/dkpro-core-jieba-asl/pom.xml b/dkpro-core-jieba-asl/pom.xml new file mode 100644 index 0000000000..63c3d69db5 --- /dev/null +++ b/dkpro-core-jieba-asl/pom.xml @@ -0,0 +1,65 @@ +<!-- + Licensed to the Technische Universität Darmstadt under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The Technische Universität Darmstadt + licenses this file to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> + <relativePath>../dkpro-core-asl</relativePath> + </parent> + <artifactId>dkpro-core-jieba-asl</artifactId> + <name>DKPro Core ASL - Jieba (v ${jieba.version}) (ASL)</name> + <url>https://dkpro.github.io/dkpro-core/</url> + <properties> + <jieba.version>1.0.2</jieba.version> + </properties> + <dependencies> + <dependency> + <groupId>com.huaban</groupId> + <artifactId>jieba-analysis</artifactId> + <version>${jieba.version}</version> + </dependency> + <dependency> + <groupId>org.apache.uima</groupId> + <artifactId>uimaj-core</artifactId> + </dependency> + <dependency> + <groupId>org.apache.uima</groupId> + <artifactId>uimafit-core</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> + </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> + <scope>test</scope> + </dependency> + </dependencies> +</project> \ No newline at end of file diff --git a/dkpro-core-jieba-asl/src/main/java/org/dkpro/core/jieba/JiebaSegmenter.java b/dkpro-core-jieba-asl/src/main/java/org/dkpro/core/jieba/JiebaSegmenter.java new file mode 100644 index 0000000000..6d7fce7d15 --- /dev/null +++ b/dkpro-core-jieba-asl/src/main/java/org/dkpro/core/jieba/JiebaSegmenter.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.jieba; + +import java.util.List; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.segmentation.SegmenterBase; + +import com.huaban.analysis.jieba.JiebaSegmenter.SegMode; +import com.huaban.analysis.jieba.SegToken; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Segmenter for Japanese using <a href="https://github.com/huaban/jieba-analysis">Jieba</a>. + */ +@ResourceMetaData(name = "Jieba Segmenter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability("zh") +@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) +public class JiebaSegmenter + extends SegmenterBase +{ + private com.huaban.analysis.jieba.JiebaSegmenter jieba; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException + { + super.initialize(aContext); + jieba = new com.huaban.analysis.jieba.JiebaSegmenter(); + } + + @Override + protected void process(JCas aJCas, String text, int zoneBegin) + throws AnalysisEngineProcessException + { + int sentenceBegin = 0; + int sentenceEnd = text.indexOf("。"); + while (sentenceEnd > sentenceBegin) { + String stext = text.substring(sentenceBegin, sentenceEnd + 1); + + processSentence(aJCas, stext, zoneBegin + sentenceBegin); + + sentenceBegin = sentenceEnd + 1; + sentenceEnd = text.indexOf("。", sentenceBegin); + } + + if (sentenceBegin < text.length()) { + String stext = text.substring(sentenceBegin, text.length()); + processSentence(aJCas, stext, zoneBegin + sentenceBegin); + } + } + + private Sentence processSentence(JCas aJCas, String text, int zoneBegin) + { + String innerText = text; + boolean addFinalToken = false; + if (innerText.endsWith("。")) { + innerText = text.substring(0, text.length() - 1); + addFinalToken = true; + } + + Annotation firstToken = null; + Annotation lastToken = null; + + List<SegToken> tokens = jieba.process(innerText, SegMode.SEARCH); + for (SegToken t : tokens) { + Annotation ut = createToken(aJCas, t.startOffset + zoneBegin, t.endOffset + zoneBegin); + + // Tokenizer reports whitespace as tokens - we don't add whitespace-only tokens. + if (ut == null) { + continue; + } + + if (firstToken == null) { + firstToken = ut; + } + + lastToken = ut; + } + + if (addFinalToken) { + lastToken = createToken(aJCas, zoneBegin + text.length() - 1, + zoneBegin + text.length()); + } + + if (firstToken != null && lastToken != null) { + return createSentence(aJCas, firstToken.getBegin(), lastToken.getEnd()); + } + else { + return null; + } + } +} diff --git a/dkpro-core-jieba-asl/src/test/java/org/dkpro/core/jieba/JiebaSegmenterTest.java b/dkpro-core-jieba-asl/src/test/java/org/dkpro/core/jieba/JiebaSegmenterTest.java new file mode 100644 index 0000000000..a6fb386eb2 --- /dev/null +++ b/dkpro-core-jieba-asl/src/test/java/org/dkpro/core/jieba/JiebaSegmenterTest.java @@ -0,0 +1,60 @@ +package org.dkpro.core.jieba; +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.AssertAnnotations.assertSentence; +import static org.dkpro.core.testing.AssertAnnotations.assertToken; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class JiebaSegmenterTest +{ + @Test + public void testChinese() throws Exception + { + JCas jcas = JCasFactory.createText("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python" + + "和C++。我不喜欢日本和服。", "zh"); + + AnalysisEngine aed = createEngine(JiebaSegmenter.class); + aed.process(jcas); + + String[] tokens = { "这是", "一个", "伸手不见五指", "的", "黑夜", "。", "我", "叫", "孙悟空", + ",", "我", "爱", "北京", ",", "我", "爱", "Python", "和", "C++", "。", "我", "不", + "喜欢", "日本", "和服", "。" }; + + assertToken(tokens, select(jcas, Token.class)); + + String[] sentences = { "这是一个伸手不见五指的黑夜。", "我叫孙悟空,我爱北京,我爱Python和C++。", + "我不喜欢日本和服。" }; + + assertSentence(sentences, select(jcas, Sentence.class)); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-jtok-asl/pom.xml b/dkpro-core-jtok-asl/pom.xml index 84b759ed1d..004b53833b 100644 --- a/dkpro-core-jtok-asl/pom.xml +++ b/dkpro-core-jtok-asl/pom.xml @@ -18,18 +18,27 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <version>1.10.0-SNAPSHOT</version> + <artifactId>dkpro-core-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.jtok-asl</artifactId> + <artifactId>dkpro-core-jtok-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - JTok (v ${jtok.version}) (LGPL)</name> - <description>http://jazzy.sourceforge.net/</description> + <url>https://dkpro.github.io/dkpro-core/</url> <properties> - <jtok.version>2.1.18</jtok.version> + <jtok.version>2.1.19</jtok.version> </properties> + <repositories> + <repository> + <id>mlt.jfrog.io</id> + <url>https://mlt.jfrog.io/artifactory/mlt-mvn-releases-local/</url> + <snapshots> + <enabled>false</enabled> + </snapshots> + </repository> + </repositories> <dependencies> <dependency> <groupId>org.apache.uima</groupId> @@ -43,14 +52,21 @@ <groupId>de.dfki.lt.jtok</groupId> <artifactId>jtok-core</artifactId> <version>${jtok.version}</version> + <exclusions> + <!-- Libraries should not depend on a specific logging backend --> + <exclusion> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + </exclusion> + </exclusions> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> </dependency> <dependency> <groupId>junit</groupId> @@ -58,8 +74,8 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> </dependencies> diff --git a/dkpro-core-jtok-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/jtok/JTokSegmenter.java b/dkpro-core-jtok-asl/src/main/java/org/dkpro/core/jtok/JTokSegmenter.java similarity index 90% rename from dkpro-core-jtok-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/jtok/JTokSegmenter.java rename to dkpro-core-jtok-asl/src/main/java/org/dkpro/core/jtok/JTokSegmenter.java index 0de7664d59..a12881c101 100644 --- a/dkpro-core-jtok-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/jtok/JTokSegmenter.java +++ b/dkpro-core-jtok-asl/src/main/java/org/dkpro/core/jtok/JTokSegmenter.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.jtok; +package org.dkpro.core.jtok; import java.io.IOException; import java.io.InputStream; @@ -32,6 +32,8 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.segmentation.SegmenterBase; import de.dfki.lt.tools.tokenizer.FileTools; import de.dfki.lt.tools.tokenizer.JTok; @@ -40,13 +42,11 @@ import de.dfki.lt.tools.tokenizer.output.Paragraph; import de.dfki.lt.tools.tokenizer.output.TextUnit; import de.dfki.lt.tools.tokenizer.output.Token; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; /** * JTok segmenter. */ -@ResourceMetaData(name="JTok Segmenter") +@ResourceMetaData(name = "JTok Segmenter") @LanguageCapability({"en", "de", "it"}) @TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph", @@ -59,11 +59,14 @@ public class JTokSegmenter * Create {@link Paragraph} annotations. */ public static final String PARAM_WRITE_PARAGRAPH = ComponentParameters.PARAM_WRITE_PARAGRAPH; - @ConfigurationParameter(name=PARAM_WRITE_PARAGRAPH, mandatory=true, defaultValue="true") + @ConfigurationParameter(name = PARAM_WRITE_PARAGRAPH, mandatory = true, defaultValue = "true") private boolean writeParagraph; - + + /** + * Use PTB-escaping when setting the token form. + */ public static final String PARAM_PTB_ESCAPING = "ptbEscaping"; - @ConfigurationParameter(name=PARAM_PTB_ESCAPING, mandatory=true, defaultValue="false") + @ConfigurationParameter(name = PARAM_PTB_ESCAPING, mandatory = true, defaultValue = "false") private boolean ptbEscaping; private JTok tokenizer; diff --git a/dkpro-core-jtok-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/jtok/JTokSegmenterTest.java b/dkpro-core-jtok-asl/src/test/java/org/dkpro/core/jtok/JTokSegmenterTest.java similarity index 90% rename from dkpro-core-jtok-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/jtok/JTokSegmenterTest.java rename to dkpro-core-jtok-asl/src/test/java/org/dkpro/core/jtok/JTokSegmenterTest.java index c7ce977093..426205b48b 100644 --- a/dkpro-core-jtok-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/jtok/JTokSegmenterTest.java +++ b/dkpro-core-jtok-asl/src/test/java/org/dkpro/core/jtok/JTokSegmenterTest.java @@ -15,15 +15,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.jtok; +package org.dkpro.core.jtok; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.dkpro.core.testing.harness.SegmenterHarness; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.harness.SegmenterHarness; - public class JTokSegmenterTest { @Test diff --git a/dkpro-core-kuromoji-asl/pom.xml b/dkpro-core-kuromoji-asl/pom.xml index 63fcac18cc..fd15e9571b 100644 --- a/dkpro-core-kuromoji-asl/pom.xml +++ b/dkpro-core-kuromoji-asl/pom.xml @@ -15,19 +15,20 @@ See the License for the specific language governing permissions and limitations under the License. --> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <groupId>org.dkpro.core</groupId> <artifactId>dkpro-core-kuromoji-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - Kuromoji (v ${kuromoji.version})</name> + <url>https://dkpro.github.io/dkpro-core/</url> <properties> <kuromoji.version>0.9.0</kuromoji.version> </properties> @@ -46,8 +47,8 @@ <version>${kuromoji.version}</version> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> <groupId>junit</groupId> @@ -55,8 +56,8 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> </dependencies> diff --git a/dkpro-core-kuromoji-asl/src/main/java/org/dkpro/core/kuromoji/KuromojiSegmenter.java b/dkpro-core-kuromoji-asl/src/main/java/org/dkpro/core/kuromoji/KuromojiSegmenter.java index c3a3a54ede..ca9ebce594 100644 --- a/dkpro-core-kuromoji-asl/src/main/java/org/dkpro/core/kuromoji/KuromojiSegmenter.java +++ b/dkpro-core-kuromoji-asl/src/main/java/org/dkpro/core/kuromoji/KuromojiSegmenter.java @@ -18,6 +18,7 @@ package org.dkpro.core.kuromoji; import java.util.List; + import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.LanguageCapability; @@ -26,17 +27,19 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.segmentation.SegmenterBase; import com.atilika.kuromoji.ipadic.Token; import com.atilika.kuromoji.ipadic.Tokenizer; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Segmenter for Japanese using <a href="https://github.com/atilika/kuromoji">Kuromojo</a>. */ -@ResourceMetaData(name="Kuromoji Segmenter") +@ResourceMetaData(name = "Kuromoji Segmenter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @LanguageCapability("ja") @TypeCapability( outputs = { diff --git a/dkpro-core-kuromoji-asl/src/test/java/org/dkpro/core/kuromoji/KuromojiSegmenterTest.java b/dkpro-core-kuromoji-asl/src/test/java/org/dkpro/core/kuromoji/KuromojiSegmenterTest.java index 096a51a4cc..6464a508c6 100644 --- a/dkpro-core-kuromoji-asl/src/test/java/org/dkpro/core/kuromoji/KuromojiSegmenterTest.java +++ b/dkpro-core-kuromoji-asl/src/test/java/org/dkpro/core/kuromoji/KuromojiSegmenterTest.java @@ -25,7 +25,8 @@ import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.dkpro.core.kuromoji.KuromojiSegmenter; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; @@ -33,8 +34,6 @@ import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class KuromojiSegmenterTest { diff --git a/dkpro-core-kuromoji-asl/src/test/resources/log4j.properties b/dkpro-core-kuromoji-asl/src/test/resources/log4j.properties deleted file mode 100644 index 9f0bdd6149..0000000000 --- a/dkpro-core-kuromoji-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,12 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO - -log4j.logger.de.tudarmstadt.ukp.dkpro.core.io.bincas.BinaryCasReader = WARN -log4j.logger.de.tudarmstadt.ukp.dkpro.core.io.bincas.BinaryCasWriter = WARN -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase = WARN \ No newline at end of file diff --git a/dkpro-core-kuromoji-asl/src/test/resources/log4j2.xml b/dkpro-core-kuromoji-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-kuromoji-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Logger name="org.dkpro.core.api.resources.ResourceObjectProviderBase" level="INFO"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-lancaster-asl/pom.xml b/dkpro-core-lancaster-asl/pom.xml deleted file mode 100644 index 3664acc941..0000000000 --- a/dkpro-core-lancaster-asl/pom.xml +++ /dev/null @@ -1,117 +0,0 @@ -<!-- - Licensed to the Technische Universität Darmstadt under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The Technische Universität Darmstadt - licenses this file to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - <parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <version>1.10.0-SNAPSHOT</version> - <relativePath>../dkpro-core-asl</relativePath> - </parent> - <groupId>org.dkpro.core</groupId> - <artifactId>dkpro-core-lancaster-asl</artifactId> - <packaging>jar</packaging> - <name>DKPro Core ASL - Lancaster</name> - <description>http://haifengl.github.io/smile</description> - <dependencies> - <dependency> - <groupId>org.apache.uima</groupId> - <artifactId>uimaj-core</artifactId> - </dependency> - <dependency> - <groupId>org.apache.uima</groupId> - <artifactId>uimafit-core</artifactId> - </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-lang3</artifactId> - </dependency> - <dependency> - <groupId>com.github.haifengl</groupId> - <artifactId>smile-nlp</artifactId> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.featurepath-asl</artifactId> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> - </dependency> - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-model-tagger-en-maxent</artifactId> - <scope>test</scope> - </dependency> - </dependencies> - <dependencyManagement> - <dependencies> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> - <type>pom</type> - <scope>import</scope> - </dependency> - </dependencies> - </dependencyManagement> - <build> - <pluginManagement> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-dependency-plugin</artifactId> - <configuration> - <usedDependencies> - <!-- Models not detected by byte-code analysis --> - <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.opennlp-model-tagger-en-maxent</usedDependency> - </usedDependencies> - </configuration> - </plugin> - </plugins> - </pluginManagement> - </build> -</project> diff --git a/dkpro-core-lancaster-asl/src/main/java/org/dkpro/core/lancaster/LancasterStemmer.java b/dkpro-core-lancaster-asl/src/main/java/org/dkpro/core/lancaster/LancasterStemmer.java deleted file mode 100644 index 71e820c258..0000000000 --- a/dkpro-core-lancaster-asl/src/main/java/org/dkpro/core/lancaster/LancasterStemmer.java +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Licensed to the Technische Universität Darmstadt under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt - * licenses this file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.lancaster; - -import java.io.IOException; -import java.net.MalformedURLException; -import java.net.URL; -import java.util.Collections; -import java.util.Locale; -import java.util.Set; - -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.FSIterator; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.cas.text.AnnotationIndex; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.LanguageCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.CasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathAnnotatorBase; -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * This Paice/Husk Lancaster stemmer implementation only works with the English language so far. - */ -@ResourceMetaData(name="Lancaster Stemmer") -@LanguageCapability("en") -@TypeCapability( - inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, - outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem" }) -public class LancasterStemmer - extends FeaturePathAnnotatorBase -{ - private static final String MESSAGE_DIGEST = LancasterStemmer.class.getName() + "_Messages"; - - /** - * True if the stemmer will strip prefix such as kilo, micro, milli, intra, ultra, mega, nano, - * pico, pseudo. - */ - public static final String PARAM_STRIP_PREFIXES = "stripPrefix"; - @ConfigurationParameter(name = PARAM_STRIP_PREFIXES, mandatory = true, defaultValue = "false") - private boolean stripPrefix; - - /** - * Specifies an URL that should resolve to a location from where to load custom rules. If the - * location starts with {@code classpath:} the location is interpreted as a classpath location, - * e.g. "classpath:my/path/to/the/rules". Otherwise it is tried as an URL, file and at last UIMA - * resource. - * - * @see ResourceUtils - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - private String modelLocation; - - /** - * Specifies the language supported by the stemming model. Default value is "en" (English). - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true, defaultValue = "en") - protected String language; - - /** - * The stemmer only has to be initialized once since it's used like a pure function with the given - * configuration parameters. - */ - private smile.nlp.stemmer.LancasterStemmer stemmer; - - @Override - protected Set<String> getDefaultPaths() - { - return Collections.singleton(Token.class.getName()); - } - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - language = language.toLowerCase(); - - if (modelLocation != null) { - try { - - URL url = ResourceUtils.resolveLocation(modelLocation, this, aContext); - stemmer = new smile.nlp.stemmer.LancasterStemmer(url.openStream(), stripPrefix); - } catch (MalformedURLException e) { - throw new ResourceInitializationException(e); - } catch (IOException e) { - throw new ResourceInitializationException(e); - } - } else { - stemmer = new smile.nlp.stemmer.LancasterStemmer(stripPrefix); - } - } - - @Override - protected void generateAnnotations(JCas jcas) - throws FeaturePathException, AnalysisEngineProcessException - { - // CAS is necessary to retrieve values - CAS currCAS = jcas.getCas(); - - // Try language set in CAS. - String lang = jcas.getDocumentLanguage(); - - if (StringUtils.isBlank(lang)) { - throw new AnalysisEngineProcessException(MESSAGE_DIGEST, "no_language_error", null); - } - - lang = lang.toLowerCase(Locale.US); - - if (!language.equals(lang)) { // Only specified language is supported - throw new AnalysisEngineProcessException(MESSAGE_DIGEST, "unsupported_language_error", - new Object[] { lang }); - } - - - for (String path : paths) { - // Separate Typename and featurepath - String[] segments = path.split("/", 2); - String typeName = segments[0]; - - // Try to get the type from the typesystem of the CAS - Type t = CasUtil.getType(currCAS, typeName); - if (t == null) { - throw new IllegalStateException("Type [" + typeName + "] not found in type system"); - } - - // get an fpi object and initialize it - // initialize the FeaturePathInfo with the corresponding part - initializeFeaturePathInfoFrom(fp, segments); - - // get the annotations - AnnotationIndex<?> idx = currCAS.getAnnotationIndex(t); - FSIterator<?> iterator = idx.iterator(); - - while (iterator.hasNext()) { - AnnotationFS fs = (AnnotationFS) iterator.next(); - - if (this.filterFeaturePath != null) { - // check annotation filter condition - if (this.filterFeaturePathInfo.match(fs, this.filterCondition)) { - createStemAnnotation(jcas, stemmer, fs); - } - } - else { // no annotation filter specified - createStemAnnotation(jcas, stemmer, fs); - } - } - } - - } - - private void createStemAnnotation(JCas jcas, smile.nlp.stemmer.LancasterStemmer stemmer, - AnnotationFS fs) - throws AnalysisEngineProcessException - { - // Check for blank text, it makes no sense to add a stem then (and raised an exception) - String value = fp.getValue(fs); - if (!StringUtils.isBlank(value)) { - Stem stemAnnot = new Stem(jcas, fs.getBegin(), fs.getEnd()); - - stemAnnot.setValue(stemmer.stem(value)); - stemAnnot.addToIndexes(jcas); - - // Try setting the "stem" feature on Tokens. - Feature feat = fs.getType().getFeatureByBaseName("stem"); - if (feat != null && feat.getRange() != null - && jcas.getTypeSystem().subsumes(feat.getRange(), stemAnnot.getType())) { - fs.setFeatureValue(feat, stemAnnot); - } - } - } - -} diff --git a/dkpro-core-lancaster-asl/src/main/java/org/dkpro/core/lancaster/package-info.java b/dkpro-core-lancaster-asl/src/main/java/org/dkpro/core/lancaster/package-info.java deleted file mode 100644 index feeaf625e4..0000000000 --- a/dkpro-core-lancaster-asl/src/main/java/org/dkpro/core/lancaster/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Licensed to the Technische Universität Darmstadt under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt - * licenses this file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Lancaster stemmer based on the <a href="http://haifengl.github.io/smile">Smile</a> machine learning package. - * - * @since 1.9.0 - */ -package org.dkpro.core.lancaster; diff --git a/dkpro-core-lancaster-asl/src/test/java/org/dkpro/core/lancaster/LancasterStemmerTest.java b/dkpro-core-lancaster-asl/src/test/java/org/dkpro/core/lancaster/LancasterStemmerTest.java deleted file mode 100644 index a81d152828..0000000000 --- a/dkpro-core-lancaster-asl/src/test/java/org/dkpro/core/lancaster/LancasterStemmerTest.java +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Licensed to the Technische Universität Darmstadt under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt - * licenses this file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.lancaster; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.jcas.JCas; -import org.junit.Rule; -import org.junit.Test; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -public class LancasterStemmerTest -{ - @Test - public void testEnglish() - throws Exception - { - runTest("en", "computers Computers deliberately", - new String[] {"comput", "comput", "delib"} ); - - runTest("en", "We need a very complicated example sentence , which " + - "contains as many constituents and dependencies as possible .", - new String[] { "we", "need", "a", "very", "comply", "exampl", "sent", "", - "which", "contain", "as", "many", "constitu", "and", "depend", "as", "poss", - "" }); - } - - @Test - public void testEnglishWithDefaultRulesConfiguration() - throws Exception - { - runTest("en", "proceed", - new String[] {"process"} - ); - } - - @Test - public void testEnglishWithClassPathRulesConfiguration() - throws Exception - { - runTest("en", "proceed", - new String[] {"procee"}, // using default rules the expected would be process - LancasterStemmer.PARAM_MODEL_LOCATION, "classpath:Lancaster_test_rules.txt" - ); - } - - @Test - public void testEnglishWithFilePathRulesConfiguration() - throws Exception - { - runTest("en", "proceed", - new String[] {"procee"}, // using default rules the expected would be process - LancasterStemmer.PARAM_MODEL_LOCATION, "file:src/test/resources/Lancaster_test_rules.txt" - ); - } - - @Test - public void testAlternativeLanguageConfiguration() - throws Exception - { - runTest("dl", "proceed", - new String[] {"procee"}, // using default rules the expected would be process - LancasterStemmer.PARAM_MODEL_LOCATION, "classpath:Lancaster_test_rules.txt", - LancasterStemmer.PARAM_LANGUAGE, "dl" - ); - } - - @Test - public void testEnglishCaseInsensitive() - throws Exception - { - runTest("en", "EDUCATIONAL Educational educational", - new String[] {"educ", "educ", "educ"}); - } - - @Test - public void testEnglishCaseFiltered() - throws Exception - { - String[] stems = { "educ" }; - String[] pos = { "NNS", "JJ", "NN", "NNS" }; - - AnalysisEngineDescription aggregate = createEngineDescription( - createEngineDescription(OpenNlpPosTagger.class), - createEngineDescription(LancasterStemmer.class, - LancasterStemmer.PARAM_FILTER_FEATUREPATH, "pos/PosValue", - LancasterStemmer.PARAM_FILTER_CONDITION_OPERATOR, "EQUALS", - LancasterStemmer.PARAM_FILTER_CONDITION_VALUE, "JJ")); - - JCas result = TestRunner.runTest(aggregate, "en", "Babies educational sleep .s"); - - AssertAnnotations.assertStem(stems, select(result, Stem.class)); - AssertAnnotations.assertPOS(null, pos, select(result, POS.class)); - } - - private JCas runTest(String aLanguage, String aText, String[] aStems, Object... aParams) - throws Exception - { - JCas result = TestRunner.runTest(createEngineDescription(LancasterStemmer.class, aParams), - aLanguage, aText); - - AssertAnnotations.assertStem(aStems, select(result, Stem.class)); - - return result; - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-lancaster-asl/src/test/resources/log4j.properties b/dkpro-core-lancaster-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-lancaster-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-langdetect-asl/pom.xml b/dkpro-core-langdetect-asl/pom.xml index ed1c4d0e64..53093de0d7 100644 --- a/dkpro-core-langdetect-asl/pom.xml +++ b/dkpro-core-langdetect-asl/pom.xml @@ -19,14 +19,15 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.langdetect-asl</artifactId> + <artifactId>dkpro-core-langdetect-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - langdetect (v ${langdetect.version})</name> + <url>https://dkpro.github.io/dkpro-core/</url> <properties> <langdetect.version>1.1-20120112</langdetect.version> </properties> @@ -40,26 +41,30 @@ <artifactId>uimafit-core</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> <groupId>com.cybozu.labs</groupId> <artifactId>langdetect</artifactId> <version>${langdetect.version}</version> </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> + </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> <dependency> diff --git a/dkpro-core-langdetect-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/langdetect/package-info.java b/dkpro-core-langdetect-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/langdetect/package-info.java deleted file mode 100644 index 2ba532ef72..0000000000 --- a/dkpro-core-langdetect-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/langdetect/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Language detector based on n-gram frequency counts, e.g. as provided by Web1T. - * - * @since 1.5.0 - */ -package de.tudarmstadt.ukp.dkpro.core.langdetect; diff --git a/dkpro-core-langdetect-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/langdetect/LangDetectLanguageIdentifier.java b/dkpro-core-langdetect-asl/src/main/java/org/dkpro/core/langdetect/LangDetectLanguageIdentifier.java similarity index 78% rename from dkpro-core-langdetect-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/langdetect/LangDetectLanguageIdentifier.java rename to dkpro-core-langdetect-asl/src/main/java/org/dkpro/core/langdetect/LangDetectLanguageIdentifier.java index 4b49bade2b..b60c945253 100644 --- a/dkpro-core-langdetect-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/langdetect/LangDetectLanguageIdentifier.java +++ b/dkpro-core-langdetect-asl/src/main/java/org/dkpro/core/langdetect/LangDetectLanguageIdentifier.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.langdetect; +package org.dkpro.core.langdetect; import java.io.File; import java.io.IOException; @@ -28,15 +28,18 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.resources.ResourceUtils; import com.cybozu.labs.langdetect.Detector; import com.cybozu.labs.langdetect.DetectorFactory; import com.cybozu.labs.langdetect.LangDetectException; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Langdetect language identifier based on character n-grams. @@ -45,7 +48,9 @@ * instantiated multiple times with different model locations. Only a single model location * can be active at a time over <b>all</b> instances of this component. */ -@ResourceMetaData(name="LangDetect") +@Component(OperationType.LANGUAGE_IDENTIFIER) +@ResourceMetaData(name = "LangDetect") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") public class LangDetectLanguageIdentifier extends JCasAnnotator_ImplBase { @@ -57,6 +62,20 @@ public class LangDetectLanguageIdentifier @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + /** * Location from which the model is read. */ @@ -83,6 +102,7 @@ public void initialize(UimaContext context) { setContextObject(LangDetectLanguageIdentifier.this); + setDefault(PACKAGE, "de/tudarmstadt/ukp/dkpro/core/langdetect"); setDefault(ARTIFACT_ID, "${groupId}.langdetect-model-${language}-${variant}"); setDefault(LOCATION, "classpath:/${package}/lib/languageidentifier-${language}-${variant}.properties"); diff --git a/dkpro-core-langdetect-asl/src/main/java/org/dkpro/core/langdetect/package-info.java b/dkpro-core-langdetect-asl/src/main/java/org/dkpro/core/langdetect/package-info.java new file mode 100644 index 0000000000..8a0590debb --- /dev/null +++ b/dkpro-core-langdetect-asl/src/main/java/org/dkpro/core/langdetect/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Language detector based on n-gram frequency counts, e.g. as provided by Web1T. + * + * @since 1.5.0 + */ +package org.dkpro.core.langdetect; diff --git a/dkpro-core-langdetect-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/langdetect/LangDetectLanguageIdentifierTest.java b/dkpro-core-langdetect-asl/src/test/java/org/dkpro/core/langdetect/LangDetectLanguageIdentifierTest.java similarity index 89% rename from dkpro-core-langdetect-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/langdetect/LangDetectLanguageIdentifierTest.java rename to dkpro-core-langdetect-asl/src/test/java/org/dkpro/core/langdetect/LangDetectLanguageIdentifierTest.java index a4b4b5b9e6..9b80fbe86a 100644 --- a/dkpro-core-langdetect-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/langdetect/LangDetectLanguageIdentifierTest.java +++ b/dkpro-core-langdetect-asl/src/test/java/org/dkpro/core/langdetect/LangDetectLanguageIdentifierTest.java @@ -15,20 +15,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.langdetect; +package org.dkpro.core.langdetect; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.junit.Assert.assertEquals; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.langdetect.LangDetectLanguageIdentifier; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - public class LangDetectLanguageIdentifierTest { @Test diff --git a/dkpro-core-langdetect-asl/src/test/resources/log4j.properties b/dkpro-core-langdetect-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-langdetect-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-langdetect-asl/src/test/resources/log4j2.xml b/dkpro-core-langdetect-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-langdetect-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Logger name="org.dkpro.core.api.resources.ResourceObjectProviderBase" level="INFO"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-languagetool-asl/pom.xml b/dkpro-core-languagetool-asl/pom.xml index a93a0d0568..06a0e931b1 100644 --- a/dkpro-core-languagetool-asl/pom.xml +++ b/dkpro-core-languagetool-asl/pom.xml @@ -18,17 +18,18 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <version>1.10.0-SNAPSHOT</version> + <artifactId>dkpro-core-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.languagetool-asl</artifactId> + <artifactId>dkpro-core-languagetool-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - LanguageTool (v ${languagetool.version}) (LGPL)</name> + <url>https://dkpro.github.io/dkpro-core/</url> <description>Grammar checker based on LanguageTool (LGPL)</description> <properties> - <languagetool.version>3.9</languagetool.version> + <languagetool.version>5.2</languagetool.version> </properties> <dependencies> <dependency> @@ -45,33 +46,39 @@ <version>1.0.1</version> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.anomaly-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-anomaly-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.transform-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-transform-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.frequency-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-frequency-asl</artifactId> </dependency> <dependency> <groupId>org.languagetool</groupId> <artifactId>languagetool-core</artifactId> <version>${languagetool.version}</version> + <exclusions> + <exclusion> + <groupId>ch.qos.logback</groupId> + <artifactId>logback-classic</artifactId> + </exclusion> + </exclusions> </dependency> <dependency> <groupId>org.languagetool</groupId> @@ -79,12 +86,16 @@ <version>${languagetool.version}</version> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-opennlp-asl</artifactId> <scope>test</scope> </dependency> <dependency> @@ -98,8 +109,8 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> <dependency> @@ -111,9 +122,9 @@ <dependencyManagement> <dependencies> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-opennlp-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <type>pom</type> <scope>import</scope> </dependency> diff --git a/dkpro-core-languagetool-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/languagetool/CjfNormalizer.java b/dkpro-core-languagetool-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/languagetool/CjfNormalizer.java deleted file mode 100644 index 6dbdcec663..0000000000 --- a/dkpro-core-languagetool-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/languagetool/CjfNormalizer.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.languagetool; - -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.LanguageCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.jcas.JCas; - -import cn.com.cjf.CJFBeanFactory; -import de.tudarmstadt.ukp.dkpro.core.api.transform.JCasTransformer_ImplBase; - -/** - * Converts traditional Chinese to simplified Chinese or vice-versa. - */ -@ResourceMetaData(name="Chinese Traditional/Simplified Converter") -@LanguageCapability("zh") -public class CjfNormalizer - extends JCasTransformer_ImplBase -{ - public static enum Direction { - TO_TRADITIONAL, - TO_SIMPLIFIED - }; - - public static final String PARAM_DIRECTION = "direction"; - @ConfigurationParameter(name = PARAM_DIRECTION, mandatory=true, defaultValue="TO_SIMPLIFIED") - private Direction direction; - - @Override - public void process(JCas aInput, JCas aOutput) - throws AnalysisEngineProcessException - { - String originalText = aInput.getDocumentText(); - String newText; - - switch (direction) { - case TO_SIMPLIFIED: - newText = CJFBeanFactory.getChineseJF().chineseFan2Jan(originalText); - break; - case TO_TRADITIONAL: - newText = CJFBeanFactory.getChineseJF().chineseJan2Fan(originalText); - break; - default: - throw new IllegalArgumentException("Unknown directon [" + direction + "]"); - } - - aOutput.setDocumentText(newText); - } -} diff --git a/dkpro-core-languagetool-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/languagetool/LanguageToolChecker.java b/dkpro-core-languagetool-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/languagetool/LanguageToolChecker.java deleted file mode 100644 index 8355a4894b..0000000000 --- a/dkpro-core-languagetool-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/languagetool/LanguageToolChecker.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.languagetool; - -import java.io.IOException; -import java.net.URL; -import java.util.List; -import java.util.Properties; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.LanguageCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.Level; -import org.languagetool.JLanguageTool; -import org.languagetool.Language; -import org.languagetool.Languages; -import org.languagetool.rules.RuleMatch; - -import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.GrammarAnomaly; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; - -/** - * Detect grammatical errors in text using LanguageTool a rule based grammar checker. - * - */ -@ResourceMetaData(name="LanguageTool Grammar Checker") -@LanguageCapability({ "en", "fa", "fr", "de", "pl", "ca", "it", "br", "nl", "pt", "ru", "be", "zh", - "da", "eo", "gl", "el", "is", "ja", "km", "lt", "ml", "ro", "sk", "sl", "es", "sv", "ta", - "tl", "uk" }) -@TypeCapability( - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.GrammarAnomaly" }) -public class LanguageToolChecker - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - private String language; - - private ModelProviderBase<JLanguageTool> modelProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase<JLanguageTool>() { - { - setContextObject(LanguageToolChecker.this); - setDefault(LOCATION, NOT_REQUIRED); - - setOverride(LANGUAGE, language); - } - - @Override - protected JLanguageTool produceResource(URL aUrl) - throws IOException - { - Properties props = getAggregatedProperties(); - Language lang = Languages.getLanguageForShortCode(props.getProperty(LANGUAGE)); - - if (lang == null) { - throw new IOException("The language code '" - + props.getProperty(LANGUAGE) + "' is not supported by LanguageTool."); - } - - Language defaultVariant = lang.getDefaultLanguageVariant(); - if (defaultVariant != null) { - getLogger().info( - "Using default variant [" - + defaultVariant.getShortCodeWithCountryAndVariant() - + "] for language [" + props.getProperty(LANGUAGE) + "]"); - lang = defaultVariant; - } - - return new JLanguageTool(lang); - } - }; - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - modelProvider.configure(aJCas.getCas()); - - // get document text - String docText = aJCas.getDocumentText(); - - try { - List<RuleMatch> matches = modelProvider.getResource().check(docText); - for (RuleMatch match : matches) { - // create annotation - GrammarAnomaly annotation = new GrammarAnomaly(aJCas); - annotation.setBegin(match.getFromPos()); - annotation.setEnd(match.getToPos()); - annotation.setDescription(match.getMessage()); - annotation.addToIndexes(); - getContext().getLogger().log(Level.FINEST, "Found: " + annotation); - } - } - catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - } -} diff --git a/dkpro-core-languagetool-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/languagetool/LanguageToolLemmatizer.java b/dkpro-core-languagetool-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/languagetool/LanguageToolLemmatizer.java deleted file mode 100644 index 8cc5d1d097..0000000000 --- a/dkpro-core-languagetool-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/languagetool/LanguageToolLemmatizer.java +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.languagetool; - -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; -import static org.apache.uima.fit.util.JCasUtil.toText; - -import java.io.IOException; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.LanguageCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.languagetool.AnalyzedSentence; -import org.languagetool.AnalyzedToken; -import org.languagetool.AnalyzedTokenReadings; -import org.languagetool.Language; -import org.languagetool.Languages; - -import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * Naive lexicon-based lemmatizer. The words are looked up using the wordform lexicons of - * LanguageTool. Multiple readings are produced. The annotator simply takes the most frequent - * lemma from those readings. If no readings could be found, the original text is assigned as - * lemma. - */ -@ResourceMetaData(name="LanguageTool Lemmatizer") -@LanguageCapability({ "en", "fa", "fr", "de", "pl", "ca", "it", "br", "nl", "pt", "ru", "be", "zh", - "da", "eo", "gl", "el", "is", "ja", "km", "lt", "ml", "ro", "sk", "sl", "es", "sv", "ta", - "tl", "uk" }) -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }) -public class LanguageToolLemmatizer - extends JCasAnnotator_ImplBase -{ - public static final String PARAM_SANITIZE = "sanitize"; - @ConfigurationParameter(name=PARAM_SANITIZE, mandatory=true, defaultValue="true") - private boolean sanitize; - - public static final String PARAM_SANTIZE_CHARS = "sanitizeChars"; - @ConfigurationParameter(name = PARAM_SANTIZE_CHARS, mandatory = true, defaultValue = { "(", - ")", "[", "]" }) - private String[] sanitizeChars; - - private MappingProvider mappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - mappingProvider = new MappingProvider(); - mappingProvider.setDefault(MappingProvider.VARIANT, "default"); - mappingProvider.setDefaultVariantsLocation( - "de/tudarmstadt/ukp/dkpro/core/languagetool/lib/language-tagset.map"); - mappingProvider.setDefault(MappingProvider.LOCATION, - "classpath:/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/${language}-${variant}.map"); - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - mappingProvider.configure(aJCas.getCas()); - - try { - Language lang = Languages.getLanguageForShortCode(aJCas.getDocumentLanguage()); - Language defaultVariant = lang.getDefaultLanguageVariant(); - if (defaultVariant != null) { - getLogger().info( - "Using default variant [" - + defaultVariant.getShortCodeWithCountryAndVariant() - + "] for language [" + aJCas.getDocumentLanguage() + "]"); - lang = defaultVariant; - } - - for (Sentence s : select(aJCas, Sentence.class)) { - // Get the tokens from the sentence - List<Token> tokens = selectCovered(Token.class, s); - List<String> tokenText = toText(tokens); - - // Let LanguageTool analyze the tokens - List<AnalyzedTokenReadings> rawTaggedTokens = lang.getTagger().tag(tokenText); - AnalyzedSentence as = new AnalyzedSentence( - rawTaggedTokens.toArray(new AnalyzedTokenReadings[rawTaggedTokens.size()])); - as = lang.getDisambiguator().disambiguate(as); - - for (int i = 0; i < tokens.size(); i++) { - Token token = tokens.get(i); - - String l = null; - - // Try using the POS to disambiguate the lemma - if (token.getPos() != null) { - l = getByPos(token.getPos(), as.getTokens()[i]); - } - - // Get the most frequent lemma - if (l == null) { - l = getMostFrequentLemma(as.getTokens()[i]); - } - - // Sanitize if we have a lemma by now - if (sanitize && l != null) { - l = sanitizeLemma(token.getText(), l); - } - - if (l == null) { - l = token.getText(); - } - - // Create the annotation - Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); - lemma.setValue(l); - lemma.addToIndexes(); - token.setLemma(lemma); - } - } - } - catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - } - - private String getByPos(POS aPos, AnalyzedTokenReadings aReadings) - { - String tag = aPos.getPosValue(); - //System.out.printf("%s %n", tag); - for (AnalyzedToken t : aReadings.getReadings()) { - //System.out.printf("-- %s %s ", t.getPOSTag(), t.getLemma()); - - if (t.getPOSTag() == null) { - return null; - } - - // Lets see if we have mapped tagsets - try { - String typeName = mappingProvider.getTagType(t.getPOSTag()).getName(); - if (aPos.getClass().getName().equals(typeName)) { - //System.out.printf("- mapped match%n"); - return t.getLemma(); - } - } - catch (IllegalStateException e) { - // Type could not be looked up. Go on with other types of matching - } - - // Full match... feeling lucky ;) This is quite unlikely to happen because the tagset - // used by LanguageTool is most certainly different from tagset used by POS tagger. - if (tag.equals(t.getPOSTag())) { - //System.out.printf("- full match%n"); - return t.getLemma(); - } - - // Some tagsets used by LanguageTool use ':' as separator. If we are lucky, the string - // before the first ':' matches our POS tag. - - if (t.getPOSTag().length() > 1 && tag.equals(t.getPOSTag().split(":")[0])) { - //System.out.printf("- first element match%n"); - return t.getLemma(); - } - - //System.out.printf("- no match%n"); - } - - //System.out.printf("- no reading matches%n"); - return null; - } - - private String getMostFrequentLemma(AnalyzedTokenReadings aReadings) - { - FrequencyDistribution<String> freq = new FrequencyDistribution<String>(); - for (AnalyzedToken t : aReadings.getReadings()) { - if (t.getLemma() != null) { - freq.inc(t.getLemma()); - } - } - - String best = null; - for (String l : freq.getKeys()) { - if (best == null) { - best = l; - } - else if (freq.getCount(best) < freq.getCount(l)) { - best = l; - } - } - - return best; - } - - private String sanitizeLemma(String aWordForm, String aLemma) - { - String sanitized = aLemma; - for (String c : sanitizeChars) { - if (!aWordForm.contains(c)) { - sanitized = sanitized.replace(c, ""); - } - } - return sanitized; - } -} diff --git a/dkpro-core-languagetool-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/languagetool/LanguageToolSegmenter.java b/dkpro-core-languagetool-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/languagetool/LanguageToolSegmenter.java deleted file mode 100644 index 9d2e92ee5e..0000000000 --- a/dkpro-core-languagetool-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/languagetool/LanguageToolSegmenter.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.languagetool; - -import java.util.List; - -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.LanguageCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.languagetool.Language; -import org.languagetool.Languages; - -import cn.com.cjf.CJFBeanFactory; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; - -/** - * Segmenter using LanguageTool to do the heavy lifting. LanguageTool internally uses different - * strategies for tokenization. - */ -@ResourceMetaData(name="LanguageTool Segmenter") -@LanguageCapability({ "en", "fa", "fr", "de", "pl", "ca", "it", "br", "nl", "pt", "ru", "be", "zh", - "da", "eo", "gl", "el", "is", "ja", "km", "lt", "ml", "ro", "sk", "sl", "es", "sv", "ta", - "tl", "uk" }) -@TypeCapability( - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) -public class LanguageToolSegmenter extends SegmenterBase -{ - @Override - protected void process(JCas aJCas, String aText, int aZoneBegin) - throws AnalysisEngineProcessException - { - Language lang = Languages.getLanguageForShortCode(getLanguage(aJCas)); - Language defaultVariant = lang.getDefaultLanguageVariant(); - if (defaultVariant != null) { - getLogger().debug( - "Using default variant [" + defaultVariant.getShortCodeWithCountryAndVariant() - + "] for language [" + getLanguage(aJCas) + "]"); - lang = defaultVariant; - } - - List<String> sentences = lang.getSentenceTokenizer().tokenize(aText); - - int lastSStart = 0; - for (String s : sentences) { - int sStart = aText.indexOf(s, lastSStart); - int sEnd = sStart + s.length(); - lastSStart = sEnd; - - sStart += aZoneBegin; - sEnd += aZoneBegin; - - createSentence(aJCas, sStart, sEnd); - - List<String> tokens = lang.getWordTokenizer().tokenize(s); - int lastTStart = 0; - for (String t : tokens) { - int tStart = s.indexOf(t, lastTStart); - - // The Chinese tokenizer adds some /xxx suffixes, try to remove that - if ("zh".equals(getLanguage(aJCas)) && tStart == -1) { - int suffix = t.indexOf('/'); - if (suffix != -1) { - t = t.substring(0, suffix); - } - tStart = s.indexOf(t, lastTStart); - - } - - // The Chinese tokenizer normalizes from traditional to simplified Chinese. - // Maybe we have to undo this transformation. - if ("zh".equals(getLanguage(aJCas)) && tStart == -1) { - String trad = CJFBeanFactory.getChineseJF().chineseJan2Fan(t); - tStart = s.indexOf(trad, lastTStart); - } - - if (tStart == -1) { - throw new IllegalStateException("Token [" + t + "] not found in sentence [" + s - + "]"); - } - - int tEnd = tStart + t.length(); - lastTStart = tEnd; - - tStart += sStart; - tEnd += sStart; - - createToken(aJCas, tStart, tEnd); - } - } - } -} diff --git a/dkpro-core-languagetool-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/languagetool/package-info.java b/dkpro-core-languagetool-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/languagetool/package-info.java deleted file mode 100644 index 122b51e01a..0000000000 --- a/dkpro-core-languagetool-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/languagetool/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Grammar and style checker based on <a href="http://www.languagetool.org/">LanguageTool</a>. - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.languagetool; diff --git a/dkpro-core-languagetool-asl/src/main/java/org/dkpro/core/languagetool/CjfNormalizer.java b/dkpro-core-languagetool-asl/src/main/java/org/dkpro/core/languagetool/CjfNormalizer.java new file mode 100644 index 0000000000..1468dc82c5 --- /dev/null +++ b/dkpro-core-languagetool-asl/src/main/java/org/dkpro/core/languagetool/CjfNormalizer.java @@ -0,0 +1,75 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.languagetool; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.transform.JCasTransformer_ImplBase; + +import cn.com.cjf.CJFBeanFactory; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Converts traditional Chinese to simplified Chinese or vice-versa. + */ +@Component(OperationType.NORMALIZER) +@ResourceMetaData(name = "Chinese Traditional/Simplified Converter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability("zh") +public class CjfNormalizer + extends JCasTransformer_ImplBase +{ + public static enum Direction { + TO_TRADITIONAL, + TO_SIMPLIFIED + }; + + /** + * Direction in which to perform the conversion ({@link Direction#TO_TRADITIONAL} or + * {@link Direction#TO_SIMPLIFIED}); + */ + public static final String PARAM_DIRECTION = "direction"; + @ConfigurationParameter(name = PARAM_DIRECTION, mandatory = true, defaultValue = "TO_SIMPLIFIED") + private Direction direction; + + @Override + public void process(JCas aInput, JCas aOutput) + throws AnalysisEngineProcessException + { + String originalText = aInput.getDocumentText(); + String newText; + + switch (direction) { + case TO_SIMPLIFIED: + newText = CJFBeanFactory.getChineseJF().chineseFan2Jan(originalText); + break; + case TO_TRADITIONAL: + newText = CJFBeanFactory.getChineseJF().chineseJan2Fan(originalText); + break; + default: + throw new IllegalArgumentException("Unknown directon [" + direction + "]"); + } + + aOutput.setDocumentText(newText); + } +} diff --git a/dkpro-core-languagetool-asl/src/main/java/org/dkpro/core/languagetool/LanguageToolChecker.java b/dkpro-core-languagetool-asl/src/main/java/org/dkpro/core/languagetool/LanguageToolChecker.java new file mode 100644 index 0000000000..551179fe20 --- /dev/null +++ b/dkpro-core-languagetool-asl/src/main/java/org/dkpro/core/languagetool/LanguageToolChecker.java @@ -0,0 +1,143 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.languagetool; + +import static org.apache.uima.fit.util.FSCollectionFactory.createFSArray; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Level; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.languagetool.JLanguageTool; +import org.languagetool.Language; +import org.languagetool.Languages; +import org.languagetool.rules.RuleMatch; + +import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.GrammarAnomaly; +import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Detect grammatical errors in text using LanguageTool a rule based grammar checker. + */ +@Component(OperationType.GRAMMAR_CHECKER) +@ResourceMetaData(name = "LanguageTool Grammar Checker") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability({ "en", "fa", "fr", "de", "pl", "ca", "it", "br", "nl", "pt", "ru", "be", "zh", + "da", "eo", "gl", "el", "is", "ja", "km", "lt", "ml", "ro", "sk", "sl", "es", "sv", "ta", + "tl", "uk" }) +@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.GrammarAnomaly" }) +public class LanguageToolChecker + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + private String language; + + private ModelProviderBase<JLanguageTool> modelProvider; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase<JLanguageTool>() + { + { + setContextObject(LanguageToolChecker.this); + setDefault(LOCATION, NOT_REQUIRED); + + setOverride(LANGUAGE, language); + } + + @Override + protected JLanguageTool produceResource(URL aUrl) throws IOException + { + Properties props = getAggregatedProperties(); + Language lang = Languages.getLanguageForShortCode(props.getProperty(LANGUAGE)); + + if (lang == null) { + throw new IOException("The language code '" + props.getProperty(LANGUAGE) + + "' is not supported by LanguageTool."); + } + + Language defaultVariant = lang.getDefaultLanguageVariant(); + if (defaultVariant != null) { + getLogger().info("Using default variant [" + + defaultVariant.getShortCodeWithCountryAndVariant() + + "] for language [" + props.getProperty(LANGUAGE) + "]"); + lang = defaultVariant; + } + + return new JLanguageTool(lang); + } + }; + } + + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException + { + modelProvider.configure(aJCas.getCas()); + + // get document text + String docText = aJCas.getDocumentText(); + + try { + List<RuleMatch> matches = modelProvider.getResource().check(docText); + for (RuleMatch match : matches) { + // create annotation + GrammarAnomaly annotation = new GrammarAnomaly(aJCas); + annotation.setBegin(match.getFromPos()); + annotation.setEnd(match.getToPos()); + annotation.setDescription(match.getMessage()); + List<SuggestedAction> suggestions = new ArrayList<>(); + for (String replacement : match.getSuggestedReplacements()) { + SuggestedAction action = new SuggestedAction(aJCas, annotation.getBegin(), + annotation.getEnd()); + action.setReplacement(replacement); + suggestions.add(action); + } + annotation.setSuggestions(createFSArray(aJCas, suggestions)); + annotation.addToIndexes(); + getContext().getLogger().log(Level.FINEST, "Found: " + annotation); + } + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } +} diff --git a/dkpro-core-languagetool-asl/src/main/java/org/dkpro/core/languagetool/LanguageToolLemmatizer.java b/dkpro-core-languagetool-asl/src/main/java/org/dkpro/core/languagetool/LanguageToolLemmatizer.java new file mode 100644 index 0000000000..009df24986 --- /dev/null +++ b/dkpro-core-languagetool-asl/src/main/java/org/dkpro/core/languagetool/LanguageToolLemmatizer.java @@ -0,0 +1,246 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.languagetool; + +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.apache.uima.fit.util.JCasUtil.toText; + +import java.io.IOException; +import java.util.List; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.frequency.util.FrequencyDistribution; +import org.dkpro.core.api.resources.MappingProvider; +import org.languagetool.AnalyzedSentence; +import org.languagetool.AnalyzedToken; +import org.languagetool.AnalyzedTokenReadings; +import org.languagetool.Language; +import org.languagetool.Languages; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Naive lexicon-based lemmatizer. The words are looked up using the wordform lexicons of + * LanguageTool. Multiple readings are produced. The annotator simply takes the most frequent + * lemma from those readings. If no readings could be found, the original text is assigned as + * lemma. + */ +@Component(OperationType.LEMMATIZER) +@ResourceMetaData(name = "LanguageTool Lemmatizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability({ "en", "fa", "fr", "de", "pl", "ca", "it", "br", "nl", "pt", "ru", "be", "zh", + "da", "eo", "gl", "el", "is", "ja", "km", "lt", "ml", "ro", "sk", "sl", "es", "sv", "ta", + "tl", "uk" }) +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }) +public class LanguageToolLemmatizer + extends JCasAnnotator_ImplBase +{ + /** + * Remove characters specified in {@link #PARAM_SANTIZE_CHARS} from lemmas. + */ + public static final String PARAM_SANITIZE = "sanitize"; + @ConfigurationParameter(name = PARAM_SANITIZE, mandatory = true, defaultValue = "true") + private boolean sanitize; + + /** + * Characters to remove from lemmas if {@link #PARAM_SANITIZE} is enabled. + */ + public static final String PARAM_SANTIZE_CHARS = "sanitizeChars"; + @ConfigurationParameter(name = PARAM_SANTIZE_CHARS, mandatory = true, defaultValue = { "(", + ")", "[", "]" }) + private String[] sanitizeChars; + + private MappingProvider mappingProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + mappingProvider = new MappingProvider(); + mappingProvider.setContextObject(this); + mappingProvider.setDefault(MappingProvider.VARIANT, "default"); + mappingProvider.setDefaultVariantsLocation("${package}/lib/language-tagset.map"); + mappingProvider.setDefault(MappingProvider.LOCATION, + "classpath:/org/dkpro/core/api/lexmorph/tagset/${language}-${variant}.map"); + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + mappingProvider.configure(aJCas.getCas()); + + try { + Language lang = Languages.getLanguageForShortCode(aJCas.getDocumentLanguage()); + Language defaultVariant = lang.getDefaultLanguageVariant(); + if (defaultVariant != null) { + getLogger().info( + "Using default variant [" + + defaultVariant.getShortCodeWithCountryAndVariant() + + "] for language [" + aJCas.getDocumentLanguage() + "]"); + lang = defaultVariant; + } + + for (Sentence s : select(aJCas, Sentence.class)) { + // Get the tokens from the sentence + List<Token> tokens = selectCovered(Token.class, s); + List<String> tokenText = toText(tokens); + + // Let LanguageTool analyze the tokens + List<AnalyzedTokenReadings> rawTaggedTokens = lang.getTagger().tag(tokenText); + AnalyzedSentence as = new AnalyzedSentence( + rawTaggedTokens.toArray(new AnalyzedTokenReadings[rawTaggedTokens.size()])); + as = lang.getDisambiguator().disambiguate(as); + + for (int i = 0; i < tokens.size(); i++) { + Token token = tokens.get(i); + + String l = null; + + // Try using the POS to disambiguate the lemma + if (token.getPos() != null) { + l = getByPos(token.getPos(), as.getTokens()[i]); + } + + // Get the most frequent lemma + if (l == null) { + l = getMostFrequentLemma(as.getTokens()[i]); + } + + // Sanitize if we have a lemma by now + if (sanitize && l != null) { + l = sanitizeLemma(token.getText(), l); + } + + if (l == null) { + l = token.getText(); + } + + // Create the annotation + Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); + lemma.setValue(l); + lemma.addToIndexes(); + token.setLemma(lemma); + } + } + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + + private String getByPos(POS aPos, AnalyzedTokenReadings aReadings) + { + String tag = aPos.getPosValue(); + //System.out.printf("%s %n", tag); + for (AnalyzedToken t : aReadings.getReadings()) { + //System.out.printf("-- %s %s ", t.getPOSTag(), t.getLemma()); + + if (t.getPOSTag() == null) { + return null; + } + + // Lets see if we have mapped tagsets + try { + String typeName = mappingProvider.getTagType(t.getPOSTag()).getName(); + if (aPos.getClass().getName().equals(typeName)) { + //System.out.printf("- mapped match%n"); + return t.getLemma(); + } + } + catch (IllegalStateException e) { + // Type could not be looked up. Go on with other types of matching + } + + // Full match... feeling lucky ;) This is quite unlikely to happen because the tagset + // used by LanguageTool is most certainly different from tagset used by POS tagger. + if (tag.equals(t.getPOSTag())) { + //System.out.printf("- full match%n"); + return t.getLemma(); + } + + // Some tagsets used by LanguageTool use ':' as separator. If we are lucky, the string + // before the first ':' matches our POS tag. + + if (t.getPOSTag().length() > 1 && tag.equals(t.getPOSTag().split(":")[0])) { + //System.out.printf("- first element match%n"); + return t.getLemma(); + } + + //System.out.printf("- no match%n"); + } + + //System.out.printf("- no reading matches%n"); + return null; + } + + private String getMostFrequentLemma(AnalyzedTokenReadings aReadings) + { + FrequencyDistribution<String> freq = new FrequencyDistribution<String>(); + for (AnalyzedToken t : aReadings.getReadings()) { + if (t.getLemma() != null) { + freq.inc(t.getLemma()); + } + } + + String best = null; + for (String l : freq.getKeys()) { + if (best == null) { + best = l; + } + else if (freq.getCount(best) < freq.getCount(l)) { + best = l; + } + } + + return best; + } + + private String sanitizeLemma(String aWordForm, String aLemma) + { + String sanitized = aLemma; + for (String c : sanitizeChars) { + if (!aWordForm.contains(c)) { + sanitized = sanitized.replace(c, ""); + } + } + return sanitized; + } +} diff --git a/dkpro-core-languagetool-asl/src/main/java/org/dkpro/core/languagetool/LanguageToolSegmenter.java b/dkpro-core-languagetool-asl/src/main/java/org/dkpro/core/languagetool/LanguageToolSegmenter.java new file mode 100644 index 0000000000..bf40380d75 --- /dev/null +++ b/dkpro-core-languagetool-asl/src/main/java/org/dkpro/core/languagetool/LanguageToolSegmenter.java @@ -0,0 +1,112 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.languagetool; + +import java.util.List; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.segmentation.SegmenterBase; +import org.languagetool.Language; +import org.languagetool.Languages; + +import cn.com.cjf.CJFBeanFactory; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Segmenter using LanguageTool to do the heavy lifting. LanguageTool internally uses different + * strategies for tokenization. + */ +@ResourceMetaData(name = "LanguageTool Segmenter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability({ "en", "fa", "fr", "de", "pl", "ca", "it", "br", "nl", "pt", "ru", "be", "zh", + "da", "eo", "gl", "el", "is", "ja", "km", "lt", "ml", "ro", "sk", "sl", "es", "sv", "ta", + "tl", "uk" }) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) +public class LanguageToolSegmenter extends SegmenterBase +{ + @Override + protected void process(JCas aJCas, String aText, int aZoneBegin) + throws AnalysisEngineProcessException + { + Language lang = Languages.getLanguageForShortCode(getLanguage(aJCas)); + Language defaultVariant = lang.getDefaultLanguageVariant(); + if (defaultVariant != null) { + getLogger().debug( + "Using default variant [" + defaultVariant.getShortCodeWithCountryAndVariant() + + "] for language [" + getLanguage(aJCas) + "]"); + lang = defaultVariant; + } + + List<String> sentences = lang.getSentenceTokenizer().tokenize(aText); + + int lastSStart = 0; + for (String s : sentences) { + int sStart = aText.indexOf(s, lastSStart); + int sEnd = sStart + s.length(); + lastSStart = sEnd; + + sStart += aZoneBegin; + sEnd += aZoneBegin; + + createSentence(aJCas, sStart, sEnd); + + List<String> tokens = lang.getWordTokenizer().tokenize(s); + int lastTStart = 0; + for (String t : tokens) { + int tStart = s.indexOf(t, lastTStart); + + // The Chinese tokenizer adds some /xxx suffixes, try to remove that + if ("zh".equals(getLanguage(aJCas)) && tStart == -1) { + int suffix = t.indexOf('/'); + if (suffix != -1) { + t = t.substring(0, suffix); + } + tStart = s.indexOf(t, lastTStart); + + } + + // The Chinese tokenizer normalizes from traditional to simplified Chinese. + // Maybe we have to undo this transformation. + if ("zh".equals(getLanguage(aJCas)) && tStart == -1) { + String trad = CJFBeanFactory.getChineseJF().chineseJan2Fan(t); + tStart = s.indexOf(trad, lastTStart); + } + + if (tStart == -1) { + throw new IllegalStateException("Token [" + t + "] not found in sentence [" + s + + "]"); + } + + int tEnd = tStart + t.length(); + lastTStart = tEnd; + + tStart += sStart; + tEnd += sStart; + + createToken(aJCas, tStart, tEnd); + } + } + } +} diff --git a/dkpro-core-languagetool-asl/src/main/java/org/dkpro/core/languagetool/package-info.java b/dkpro-core-languagetool-asl/src/main/java/org/dkpro/core/languagetool/package-info.java new file mode 100644 index 0000000000..e5833e36b1 --- /dev/null +++ b/dkpro-core-languagetool-asl/src/main/java/org/dkpro/core/languagetool/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Grammar and style checker based on <a href="http://www.languagetool.org/">LanguageTool</a>. + * + * @since 1.1.0 + */ +package org.dkpro.core.languagetool; diff --git a/dkpro-core-languagetool-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/languagetool/lib/language-tagset.map b/dkpro-core-languagetool-asl/src/main/resources/org/dkpro/core/languagetool/lib/language-tagset.map similarity index 100% rename from dkpro-core-languagetool-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/languagetool/lib/language-tagset.map rename to dkpro-core-languagetool-asl/src/main/resources/org/dkpro/core/languagetool/lib/language-tagset.map diff --git a/dkpro-core-languagetool-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/languagetool/LanguageToolCheckerTest.java b/dkpro-core-languagetool-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/languagetool/LanguageToolCheckerTest.java deleted file mode 100644 index 9cfe4741b8..0000000000 --- a/dkpro-core-languagetool-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/languagetool/LanguageToolCheckerTest.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.languagetool; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.junit.Assert.assertEquals; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.fit.testing.factory.TokenBuilder; -import org.apache.uima.jcas.JCas; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.GrammarAnomaly; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - -public class LanguageToolCheckerTest -{ - @Test - public void grammarCheckerTest() - throws Exception - { - String testDocument = "A sentence with a error in the Hitchhiker's Guide tot he Galaxy ."; - - AnalysisEngine engine = createEngine(LanguageToolChecker.class, - LanguageToolChecker.PARAM_LANGUAGE, "en"); - JCas aJCas = engine.newJCas(); - - TokenBuilder<Token, Sentence> tb = new TokenBuilder<Token, Sentence>(Token.class, Sentence.class); - tb.buildTokens(aJCas, testDocument); - - engine.process(aJCas); - - // copy input match type annotations to an array - int count = 0; - for (GrammarAnomaly ga : select(aJCas, GrammarAnomaly.class)) { - System.out.println("Error " + (count + 1) + " (" + ga.getBegin() + ", " + ga.getEnd() - + "):" + ga.getDescription()); - count++; - } - assertEquals(count, 3); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-languagetool-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/languagetool/LanguageToolLemmatizerTest.java b/dkpro-core-languagetool-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/languagetool/LanguageToolLemmatizerTest.java deleted file mode 100644 index e810de8d3c..0000000000 --- a/dkpro-core-languagetool-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/languagetool/LanguageToolLemmatizerTest.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.languagetool; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.jcas.JCas; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -public class LanguageToolLemmatizerTest -{ - @Test - public void testGerman() - throws Exception - { - runTest("de", "Das ist ein Test .", - new String[] { "der", "sein", "ein", "Test", "." }); - - runTest("de", "besitzt", - new String[] { "besitzen" }); - } - - @Test - public void testGerman2() - throws Exception - { - JCas jcas = runTest("de", "Wir brauchen ein sehr kompliziertes Beispiel , welches " - + "möglichst viele Konstituenten und Dependenzen beinhaltet ."); - - String[] lemmas = new String[] { "ich", "brauchen", "ein", "sehr", "kompliziert", - "Beispiel", ",", "welch", "möglichst", "viel", "Konstituente", "und", - "Dependenzen", "beinhalten", "." }; - - AssertAnnotations.assertLemma(lemmas, select(jcas, Lemma.class)); - } - - @Test - public void testEnglish() - throws Exception - { - runTest("en", "This is a test .", - new String[] { "this", "be", "a", "test", "." }); - - runTest("en", "A neural net .", - new String[] { "a", "neural", "net", "." }); - - runTest("en", "John is purchasing oranges .", - new String[] { "John", "be", "purchase", "orange", "." }); - } - - private JCas runTest(String aLanguage, String aText) - throws Exception - { - AnalysisEngineDescription lemma = createEngineDescription(LanguageToolLemmatizer.class); - - return TestRunner.runTest(lemma, aLanguage, aText); - } - - private void runTest(String language, String testDocument, String[] aLemma) - throws Exception - { - AnalysisEngineDescription engine = createEngineDescription( - createEngineDescription(OpenNlpPosTagger.class), - createEngineDescription(LanguageToolLemmatizer.class)); - - JCas jcas = TestRunner.runTest(engine, language, testDocument); - - AssertAnnotations.assertLemma(aLemma, select(jcas, Lemma.class)); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-languagetool-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/languagetool/CjfNormalizerTest.java b/dkpro-core-languagetool-asl/src/test/java/org/dkpro/core/languagetool/CjfNormalizerTest.java similarity index 85% rename from dkpro-core-languagetool-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/languagetool/CjfNormalizerTest.java rename to dkpro-core-languagetool-asl/src/test/java/org/dkpro/core/languagetool/CjfNormalizerTest.java index d57e0e5bcd..d10d43d407 100644 --- a/dkpro-core-languagetool-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/languagetool/CjfNormalizerTest.java +++ b/dkpro-core-languagetool-asl/src/test/java/org/dkpro/core/languagetool/CjfNormalizerTest.java @@ -15,18 +15,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.languagetool; +package org.dkpro.core.languagetool; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTransformedText; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.dkpro.core.testing.AssertAnnotations.assertTransformedText; import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.dkpro.core.languagetool.CjfNormalizer.Direction; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.languagetool.CjfNormalizer.Direction; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - public class CjfNormalizerTest { @Test diff --git a/dkpro-core-languagetool-asl/src/test/java/org/dkpro/core/languagetool/LanguageToolCheckerTest.java b/dkpro-core-languagetool-asl/src/test/java/org/dkpro/core/languagetool/LanguageToolCheckerTest.java new file mode 100644 index 0000000000..5bb905e3c8 --- /dev/null +++ b/dkpro-core-languagetool-asl/src/test/java/org/dkpro/core/languagetool/LanguageToolCheckerTest.java @@ -0,0 +1,99 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.languagetool; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.junit.Assert.assertEquals; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.fit.testing.factory.TokenBuilder; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.GrammarAnomaly; +import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class LanguageToolCheckerTest +{ + @Test + public void grammarCheckerTest() throws Exception + { + String testDocument = "A sentence with a error in the Hitchhiker's Guide tot he Galaxy ."; + + AnalysisEngine engine = createEngine(LanguageToolChecker.class, + LanguageToolChecker.PARAM_LANGUAGE, "en"); + JCas aJCas = engine.newJCas(); + + TokenBuilder<Token, Sentence> tb = new TokenBuilder<>(Token.class, Sentence.class); + tb.buildTokens(aJCas, testDocument); + + engine.process(aJCas); + + // copy input match type annotations to an array + int count = 0; + for (GrammarAnomaly ga : select(aJCas, GrammarAnomaly.class)) { + System.out.println("Error " + (count + 1) + " (" + ga.getBegin() + ", " + ga.getEnd() + + "):" + ga.getDescription()); + for (SuggestedAction action : JCasUtil.select(ga.getSuggestions(), + SuggestedAction.class)) { + System.out.printf("-> %s (score %f)%n", action.getReplacement(), + action.getCertainty()); + } + count++; + } + assertEquals(count, 3); + } + + @Test + public void grammarCheckerTestFrench() throws Exception + { + String testDocument = "comment modifer un compte"; + + AnalysisEngine engine = createEngine(LanguageToolChecker.class, + LanguageToolChecker.PARAM_LANGUAGE, "fr"); + JCas aJCas = engine.newJCas(); + + TokenBuilder<Token, Sentence> tb = new TokenBuilder<>(Token.class, Sentence.class); + tb.buildTokens(aJCas, testDocument); + + engine.process(aJCas); + + // copy input match type annotations to an array + int count = 0; + for (GrammarAnomaly ga : select(aJCas, GrammarAnomaly.class)) { + System.out.println("Error " + (count + 1) + " (" + ga.getBegin() + ", " + ga.getEnd() + + "):" + ga.getDescription()); + for (SuggestedAction action : JCasUtil.select(ga.getSuggestions(), + SuggestedAction.class)) { + System.out.printf("-> %s (score %f)%n", action.getReplacement(), + action.getCertainty()); + } + count++; + } + assertEquals(count, 2); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-languagetool-asl/src/test/java/org/dkpro/core/languagetool/LanguageToolLemmatizerTest.java b/dkpro-core-languagetool-asl/src/test/java/org/dkpro/core/languagetool/LanguageToolLemmatizerTest.java new file mode 100644 index 0000000000..add246c005 --- /dev/null +++ b/dkpro-core-languagetool-asl/src/test/java/org/dkpro/core/languagetool/LanguageToolLemmatizerTest.java @@ -0,0 +1,97 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.languagetool; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.opennlp.OpenNlpPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; + +public class LanguageToolLemmatizerTest +{ + @Test + public void testGerman() + throws Exception + { + runTest("de", "Das ist ein Test .", + new String[] { "der", "sein", "ein", "Test", "." }); + + runTest("de", "besitzt", + new String[] { "besitzen" }); + } + + @Test + public void testGerman2() + throws Exception + { + JCas jcas = runTest("de", "Wir brauchen ein sehr kompliziertes Beispiel , welches " + + "möglichst viele Konstituenten und Dependenzen beinhaltet ."); + + String[] lemmas = new String[] { "ich", "brauchen", "ein", "sehr", "kompliziert", + "Beispiel", ",", "welch", "möglichst", "viel", "Konstituente", "und", + "Dependenzen", "beinhalten", "." }; + + AssertAnnotations.assertLemma(lemmas, select(jcas, Lemma.class)); + } + + @Test + public void testEnglish() + throws Exception + { + runTest("en", "This is a test .", + new String[] { "this", "be", "a", "test", "." }); + + runTest("en", "A neural net .", + new String[] { "a", "neural", "net", "." }); + + runTest("en", "John is purchasing oranges .", + new String[] { "John", "be", "purchase", "orange", "." }); + } + + private JCas runTest(String aLanguage, String aText) + throws Exception + { + AnalysisEngineDescription lemma = createEngineDescription(LanguageToolLemmatizer.class); + + return TestRunner.runTest(lemma, aLanguage, aText); + } + + private void runTest(String language, String testDocument, String[] aLemma) + throws Exception + { + AnalysisEngineDescription engine = createEngineDescription( + createEngineDescription(OpenNlpPosTagger.class), + createEngineDescription(LanguageToolLemmatizer.class)); + + JCas jcas = TestRunner.runTest(engine, language, testDocument); + + AssertAnnotations.assertLemma(aLemma, select(jcas, Lemma.class)); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-languagetool-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/languagetool/LanguageToolSegmenterTest.java b/dkpro-core-languagetool-asl/src/test/java/org/dkpro/core/languagetool/LanguageToolSegmenterTest.java similarity index 92% rename from dkpro-core-languagetool-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/languagetool/LanguageToolSegmenterTest.java rename to dkpro-core-languagetool-asl/src/test/java/org/dkpro/core/languagetool/LanguageToolSegmenterTest.java index 199024be3a..cb188f68ea 100644 --- a/dkpro-core-languagetool-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/languagetool/LanguageToolSegmenterTest.java +++ b/dkpro-core-languagetool-asl/src/test/java/org/dkpro/core/languagetool/LanguageToolSegmenterTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.languagetool; +package org.dkpro.core.languagetool; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; @@ -28,6 +28,9 @@ import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.harness.SegmenterHarness; import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; @@ -35,9 +38,6 @@ import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.harness.SegmenterHarness; public class LanguageToolSegmenterTest { @@ -100,7 +100,7 @@ public void testTraditionalChinese() throws Exception AnalysisEngine aed = createEngine(LanguageToolSegmenter.class); aed.process(jcas); - String[] tokens = { "毛澤東", "住", "在", "北京" }; + String[] tokens = { "毛", "澤東", "住", "在", "北京" }; AssertAnnotations.assertToken(tokens, select(jcas, Token.class)); } diff --git a/dkpro-core-languagetool-asl/src/test/resources/log4j.properties b/dkpro-core-languagetool-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-languagetool-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-languagetool-asl/src/test/resources/log4j2.xml b/dkpro-core-languagetool-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-languagetool-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Logger name="org.dkpro.core.api.resources.ResourceObjectProviderBase" level="INFO"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-lbj-asl/pom.xml b/dkpro-core-lbj-asl/pom.xml index d0c8475797..0bcbff8c50 100644 --- a/dkpro-core-lbj-asl/pom.xml +++ b/dkpro-core-lbj-asl/pom.xml @@ -15,21 +15,23 @@ See the License for the specific language governing permissions and limitations under the License. --> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> <artifactId>de.tudarmstadt.ukp.dkpro.core.lbj-asl</artifactId> <packaging>jar</packaging> - <name>DKPro Core ASL - Illinois Cognitive Computation Group NLP (v ${illinois-cogcomp-nlp.version})</name> + <name>DKPro Core ASL - Illinois Cognitive Computation Group NLP (v ${illinois-cogcomp-nlp.version}) (academic use)</name> + <url>https://dkpro.github.io/dkpro-core/</url> <properties> - <illinois-cogcomp-nlp.version>3.0.72</illinois-cogcomp-nlp.version> - <maven.surefire.heap>4g</maven.surefire.heap> + <illinois-cogcomp-nlp.version>4.0.7</illinois-cogcomp-nlp.version> + <maven.surefire.heap>6g</maven.surefire.heap> </properties> <dependencies> <dependency> @@ -44,6 +46,10 @@ <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> </dependency> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + </dependency> <dependency> <groupId>edu.illinois.cs.cogcomp</groupId> <artifactId>illinois-pos</artifactId> @@ -52,7 +58,7 @@ <dependency> <groupId>edu.illinois.cs.cogcomp</groupId> <artifactId>LBJava</artifactId> - <version>1.2.24</version> + <version>1.3.0</version> <exclusions> <exclusion> <artifactId>weka-stable</artifactId> @@ -74,6 +80,12 @@ <groupId>edu.illinois.cs.cogcomp</groupId> <artifactId>illinois-ner</artifactId> <version>${illinois-cogcomp-nlp.version}</version> + <exclusions> + <exclusion> + <groupId>edu.stanford.nlp</groupId> + <artifactId>stanford-corenlp</artifactId> + </exclusion> + </exclusions> </dependency> <dependency> <groupId>edu.illinois.cs.cogcomp</groupId> @@ -89,15 +101,12 @@ <artifactId>stanford-corenlp</artifactId> <groupId>edu.stanford.nlp</groupId> </exclusion> + <exclusion> + <groupId>mysql</groupId> + <artifactId>mysql-connector-java</artifactId> + </exclusion> </exclusions> </dependency> - <!-- - <dependency> - <groupId>edu.illinois.cs.cogcomp</groupId> - <artifactId>illinois-srl</artifactId> - <version>5.1.11</version> - </dependency> - --> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.api.metadata-asl</artifactId> @@ -126,6 +135,10 @@ <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> + </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> @@ -136,10 +149,5 @@ <artifactId>junit</artifactId> <scope>test</scope> </dependency> - <dependency> - <groupId>commons-io</groupId> - <artifactId>commons-io</artifactId> - <scope>test</scope> - </dependency> </dependencies> </project> \ No newline at end of file diff --git a/dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisStatefulSegmenter.java b/dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisStatefulSegmenter.java deleted file mode 100644 index a4d8acccf7..0000000000 --- a/dkpro-core-lbj-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lbj/IllinoisStatefulSegmenter.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.lbj; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; -import edu.illinois.cs.cogcomp.core.datastructures.IntPair; -import edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer; -import edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer; -import edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer.Tokenization; - -/** - * Illinois stateful segmenter. - */ -@ResourceMetaData(name="Illinois CCG Stateful Segmenter") -@TypeCapability(outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) -public class IllinoisStatefulSegmenter - extends SegmenterBase -{ - private Tokenizer tokenizer; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - tokenizer = new StatefulTokenizer(); - } - - @Override - protected void process(JCas aJCas, String text, int zoneBegin) - throws AnalysisEngineProcessException - { - Tokenization tokens = tokenizer.tokenizeTextSpan(text); - - IntPair[] ts = tokens.getCharacterOffsets(); - for (IntPair t : ts) { - createToken(aJCas, t.getFirst() + zoneBegin, t.getSecond() + zoneBegin); - } - - int lastBegin = 0; - for (int i : tokens.getSentenceEndTokenIndexes()) { - createSentence(aJCas, ts[lastBegin].getFirst() + zoneBegin, ts[i-1].getSecond() + zoneBegin); - lastBegin = i; - } - - tokens.getSentenceEndTokenIndexes(); - - -// -// for (Paragraph paragraph : paragraphs) { -// if (writeParagraph) { -// Annotation p = new de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph( -// aJCas, paragraph.getStartIndex(), paragraph.getEndIndex()); -// p.addToIndexes(); -// } -// -// for (TextUnit tu : paragraph.getTextUnits()) { -// if (isWriteSentence()) { -// createSentence(aJCas, tu.getStartIndex(), tu.getEndIndex()); -// } -// -// for (Token t : tu.getTokens()) { -// if (isWriteToken()) { -// createToken(aJCas, t.getStartIndex(), t.getEndIndex()); -// } -// } -// } -// } - } -} diff --git a/dkpro-core-ldweb1t-asl/pom.xml b/dkpro-core-ldweb1t-asl/pom.xml index e330594390..8a69dd6705 100644 --- a/dkpro-core-ldweb1t-asl/pom.xml +++ b/dkpro-core-ldweb1t-asl/pom.xml @@ -18,14 +18,15 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.ldweb1t-asl</artifactId> + <artifactId>dkpro-core-ldweb1t-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - Language Detection using n-grams</name> + <url>https://dkpro.github.io/dkpro-core/</url> <dependencies> <dependency> <groupId>org.apache.uima</groupId> @@ -40,24 +41,28 @@ <artifactId>commons-lang3</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.ngrams-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-ngrams-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.frequency-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-frequency-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.frequency-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-frequency-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> </dependency> <dependency> <groupId>junit</groupId> @@ -65,8 +70,8 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.tokit-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-tokit-asl</artifactId> <scope>test</scope> </dependency> </dependencies> @@ -83,11 +88,26 @@ - do require it as a compile dependency and also at runtime, so we - cannot set it to scope provided. Need to tell Maven to ignore it here. --> - <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</usedDependency> + <usedDependency>org.dkpro.core:dkpro-core-api-parameter-asl</usedDependency> </usedDependencies> </configuration> </plugin> </plugins> </pluginManagement> + <plugins> + <plugin> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-maven-plugin</artifactId> + <configuration> + <!-- + The following components must be configured via external resources which is not + possible on the OpenMinTeD platform. + --> + <uimaDescriptorExcludes> + <exclude>**/LanguageDetectorWeb1T.xml</exclude> + </uimaDescriptorExcludes> + </configuration> + </plugin> + </plugins> </build> </project> \ No newline at end of file diff --git a/dkpro-core-ldweb1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ldweb1t/LanguageDetectorWeb1T.java b/dkpro-core-ldweb1t-asl/src/main/java/org/dkpro/core/ldweb1t/LanguageDetectorWeb1T.java similarity index 87% rename from dkpro-core-ldweb1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ldweb1t/LanguageDetectorWeb1T.java rename to dkpro-core-ldweb1t-asl/src/main/java/org/dkpro/core/ldweb1t/LanguageDetectorWeb1T.java index d30da5caea..5a34643c93 100644 --- a/dkpro-core-ldweb1t-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ldweb1t/LanguageDetectorWeb1T.java +++ b/dkpro-core-ldweb1t-asl/src/main/java/org/dkpro/core/ldweb1t/LanguageDetectorWeb1T.java @@ -1,185 +1,191 @@ -/* - * Copyright 2013 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.ldweb1t; - -import static de.tudarmstadt.ukp.dkpro.core.frequency.Web1TProviderBase.BOS; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ExternalResource; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.frequency.provider.FrequencyCountProvider; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringIterable; - -/** - * Language detector based on n-gram frequency counts, e.g. as provided by Web1T - */ -@ResourceMetaData(name="Web1T Language Detector") -public class LanguageDetectorWeb1T - extends JCasAnnotator_ImplBase -{ - /** - * An array of external resources of frequency providers (one for each language that should be detected). - */ - public static final String PARAM_FREQUENCY_PROVIDER_RESOURCES = "frequencyProviders"; - @ExternalResource(key = PARAM_FREQUENCY_PROVIDER_RESOURCES, mandatory = true) - private FrequencyCountProvider[] frequencyProviders; - - /** - * The minimum n-gram size that should be considered. Default is 1. - */ - public static final String PARAM_MIN_NGRAM_SIZE = "minNGramSize"; - @ConfigurationParameter(name = PARAM_MIN_NGRAM_SIZE, mandatory = true, defaultValue = "1") - private int minNGramSize; - - /** - * The maximum n-gram size that should be considered. Default is 3. - */ - public static final String PARAM_MAX_NGRAM_SIZE = "maxNGramSize"; - @ConfigurationParameter(name = PARAM_MAX_NGRAM_SIZE, mandatory = true, defaultValue = "3") - private int maxNGramSize; - - private Map<String,FrequencyCountProvider> providerMap; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - - providerMap = new HashMap<String,FrequencyCountProvider>(); - - for (FrequencyCountProvider provider : frequencyProviders) { - try { - providerMap.put(provider.getLanguage(), provider); - } - catch (Exception e) { - throw new ResourceInitializationException(e); - } - } - } - - @Override - public void process(JCas jcas) - throws AnalysisEngineProcessException - { - - List<String> words = JCasUtil.toText(JCasUtil.select(jcas, Token.class)); - - if (words.size() < 1) { - return; - } - - List<String> ngrams = new ArrayList<String>(); - if (words.size() > 1) { - ngrams.add(getNgram(BOS, words.get(0), words.get(1))); - } - - for (String ngram : new NGramStringIterable(words, 1, 3)) { - ngrams.add(ngram); - } - - try { - Map<String,Double> langProbs = getLanguageProbabilities(ngrams); - - String maxLanguage = "x-unspecified"; - double maxLogProb = Double.NEGATIVE_INFINITY; - for (String lang : langProbs.keySet()) { - double prob = langProbs.get(lang); - if (prob > maxLogProb) { - maxLogProb = prob; - maxLanguage = lang; - } - System.out.println(lang + " - " + prob); - } - jcas.setDocumentLanguage(maxLanguage); - } - catch (Exception e) { - throw new AnalysisEngineProcessException(e); - } - } - - private Map<String,Double> getLanguageProbabilities(List<String> ngrams) - throws Exception - { - Map<String,Double> langProbs = new HashMap<String,Double>(); - - for (String lang : providerMap.keySet()) { - - FrequencyCountProvider provider = providerMap.get(lang); - - long nrOfUnigrams = provider.getNrOfNgrams(1); - long nrOfBigrams = provider.getNrOfNgrams(2); - long nrOfTrigrams = provider.getNrOfNgrams(3); - - double textLogProbability = 0.0; - - for (String ngram : ngrams) { - - long frequency = provider.getFrequency(ngram); - - int ngramSize = FrequencyUtils.getPhraseLength(ngram); - - long normalization = 1; - int weighting = 1; - if (ngramSize == 1) { - normalization = nrOfUnigrams; - } - else if (ngramSize == 2) { - weighting = 2; - normalization = nrOfBigrams; - } - else if (ngramSize == 3) { - weighting = 4; - normalization = nrOfTrigrams; - } - - if (frequency > 0) { - double logProb = Math.log( weighting * ((double) frequency) / normalization ); - - textLogProbability += logProb; - } - else { - textLogProbability += Math.log( 1.0 / normalization); - } - } - - langProbs.put(lang, textLogProbability); - } - - return langProbs; - } - - private String getNgram(String ...strings) { - return StringUtils.join(strings, " "); - } +/* + * Copyright 2013 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.ldweb1t; + +import static org.dkpro.core.frequency.Web1TProviderBase.BOS; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ExternalResource; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.frequency.provider.FrequencyCountProvider; +import org.dkpro.core.api.frequency.util.FrequencyUtils; +import org.dkpro.core.ngrams.util.NGramStringIterable; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Language detector based on n-gram frequency counts, e.g. as provided by Web1T + */ +@Component(OperationType.LANGUAGE_IDENTIFIER) +@ResourceMetaData(name = "Web1T Language Detector") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +public class LanguageDetectorWeb1T + extends JCasAnnotator_ImplBase +{ + /** + * An array of external resources of frequency providers (one for each language that should be + * detected). + */ + public static final String RES_FREQUENCY_PROVIDER_RESOURCES = "frequencyProviders"; + @ExternalResource(key = RES_FREQUENCY_PROVIDER_RESOURCES, mandatory = true) + private FrequencyCountProvider[] frequencyProviders; + + /** + * The minimum n-gram size that should be considered. Default is 1. + */ + public static final String PARAM_MIN_NGRAM_SIZE = "minNGramSize"; + @ConfigurationParameter(name = PARAM_MIN_NGRAM_SIZE, mandatory = true, defaultValue = "1") + private int minNGramSize; + + /** + * The maximum n-gram size that should be considered. Default is 3. + */ + public static final String PARAM_MAX_NGRAM_SIZE = "maxNGramSize"; + @ConfigurationParameter(name = PARAM_MAX_NGRAM_SIZE, mandatory = true, defaultValue = "3") + private int maxNGramSize; + + private Map<String,FrequencyCountProvider> providerMap; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + providerMap = new HashMap<String,FrequencyCountProvider>(); + + for (FrequencyCountProvider provider : frequencyProviders) { + try { + providerMap.put(provider.getLanguage(), provider); + } + catch (Exception e) { + throw new ResourceInitializationException(e); + } + } + } + + @Override + public void process(JCas jcas) + throws AnalysisEngineProcessException + { + + List<String> words = JCasUtil.toText(JCasUtil.select(jcas, Token.class)); + + if (words.size() < 1) { + return; + } + + List<String> ngrams = new ArrayList<String>(); + if (words.size() > 1) { + ngrams.add(getNgram(BOS, words.get(0), words.get(1))); + } + + for (String ngram : new NGramStringIterable(words, 1, 3)) { + ngrams.add(ngram); + } + + try { + Map<String,Double> langProbs = getLanguageProbabilities(ngrams); + + String maxLanguage = "x-unspecified"; + double maxLogProb = Double.NEGATIVE_INFINITY; + for (String lang : langProbs.keySet()) { + double prob = langProbs.get(lang); + if (prob > maxLogProb) { + maxLogProb = prob; + maxLanguage = lang; + } + System.out.println(lang + " - " + prob); + } + jcas.setDocumentLanguage(maxLanguage); + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + } + + private Map<String,Double> getLanguageProbabilities(List<String> ngrams) + throws Exception + { + Map<String,Double> langProbs = new HashMap<String,Double>(); + + for (String lang : providerMap.keySet()) { + + FrequencyCountProvider provider = providerMap.get(lang); + + long nrOfUnigrams = provider.getNrOfNgrams(1); + long nrOfBigrams = provider.getNrOfNgrams(2); + long nrOfTrigrams = provider.getNrOfNgrams(3); + + double textLogProbability = 0.0; + + for (String ngram : ngrams) { + + long frequency = provider.getFrequency(ngram); + + int ngramSize = FrequencyUtils.getPhraseLength(ngram); + + long normalization = 1; + int weighting = 1; + if (ngramSize == 1) { + normalization = nrOfUnigrams; + } + else if (ngramSize == 2) { + weighting = 2; + normalization = nrOfBigrams; + } + else if (ngramSize == 3) { + weighting = 4; + normalization = nrOfTrigrams; + } + + if (frequency > 0) { + double logProb = Math.log( weighting * ((double) frequency) / normalization ); + + textLogProbability += logProb; + } + else { + textLogProbability += Math.log( 1.0 / normalization); + } + } + + langProbs.put(lang, textLogProbability); + } + + return langProbs; + } + + private String getNgram(String ...strings) { + return StringUtils.join(strings, " "); + } } diff --git a/dkpro-core-ldweb1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ldweb1t/LanguageDetectorWeb1TTest.java b/dkpro-core-ldweb1t-asl/src/test/java/org/dkpro/core/ldweb1t/LanguageDetectorWeb1TTest.java similarity index 84% rename from dkpro-core-ldweb1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ldweb1t/LanguageDetectorWeb1TTest.java rename to dkpro-core-ldweb1t-asl/src/test/java/org/dkpro/core/ldweb1t/LanguageDetectorWeb1TTest.java index 6ba45f2a21..aad20c0417 100644 --- a/dkpro-core-ldweb1t-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ldweb1t/LanguageDetectorWeb1TTest.java +++ b/dkpro-core-ldweb1t-asl/src/test/java/org/dkpro/core/ldweb1t/LanguageDetectorWeb1TTest.java @@ -15,10 +15,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.ldweb1t; +package org.dkpro.core.ldweb1t; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.ExternalResourceFactory.createExternalResourceDescription; +import static org.apache.uima.fit.factory.ExternalResourceFactory.createResourceDescription; import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; import static org.junit.Assert.assertEquals; @@ -29,25 +29,24 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ExternalResourceDescription; +import org.dkpro.core.frequency.resources.Web1TInMemoryFrequencyCountResource; +import org.dkpro.core.tokit.BreakIteratorSegmenter; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.frequency.resources.Web1TInMemoryFrequencyCountResource; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; - public class LanguageDetectorWeb1TTest { @Test public void web1tLanguageDetectorTest() throws Exception { - ExternalResourceDescription en = createExternalResourceDescription( + ExternalResourceDescription en = createResourceDescription( Web1TInMemoryFrequencyCountResource.class, Web1TInMemoryFrequencyCountResource.PARAM_MODEL_LOCATION, "src/test/resources/web1t/en/", Web1TInMemoryFrequencyCountResource.PARAM_LANGUAGE, "en", Web1TInMemoryFrequencyCountResource.PARAM_MAX_NGRAM_LEVEL, "2"); - ExternalResourceDescription de = createExternalResourceDescription( + ExternalResourceDescription de = createResourceDescription( Web1TInMemoryFrequencyCountResource.class, Web1TInMemoryFrequencyCountResource.PARAM_MODEL_LOCATION, "src/test/resources/web1t/de/", @@ -62,7 +61,7 @@ public void web1tLanguageDetectorTest() createEngineDescription(BreakIteratorSegmenter.class), createEngineDescription(LanguageDetectorWeb1T.class, LanguageDetectorWeb1T.PARAM_MAX_NGRAM_SIZE, 2, - LanguageDetectorWeb1T.PARAM_FREQUENCY_PROVIDER_RESOURCES, resources)); + LanguageDetectorWeb1T.RES_FREQUENCY_PROVIDER_RESOURCES, resources)); JCas jcas = JCasFactory.createJCas(); jcas.setDocumentText("This is an English example."); diff --git a/dkpro-core-lingpipe-gpl/.license-header.txt b/dkpro-core-lingpipe-gpl/.license-header.txt index ab08133a17..bbaf6e0e56 100644 --- a/dkpro-core-lingpipe-gpl/.license-header.txt +++ b/dkpro-core-lingpipe-gpl/.license-header.txt @@ -13,4 +13,4 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with this program. If not, see http://www.gnu.org/licenses/. +along with this program. If not, see http://www.gnu.org/licenses/. diff --git a/dkpro-core-lingpipe-gpl/LICENSE.txt b/dkpro-core-lingpipe-gpl/LICENSE.txt index 6e22a15c3c..99ace43661 100644 --- a/dkpro-core-lingpipe-gpl/LICENSE.txt +++ b/dkpro-core-lingpipe-gpl/LICENSE.txt @@ -654,7 +654,7 @@ the "copyright" line and a pointer to where the full notice is found. GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. + along with this program. If not, see http://www.gnu.org/licenses/. Also add information on how to contact you by electronic and paper mail. diff --git a/dkpro-core-lingpipe-gpl/pom.xml b/dkpro-core-lingpipe-gpl/pom.xml index ca4226358d..540ed90093 100644 --- a/dkpro-core-lingpipe-gpl/pom.xml +++ b/dkpro-core-lingpipe-gpl/pom.xml @@ -1,6 +1,6 @@ <!-- - Copyright 2007-2017 + Copyright 2007-2019 Ubiquitous Knowledge Processing (UKP) Lab Technische Universität Darmstadt @@ -15,22 +15,23 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. + along with this program. If not, see http://www.gnu.org/licenses/. --> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core-gpl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-gpl</artifactId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-gpl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.lingpipe-gpl</artifactId> + <artifactId>dkpro-core-lingpipe-gpl</artifactId> <packaging>jar</packaging> <name>DKPro Core GPL - LingPipe (v ${lingpipe.version})</name> + <url>https://dkpro.github.io/dkpro-core/</url> <properties> - <lingpipe.version>4.1.0</lingpipe.version> + <lingpipe.version>4.1.2-JL1.0</lingpipe.version> </properties> <dependencies> <dependency> @@ -55,28 +56,32 @@ <version>${lingpipe.version}</version> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.metadata-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-metadata-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.ner-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-ner-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> </dependency> <dependency> <groupId>junit</groupId> @@ -84,8 +89,8 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> <dependency> diff --git a/dkpro-core-lingpipe-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipePosTagger.java b/dkpro-core-lingpipe-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipePosTagger.java deleted file mode 100644 index 28dbc4a7cb..0000000000 --- a/dkpro-core-lingpipe-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipePosTagger.java +++ /dev/null @@ -1,196 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.lingpipe; - -import static java.util.Arrays.asList; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; -import static org.apache.uima.fit.util.JCasUtil.toText; -import static org.apache.uima.util.Level.INFO; - -import java.io.InputStream; -import java.io.ObjectInputStream; -import java.util.List; -import java.util.Locale; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Type; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import com.aliasi.hmm.HiddenMarkovModel; -import com.aliasi.hmm.HmmDecoder; -import com.aliasi.tag.Tagging; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * LingPipe part-of-speech tagger. - */ -@ResourceMetaData(name="LingPipe POS-Tagger") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }) -public class LingPipePosTagger - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - protected String modelLocation; - - /** - * Load the part-of-speech tag to UIMA type mapping from this location instead of locating - * the mapping automatically. - */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - protected String posMappingLocation; - - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code true} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - - /** - * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") - protected boolean printTagSet; - - /** - * Lingpipe models tend to be trained on lower-case tags, but our POS mappings use uppercase. - */ - public static final String PARAM_UPPERCASE_TAGS = "uppercaseTags"; - @ConfigurationParameter(name = PARAM_UPPERCASE_TAGS, mandatory = true, defaultValue="true") - protected boolean uppercaseTags; - - private CasConfigurableProviderBase<HmmDecoder> modelProvider; - private MappingProvider mappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase<HmmDecoder>(this, "lingpipe", "tagger") { - @Override - protected HmmDecoder produceResource(InputStream aStream) - throws Exception - { - ObjectInputStream ois = new ObjectInputStream(aStream); - HiddenMarkovModel hmm = (HiddenMarkovModel) ois.readObject(); - - SingletonTagset tags = new SingletonTagset(POS.class, getResourceMetaData() - .getProperty(("pos.tagset"))); - for (int n = 0; n < hmm.stateSymbolTable().numSymbols(); n++) { - String tag = hmm.stateSymbolTable().idToSymbol(n); - if (uppercaseTags) { - tag = tag.toUpperCase(Locale.US); - } - tags.add(tag); - } - addTagset(tags); - - if (printTagSet) { - getContext().getLogger().log(INFO, getTagset().toString()); - } - - return new HmmDecoder(hmm); - } - }; - - mappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - language, modelProvider); - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - - modelProvider.configure(cas); - mappingProvider.configure(cas); - - for (Sentence sentence : select(aJCas, Sentence.class)) { - List<Token> tokens = selectCovered(aJCas, Token.class, sentence); - String[] tokenTexts = toText(tokens).toArray(new String[tokens.size()]); - - Tagging<String> tagging = modelProvider.getResource().tag(asList(tokenTexts)); - - for (int n = 0; n < tagging.size(); n++) { - Token t = tokens.get(n); - String tag = tagging.tag(n); - if (uppercaseTags) { - tag = tag.toUpperCase(Locale.US); - } - Type posTag = mappingProvider.getTagType(tag); - POS posAnno = (POS) cas.createAnnotation(posTag, t.getBegin(), t.getEnd()); - posAnno.setPosValue(internTags ? tag.intern() : tag); - POSUtils.assignCoarseValue(posAnno); - posAnno.addToIndexes(); - t.setPos(posAnno); - } - } - } -} diff --git a/dkpro-core-lingpipe-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipeNamedEntityRecognizer.java b/dkpro-core-lingpipe-gpl/src/main/java/org/dkpro/core/lingpipe/LingPipeNamedEntityRecognizer.java similarity index 80% rename from dkpro-core-lingpipe-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipeNamedEntityRecognizer.java rename to dkpro-core-lingpipe-gpl/src/main/java/org/dkpro/core/lingpipe/LingPipeNamedEntityRecognizer.java index 3d73e437be..6ea24f1c58 100644 --- a/dkpro-core-lingpipe-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipeNamedEntityRecognizer.java +++ b/dkpro-core-lingpipe-gpl/src/main/java/org/dkpro/core/lingpipe/LingPipeNamedEntityRecognizer.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,14 +14,15 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.lingpipe; +package org.dkpro.core.lingpipe; import static java.util.Arrays.asList; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.toText; import static org.apache.uima.util.Level.INFO; +import static org.dkpro.core.api.resources.MappingProviderFactory.createNerMappingProvider; import java.io.InputStream; import java.io.ObjectInputStream; @@ -29,8 +30,6 @@ import java.util.List; import java.util.Map; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ResourceParameter; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.reflect.FieldUtils; import org.apache.uima.UimaContext; @@ -43,6 +42,12 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.parameter.ResourceParameter; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; import com.aliasi.chunk.AbstractCharLmRescoringChunker; import com.aliasi.chunk.Chunk; @@ -53,17 +58,16 @@ import com.aliasi.hmm.HiddenMarkovModel; import com.aliasi.symbol.SymbolTable; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.constants.OperationType; /** * LingPipe named entity recognizer. */ -@ResourceMetaData(name="LingPipe Named Entity Recognizer") +@Component(OperationType.NAMED_ENTITITY_RECOGNIZER) +@ResourceMetaData(name = "LingPipe Named Entity Recognizer") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, @@ -94,6 +98,20 @@ public class LingPipeNamedEntityRecognizer @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + /** * Location from which the model is read. */ @@ -105,7 +123,8 @@ public class LingPipeNamedEntityRecognizer /** * Location of the mapping file for named entity tags to UIMA types. */ - public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; + public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = + ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_NAMED_ENTITY_MAPPING_LOCATION, mandatory = false) protected String mappingLocation; @@ -120,6 +139,12 @@ public void initialize(UimaContext aContext) modelProvider = new ModelProviderBase<Chunker>(this, "lingpipe", "ner") { + { + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/lingpipe/lib/ner-${language}-${variant}.properties"); + } + @Override protected Chunker produceResource(InputStream aStream) throws Exception @@ -188,15 +213,8 @@ else if (chunker instanceof AbstractCharLmRescoringChunker) { } }; - mappingProvider = new MappingProvider(); - mappingProvider.setDefaultVariantsLocation( - "de/tudarmstadt/ukp/dkpro/core/lingpipe/lib/ner-default-variants.map"); - mappingProvider.setDefault(MappingProvider.LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/" - + "core/lingpipe/lib/ner-${language}-${variant}.map"); - mappingProvider.setDefault(MappingProvider.BASE_TYPE, NamedEntity.class.getName()); - mappingProvider.setOverride(MappingProvider.LOCATION, mappingLocation); - mappingProvider.setOverride(MappingProvider.LANGUAGE, language); - mappingProvider.setOverride(MappingProvider.VARIANT, variant); + mappingProvider = createNerMappingProvider(this, mappingLocation, language, variant, + modelProvider); } @Override diff --git a/dkpro-core-lingpipe-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipeNamedEntityRecognizerTrainer.java b/dkpro-core-lingpipe-gpl/src/main/java/org/dkpro/core/lingpipe/LingPipeNamedEntityRecognizerTrainer.java similarity index 82% rename from dkpro-core-lingpipe-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipeNamedEntityRecognizerTrainer.java rename to dkpro-core-lingpipe-gpl/src/main/java/org/dkpro/core/lingpipe/LingPipeNamedEntityRecognizerTrainer.java index 9662834ce2..e4a891ca1d 100644 --- a/dkpro-core-lingpipe-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipeNamedEntityRecognizerTrainer.java +++ b/dkpro-core-lingpipe-gpl/src/main/java/org/dkpro/core/lingpipe/LingPipeNamedEntityRecognizerTrainer.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,10 +14,37 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ +package org.dkpro.core.lingpipe; -package de.tudarmstadt.ukp.dkpro.core.lingpipe; +import static org.apache.uima.fit.util.JCasUtil.indexCovered; +import static org.apache.uima.fit.util.JCasUtil.select; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; + +import org.apache.commons.io.IOUtils; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasConsumer_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.xml.sax.InputSource; import com.aliasi.chunk.BioTagChunkCodec; import com.aliasi.chunk.CharLmRescoringChunker; @@ -31,53 +58,39 @@ import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; import com.aliasi.tokenizer.TokenizerFactory; import com.aliasi.util.AbstractExternalizable; + import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import org.apache.commons.io.IOUtils; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasConsumer_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.jcas.JCas; -import org.xml.sax.InputSource; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.PrintWriter; -import java.nio.charset.StandardCharsets; -import java.util.Collection; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; -import java.util.regex.Pattern; - -import static org.apache.uima.fit.util.JCasUtil.indexCovered; -import static org.apache.uima.fit.util.JCasUtil.indexCovering; -import static org.apache.uima.fit.util.JCasUtil.select; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.Parameters; +import eu.openminted.share.annotations.api.constants.OperationType; /** * LingPipe named entity recognizer trainer. */ +@Component(OperationType.TRAINER_OF_MACHINE_LEARNING_MODELS) @MimeTypeCapability(MimeTypes.APPLICATION_X_LINGPIPE_NER) +@Parameters( + exclude = { + LingPipeNamedEntityRecognizerTrainer.PARAM_TARGET_LOCATION }) @ResourceMetaData(name = "LingPipe Named Entity Recognizer Trainer") -public class LingPipeNamedEntityRecognizerTrainer extends JCasConsumer_ImplBase { - +public class LingPipeNamedEntityRecognizerTrainer + extends JCasConsumer_ImplBase +{ + /** + * Location to which the output is written. + */ public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) private File targetLocation; /** - * Regex to filter the {@link de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity#getValue() named entity} by - * type. + * Regex to filter the {@link de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity#getValue() + * named entity} by type. */ - public static final String PARAM_ACCEPTED_TAGS_REGEX = ComponentParameters.PARAM_ACCEPTED_TAGS_REGEX; + public static final String PARAM_ACCEPTED_TAGS_REGEX = + ComponentParameters.PARAM_ACCEPTED_TAGS_REGEX; @ConfigurationParameter(name = PARAM_ACCEPTED_TAGS_REGEX, mandatory = false) protected String acceptedTagsRegex; @@ -95,15 +108,15 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { if (tempData == null) { try { tempData = File.createTempFile("dkpro-lingpipe-ner-trainer", ".tsv"); - out = new PrintWriter( - new OutputStreamWriter(new FileOutputStream(tempData), StandardCharsets.UTF_8)); + out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(tempData), + StandardCharsets.UTF_8)); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } } - Map<Sentence, Collection<Token>> index = indexCovered(aJCas, Sentence.class, Token.class); - Map<Token, Collection<NamedEntity>> neIndex = getNamedEntityIndex(aJCas); + Map<Sentence, List<Token>> index = indexCovered(aJCas, Sentence.class, Token.class); + Map<Token, List<NamedEntity>> neIndex = getNamedEntityIndex(aJCas); for (Sentence sentence : select(aJCas, Sentence.class)) { Collection<Token> tokens = index.get(sentence); @@ -135,15 +148,16 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { } } - private Map<Token, Collection<NamedEntity>> getNamedEntityIndex(JCas aJCas) { - Map<Token, Collection<NamedEntity>> idx = indexCovered(aJCas, Token.class, NamedEntity.class); + private Map<Token, List<NamedEntity>> getNamedEntityIndex(JCas aJCas) { + Map<Token, List<NamedEntity>> idx = indexCovered(aJCas, Token.class, + NamedEntity.class); if (acceptedTagsRegex != null) { Pattern pattern = Pattern.compile(acceptedTagsRegex); - Map<Token, Collection<NamedEntity>> filteredIdx = new HashMap<>(); + Map<Token, List<NamedEntity>> filteredIdx = new HashMap<>(); for (Token token : idx.keySet()) { - Collection<NamedEntity> nes = new ArrayList<>(); + List<NamedEntity> nes = new ArrayList<>(); for (NamedEntity ne : idx.get(token)) { if (pattern.matcher(ne.getValue()).matches()) { diff --git a/dkpro-core-lingpipe-gpl/src/main/java/org/dkpro/core/lingpipe/LingPipePosTagger.java b/dkpro-core-lingpipe-gpl/src/main/java/org/dkpro/core/lingpipe/LingPipePosTagger.java new file mode 100644 index 0000000000..459145630a --- /dev/null +++ b/dkpro-core-lingpipe-gpl/src/main/java/org/dkpro/core/lingpipe/LingPipePosTagger.java @@ -0,0 +1,218 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.lingpipe; + +import static java.util.Arrays.asList; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.apache.uima.fit.util.JCasUtil.toText; +import static org.apache.uima.util.Level.INFO; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; + +import java.io.InputStream; +import java.io.ObjectInputStream; +import java.util.List; +import java.util.Locale; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Type; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; + +import com.aliasi.hmm.HiddenMarkovModel; +import com.aliasi.hmm.HmmDecoder; +import com.aliasi.tag.Tagging; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * LingPipe part-of-speech tagger. + */ +@Component(OperationType.PART_OF_SPEECH_TAGGER) +@ResourceMetaData(name = "LingPipe POS-Tagger") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }) +public class LingPipePosTagger + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + protected String modelLocation; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Load the part-of-speech tag to UIMA type mapping from this location instead of locating + * the mapping automatically. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + protected String posMappingLocation; + + /** + * Log the tag set(s) when a model is loaded. + */ + public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") + protected boolean printTagSet; + + /** + * Lingpipe models tend to be trained on lower-case tags, but our POS mappings use uppercase. + */ + public static final String PARAM_UPPERCASE_TAGS = "uppercaseTags"; + @ConfigurationParameter(name = PARAM_UPPERCASE_TAGS, mandatory = true, defaultValue = "true") + protected boolean uppercaseTags; + + private CasConfigurableProviderBase<HmmDecoder> modelProvider; + private MappingProvider mappingProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase<HmmDecoder>(this, "lingpipe", "tagger") { + { + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/lingpipe/lib/tagger-${language}-${variant}.properties"); + } + + @Override + protected HmmDecoder produceResource(InputStream aStream) + throws Exception + { + ObjectInputStream ois = new ObjectInputStream(aStream); + HiddenMarkovModel hmm = (HiddenMarkovModel) ois.readObject(); + + SingletonTagset tags = new SingletonTagset(POS.class, getResourceMetaData() + .getProperty(("pos.tagset"))); + for (int n = 0; n < hmm.stateSymbolTable().numSymbols(); n++) { + String tag = hmm.stateSymbolTable().idToSymbol(n); + if (uppercaseTags) { + tag = tag.toUpperCase(Locale.US); + } + tags.add(tag); + } + addTagset(tags); + + if (printTagSet) { + getContext().getLogger().log(INFO, getTagset().toString()); + } + + return new HmmDecoder(hmm); + } + }; + + mappingProvider = createPosMappingProvider(this, posMappingLocation, language, + modelProvider); + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + CAS cas = aJCas.getCas(); + + modelProvider.configure(cas); + mappingProvider.configure(cas); + + for (Sentence sentence : select(aJCas, Sentence.class)) { + List<Token> tokens = selectCovered(aJCas, Token.class, sentence); + String[] tokenTexts = toText(tokens).toArray(new String[tokens.size()]); + + Tagging<String> tagging = modelProvider.getResource().tag(asList(tokenTexts)); + + for (int n = 0; n < tagging.size(); n++) { + Token t = tokens.get(n); + String tag = tagging.tag(n); + if (uppercaseTags) { + tag = tag.toUpperCase(Locale.US); + } + Type posTag = mappingProvider.getTagType(tag); + POS posAnno = (POS) cas.createAnnotation(posTag, t.getBegin(), t.getEnd()); + posAnno.setPosValue(tag != null ? tag.intern() : null); + POSUtils.assignCoarseValue(posAnno); + posAnno.addToIndexes(); + t.setPos(posAnno); + } + } + } +} diff --git a/dkpro-core-lingpipe-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipeSegmenter.java b/dkpro-core-lingpipe-gpl/src/main/java/org/dkpro/core/lingpipe/LingPipeSegmenter.java similarity index 88% rename from dkpro-core-lingpipe-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipeSegmenter.java rename to dkpro-core-lingpipe-gpl/src/main/java/org/dkpro/core/lingpipe/LingPipeSegmenter.java index 794cda402b..b680de4f6c 100644 --- a/dkpro-core-lingpipe-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipeSegmenter.java +++ b/dkpro-core-lingpipe-gpl/src/main/java/org/dkpro/core/lingpipe/LingPipeSegmenter.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,9 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.lingpipe; +package org.dkpro.core.lingpipe; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; @@ -24,6 +24,7 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.segmentation.SegmenterBase; import com.aliasi.sentences.IndoEuropeanSentenceModel; import com.aliasi.sentences.SentenceModel; @@ -31,12 +32,13 @@ import com.aliasi.tokenizer.Tokenization; import com.aliasi.tokenizer.TokenizerFactory; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; +import eu.openminted.share.annotations.api.DocumentationResource; /** * LingPipe segmenter. */ -@ResourceMetaData(name="LingPipe Segmenter") +@ResourceMetaData(name = "LingPipe Segmenter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", @@ -73,7 +75,7 @@ protected void process(JCas aJCas, String aText, int aZoneBegin) int[] sentenceBoundaries = sentenceModel.boundaryIndices(toks.tokens(), toks.whitespaces()); if (sentenceBoundaries.length == 0) { if (toks.numTokens() > 0) { - createSentence(aJCas, toks.tokenStart(0), toks.tokenEnd(toks.numTokens()-1)); + createSentence(aJCas, toks.tokenStart(0), toks.tokenEnd(toks.numTokens() - 1)); } } else { diff --git a/dkpro-core-lingpipe-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/lingpipe/lib/ner-default-variants.map b/dkpro-core-lingpipe-gpl/src/main/resources/org/dkpro/core/lingpipe/lib/ner-default-variants.map similarity index 100% rename from dkpro-core-lingpipe-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/lingpipe/lib/ner-default-variants.map rename to dkpro-core-lingpipe-gpl/src/main/resources/org/dkpro/core/lingpipe/lib/ner-default-variants.map diff --git a/dkpro-core-lingpipe-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/lingpipe/lib/ner-en-news-muc6.map b/dkpro-core-lingpipe-gpl/src/main/resources/org/dkpro/core/lingpipe/lib/ner-en-news-muc6.map similarity index 100% rename from dkpro-core-lingpipe-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/lingpipe/lib/ner-en-news-muc6.map rename to dkpro-core-lingpipe-gpl/src/main/resources/org/dkpro/core/lingpipe/lib/ner-en-news-muc6.map diff --git a/dkpro-core-lingpipe-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/lingpipe/lib/tagger-default-variants.map b/dkpro-core-lingpipe-gpl/src/main/resources/org/dkpro/core/lingpipe/lib/tagger-default-variants.map similarity index 100% rename from dkpro-core-lingpipe-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/lingpipe/lib/tagger-default-variants.map rename to dkpro-core-lingpipe-gpl/src/main/resources/org/dkpro/core/lingpipe/lib/tagger-default-variants.map diff --git a/dkpro-core-lingpipe-gpl/src/scripts/build.xml b/dkpro-core-lingpipe-gpl/src/scripts/build.xml index 04fd8f4716..66f0d03749 100644 --- a/dkpro-core-lingpipe-gpl/src/scripts/build.xml +++ b/dkpro-core-lingpipe-gpl/src/scripts/build.xml @@ -1,6 +1,6 @@ <!-- - Copyright 2007-2017 + Copyright 2007-2019 Ubiquitous Knowledge Processing (UKP) Lab Technische Universität Darmstadt @@ -15,7 +15,7 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. + along with this program. If not, see http://www.gnu.org/licenses/. --> <project basedir="../.." default="separate-jars"> diff --git a/dkpro-core-lingpipe-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipeNamedEntityRecognizerTest.java b/dkpro-core-lingpipe-gpl/src/test/java/org/dkpro/core/lingpipe/LingPipeNamedEntityRecognizerTest.java similarity index 92% rename from dkpro-core-lingpipe-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipeNamedEntityRecognizerTest.java rename to dkpro-core-lingpipe-gpl/src/test/java/org/dkpro/core/lingpipe/LingPipeNamedEntityRecognizerTest.java index 38e118b9eb..7355915b8c 100644 --- a/dkpro-core-lingpipe-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipeNamedEntityRecognizerTest.java +++ b/dkpro-core-lingpipe-gpl/src/test/java/org/dkpro/core/lingpipe/LingPipeNamedEntityRecognizerTest.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,22 +14,23 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.lingpipe; +package org.dkpro.core.lingpipe; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; +import org.dkpro.core.lingpipe.LingPipeNamedEntityRecognizer; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class LingPipeNamedEntityRecognizerTest { diff --git a/dkpro-core-lingpipe-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipePosTaggerTest.java b/dkpro-core-lingpipe-gpl/src/test/java/org/dkpro/core/lingpipe/LingPipePosTaggerTest.java similarity index 89% rename from dkpro-core-lingpipe-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipePosTaggerTest.java rename to dkpro-core-lingpipe-gpl/src/test/java/org/dkpro/core/lingpipe/LingPipePosTaggerTest.java index b748b6fe31..5eef0e21f3 100644 --- a/dkpro-core-lingpipe-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipePosTaggerTest.java +++ b/dkpro-core-lingpipe-gpl/src/test/java/org/dkpro/core/lingpipe/LingPipePosTaggerTest.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,22 +14,23 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.lingpipe; +package org.dkpro.core.lingpipe; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; +import org.dkpro.core.lingpipe.LingPipePosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class LingPipePosTaggerTest { @@ -71,9 +72,9 @@ public void testEnglish() jcas = runTest("en", "bio-genia", "The quick brown fox jumps over the lazy dog . \n", - new String[] { "DT", "RB", "VBN", "NN", "NNS", "IN", "DT", "NN", "NN", "." }, - new String[] { "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", "POS_NOUN", "POS_ADP", "POS_DET", "POS_NOUN", "POS_NOUN", - "POS_PUNCT" }); + new String[] { "DT", "RB", "VBN", "NN", "NNS", "IN", "DT", "NN", "NN", "." }, + new String[] { "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", "POS_NOUN", "POS_ADP", + "POS_DET", "POS_NOUN", "POS_NOUN", "POS_PUNCT" }); String[] ptbTags = { "", "''", "(", ")", ",", "-", ".", ":", "CC", "CD", "CT", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "N", "NN", "NNP", "NNPS", "NNS", "PDT", @@ -87,9 +88,9 @@ public void testEnglish() jcas = runTest("en", "bio-medpost", "The quick brown fox jumps over the lazy dog . \n", - new String[] { "DD", "NN", "JJ", "NN", "NNS", "II", "DD", "NN", "NN", "." }, - new String[] { "POS_DET", "POS_NOUN", "POS_ADJ", "POS_NOUN", "POS_NOUN", "POS_ADP", "POS_DET", "POS_NOUN", "POS_NOUN", - "POS_PUNCT" }); + new String[] { "DD", "NN", "JJ", "NN", "NNS", "II", "DD", "NN", "NN", "." }, + new String[] { "POS_DET", "POS_NOUN", "POS_ADJ", "POS_NOUN", "POS_NOUN", "POS_ADP", + "POS_DET", "POS_NOUN", "POS_NOUN", "POS_PUNCT" }); String[] medpostTags = { "''", "(", ")", ",", ".", ":", "CC", "CC+", "CS", "CS+", "CSN", "CST", "DB", "DD", "EX", "GE", "II", "II+", "JJ", "JJ+", "JJR", "JJT", "MC", "NN", diff --git a/dkpro-core-lingpipe-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipeSegmenterTest.java b/dkpro-core-lingpipe-gpl/src/test/java/org/dkpro/core/lingpipe/LingPipeSegmenterTest.java similarity index 81% rename from dkpro-core-lingpipe-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipeSegmenterTest.java rename to dkpro-core-lingpipe-gpl/src/test/java/org/dkpro/core/lingpipe/LingPipeSegmenterTest.java index db3a2ef0ed..b825c4e3a5 100644 --- a/dkpro-core-lingpipe-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/lingpipe/LingPipeSegmenterTest.java +++ b/dkpro-core-lingpipe-gpl/src/test/java/org/dkpro/core/lingpipe/LingPipeSegmenterTest.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,19 +14,19 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.lingpipe; +package org.dkpro.core.lingpipe; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.dkpro.core.lingpipe.LingPipeSegmenter; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.harness.SegmenterHarness; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.harness.SegmenterHarness; - public class LingPipeSegmenterTest { @Test diff --git a/dkpro-core-lingpipe-gpl/src/test/resources/log4j.properties b/dkpro-core-lingpipe-gpl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-lingpipe-gpl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-lingpipe-gpl/src/test/resources/log4j2.xml b/dkpro-core-lingpipe-gpl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-lingpipe-gpl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Logger name="org.dkpro.core.api.resources.ResourceObjectProviderBase" level="INFO"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-mallet-asl/pom.xml b/dkpro-core-mallet-asl/pom.xml index ee9d3a79fd..94f3fc42a6 100644 --- a/dkpro-core-mallet-asl/pom.xml +++ b/dkpro-core-mallet-asl/pom.xml @@ -18,15 +18,16 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.mallet-asl</artifactId> + <artifactId>dkpro-core-mallet-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - Mallet (v${mallet.version}) (CPL)</name> <description>A DKPro Core component interfacing the Mallet LDA and Word Embeddings implementations.</description> + <url>https://dkpro.github.io/dkpro-core/</url> <properties> <mallet.version>2.0.8</mallet.version> <maven.surefire.heap>1g</maven.surefire.heap> @@ -68,32 +69,36 @@ <artifactId>dkpro-core-api-embeddings-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.metadata-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-metadata-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.featurepath-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-featurepath-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.io-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-io-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.tokit-asl</artifactId> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-tokit-asl</artifactId> <scope>test</scope> </dependency> <dependency> @@ -102,8 +107,8 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.io.text-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-io-text-asl</artifactId> <scope>test</scope> </dependency> <dependency> @@ -112,8 +117,8 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> </dependencies> @@ -142,7 +147,7 @@ <configuration> <!-- ignore annotations for "unused but declared" warnings --> <ignoredUnusedDeclaredDependencies combine.self="append"> - <!-- Dependency on older Trove version required by Mallet 2.0.8--> + <!-- Dependency on older Trove version required by Mallet 2.0.8 --> <ignoredUnusedDeclaredDependency>net.sf.trove4j:trove4j</ignoredUnusedDeclaredDependency> </ignoredUnusedDeclaredDependencies> </configuration> diff --git a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/MalletModelTrainer.java b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/MalletModelTrainer.java similarity index 82% rename from dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/MalletModelTrainer.java rename to dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/MalletModelTrainer.java index 8b8d456897..0c40a276bd 100644 --- a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/MalletModelTrainer.java +++ b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/MalletModelTrainer.java @@ -15,35 +15,36 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.mallet; +package org.dkpro.core.mallet; + +import java.io.IOException; +import java.util.Locale; -import cc.mallet.pipe.TokenSequence2FeatureSequence; -import cc.mallet.types.Instance; -import cc.mallet.types.InstanceList; -import cc.mallet.types.TokenSequence; -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator.PhraseSequenceGenerator; -import de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator.StringSequenceGenerator; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.mallet.lda.MalletLdaTopicModelTrainer; -import de.tudarmstadt.ukp.dkpro.core.mallet.wordembeddings.MalletEmbeddingsTrainer; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.featurepath.FeaturePathException; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.io.sequencegenerator.PhraseSequenceGenerator; +import org.dkpro.core.api.io.sequencegenerator.StringSequenceGenerator; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.mallet.lda.MalletLdaTopicModelTrainer; +import org.dkpro.core.mallet.wordembeddings.MalletEmbeddingsTrainer; -import java.io.IOException; -import java.util.Locale; +import cc.mallet.pipe.TokenSequence2FeatureSequence; +import cc.mallet.types.Instance; +import cc.mallet.types.InstanceList; +import cc.mallet.types.TokenSequence; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; /** * This abstract class defines parameters and methods that are common for Mallet model estimators. * <p> * It creates a Mallet {@link InstanceList} from the input documents so that inheriting estimators - * can create a model, typically implemented by overriding the {@link JCasFileWriter_ImplBase#collectionProcessComplete()} - * method. + * can create a model, typically implemented by overriding the + * {@link JCasFileWriter_ImplBase#collectionProcessComplete()} method. * * @see MalletEmbeddingsTrainer * @see MalletLdaTopicModelTrainer @@ -56,28 +57,29 @@ public abstract class MalletModelTrainer private static final Locale LOCALE = Locale.US; /** - * The annotation type to use as input tokens for the model estimation. - * Default: {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token}. - * For lemmas, for instance, use {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token/lemma/value} + * The annotation type to use as input tokens for the model estimation. For lemmas, + * use {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token/lemma/value} */ public static final String PARAM_TOKEN_FEATURE_PATH = "tokenFeaturePath"; - @ConfigurationParameter(name = PARAM_TOKEN_FEATURE_PATH, mandatory = true, defaultValue = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token") + @ConfigurationParameter(name = PARAM_TOKEN_FEATURE_PATH, mandatory = true, + defaultValue = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token") private String tokenFeaturePath; /** - * The number of threads to use during model estimation. - * If not set, the number of threads is automatically set by {@link ComponentParameters#computeNumThreads(int)}. + * The number of threads to use during model estimation. If not set, the number of threads is + * automatically set by {@link ComponentParameters#computeNumThreads(int)}. * <p> - * Warning: do not set this to more than 1 when using very small (test) data sets on {@link MalletEmbeddingsTrainer}! - * This might prevent the process from terminating. + * Warning: do not set this to more than 1 when using very small (test) data sets on + * {@link MalletEmbeddingsTrainer}! This might prevent the process from terminating. */ public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS; - @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, defaultValue = ComponentParameters.AUTO_NUM_THREADS) + @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, + defaultValue = ComponentParameters.AUTO_NUM_THREADS) private int numThreads; /** - * Ignore tokens (or any other annotation type, as specified by {@link #PARAM_TOKEN_FEATURE_PATH}) - * that are shorter than the given value. Default: 3. + * Ignore tokens (or any other annotation type, as specified by + * {@link #PARAM_TOKEN_FEATURE_PATH}) that are shorter than the given value. */ public static final String PARAM_MIN_TOKEN_LENGTH = "minTokenLength"; @ConfigurationParameter(name = PARAM_MIN_TOKEN_LENGTH, mandatory = true, defaultValue = "3") @@ -126,12 +128,15 @@ public abstract class MalletModelTrainer private String stopwordsReplacement; /** - * Filter out all tokens matching that regular expression. + * Regular expression of tokens to be filtered. */ public static final String PARAM_FILTER_REGEX = "filterRegex"; @ConfigurationParameter(name = PARAM_FILTER_REGEX, mandatory = true, defaultValue = "") private String filterRegex; + /** + * Value with which tokens matching the regular expression are replaced. + */ public static final String PARAM_FILTER_REGEX_REPLACEMENT = "filterRegexReplacement"; @ConfigurationParameter(name = PARAM_FILTER_REGEX_REPLACEMENT, mandatory = true, defaultValue = "") private String filterRegexReplacement; @@ -149,7 +154,8 @@ public void initialize(UimaContext context) new IllegalArgumentException("No target location set!")); } - // locale should be set to US to define the output format of the Mallet models (especially decimal numbers). + // locale should be set to US to define the output format of the Mallet models (especially + // decimal numbers). Locale.setDefault(LOCALE); numThreads = ComponentParameters.computeNumThreads(numThreads); diff --git a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaTopicModelInferencer.java b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/MalletLdaTopicModelInferencer.java similarity index 85% rename from dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaTopicModelInferencer.java rename to dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/MalletLdaTopicModelInferencer.java index 6587e60704..3043b90e70 100644 --- a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaTopicModelInferencer.java +++ b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/MalletLdaTopicModelInferencer.java @@ -15,21 +15,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.mallet.lda; +package org.dkpro.core.mallet.lda; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; -import cc.mallet.pipe.Pipe; -import cc.mallet.pipe.TokenSequence2FeatureSequence; -import cc.mallet.topics.ParallelTopicModel; -import cc.mallet.topics.TopicInferencer; -import cc.mallet.types.Instance; -import cc.mallet.types.TokenSequence; -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; -import de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator.PhraseSequenceGenerator; -import de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator.StringSequenceGenerator; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.mallet.MalletModelTrainer; -import de.tudarmstadt.ukp.dkpro.core.mallet.type.TopicDistribution; import org.apache.commons.lang3.ArrayUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; @@ -41,54 +35,71 @@ import org.apache.uima.jcas.cas.DoubleArray; import org.apache.uima.jcas.cas.IntegerArray; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.featurepath.FeaturePathException; +import org.dkpro.core.api.io.sequencegenerator.PhraseSequenceGenerator; +import org.dkpro.core.api.io.sequencegenerator.StringSequenceGenerator; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.mallet.MalletModelTrainer; -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; +import cc.mallet.pipe.Pipe; +import cc.mallet.pipe.TokenSequence2FeatureSequence; +import cc.mallet.topics.ParallelTopicModel; +import cc.mallet.topics.TopicInferencer; +import cc.mallet.types.Instance; +import cc.mallet.types.TokenSequence; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.mallet.type.TopicDistribution; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Infers the topic distribution over documents using a Mallet {@link ParallelTopicModel}. */ -@ResourceMetaData(name="Mallet LDA Topic Model Inferencer") +@Component(OperationType.TOPIC_EXTRACTOR) +@ResourceMetaData(name = "Mallet LDA Topic Model Inferencer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, outputs = { "de.tudarmstadt.ukp.dkpro.core.mallet.type.TopicDistribution" } ) - public class MalletLdaTopicModelInferencer extends JCasAnnotator_ImplBase { private static final String NONE_LABEL = "X"; + /** + * Location from which the model is read. This is either a local path or a classpath location. + * In the latter case, the model artifact (if any) is searched as well. + */ public final static String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true) private File modelLocation; /** - * The annotation type to use as tokens. Default: - * {@link de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token} + * The annotation type to use as tokens. */ public final static String PARAM_TYPE_NAME = "typeName"; @ConfigurationParameter(name = PARAM_TYPE_NAME, mandatory = true, defaultValue = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token") private String typeName; /** - * The number of iterations during inference. Default: 100. + * The number of iterations during inference. */ public final static String PARAM_N_ITERATIONS = "nIterations"; @ConfigurationParameter(name = PARAM_N_ITERATIONS, mandatory = true, defaultValue = "100") private int nIterations; /** - * The number of iterations before hyperparameter optimization begins. Default: 1 + * The number of iterations before hyper-parameter optimization begins. */ public final static String PARAM_BURN_IN = "burnIn"; @ConfigurationParameter(name = PARAM_BURN_IN, mandatory = true, defaultValue = "1") private int burnIn; + /** + * The number of iterations between saved samples. + */ public final static String PARAM_THINNING = "thinning"; @ConfigurationParameter(name = PARAM_THINNING, mandatory = true, defaultValue = "5") private int thinning; @@ -109,14 +120,16 @@ public class MalletLdaTopicModelInferencer private int maxTopicAssignments; /** - * The annotation type to use for the model. Default: {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token}. - * For lemmas, use {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token/lemma/value} + * The annotation type to use for the model. For lemmas, use + * {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token/lemma/value} */ - public static final String PARAM_TOKEN_FEATURE_PATH = MalletModelTrainer.PARAM_TOKEN_FEATURE_PATH; - @ConfigurationParameter(name = PARAM_TOKEN_FEATURE_PATH, mandatory = true, defaultValue = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token") + public static final String PARAM_TOKEN_FEATURE_PATH = + MalletModelTrainer.PARAM_TOKEN_FEATURE_PATH; + @ConfigurationParameter(name = PARAM_TOKEN_FEATURE_PATH, mandatory = true, + defaultValue = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token") private String tokenFeaturePath; /** - * Ignore tokens (or lemmas, respectively) that are shorter than the given value. Default: 3. + * Ignore tokens (or lemmas, respectively) that are shorter than the given value. */ public static final String PARAM_MIN_TOKEN_LENGTH = "minTokenLength"; @ConfigurationParameter(name = PARAM_MIN_TOKEN_LENGTH, mandatory = true, defaultValue = "3") @@ -254,8 +267,8 @@ private int[] assignTopics(final double[] topicDistribution) if (indexes.size() > maxTopicAssignments) { /* sort index list by corresponding values */ - Collections.sort(indexes, - (aO1, aO2) -> Double.compare(topicDistribution[aO1], topicDistribution[aO2])); + Collections.sort(indexes, (aO1, aO2) -> + Double.compare(topicDistribution[aO1], topicDistribution[aO2])); while (indexes.size() > maxTopicAssignments) { indexes.remove(0); diff --git a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaTopicModelTrainer.java b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/MalletLdaTopicModelTrainer.java similarity index 82% rename from dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaTopicModelTrainer.java rename to dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/MalletLdaTopicModelTrainer.java index 24da64e9b3..14d330a8f3 100644 --- a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaTopicModelTrainer.java +++ b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/MalletLdaTopicModelTrainer.java @@ -15,27 +15,35 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.mallet.lda; +package org.dkpro.core.mallet.lda; + +import java.io.File; +import java.io.IOException; -import cc.mallet.topics.ParallelTopicModel; -import cc.mallet.types.Instance; -import de.tudarmstadt.ukp.dkpro.core.mallet.MalletModelTrainer; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.dkpro.core.mallet.MalletModelTrainer; -import java.io.File; -import java.io.IOException; +import cc.mallet.topics.ParallelTopicModel; +import cc.mallet.types.Instance; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Estimate an LDA topic model using Mallet and write it to a file. It stores all incoming CAS' to * Mallet {@link Instance}s before estimating the model, using a {@link ParallelTopicModel}. * <p> - * Set {@link #PARAM_TOKEN_FEATURE_PATH} to define what is considered as a token (Tokens, Lemmas, etc.). + * Set {@link #PARAM_TOKEN_FEATURE_PATH} to define what is considered as a token (Tokens, Lemmas, + * etc.). * <p> - * Set {@link #PARAM_COVERING_ANNOTATION_TYPE} to define what is considered a document (sentences, paragraphs, etc.). + * Set {@link #PARAM_COVERING_ANNOTATION_TYPE} to define what is considered a document (sentences, + * paragraphs, etc.). */ -@ResourceMetaData(name="Mallet LDA Topic Model Trainer") +@Component(OperationType.TRAINER_OF_MACHINE_LEARNING_MODELS) +@ResourceMetaData(name = "Mallet LDA Topic Model Trainer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") public class MalletLdaTopicModelTrainer extends MalletModelTrainer { @@ -47,21 +55,21 @@ public class MalletLdaTopicModelTrainer private int nTopics; /** - * The number of iterations during model estimation. Default: 1000. + * The number of iterations during model estimation. */ public static final String PARAM_N_ITERATIONS = "nIterations"; @ConfigurationParameter(name = PARAM_N_ITERATIONS, mandatory = true, defaultValue = "1000") private int nIterations; /** - * The number of iterations before hyper-parameter optimization begins. Default: 100 + * The number of iterations before hyper-parameter optimization begins. */ public static final String PARAM_BURNIN_PERIOD = "burninPeriod"; @ConfigurationParameter(name = PARAM_BURNIN_PERIOD, mandatory = true, defaultValue = "100") private int burninPeriod; /** - * Interval for optimizing Dirichlet hyper-parameters. Default: 50 + * Interval for optimizing Dirichlet hyper-parameters. */ public static final String PARAM_OPTIMIZE_INTERVAL = "optimizeInterval"; @ConfigurationParameter(name = PARAM_OPTIMIZE_INTERVAL, mandatory = true, defaultValue = "50") @@ -75,36 +83,35 @@ public class MalletLdaTopicModelTrainer private int randomSeed; /** - * Define how frequently a serialized model is saved to disk during estimation. Default: 0 (only save when - * estimation is done). + * Define how frequently an intermediate serialized model is saved to disk during estimation. */ public static final String PARAM_SAVE_INTERVAL = "saveInterval"; @ConfigurationParameter(name = PARAM_SAVE_INTERVAL, mandatory = true, defaultValue = "0") private int saveInterval; /** - * Use a symmetric alpha value during model estimation? Default: false. + * Use a symmetric alpha value during model estimation? */ public static final String PARAM_USE_SYMMETRIC_ALPHA = "useSymmetricAlpha"; @ConfigurationParameter(name = PARAM_USE_SYMMETRIC_ALPHA, mandatory = true, defaultValue = "false") private boolean useSymmetricAlpha; /** - * The interval in which to display the estimated topics. Default: 50. + * The interval in which to display the estimated topics. */ public static final String PARAM_DISPLAY_INTERVAL = "displayInterval"; @ConfigurationParameter(name = PARAM_DISPLAY_INTERVAL, mandatory = true, defaultValue = "50") private int displayInterval; /** - * The number of top words to display during estimation. Default: 7. + * The number of top words to display during estimation. */ public static final String PARAM_DISPLAY_N_TOPIC_WORDS = "displayNTopicWords"; @ConfigurationParameter(name = PARAM_DISPLAY_N_TOPIC_WORDS, mandatory = true, defaultValue = "7") private int displayNTopicWords; /** - * The sum of alphas over all topics. Default: 1.0. + * The sum of alphas over all topics. * <p> * Another recommended value is 50 / T (number of topics). */ @@ -113,7 +120,7 @@ public class MalletLdaTopicModelTrainer private float alphaSum; /** - * Beta for a single dimension of the Dirichlet prior. Default: 0.01. + * Beta for a single dimension of the Dirichlet prior. */ public static final String PARAM_BETA = "beta"; @ConfigurationParameter(name = PARAM_BETA, mandatory = true, defaultValue = "0.01f") diff --git a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaTopicModelUtils.java b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/MalletLdaTopicModelUtils.java similarity index 91% rename from dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaTopicModelUtils.java rename to dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/MalletLdaTopicModelUtils.java index bb2be0d3ef..aef4e03717 100644 --- a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaTopicModelUtils.java +++ b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/MalletLdaTopicModelUtils.java @@ -16,17 +16,22 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.mallet.lda; +package org.dkpro.core.mallet.lda; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeSet; -import cc.mallet.topics.ParallelTopicModel; -import cc.mallet.types.Alphabet; -import cc.mallet.types.IDSorter; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import java.io.File; -import java.io.IOException; -import java.util.*; +import cc.mallet.topics.ParallelTopicModel; +import cc.mallet.types.Alphabet; +import cc.mallet.types.IDSorter; public class MalletLdaTopicModelUtils { @@ -62,13 +67,14 @@ public static List<Map<String, Double>> getTopWords(File modelFile, int nWords, List<Map<String, Double>> topics = new ArrayList<>(model.getNumTopics()); - /* iterate over topics */ + // iterate over topics for (TreeSet<IDSorter> topic : model.getSortedWords()) { Map<String, Double> topicWords = new HashMap<>(nWords); - /* iterate over word IDs in topic (sorted by weight) */ + // iterate over word IDs in topic (sorted by weight) for (IDSorter id : topic) { - double weight = normalize ? id.getWeight() / alphabet.size() : id.getWeight(); // normalize + // normalize + double weight = normalize ? id.getWeight() / alphabet.size() : id.getWeight(); String word = (String) alphabet.lookupObject(id.getID()); topicWords.put(word, weight); diff --git a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/io/MalletLdaTopicProportionsWriter.java b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/io/MalletLdaTopicProportionsWriter.java similarity index 83% rename from dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/io/MalletLdaTopicProportionsWriter.java rename to dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/io/MalletLdaTopicProportionsWriter.java index 73533950fd..74e397bfdc 100644 --- a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/io/MalletLdaTopicProportionsWriter.java +++ b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/io/MalletLdaTopicProportionsWriter.java @@ -15,32 +15,34 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.mallet.lda.io; +package org.dkpro.core.mallet.lda.io; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.mallet.lda.MalletLdaTopicModelInferencer; -import de.tudarmstadt.ukp.dkpro.core.mallet.type.TopicDistribution; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.jcas.JCas; +import static org.apache.uima.fit.util.JCasUtil.select; import java.io.IOException; import java.io.OutputStream; import java.util.Locale; -import static org.apache.uima.fit.util.JCasUtil.select; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.mallet.lda.MalletLdaTopicModelInferencer; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.mallet.type.TopicDistribution; /** - * Write topic proportions to a file in the shape {@code [<docId>\t]<topic_1>\t<topic_2>\t...<topic_n>} + * Write topic proportions to a file in the shape + * {@code [<docId>\t]<topic_1>\t<topic_2>\t...<topic_n>} * <p> * This writer depends on the {@link TopicDistribution} annotation which needs to be created by * {@link MalletLdaTopicModelInferencer} before. * </p> */ -@ResourceMetaData(name="Mallet LDA Topic Proportions Writer") +@ResourceMetaData(name = "Mallet LDA Topic Proportions Writer") public class MalletLdaTopicProportionsWriter extends JCasFileWriter_ImplBase { @@ -55,10 +57,11 @@ public class MalletLdaTopicProportionsWriter private boolean writeDocid; /** - * If {@link #PARAM_SINGULAR_TARGET} is set to false (default), this extension will be appended to the output - * files. Default: {@code .topics}. + * If {@link #PARAM_SINGULAR_TARGET} is set to false (default), this extension will be appended + * to the output files. */ - public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; + public static final String PARAM_FILENAME_EXTENSION = + ComponentParameters.PARAM_FILENAME_EXTENSION; @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".topics") private String filenameExtension; diff --git a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/io/MalletLdaTopicsProportionsSortedWriter.java b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/io/MalletLdaTopicsProportionsSortedWriter.java similarity index 86% rename from dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/io/MalletLdaTopicsProportionsSortedWriter.java rename to dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/io/MalletLdaTopicsProportionsSortedWriter.java index 6917075a7b..a3c28f34bc 100644 --- a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/io/MalletLdaTopicsProportionsSortedWriter.java +++ b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/io/MalletLdaTopicsProportionsSortedWriter.java @@ -15,19 +15,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.mallet.lda.io; +package org.dkpro.core.mallet.lda.io; -import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.mallet.lda.MalletLdaTopicModelInferencer; -import de.tudarmstadt.ukp.dkpro.core.mallet.type.TopicDistribution; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; +import static org.apache.uima.fit.util.JCasUtil.selectSingle; import java.io.BufferedWriter; import java.io.File; @@ -38,20 +28,29 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; -import static org.apache.uima.fit.util.JCasUtil.selectSingle; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.dkpro.core.mallet.lda.MalletLdaTopicModelInferencer; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.mallet.type.TopicDistribution; /** * Write the topic proportions according to an LDA topic model to an output file. The proportions * need to be inferred in a previous step using {@link MalletLdaTopicModelInferencer}. */ -@ResourceMetaData(name="Mallet LDA Sorted Topic Proportions Writer") +@ResourceMetaData(name = "Mallet LDA Sorted Topic Proportions Writer") public class MalletLdaTopicsProportionsSortedWriter extends JCasFileWriter_ImplBase { - public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; - @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) - private File targetLocation; - + /** + * Number of topics to generate. + */ public static final String PARAM_N_TOPICS = "nTopics"; @ConfigurationParameter(name = PARAM_N_TOPICS, mandatory = true, defaultValue = "3") private int nTopics; @@ -64,6 +63,7 @@ public void initialize(UimaContext context) { super.initialize(context); + File targetLocation = new File(getTargetLocation()); targetLocation.getParentFile().mkdirs(); try { writer = new BufferedWriter(new FileWriter(targetLocation)); @@ -125,6 +125,6 @@ public void collectionProcessComplete() catch (IOException e) { throw new AnalysisEngineProcessException(e); } - getLogger().info("Output written to " + targetLocation); + getLogger().info("Output written to " + getTargetLocation()); } } diff --git a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/util/PrintTopWords.java b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/util/PrintTopWords.java similarity index 98% rename from dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/util/PrintTopWords.java rename to dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/util/PrintTopWords.java index 071f575fb0..78b04341ef 100644 --- a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/util/PrintTopWords.java +++ b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/util/PrintTopWords.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.mallet.lda.util; +package org.dkpro.core.mallet.lda.util; import java.io.File; import java.io.IOException; diff --git a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/util/PrintTopicWordWeights.java b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/util/PrintTopicWordWeights.java similarity index 98% rename from dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/util/PrintTopicWordWeights.java rename to dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/util/PrintTopicWordWeights.java index d72c41d9fa..9f44dadcfe 100644 --- a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/util/PrintTopicWordWeights.java +++ b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/lda/util/PrintTopicWordWeights.java @@ -15,11 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.mallet.lda.util; - -import cc.mallet.topics.ParallelTopicModel; -import cc.mallet.types.Alphabet; -import cc.mallet.types.IDSorter; +package org.dkpro.core.mallet.lda.util; import java.io.BufferedWriter; import java.io.File; @@ -31,6 +27,10 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import cc.mallet.topics.ParallelTopicModel; +import cc.mallet.types.Alphabet; +import cc.mallet.types.IDSorter; + /** * Extract the n most important words for each topic in the given {@link ParallelTopicModel} files * and print them with normalized proportion to a new file. diff --git a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/MalletEmbeddingsAnnotator.java b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/wordembeddings/MalletEmbeddingsAnnotator.java similarity index 83% rename from dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/MalletEmbeddingsAnnotator.java rename to dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/wordembeddings/MalletEmbeddingsAnnotator.java index e07f4c0708..899906966c 100644 --- a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/MalletEmbeddingsAnnotator.java +++ b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/wordembeddings/MalletEmbeddingsAnnotator.java @@ -15,11 +15,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.mallet.wordembeddings; +package org.dkpro.core.mallet.wordembeddings; + +import java.io.File; +import java.io.IOException; +import java.util.Optional; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.mallet.MalletModelTrainer; -import de.tudarmstadt.ukp.dkpro.core.mallet.type.WordEmbedding; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.Type; @@ -35,17 +36,19 @@ import org.dkpro.core.api.embeddings.Vectorizer; import org.dkpro.core.api.embeddings.binary.BinaryVectorizer; import org.dkpro.core.api.embeddings.text.TextFormatVectorizer; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.mallet.MalletModelTrainer; -import java.io.File; -import java.io.IOException; -import java.util.Optional; +import de.tudarmstadt.ukp.dkpro.core.mallet.type.WordEmbedding; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Reads word embeddings from a file and adds {@link WordEmbedding} annotations to tokens/lemmas. * * @since 1.9.0 */ -@ResourceMetaData(name="Mallet Embeddings Annotator") +@ResourceMetaData(name = "Mallet Embeddings Annotator") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, outputs = { "de.tudarmstadt.ukp.dkpro.core.mallet.type.WordEmbedding" } @@ -63,6 +66,9 @@ public class MalletEmbeddingsAnnotator @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true) private File modelLocation; + /** + * Whether the model is in binary format instead of text format. + */ public static final String PARAM_MODEL_IS_BINARY = "modelIsBinary"; @ConfigurationParameter(name = PARAM_MODEL_IS_BINARY, mandatory = true, defaultValue = "false") private boolean modelIsBinary; @@ -72,8 +78,10 @@ public class MalletEmbeddingsAnnotator * Specify how to handle unknown tokens: * <ol> * <li>If this parameter is not specified, unknown tokens are not annotated.</li> - * <li>If an empty float[] is passed, a random vector is generated that is used for each unknown token.</li> - * <li>If a float[] is passed, each unknown token is annotated with that vector. The float must have the same length as the vectors in the model file.</li> + * <li>If an empty float[] is passed, a random vector is generated that is used for each unknown + * token.</li> + * <li>If a float[] is passed, each unknown token is annotated with that vector. The float must + * have the same length as the vectors in the model file.</li> * </ol> */ public static final String PARAM_ANNOTATE_UNKNOWN_TOKENS = "annotateUnknownTokens"; @@ -81,18 +89,20 @@ public class MalletEmbeddingsAnnotator private boolean annotateUnknownTokens; /** - * If set to true (default: false), the first line is interpreted as header line containing the number of entries and the dimensionality. - * This should be set to true for models generated with Word2Vec. + * If set to true (default: false), the first line is interpreted as header line containing the + * number of entries and the dimensionality. This should be set to true for models generated + * with Word2Vec. */ public static final String PARAM_MODEL_HAS_HEADER = "modelHasHeader"; @ConfigurationParameter(name = PARAM_MODEL_HAS_HEADER, mandatory = true, defaultValue = "false") private boolean modelHasHeader; /** - * The annotation type to use for the model. Default: {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token}. - * For lemmas, use {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token/lemma/value} + * The annotation type to use for the model. For lemmas, use + * {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token/lemma/value} */ - public static final String PARAM_TOKEN_FEATURE_PATH = MalletModelTrainer.PARAM_TOKEN_FEATURE_PATH; + public static final String PARAM_TOKEN_FEATURE_PATH = + MalletModelTrainer.PARAM_TOKEN_FEATURE_PATH; @ConfigurationParameter(name = PARAM_TOKEN_FEATURE_PATH, mandatory = true, defaultValue = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token") private String tokenFeaturePath; @@ -168,12 +178,15 @@ private void addAnnotation(JCas aJCas, String text, int begin, int end) /** * If {@link #PARAM_ANNOTATE_UNKNOWN_TOKENS} is set to true, always return a vector retrieved - * from the vectorizer, which should hold a stable random vector for unknown tokens. - * Otherwise, return a vector for known tokens, or none if the token is unknown. + * from the vectorizer, which should hold a stable random vector for unknown tokens. Otherwise, + * return a vector for known tokens, or none if the token is unknown. * - * @param token a token for which to look up an embedding - * @return an {@code Optional<float[]>} that holds the token embedding or is empty if no embeddings is available for the token - * @throws IOException if an I/O error occurs + * @param token + * a token for which to look up an embedding + * @return an {@code Optional<float[]>} that holds the token embedding or is empty if no + * embeddings is available for the token + * @throws IOException + * if an I/O error occurs */ private Optional<float[]> getVector(String token) throws IOException diff --git a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/MalletEmbeddingsTrainer.java b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/wordembeddings/MalletEmbeddingsTrainer.java similarity index 89% rename from dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/MalletEmbeddingsTrainer.java rename to dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/wordembeddings/MalletEmbeddingsTrainer.java index 01affbf8d4..fae2a7f1bc 100644 --- a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/MalletEmbeddingsTrainer.java +++ b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/wordembeddings/MalletEmbeddingsTrainer.java @@ -15,32 +15,37 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.mallet.wordembeddings; - -import cc.mallet.topics.WordEmbeddings; -import cc.mallet.types.Alphabet; -import cc.mallet.types.InstanceList; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.mallet.MalletModelTrainer; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; +package org.dkpro.core.mallet.wordembeddings; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.io.PrintWriter; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.mallet.MalletModelTrainer; + +import cc.mallet.topics.WordEmbeddings; +import cc.mallet.types.Alphabet; +import cc.mallet.types.InstanceList; +import eu.openminted.share.annotations.api.DocumentationResource; + /** * Compute word embeddings from the given collection using skip-grams. * <p> - * Set {@link #PARAM_TOKEN_FEATURE_PATH} to define what is considered as a token (Tokens, Lemmas, etc.). + * Set {@link #PARAM_TOKEN_FEATURE_PATH} to define what is considered as a token (Tokens, Lemmas, + * etc.). * <p> - * Set {@link #PARAM_COVERING_ANNOTATION_TYPE} to define what is considered a document (sentences, paragraphs, etc.). + * Set {@link #PARAM_COVERING_ANNOTATION_TYPE} to define what is considered a document (sentences, + * paragraphs, etc.). * * @since 1.9.0 */ -@ResourceMetaData(name="Mallet Embeddings Trainer") +@ResourceMetaData(name = "Mallet Embeddings Trainer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") public class MalletEmbeddingsTrainer extends MalletModelTrainer { @@ -66,7 +71,8 @@ public class MalletEmbeddingsTrainer private int windowSize; /** - * An example word that is output with its nearest neighbours once in a while (default: null, i.e. none). + * An example word that is output with its nearest neighbours once in a while (default: null, + * i.e. none). */ public static final String PARAM_EXAMPLE_WORD = "exampleWord"; @ConfigurationParameter(name = PARAM_EXAMPLE_WORD, mandatory = false) @@ -102,7 +108,7 @@ public void collectionProcessComplete() matrix.countWords(instanceList); matrix.train(instanceList, getNumThreads(), numNegativeSamples); - assert(getTargetLocation() != null); + assert getTargetLocation() != null; getLogger().info("Writing output to " + getTargetLocation()); File targetFile = new File(getTargetLocation()); if (targetFile.getParentFile() != null) { diff --git a/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/MalletEmbeddingsAnnotatorTest.java b/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/MalletEmbeddingsAnnotatorTest.java index dc2496890a..73dd8cbeff 100644 --- a/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/MalletEmbeddingsAnnotatorTest.java +++ b/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/MalletEmbeddingsAnnotatorTest.java @@ -17,10 +17,18 @@ */ package de.tudarmstadt.ukp.dkpro.core.mallet.wordembeddings; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.mallet.type.WordEmbedding; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.IOException; +import java.net.URISyntaxException; +import java.util.Arrays; + import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReaderDescription; @@ -28,20 +36,14 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.dkpro.core.api.embeddings.VectorizerUtils; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.mallet.wordembeddings.MalletEmbeddingsAnnotator; +import org.dkpro.core.tokit.BreakIteratorSegmenter; import org.junit.Before; import org.junit.Test; -import java.io.File; -import java.io.IOException; -import java.net.URISyntaxException; -import java.util.Arrays; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.mallet.type.WordEmbedding; public class MalletEmbeddingsAnnotatorTest { @@ -201,4 +203,4 @@ private static void testEmbeddingAnnotations(CollectionReaderDescription reader, } } } -} \ No newline at end of file +} diff --git a/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/MalletEmbeddingsTrainerTest.java b/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/MalletEmbeddingsTrainerTest.java index 4d393c318b..a012497aa3 100644 --- a/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/MalletEmbeddingsTrainerTest.java +++ b/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/MalletEmbeddingsTrainerTest.java @@ -17,20 +17,10 @@ */ package de.tudarmstadt.ukp.dkpro.core.mallet.wordembeddings; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionMethod; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; -import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.apache.uima.resource.ResourceInitializationException; -import org.junit.Rule; -import org.junit.Test; +import static junit.framework.TestCase.assertEquals; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.junit.Assert.assertTrue; import java.io.BufferedReader; import java.io.File; @@ -40,10 +30,22 @@ import java.util.Arrays; import java.util.List; -import static junit.framework.TestCase.assertEquals; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.junit.Assert.assertTrue; +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.resources.CompressionMethod; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.mallet.wordembeddings.MalletEmbeddingsTrainer; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.tokit.BreakIteratorSegmenter; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; public class MalletEmbeddingsTrainerTest { @@ -250,5 +252,4 @@ public void testCharacterEmbeddingsTokens() .map(line -> Arrays.copyOfRange(line, 1, dimensions)) .forEach(array -> Arrays.stream(array).forEach(Double::parseDouble)); } - -} \ No newline at end of file +} diff --git a/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaTopicModelInferencerTest.java b/dkpro-core-mallet-asl/src/test/java/org/dkpro/core/mallet/lda/MalletLdaTopicModelInferencerTest.java similarity index 90% rename from dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaTopicModelInferencerTest.java rename to dkpro-core-mallet-asl/src/test/java/org/dkpro/core/mallet/lda/MalletLdaTopicModelInferencerTest.java index 0847d68009..e64b336ac3 100644 --- a/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaTopicModelInferencerTest.java +++ b/dkpro-core-mallet-asl/src/test/java/org/dkpro/core/mallet/lda/MalletLdaTopicModelInferencerTest.java @@ -16,28 +16,30 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.mallet.lda; +package org.dkpro.core.mallet.lda; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.mallet.lda.MalletLdaUtil.trainModel; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.IOException; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.mallet.type.TopicDistribution; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.mallet.lda.MalletLdaTopicModelInferencer; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.tokit.BreakIteratorSegmenter; import org.junit.Rule; import org.junit.Test; -import java.io.File; -import java.io.IOException; - -import static de.tudarmstadt.ukp.dkpro.core.mallet.lda.MalletLdaUtil.trainModel; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.junit.Assert.assertTrue; +import de.tudarmstadt.ukp.dkpro.core.mallet.type.TopicDistribution; public class MalletLdaTopicModelInferencerTest { diff --git a/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaTopicModelTrainerTest.java b/dkpro-core-mallet-asl/src/test/java/org/dkpro/core/mallet/lda/MalletLdaTopicModelTrainerTest.java similarity index 95% rename from dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaTopicModelTrainerTest.java rename to dkpro-core-mallet-asl/src/test/java/org/dkpro/core/mallet/lda/MalletLdaTopicModelTrainerTest.java index a899f64f0e..c07e97e119 100644 --- a/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaTopicModelTrainerTest.java +++ b/dkpro-core-mallet-asl/src/test/java/org/dkpro/core/mallet/lda/MalletLdaTopicModelTrainerTest.java @@ -15,25 +15,27 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.mallet.lda; +package org.dkpro.core.mallet.lda; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; -import cc.mallet.topics.ParallelTopicModel; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.mallet.lda.MalletLdaTopicModelTrainer; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.tokit.BreakIteratorSegmenter; import org.junit.Rule; import org.junit.Test; -import java.io.File; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import cc.mallet.topics.ParallelTopicModel; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; public class MalletLdaTopicModelTrainerTest { diff --git a/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaTopicModelUtilsTest.java b/dkpro-core-mallet-asl/src/test/java/org/dkpro/core/mallet/lda/MalletLdaTopicModelUtilsTest.java similarity index 92% rename from dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaTopicModelUtilsTest.java rename to dkpro-core-mallet-asl/src/test/java/org/dkpro/core/mallet/lda/MalletLdaTopicModelUtilsTest.java index 998dfa8789..2617f3c852 100644 --- a/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaTopicModelUtilsTest.java +++ b/dkpro-core-mallet-asl/src/test/java/org/dkpro/core/mallet/lda/MalletLdaTopicModelUtilsTest.java @@ -16,17 +16,18 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.mallet.lda; +package org.dkpro.core.mallet.lda; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import org.junit.Rule; -import org.junit.Test; +import static org.junit.Assert.assertEquals; import java.io.File; import java.util.List; import java.util.Map; -import static org.junit.Assert.assertEquals; +import org.dkpro.core.mallet.lda.MalletLdaTopicModelUtils; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Rule; +import org.junit.Test; public class MalletLdaTopicModelUtilsTest { diff --git a/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaUtil.java b/dkpro-core-mallet-asl/src/test/java/org/dkpro/core/mallet/lda/MalletLdaUtil.java similarity index 92% rename from dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaUtil.java rename to dkpro-core-mallet-asl/src/test/java/org/dkpro/core/mallet/lda/MalletLdaUtil.java index 421202fea3..4887223535 100644 --- a/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/MalletLdaUtil.java +++ b/dkpro-core-mallet-asl/src/test/java/org/dkpro/core/mallet/lda/MalletLdaUtil.java @@ -15,20 +15,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.mallet.lda; +package org.dkpro.core.mallet.lda; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; -import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.SimplePipeline; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; import java.io.File; import java.io.IOException; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.mallet.lda.MalletLdaTopicModelTrainer; +import org.dkpro.core.tokit.BreakIteratorSegmenter; public class MalletLdaUtil { diff --git a/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/io/MalletLdaTopicProportionsWriterTest.java b/dkpro-core-mallet-asl/src/test/java/org/dkpro/core/mallet/lda/io/MalletLdaTopicProportionsWriterTest.java similarity index 95% rename from dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/io/MalletLdaTopicProportionsWriterTest.java rename to dkpro-core-mallet-asl/src/test/java/org/dkpro/core/mallet/lda/io/MalletLdaTopicProportionsWriterTest.java index 88e8ba8b52..e9e9cbc11c 100644 --- a/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/io/MalletLdaTopicProportionsWriterTest.java +++ b/dkpro-core-mallet-asl/src/test/java/org/dkpro/core/mallet/lda/io/MalletLdaTopicProportionsWriterTest.java @@ -15,31 +15,32 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.mallet.lda.io; +package org.dkpro.core.mallet.lda.io; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.IOException; +import java.net.URISyntaxException; +import java.util.List; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.mallet.lda.MalletLdaTopicModelInferencer; -import de.tudarmstadt.ukp.dkpro.core.mallet.lda.MalletLdaUtil; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; import org.apache.commons.io.FileUtils; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.mallet.lda.MalletLdaTopicModelInferencer; +import org.dkpro.core.mallet.lda.MalletLdaUtil; +import org.dkpro.core.mallet.lda.io.MalletLdaTopicProportionsWriter; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.tokit.BreakIteratorSegmenter; import org.junit.Rule; import org.junit.Test; -import java.io.File; -import java.io.IOException; -import java.net.URISyntaxException; -import java.util.List; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - /** * */ diff --git a/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/io/MalletLdaTopicsProportionsSortedWriterTest.java b/dkpro-core-mallet-asl/src/test/java/org/dkpro/core/mallet/lda/io/MalletLdaTopicsProportionsSortedWriterTest.java similarity index 90% rename from dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/io/MalletLdaTopicsProportionsSortedWriterTest.java rename to dkpro-core-mallet-asl/src/test/java/org/dkpro/core/mallet/lda/io/MalletLdaTopicsProportionsSortedWriterTest.java index 4fe2407024..6fd19f7362 100644 --- a/dkpro-core-mallet-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/io/MalletLdaTopicsProportionsSortedWriterTest.java +++ b/dkpro-core-mallet-asl/src/test/java/org/dkpro/core/mallet/lda/io/MalletLdaTopicsProportionsSortedWriterTest.java @@ -15,30 +15,31 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.mallet.lda.io; +package org.dkpro.core.mallet.lda.io; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.dkpro.core.mallet.lda.MalletLdaUtil.trainModel; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.IOException; +import java.util.List; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.mallet.lda.MalletLdaTopicModelInferencer; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; import org.apache.commons.io.FileUtils; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.mallet.lda.MalletLdaTopicModelInferencer; +import org.dkpro.core.mallet.lda.io.MalletLdaTopicsProportionsSortedWriter; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.tokit.BreakIteratorSegmenter; import org.junit.Rule; import org.junit.Test; -import java.io.File; -import java.io.IOException; -import java.util.List; - -import static de.tudarmstadt.ukp.dkpro.core.mallet.lda.MalletLdaUtil.trainModel; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - /** * */ diff --git a/dkpro-core-mallet-asl/suppressions.xml b/dkpro-core-mallet-asl/suppressions.xml new file mode 100644 index 0000000000..05381817ea --- /dev/null +++ b/dkpro-core-mallet-asl/suppressions.xml @@ -0,0 +1,9 @@ +<?xml version="1.0"?> + +<!DOCTYPE suppressions PUBLIC +"-//Puppy Crawl//DTD Suppressions 1.1//EN" +"http://www.puppycrawl.com/dtds/suppressions_1_1.dtd"> + +<suppressions> + <suppress files=".*[/\\]target[/\\].*" checks=".*"/> +</suppressions> diff --git a/dkpro-core-maltparser-asl/pom.xml b/dkpro-core-maltparser-asl/pom.xml index b7feeacbfe..bb3f67b7fd 100644 --- a/dkpro-core-maltparser-asl/pom.xml +++ b/dkpro-core-maltparser-asl/pom.xml @@ -18,16 +18,17 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <version>1.10.0-SNAPSHOT</version> + <artifactId>dkpro-core-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.maltparser-asl</artifactId> + <artifactId>dkpro-core-maltparser-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - MaltParser (v ${maltparser.version})</name> + <url>https://dkpro.github.io/dkpro-core/</url> <properties> - <maltparser.version>1.9.1</maltparser.version> + <maltparser.version>1.9.2</maltparser.version> </properties> <dependencies> <dependency> @@ -52,42 +53,46 @@ <version>${maltparser.version}</version> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.metadata-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-metadata-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.syntax-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-syntax-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.hunpos-asl</artifactId> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-hunpos-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-opennlp-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> <dependency> @@ -159,16 +164,16 @@ <dependencyManagement> <dependencies> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-opennlp-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <scope>import</scope> <type>pom</type> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.hunpos-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-hunpos-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <scope>import</scope> <type>pom</type> </dependency> diff --git a/dkpro-core-maltparser-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/maltparser/MaltParser.java b/dkpro-core-maltparser-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/maltparser/MaltParser.java deleted file mode 100644 index f16474e2c8..0000000000 --- a/dkpro-core-maltparser-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/maltparser/MaltParser.java +++ /dev/null @@ -1,525 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.maltparser; - -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; -import static org.apache.uima.util.Level.INFO; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.net.URL; -import java.util.HashSet; -import java.util.List; -import java.util.Properties; -import java.util.Set; -import java.util.jar.JarEntry; -import java.util.jar.JarInputStream; - -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_component.AnalysisComponent; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.Level; -import org.apache.uima.util.Logger; -import org.maltparser.MaltParserService; -import org.maltparser.core.exception.MaltChainedException; -import org.maltparser.core.options.OptionManager; -import org.maltparser.core.symbol.SymbolTable; -import org.maltparser.core.symbol.parse.ParseSymbolTable; -import org.maltparser.core.syntaxgraph.DependencyStructure; -import org.maltparser.core.syntaxgraph.edge.Edge; -import org.maltparser.core.syntaxgraph.node.TokenNode; -import org.maltparser.parser.SingleMalt; -import org.springframework.beans.PropertyAccessor; -import org.springframework.beans.PropertyAccessorFactory; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; - -/** - * Dependency parsing using MaltPaser. - * <p> - * Required annotations: - * </p> - * <ul> - * <li>Token</li> - * <li>Sentence</li> - * <li>POS</li> - * </ul> - * - * Generated annotations: - * <ul> - * <li>Dependency (annotated over sentence-span)</li> - * </ul> - */ -@ResourceMetaData(name="MaltParser Dependency Parser") -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}, - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency"}) - -public class MaltParser - extends JCasAnnotator_ImplBase -{ - private static final String UNUSED = "_"; - - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - protected String modelLocation; - - /** - * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") - protected boolean printTagSet; - - /** - * Process anyway, even if the model relies on features that are not supported by this - * component. - * - * Default: {@code false} - */ - public static final String PARAM_IGNORE_MISSING_FEATURES = "ignoreMissingFeatures"; - @ConfigurationParameter(name = PARAM_IGNORE_MISSING_FEATURES, mandatory = true, defaultValue = "false") - protected boolean ignoreMissingFeatures; - - // Not sure if we'll ever have to use different symbol tables - // public static final String SYMBOL_TABLE = "symbolTableName"; - // @ConfigurationParameter(name = SYMBOL_TABLE, mandatory = true, defaultValue = "DEPREL") - private final String symbolTableName = "DEPREL"; - - private Logger logger; - private SymbolTable symbolTable; - private File workingDir; - - private CasConfigurableProviderBase<MaltParserService> modelProvider; - private Set<String> features; - - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - - logger = getContext().getLogger(); - - try { - workingDir = File.createTempFile("maltparser", ".tmp"); - workingDir.delete(); - workingDir.mkdirs(); - workingDir.deleteOnExit(); - } - catch (IOException e) { - throw new ResourceInitializationException(e); - } - - modelProvider = new ModelProviderBase<MaltParserService>(this, "maltparser", "parser") { - private MaltParserService parser; - - { - setDefault(VARIANT, "linear"); - } - - @Override - protected MaltParserService produceResource(URL aUrl) throws IOException - { - if (parser != null) { - // Terminates the parser model - try { - parser.terminateParserModel(); - parser = null; - } - catch (MaltChainedException e) { - logger.log(Level.SEVERE, - "MaltParser exception while terminating parser model: " + e.getMessage()); - } - } - - try { - // Warn if the model uses features that we currently do not support - features = getFeatures(aUrl); - Set<String> unsupportedFeatures = new HashSet<String>(features); - getLogger().info("Model uses these features: " + features); - unsupportedFeatures.remove("FORM"); // we know covered text - unsupportedFeatures.remove("LEMMA"); // we know lemma if lemmatizer ran before - unsupportedFeatures.remove("POSTAG"); // we know POS tag if POS tagger ran before - // CPOSTAG - only supported if we know a mapping from POSTAG to CPOSTAG (FIXME) - // FEATS - not properly supported in DKPro Core yet! (FIXME) - if (!unsupportedFeatures.isEmpty()) { - String message = "Model these uses unsupported features: " + unsupportedFeatures; - if (ignoreMissingFeatures) { - getLogger().warn(message); - } - else { - throw new IOException(message); - } - } - - // However, Maltparser is not happy at all if the model file does not have the right - // name, so we are forced to create a temporary directory and place the file there. - File modelFile = new File(workingDir, getRealName(aUrl)); - if (!modelFile.exists()) { - InputStream is = null; - OutputStream os = null; - try { - is = aUrl.openStream(); - os = new FileOutputStream(modelFile); - IOUtils.copy(is, os); - modelFile.deleteOnExit(); - } - finally { - IOUtils.closeQuietly(is); - IOUtils.closeQuietly(os); - } - } - - // Maltparser has a very odd way of finding out which command line options it supports. - // By manually initializing the OptionManager before Maltparser tries it, we can work - // around Maltparsers' own broken code. - if (OptionManager.instance().getOptionContainerIndices().size() == 0) { - OptionManager.instance().loadOptionDescriptionFile( - MaltParserService.class.getResource("/appdata/options.xml")); - OptionManager.instance().generateMaps(); - } - - // Ok, now we can finally initialize the parser - parser = new MaltParserService(); - parser.initializeParserModel("-w " + workingDir + " -c " + modelFile.getName() - + " -m parse"); - // parser.initializeParserModel("-u " + modelUrl.toString() + " -m parse"); - - - Properties metadata = getResourceMetaData(); - - PropertyAccessor paDirect = PropertyAccessorFactory.forDirectFieldAccess(parser); - SingleMalt singleMalt = (SingleMalt) paDirect.getPropertyValue("singleMalt"); - - SingletonTagset posTags = new SingletonTagset( - POS.class, metadata.getProperty("pos.tagset")); - ParseSymbolTable posTagTable = (ParseSymbolTable) singleMalt.getSymbolTables() - .getSymbolTable("POSTAG"); - for (int i = 0; i < posTagTable.getValueCounter(); i++) { - posTags.add(posTagTable.getSymbolCodeToString(i)); - } - posTags.remove("#null#"); // Technical symbol introduced in MaltParser 1.8 - addTagset(posTags, false); - - SingletonTagset depTags = new SingletonTagset( - Dependency.class, metadata.getProperty("dependency.tagset")); - ParseSymbolTable depRelTable = (ParseSymbolTable) singleMalt.getSymbolTables() - .getSymbolTable("DEPREL"); - for (int i = 0; i < depRelTable.getValueCounter(); i++) { - depTags.add(depRelTable.getSymbolCodeToString(i)); - } - depTags.remove("#null#"); // Technical symbol introduced in MaltParser 1.8 - addTagset(depTags); - - if (printTagSet) { - getContext().getLogger().log(INFO, getTagset().toString()); - } - - return parser; - } - catch (MaltChainedException e) { - logger.log(Level.SEVERE, - "MaltParser exception while initializing parser model: " + e.getMessage()); - throw new IOException(e); - } - } - }; - } - - /** - * @see AnalysisComponent#collectionProcessComplete() - */ - @Override - public void collectionProcessComplete() - throws AnalysisEngineProcessException - { - if (workingDir != null && workingDir.isDirectory()) { - FileUtils.deleteQuietly(workingDir); - } - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - modelProvider.configure(aJCas.getCas()); - - // Iterate over all sentences - for (Sentence curSentence : select(aJCas, Sentence.class)) { - - // Generate list of tokens for current sentence - List<Token> tokens = selectCovered(Token.class, curSentence); - - // Generate input format required by parser - String[] parserInput = new String[tokens.size()]; - for (int i = 0; i < parserInput.length; i++) { - Token t = tokens.get(i); - - int id = i + 1; - String form = t.getText(); - String lemma = UNUSED; - String cpostag = UNUSED; - String postag = UNUSED; - String feats = UNUSED; - - if (features.contains("LEMMA")) { - if (t.getLemma() != null) { - lemma = t.getLemma().getValue(); - } - else if (!ignoreMissingFeatures) { - throw new IllegalStateException( - "Model uses feature LEMMA but there is no lemma information in CAS"); - } - } - - // Actually, this cannot work, because we only know about the DKPro Core coarse - // grained categories, which are most likely different from the coarse-grained - // categories required by the model. We would need to include a mapping with the - // model to recover the required coarse grained categories from the fine-grained - // categories in POSTAG. - if (features.contains("CPOSTAG")) { -// if (t.getPos() != null) { -// cpostag = t.getPos().getPosValue(); -// } -// else - if (!ignoreMissingFeatures) { - throw new IllegalStateException( - "Model uses feature CPOSTAG but there is no part-of-speech information in CAS"); - } - } - - if (features.contains("POSTAG")) { - if (t.getPos() != null) { - postag = t.getPos().getPosValue(); - } - else if (!ignoreMissingFeatures) { - throw new IllegalStateException( - "Model uses feature POSTAG but there is no part-of-speech information in CAS"); - } - } - - if (features.contains("FEATS")) { - if (t.getMorph() != null) { - feats = t.getMorph().getValue(); - } - else - if (!ignoreMissingFeatures) { - throw new IllegalStateException( - "Model uses feature FEATS but there is no morphology information in CAS"); - } - } - - // This only works for the English model. Other models have different input - // formats. See http://www.maltparser.org/mco/mco.html - parserInput[i] = String.format("%d\t%s\t%s\t%s\t%s\t%s", id, form, lemma, cpostag, - postag, feats); - } - - // Parse sentence - DependencyStructure graph = null; - try { - // Parses the sentence - graph = modelProvider.getResource().parse(parserInput); - symbolTable = graph.getSymbolTables().getSymbolTable(symbolTableName); - } - catch (MaltChainedException e) { - logger.log(Level.WARNING, - "MaltParser exception while parsing sentence: " + e.getMessage(), e); - // don't pass on exception - go on with next sentence - continue; - } - - /* - * Generate annotations: NOTE: Index of token in tokenList corresponds to node in - * DependencyGraph with NodeIndex+1 - */ - try { - // iterate over all tokens in current sentence - for (int i = 0; i < tokens.size(); i++) { - // Start with Node 1 - we omit ROOT-dependencies, - // because we don't have a ROOT-token. - TokenNode curNode = graph.getTokenNode(i + 1); - - // iterate over all dependencies for current token - for (Edge edge : curNode.getHeadEdges()) { - int sourceIdx = edge.getSource().getIndex(); - int targetIdx = edge.getTarget().getIndex(); - - // get corresponding token for node in DependencyGraph - Token sourceToken = sourceIdx > 0 ? tokens.get(sourceIdx - 1) : null; - Token targetToken = targetIdx > 0 ? tokens.get(targetIdx - 1) : null; - - // create dep-annotation for current edge - if (sourceToken != null && targetToken != null) { - Dependency dep = new Dependency(aJCas); - dep.setDependencyType(edge.getLabelSymbol(symbolTable)); - dep.setFlavor(DependencyFlavor.BASIC); - dep.setGovernor(sourceToken); // TODO check if source=Governor - dep.setDependent(targetToken); // TODO check if target=Dependent - dep.setBegin(dep.getDependent().getBegin()); - dep.setEnd(dep.getDependent().getEnd()); - dep.addToIndexes(); - } - else if (targetToken != null && sourceToken == null) { - Dependency dep = new ROOT(aJCas); - // Trying to get the label triggers Exception - dep.setDependencyType("ROOT"); - dep.setFlavor(DependencyFlavor.BASIC); - dep.setGovernor(targetToken); - dep.setDependent(targetToken); - dep.setBegin(dep.getDependent().getBegin()); - dep.setEnd(dep.getDependent().getEnd()); - dep.addToIndexes(); - } - else { - throw new IllegalStateException("Source token must exist."); - } - } - } - } - catch (MaltChainedException e) { - logger.log(Level.WARNING, "MaltParser exception creating dependency annotations: " - + e.getMessage(), e); - // don't pass on exception - go on with next sentence - continue; - } - } - } - - private String getRealName(URL aUrl) throws IOException - { - JarEntry je = null; - JarInputStream jis = null; - - try { - jis = new JarInputStream(aUrl.openConnection().getInputStream()); - while ((je = jis.getNextJarEntry()) != null) { - String entryName = je.getName(); - if (entryName.endsWith(".info")) { - int indexUnderScore = entryName.lastIndexOf('_'); - int indexSeparator = entryName.lastIndexOf(File.separator); - if (indexSeparator == -1) { - indexSeparator = entryName.lastIndexOf('/'); - } - if (indexSeparator == -1) { - indexSeparator = entryName.lastIndexOf('\\'); - } - int indexDot = entryName.lastIndexOf('.'); - if (indexUnderScore == -1 || indexDot == -1) { - throw new IllegalStateException( - "Could not find the configuration name and type from the URL '" - + aUrl.toString() + "'. "); - } - - return entryName.substring(indexSeparator+1, indexUnderScore) + ".mco"; - } - } - - throw new IllegalStateException( - "Could not find the configuration name and type from the URL '" - + aUrl.toString() + "'. "); - } - finally { - IOUtils.closeQuietly(jis); - } - } - - private Set<String> getFeatures(URL aUrl) throws IOException - { - JarEntry je = null; - JarInputStream jis = null; - - try { - jis = new JarInputStream(aUrl.openConnection().getInputStream()); - while ((je = jis.getNextJarEntry()) != null) { - String entryName = je.getName(); - - if (entryName.endsWith(".info")) { - Set<String> features = new HashSet<String>(); - - for (String line : IOUtils.readLines(jis, "UTF-8")) { - if (line.contains("InputColumn(")) { - int offset = line.indexOf("InputColumn("); - while (offset >= 0) { - int comma = line.indexOf(',', offset+1); - features.add(line.substring(offset+12,comma).trim()); - offset = line.indexOf("InputColumn(", comma); - } - } - } - - return features; - } - } - - throw new IllegalStateException( - "Could not find the configuration name and type from the URL '" - + aUrl.toString() + "'. "); - } - finally { - IOUtils.closeQuietly(jis); - } - } -} diff --git a/dkpro-core-maltparser-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/maltparser/package-info.java b/dkpro-core-maltparser-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/maltparser/package-info.java deleted file mode 100644 index c782e7e875..0000000000 --- a/dkpro-core-maltparser-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/maltparser/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Integration of the <a href="http://www.maltparser.org/">MaltParser</a> dependency parser. - * - * @since 1.4.0 - */ -package de.tudarmstadt.ukp.dkpro.core.maltparser; diff --git a/dkpro-core-maltparser-asl/src/main/java/org/dkpro/core/maltparser/MaltParser.java b/dkpro-core-maltparser-asl/src/main/java/org/dkpro/core/maltparser/MaltParser.java new file mode 100644 index 0000000000..a8e3aadb65 --- /dev/null +++ b/dkpro-core-maltparser-asl/src/main/java/org/dkpro/core/maltparser/MaltParser.java @@ -0,0 +1,550 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.maltparser; + +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.apache.uima.util.Level.INFO; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.URL; +import java.util.HashSet; +import java.util.List; +import java.util.Properties; +import java.util.Set; +import java.util.jar.JarEntry; +import java.util.jar.JarInputStream; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_component.AnalysisComponent; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Level; +import org.apache.uima.util.Logger; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.maltparser.MaltParserService; +import org.maltparser.core.exception.MaltChainedException; +import org.maltparser.core.options.OptionManager; +import org.maltparser.core.symbol.SymbolTable; +import org.maltparser.core.symbol.parse.ParseSymbolTable; +import org.maltparser.core.syntaxgraph.DependencyStructure; +import org.maltparser.core.syntaxgraph.edge.Edge; +import org.maltparser.core.syntaxgraph.node.TokenNode; +import org.maltparser.parser.SingleMalt; +import org.springframework.beans.PropertyAccessor; +import org.springframework.beans.PropertyAccessorFactory; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Dependency parsing using MaltPaser. + * <p> + * Required annotations: + * </p> + * <ul> + * <li>Token</li> + * <li>Sentence</li> + * <li>POS</li> + * </ul> + * + * Generated annotations: + * <ul> + * <li>Dependency (annotated over sentence-span)</li> + * </ul> + */ +@Component(OperationType.DEPENDENCY_PARSER) +@ResourceMetaData(name = "MaltParser Dependency Parser") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency"}) + +public class MaltParser + extends JCasAnnotator_ImplBase +{ + private static final String UNUSED = "_"; + + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + protected String modelLocation; + + /** + * Log the tag set(s) when a model is loaded. + */ + public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") + protected boolean printTagSet; + + /** + * Process anyway, even if the model relies on features that are not supported by this + * component. + */ + public static final String PARAM_IGNORE_MISSING_FEATURES = "ignoreMissingFeatures"; + @ConfigurationParameter(name = PARAM_IGNORE_MISSING_FEATURES, mandatory = true, defaultValue = "false") + protected boolean ignoreMissingFeatures; + + // Not sure if we'll ever have to use different symbol tables + // public static final String SYMBOL_TABLE = "symbolTableName"; + // @ConfigurationParameter(name = SYMBOL_TABLE, mandatory = true, defaultValue = "DEPREL") + private final String symbolTableName = "DEPREL"; + + private Logger logger; + private SymbolTable symbolTable; + private File workingDir; + + private CasConfigurableProviderBase<MaltParserService> modelProvider; + private Set<String> features; + + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + logger = getContext().getLogger(); + + try { + workingDir = File.createTempFile("maltparser", ".tmp"); + workingDir.delete(); + workingDir.mkdirs(); + workingDir.deleteOnExit(); + } + catch (IOException e) { + throw new ResourceInitializationException(e); + } + + modelProvider = new ModelProviderBase<MaltParserService>(this, "maltparser", "parser") { + private MaltParserService parser; + + { + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/maltparser/lib/parser-${language}-${variant}.properties"); + setDefault(VARIANT, "linear"); + } + + @Override + protected MaltParserService produceResource(URL aUrl) throws IOException + { + if (parser != null) { + // Terminates the parser model + try { + parser.terminateParserModel(); + parser = null; + } + catch (MaltChainedException e) { + logger.log(Level.SEVERE, + "MaltParser exception while terminating parser model: " + + e.getMessage()); + } + } + + try { + // Warn if the model uses features that we currently do not support + features = getFeatures(aUrl); + Set<String> unsupportedFeatures = new HashSet<String>(features); + getLogger().info("Model uses these features: " + features); + // we know covered text + unsupportedFeatures.remove("FORM"); + // we know lemma if lemmatizer ran before + unsupportedFeatures.remove("LEMMA"); + // we know POS tag if POS tagger ran before + unsupportedFeatures.remove("POSTAG"); + // CPOSTAG - only supported if we know a mapping from POSTAG to CPOSTAG (FIXME) + // FEATS - not properly supported in DKPro Core yet! (FIXME) + if (!unsupportedFeatures.isEmpty()) { + String message = "Model these uses unsupported features: " + + unsupportedFeatures; + if (ignoreMissingFeatures) { + getLogger().warn(message); + } + else { + throw new IOException(message); + } + } + + // However, Maltparser is not happy at all if the model file does not have the + // right name, so we are forced to create a temporary directory and place the + // file there. + File modelFile = new File(workingDir, getRealName(aUrl)); + if (!modelFile.exists()) { + InputStream is = null; + OutputStream os = null; + try { + is = aUrl.openStream(); + os = new FileOutputStream(modelFile); + IOUtils.copy(is, os); + modelFile.deleteOnExit(); + } + finally { + IOUtils.closeQuietly(is); + IOUtils.closeQuietly(os); + } + } + + // Maltparser has a very odd way of finding out which command line options it + // supports. By manually initializing the OptionManager before Maltparser + // tries it, we can work around Maltparsers' own broken code. + if (OptionManager.instance().getOptionContainerIndices().size() == 0) { + OptionManager.instance().loadOptionDescriptionFile( + MaltParserService.class.getResource("/appdata/options.xml")); + OptionManager.instance().generateMaps(); + } + + // Ok, now we can finally initialize the parser + parser = new MaltParserService(); + parser.initializeParserModel("-w " + workingDir + " -c " + modelFile.getName() + + " -m parse"); + // parser.initializeParserModel("-u " + modelUrl.toString() + " -m parse"); + + + Properties metadata = getResourceMetaData(); + + PropertyAccessor paDirect = PropertyAccessorFactory + .forDirectFieldAccess(parser); + SingleMalt singleMalt = (SingleMalt) paDirect.getPropertyValue("singleMalt"); + + SingletonTagset posTags = new SingletonTagset( + POS.class, metadata.getProperty("pos.tagset")); + ParseSymbolTable posTagTable = (ParseSymbolTable) singleMalt.getSymbolTables() + .getSymbolTable("POSTAG"); + for (int i = 0; i < posTagTable.getValueCounter(); i++) { + posTags.add(posTagTable.getSymbolCodeToString(i)); + } + posTags.remove("#null#"); // Technical symbol introduced in MaltParser 1.8 + addTagset(posTags, false); + + SingletonTagset depTags = new SingletonTagset( + Dependency.class, metadata.getProperty("dependency.tagset")); + ParseSymbolTable depRelTable = (ParseSymbolTable) singleMalt.getSymbolTables() + .getSymbolTable("DEPREL"); + for (int i = 0; i < depRelTable.getValueCounter(); i++) { + depTags.add(depRelTable.getSymbolCodeToString(i)); + } + depTags.remove("#null#"); // Technical symbol introduced in MaltParser 1.8 + addTagset(depTags); + + if (printTagSet) { + getContext().getLogger().log(INFO, getTagset().toString()); + } + + return parser; + } + catch (MaltChainedException e) { + logger.log(Level.SEVERE, + "MaltParser exception while initializing parser model: " + + e.getMessage()); + throw new IOException(e); + } + } + }; + } + + /** + * @see AnalysisComponent#collectionProcessComplete() + */ + @Override + public void collectionProcessComplete() + throws AnalysisEngineProcessException + { + if (workingDir != null && workingDir.isDirectory()) { + FileUtils.deleteQuietly(workingDir); + } + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + modelProvider.configure(aJCas.getCas()); + + // Iterate over all sentences + for (Sentence curSentence : select(aJCas, Sentence.class)) { + + // Generate list of tokens for current sentence + List<Token> tokens = selectCovered(Token.class, curSentence); + + // Generate input format required by parser + String[] parserInput = new String[tokens.size()]; + for (int i = 0; i < parserInput.length; i++) { + Token t = tokens.get(i); + + int id = i + 1; + String form = t.getText(); + String lemma = UNUSED; + String cpostag = UNUSED; + String postag = UNUSED; + String feats = UNUSED; + + if (features.contains("LEMMA")) { + if (t.getLemma() != null) { + lemma = t.getLemma().getValue(); + } + else if (!ignoreMissingFeatures) { + throw new IllegalStateException( + "Model uses feature LEMMA but there is no lemma information in CAS"); + } + } + + // Actually, this cannot work, because we only know about the DKPro Core coarse + // grained categories, which are most likely different from the coarse-grained + // categories required by the model. We would need to include a mapping with the + // model to recover the required coarse grained categories from the fine-grained + // categories in POSTAG. + if (features.contains("CPOSTAG")) { +// if (t.getPos() != null) { +// cpostag = t.getPos().getPosValue(); +// } +// else + if (!ignoreMissingFeatures) { + throw new IllegalStateException( + "Model uses feature CPOSTAG but there is no part-of-speech information in CAS"); + } + } + + if (features.contains("POSTAG")) { + if (t.getPos() != null) { + postag = t.getPos().getPosValue(); + } + else if (!ignoreMissingFeatures) { + throw new IllegalStateException( + "Model uses feature POSTAG but there is no part-of-speech information in CAS"); + } + } + + if (features.contains("FEATS")) { + if (t.getMorph() != null) { + feats = t.getMorph().getValue(); + } + else if (!ignoreMissingFeatures) { + throw new IllegalStateException( + "Model uses feature FEATS but there is no morphology information in CAS"); + } + } + + // This only works for the English model. Other models have different input + // formats. See http://www.maltparser.org/mco/mco.html + parserInput[i] = String.format("%d\t%s\t%s\t%s\t%s\t%s", id, form, lemma, cpostag, + postag, feats); + } + + // Parse sentence + DependencyStructure graph = null; + try { + // Parses the sentence + graph = modelProvider.getResource().parse(parserInput); + symbolTable = graph.getSymbolTables().getSymbolTable(symbolTableName); + } + catch (MaltChainedException e) { + logger.log(Level.WARNING, + "MaltParser exception while parsing sentence: " + e.getMessage(), e); + // don't pass on exception - go on with next sentence + continue; + } + + /* + * Generate annotations: NOTE: Index of token in tokenList corresponds to node in + * DependencyGraph with NodeIndex+1 + */ + try { + // iterate over all tokens in current sentence + for (int i = 0; i < tokens.size(); i++) { + // Start with Node 1 - we omit ROOT-dependencies, + // because we don't have a ROOT-token. + TokenNode curNode = graph.getTokenNode(i + 1); + + // iterate over all dependencies for current token + for (Edge edge : curNode.getHeadEdges()) { + int sourceIdx = edge.getSource().getIndex(); + int targetIdx = edge.getTarget().getIndex(); + + // get corresponding token for node in DependencyGraph + Token sourceToken = sourceIdx > 0 ? tokens.get(sourceIdx - 1) : null; + Token targetToken = targetIdx > 0 ? tokens.get(targetIdx - 1) : null; + + // create dep-annotation for current edge + if (sourceToken != null && targetToken != null) { + Dependency dep = new Dependency(aJCas); + dep.setDependencyType(edge.getLabelSymbol(symbolTable)); + dep.setFlavor(DependencyFlavor.BASIC); + dep.setGovernor(sourceToken); // TODO check if source=Governor + dep.setDependent(targetToken); // TODO check if target=Dependent + dep.setBegin(dep.getDependent().getBegin()); + dep.setEnd(dep.getDependent().getEnd()); + dep.addToIndexes(); + } + else if (targetToken != null && sourceToken == null) { + Dependency dep = new ROOT(aJCas); + // Trying to get the label triggers Exception + dep.setDependencyType("ROOT"); + dep.setFlavor(DependencyFlavor.BASIC); + dep.setGovernor(targetToken); + dep.setDependent(targetToken); + dep.setBegin(dep.getDependent().getBegin()); + dep.setEnd(dep.getDependent().getEnd()); + dep.addToIndexes(); + } + else { + throw new IllegalStateException("Source token must exist."); + } + } + } + } + catch (MaltChainedException e) { + logger.log(Level.WARNING, "MaltParser exception creating dependency annotations: " + + e.getMessage(), e); + // don't pass on exception - go on with next sentence + continue; + } + } + } + + private String getRealName(URL aUrl) throws IOException + { + JarEntry je = null; + JarInputStream jis = null; + + try { + jis = new JarInputStream(aUrl.openConnection().getInputStream()); + while ((je = jis.getNextJarEntry()) != null) { + String entryName = je.getName(); + if (entryName.endsWith(".info")) { + int indexUnderScore = entryName.lastIndexOf('_'); + int indexSeparator = entryName.lastIndexOf(File.separator); + if (indexSeparator == -1) { + indexSeparator = entryName.lastIndexOf('/'); + } + if (indexSeparator == -1) { + indexSeparator = entryName.lastIndexOf('\\'); + } + int indexDot = entryName.lastIndexOf('.'); + if (indexUnderScore == -1 || indexDot == -1) { + throw new IllegalStateException( + "Could not find the configuration name and type from the URL '" + + aUrl.toString() + "'. "); + } + + return entryName.substring(indexSeparator + 1, indexUnderScore) + ".mco"; + } + } + + throw new IllegalStateException( + "Could not find the configuration name and type from the URL '" + + aUrl.toString() + "'. "); + } + finally { + IOUtils.closeQuietly(jis); + } + } + + private Set<String> getFeatures(URL aUrl) throws IOException + { + JarEntry je = null; + JarInputStream jis = null; + + try { + jis = new JarInputStream(aUrl.openConnection().getInputStream()); + while ((je = jis.getNextJarEntry()) != null) { + String entryName = je.getName(); + + if (entryName.endsWith(".info")) { + Set<String> features = new HashSet<String>(); + + for (String line : IOUtils.readLines(jis, "UTF-8")) { + if (line.contains("InputColumn(")) { + int offset = line.indexOf("InputColumn("); + while (offset >= 0) { + int comma = line.indexOf(',', offset + 1); + features.add(line.substring(offset + 12, comma).trim()); + offset = line.indexOf("InputColumn(", comma); + } + } + } + + return features; + } + } + + throw new IllegalStateException( + "Could not find the configuration name and type from the URL '" + + aUrl.toString() + "'. "); + } + finally { + IOUtils.closeQuietly(jis); + } + } +} diff --git a/dkpro-core-maltparser-asl/src/main/java/org/dkpro/core/maltparser/package-info.java b/dkpro-core-maltparser-asl/src/main/java/org/dkpro/core/maltparser/package-info.java new file mode 100644 index 0000000000..b61bb0481f --- /dev/null +++ b/dkpro-core-maltparser-asl/src/main/java/org/dkpro/core/maltparser/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Integration of the <a href="http://www.maltparser.org/">MaltParser</a> dependency parser. + * + * @since 1.4.0 + */ +package org.dkpro.core.maltparser; diff --git a/dkpro-core-maltparser-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/maltparser/MaltParserTest.java b/dkpro-core-maltparser-asl/src/test/java/org/dkpro/core/maltparser/MaltParserTest.java similarity index 97% rename from dkpro-core-maltparser-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/maltparser/MaltParserTest.java rename to dkpro-core-maltparser-asl/src/test/java/org/dkpro/core/maltparser/MaltParserTest.java index 643c6c572f..c80d304f30 100644 --- a/dkpro-core-maltparser-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/maltparser/MaltParserTest.java +++ b/dkpro-core-maltparser-asl/src/test/java/org/dkpro/core/maltparser/MaltParserTest.java @@ -15,10 +15,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.maltparser; +package org.dkpro.core.maltparser; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.*; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.dkpro.core.testing.AssertAnnotations.assertDependencies; +import static org.dkpro.core.testing.AssertAnnotations.assertTagset; +import static org.dkpro.core.testing.AssertAnnotations.assertTagsetMapping; import java.util.ArrayList; import java.util.List; @@ -28,6 +30,10 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.hunpos.HunPosTagger; +import org.dkpro.core.opennlp.OpenNlpPosTagger; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Assume; import org.junit.Ignore; import org.junit.Rule; @@ -35,13 +41,7 @@ import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.hunpos.HunPosTagger; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; -/** - */ public class MaltParserTest { // /** @@ -288,8 +288,8 @@ public void testEnglishPoly() assertDependencies(dependencies, JCasUtil.select(jcas, Dependency.class)); // There are some minor differences between the tags produced by the POS tagger and the - // tags expected by the parser model. We need a better test here that makes these differences - // more visible and at the same time doesn't fail. + // tags expected by the parser model. We need a better test here that makes these + // differences more visible and at the same time doesn't fail. //assertTagset(OpenNlpPosTagger.class, POS.class, "ptb", posTags, jcas); assertTagset(MaltParser.class, POS.class, "ptb", posTags, jcas); assertTagsetMapping(MaltParser.class, POS.class, "ptb", unmappedPos, jcas); @@ -730,14 +730,20 @@ public static AnalysisEngineDescription getEngines(String aLanguage, String aVar else if ("fa".equals(aLanguage) || "sv".equals(aLanguage)) { Assume.assumeFalse("HunPos currently hangs indefinitely on Windows: Issue #1099", System.getProperty("os.name").toLowerCase(Locale.US).contains("win")); + Assume.assumeTrue("HunPos does not run on OS X Catalina or higher", + System.getProperty("os.name").toLowerCase(Locale.US).contains("mac") && + !System.getProperty("os.version").matches("10\\.([0-9]|1[0-4]).*")); engines.add(createEngineDescription(HunPosTagger.class)); } else { engines.add(createEngineDescription(OpenNlpPosTagger.class)); } - engines.add(createEngineDescription(MaltParser.class, MaltParser.PARAM_VARIANT, aVariant, - MaltParser.PARAM_PRINT_TAGSET, true, MaltParser.PARAM_IGNORE_MISSING_FEATURES, true)); + engines.add(createEngineDescription( + MaltParser.class, + MaltParser.PARAM_VARIANT, aVariant, + MaltParser.PARAM_PRINT_TAGSET, true, + MaltParser.PARAM_IGNORE_MISSING_FEATURES, true)); return createEngineDescription(engines .toArray(new AnalysisEngineDescription[engines.size()])); diff --git a/dkpro-core-maltparser-asl/src/test/resources/log4j.properties b/dkpro-core-maltparser-asl/src/test/resources/log4j.properties deleted file mode 100644 index c7a889722d..0000000000 --- a/dkpro-core-maltparser-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,7 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG diff --git a/dkpro-core-maltparser-asl/src/test/resources/log4j2.xml b/dkpro-core-maltparser-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-maltparser-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Logger name="org.dkpro.core.api.resources.ResourceObjectProviderBase" level="INFO"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-matetools-gpl/.license-header.txt b/dkpro-core-matetools-gpl/.license-header.txt index ab08133a17..bbaf6e0e56 100644 --- a/dkpro-core-matetools-gpl/.license-header.txt +++ b/dkpro-core-matetools-gpl/.license-header.txt @@ -13,4 +13,4 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with this program. If not, see http://www.gnu.org/licenses/. +along with this program. If not, see http://www.gnu.org/licenses/. diff --git a/dkpro-core-matetools-gpl/pom.xml b/dkpro-core-matetools-gpl/pom.xml index 584e88a053..c7407d130d 100644 --- a/dkpro-core-matetools-gpl/pom.xml +++ b/dkpro-core-matetools-gpl/pom.xml @@ -1,6 +1,6 @@ <!-- - Copyright 2007-2017 + Copyright 2007-2019 Ubiquitous Knowledge Processing (UKP) Lab Technische Universität Darmstadt @@ -15,20 +15,21 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. + along with this program. If not, see http://www.gnu.org/licenses/. --> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core-gpl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-gpl</artifactId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-gpl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.matetools-gpl</artifactId> + <artifactId>dkpro-core-matetools-gpl</artifactId> <packaging>jar</packaging> <name>DKPro Core GPL - Mate-Tools</name> + <url>https://dkpro.github.io/dkpro-core/</url> <dependencies> <dependency> <groupId>org.apache.uima</groupId> @@ -47,7 +48,7 @@ <groupId>com.googlecode.mate-tools</groupId> <artifactId>srl</artifactId> <version>4.31</version> - <!-- + <!-- Excluding this non-essential dependency as it conflicts with the CoreNLP 3.7.0+ when building the aggregate Javadoc --> @@ -63,32 +64,36 @@ </exclusions> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.metadata-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-metadata-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.syntax-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-syntax-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.semantics-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-semantics-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> </dependency> <dependency> <groupId>junit</groupId> @@ -96,13 +101,13 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.hunpos-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-hunpos-asl</artifactId> <scope>test</scope> </dependency> <dependency> @@ -134,9 +139,9 @@ <dependencyManagement> <dependencies> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.hunpos-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-hunpos-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <scope>import</scope> <type>pom</type> </dependency> @@ -377,5 +382,5 @@ </pluginManagement> </build> </profile> - </profiles> + </profiles> </project> \ No newline at end of file diff --git a/dkpro-core-matetools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateLemmatizer.java b/dkpro-core-matetools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateLemmatizer.java deleted file mode 100644 index 1f326c2a82..0000000000 --- a/dkpro-core-matetools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateLemmatizer.java +++ /dev/null @@ -1,144 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.matetools; - -import is2.data.SentenceData09; -import is2.io.CONLLReader09; -import is2.lemmatizer.Lemmatizer; - -import java.io.File; -import java.io.IOException; -import java.net.URL; -import java.util.LinkedList; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * DKPro Annotator for the MateToolsLemmatizer. - */ -@ResourceMetaData(name="Mate Tools Lemmatizer") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" - }, - outputs = {"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma"}) -public class MateLemmatizer - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - private String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = "variant"; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - private String variant; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - private String modelLocation; - - /** - * Try reconstructing proper casing for lemmata. This is useful for German, but e.g. for - * English creates odd results. - */ - public static final String PARAM_UPPERCASE = "uppercase"; - @ConfigurationParameter(name = PARAM_UPPERCASE, mandatory = true, defaultValue="false") - private boolean uppercase; - - private CasConfigurableProviderBase<Lemmatizer> modelProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase<Lemmatizer>(this, "matetools", "lemmatizer") - { - @Override - protected Lemmatizer produceResource(URL aUrl) - throws IOException - { - File modelFile = ResourceUtils.getUrlAsFile(aUrl, true); - - return new Lemmatizer(modelFile.getPath(), uppercase); // create a lemmatizer - } - }; - } - - @Override - public void process(JCas jcas) - throws AnalysisEngineProcessException - { - CAS cas = jcas.getCas(); - - modelProvider.configure(cas); - - for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) { - List<Token> tokens = JCasUtil.selectCovered(Token.class, sentence); - - List<String> forms = new LinkedList<String>(); - forms.add(CONLLReader09.ROOT); - forms.addAll(JCasUtil.toText(tokens)); - - SentenceData09 sd = new SentenceData09(); - sd.init(forms.toArray(new String[0])); - String[] lemmas = modelProvider.getResource().apply(sd).plemmas; - - for (int i = 0; i < lemmas.length; i++) { - Token token = tokens.get(i); - if (lemmas[i] == null) { - lemmas[i] = token.getText(); - } - Lemma lemma = new Lemma(jcas, token.getBegin(), token.getEnd()); - lemma.setValue(lemmas[i]); - lemma.addToIndexes(); - token.setLemma(lemma); - } - } - } -} diff --git a/dkpro-core-matetools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateMorphTagger.java b/dkpro-core-matetools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateMorphTagger.java deleted file mode 100644 index 855f284c4b..0000000000 --- a/dkpro-core-matetools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateMorphTagger.java +++ /dev/null @@ -1,154 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.matetools; - -import is2.data.SentenceData09; -import is2.io.CONLLReader09; -import is2.mtag.Tagger; - -import java.io.File; -import java.io.IOException; -import java.net.URL; -import java.util.LinkedList; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.morph.MorphologicalFeaturesParser; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * DKPro Annotator for the MateToolsMorphTagger. - */ -@ResourceMetaData(name="Mate Tools Morphological Analyzer") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" - }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.Morpheme", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures" - } -) -public class MateMorphTagger - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - protected String modelLocation; - - private CasConfigurableProviderBase<Tagger> modelProvider; - private MorphologicalFeaturesParser featuresParser; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase<Tagger>(this, "matetools", "morphtagger") - { - @Override - protected Tagger produceResource(URL aUrl) - throws IOException - { - File modelFile = ResourceUtils.getUrlAsFile(aUrl, true); - - return new Tagger(modelFile.getPath()); // create a MorphTagger - } - }; - - featuresParser = new MorphologicalFeaturesParser(this, modelProvider); - } - - @Override - public void process(JCas jcas) - throws AnalysisEngineProcessException - { - CAS cas = jcas.getCas(); - - modelProvider.configure(cas); - featuresParser.configure(cas); - - try { - for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) { - List<Token> tokens = JCasUtil.selectCovered(Token.class, sentence); - - List<String> forms = new LinkedList<String>(); - forms.add(CONLLReader09.ROOT); - forms.addAll(JCasUtil.toText(tokens)); - - List<String> lemmas = new LinkedList<String>(); - lemmas.add(CONLLReader09.ROOT_LEMMA); - for (Token token : tokens) { - lemmas.add(token.getLemma().getValue()); - } - - SentenceData09 sd = new SentenceData09(); - sd.init(forms.toArray(new String[0])); - sd.setLemmas(lemmas.toArray(new String[0])); - String[] morphTags = modelProvider.getResource().apply(sd).pfeats; - - for (int i = 1; i < morphTags.length; i++) { - Token token = tokens.get(i-1); - MorphologicalFeatures analysis = featuresParser - .parse(jcas, token, morphTags[i]); - token.setMorph(analysis); - } - } - } - catch (Exception e) { - throw new AnalysisEngineProcessException(e); - } - } -} diff --git a/dkpro-core-matetools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateParser.java b/dkpro-core-matetools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateParser.java deleted file mode 100644 index b7b020b552..0000000000 --- a/dkpro-core-matetools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateParser.java +++ /dev/null @@ -1,238 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.matetools; - -import static java.util.Arrays.asList; -import static org.apache.uima.util.Level.INFO; -import is2.data.SentenceData09; -import is2.io.CONLLReader09; -import is2.parser.MFO; -import is2.parser.Options; -import is2.parser.Parser; - -import java.io.File; -import java.io.IOException; -import java.net.URL; -import java.util.HashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Properties; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Type; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; - -/** - * DKPro Annotator for the MateToolsParser. - * - * <p> - * Please cite the following paper, if you use the parser: Bernd Bohnet. 2010. Top Accuracy and Fast - * Dependency Parsing is not a Contradiction. The 23rd International Conference on Computational - * Linguistics (COLING 2010), Beijing, China. - * </p> - */ -@ResourceMetaData(name="Mate Tools Dependency Parser") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) -public class MateParser - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - protected String modelLocation; - - /** - * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") - protected boolean printTagSet; - - /** - * Load the dependency to UIMA type mapping from this location instead of locating - * the mapping automatically. - */ - public static final String PARAM_DEPENDENCY_MAPPING_LOCATION = ComponentParameters.PARAM_DEPENDENCY_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_DEPENDENCY_MAPPING_LOCATION, mandatory = false) - protected String dependencyMappingLocation; - - - private CasConfigurableProviderBase<Parser> modelProvider; - private MappingProvider mappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase<Parser>(this, "matetools", "parser") - { - @Override - protected Parser produceResource(URL aUrl) - throws IOException - { - File modelFile = ResourceUtils.getUrlAsFile(aUrl, true); - - String[] args = { "-model", modelFile.getPath() }; - Options option = new Options(args); - Parser parser = new Parser(option); // create a parser - - Properties metadata = getResourceMetaData(); - - HashMap<String, HashMap<String, Integer>> featureSet = MFO.getFeatureSet(); - SingletonTagset posTags = new SingletonTagset( - POS.class, metadata.getProperty("pos.tagset")); - HashMap<String, Integer> posTagFeatures = featureSet.get("POS"); - posTags.addAll(posTagFeatures.keySet()); - posTags.removeAll(asList("<None>", "<root-POS>")); - addTagset(posTags); - - SingletonTagset depTags = new SingletonTagset( - Dependency.class, metadata.getProperty("dependency.tagset")); - HashMap<String, Integer> depTagFeatures = featureSet.get("REL"); - depTags.addAll(depTagFeatures.keySet()); - depTags.removeAll(asList("<None>", "<no-type>", "<root-type>")); - addTagset(depTags); - - if (printTagSet) { - getContext().getLogger().log(INFO, getTagset().toString()); - } - - return parser; - } - }; - - mappingProvider = MappingProviderFactory.createDependencyMappingProvider( - dependencyMappingLocation, language, modelProvider); - } - - @Override - public void process(JCas jcas) - throws AnalysisEngineProcessException - { - CAS cas = jcas.getCas(); - - modelProvider.configure(cas); - mappingProvider.configure(cas); - - for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) { - List<Token> tokens = JCasUtil.selectCovered(Token.class, sentence); - - List<String> forms = new LinkedList<String>(); - forms.add(CONLLReader09.ROOT); - forms.addAll(JCasUtil.toText(tokens)); - - List<String> lemmas = new LinkedList<String>(); - List<String> posTags = new LinkedList<String>(); - lemmas.add(CONLLReader09.ROOT_LEMMA); - posTags.add(CONLLReader09.ROOT_POS); - for (Token token : tokens) { - if (token.getLemma() != null) { - lemmas.add(token.getLemma().getValue()); - } - else { - lemmas.add("_"); - } - posTags.add(token.getPos().getPosValue()); - } - - SentenceData09 sd = new SentenceData09(); - sd.init(forms.toArray(new String[forms.size()])); - sd.setLemmas(lemmas.toArray(new String[lemmas.size()])); - sd.setPPos(posTags.toArray(new String[posTags.size()])); - SentenceData09 parsed = modelProvider.getResource().apply(sd); - - for (int i = 0; i < parsed.labels.length; i++) { - if (parsed.pheads[i] != 0) { - Token sourceToken = tokens.get(parsed.pheads[i] - 1); - Token targetToken = tokens.get(i); - - Type depRel = mappingProvider.getTagType(parsed.plabels[i]); - Dependency dep = (Dependency) cas.createFS(depRel); - dep.setGovernor(sourceToken); - dep.setDependent(targetToken); - dep.setDependencyType(parsed.plabels[i]); - dep.setFlavor(DependencyFlavor.BASIC); - dep.setBegin(dep.getDependent().getBegin()); - dep.setEnd(dep.getDependent().getEnd()); - dep.addToIndexes(); - } - else { - Token rootToken = tokens.get(i); - - Dependency dep = new ROOT(jcas); - dep.setGovernor(rootToken); - dep.setDependent(rootToken); - dep.setDependencyType(parsed.plabels[i]); - dep.setFlavor(DependencyFlavor.BASIC); - dep.setBegin(dep.getDependent().getBegin()); - dep.setEnd(dep.getDependent().getEnd()); - dep.addToIndexes(); - } - } - } - } -} diff --git a/dkpro-core-matetools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/matetools/MatePosTagger.java b/dkpro-core-matetools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/matetools/MatePosTagger.java deleted file mode 100644 index 2253837ae3..0000000000 --- a/dkpro-core-matetools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/matetools/MatePosTagger.java +++ /dev/null @@ -1,193 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.matetools; - -import static java.util.Arrays.asList; -import static org.apache.uima.util.Level.INFO; -import is2.data.SentenceData09; -import is2.io.CONLLReader09; -import is2.tag.Options; -import is2.tag.Tagger; - -import java.io.File; -import java.io.IOException; -import java.net.URL; -import java.util.HashMap; -import java.util.LinkedList; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Type; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * DKPro Annotator for the MateToolsPosTagger - */ -@ResourceMetaData(name="Mate Tools POS-Tagger") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" - }, - outputs = {"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}) -public class MatePosTagger - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - protected String modelLocation; - - /** - * Load the part-of-speech tag to UIMA type mapping from this location instead of locating the - * mapping automatically. - */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - protected String posMappingLocation; - - /** - * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") - protected boolean printTagSet; - - private CasConfigurableProviderBase<Tagger> modelProvider; - private MappingProvider posMappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase<Tagger>(this, "matetools", "tagger") - { - @Override - protected Tagger produceResource(URL aUrl) - throws IOException - { - File modelFile = ResourceUtils.getUrlAsFile(aUrl, true); - - String[] args = { "-model", modelFile.getPath() }; - Options option = new Options(args); - Tagger tagger = new Tagger(option); // create a POSTagger - - HashMap<String, HashMap<String, Integer>> featureSet = tagger.mf.getFeatureSet(); - SingletonTagset posTags = new SingletonTagset(POS.class, getResourceMetaData() - .getProperty("pos.tagset")); - HashMap<String, Integer> posTagFeatures = featureSet.get("POS"); - posTags.addAll(posTagFeatures.keySet()); - posTags.removeAll(asList("<None>", "<root-POS>")); - addTagset(posTags); - - if (printTagSet) { - getContext().getLogger().log(INFO, getTagset().toString()); - } - - return tagger; - } - }; - - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - language, modelProvider); - } - - @Override - public void process(JCas jcas) - throws AnalysisEngineProcessException - { - CAS cas = jcas.getCas(); - - modelProvider.configure(cas); - posMappingProvider.configure(cas); - - for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) { - List<Token> tokens = JCasUtil.selectCovered(Token.class, sentence); - - List<String> forms = new LinkedList<String>(); - forms.add(CONLLReader09.ROOT); - forms.addAll(JCasUtil.toText(tokens)); - - List<String> lemmas = new LinkedList<String>(); - lemmas.add(CONLLReader09.ROOT_LEMMA); - for (Token token : tokens) { - if (token.getLemma() != null) { - lemmas.add(token.getLemma().getValue()); - } else { - lemmas.add("_"); - } - } - - SentenceData09 sd = new SentenceData09(); - sd.init(forms.toArray(new String[0])); - sd.setLemmas(lemmas.toArray(new String[0])); - String[] posTags = modelProvider.getResource().apply(sd).ppos; - - for (int i = 1; i < posTags.length; i++) { - Token token = tokens.get(i-1); - Type posType = posMappingProvider.getTagType(posTags[i]); - POS posTag = (POS) cas.createAnnotation(posType, token.getBegin(), token.getEnd()); - posTag.setPosValue(posTags[i].intern()); - POSUtils.assignCoarseValue(posTag); - posTag.addToIndexes(); - token.setPos(posTag); - } - } - } -} diff --git a/dkpro-core-matetools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateSemanticRoleLabeler.java b/dkpro-core-matetools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateSemanticRoleLabeler.java deleted file mode 100644 index c6b3458ad4..0000000000 --- a/dkpro-core-matetools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateSemanticRoleLabeler.java +++ /dev/null @@ -1,323 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.matetools; - -import static org.apache.uima.fit.util.JCasUtil.indexCovered; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; - -import java.io.File; -import java.io.IOException; -import java.net.URL; -import java.util.Collection; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.regex.Pattern; -import java.util.zip.ZipFile; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.FSCollectionFactory; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.cas.FSArray; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.Morpheme; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg; -import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArgLink; -import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import se.lth.cs.srl.SemanticRoleLabeler; -import se.lth.cs.srl.corpus.Predicate; -import se.lth.cs.srl.corpus.Word; -import se.lth.cs.srl.languages.Language; -import se.lth.cs.srl.languages.Language.L; -import se.lth.cs.srl.pipeline.Pipeline; - -/** - * DKPro Annotator for the MateTools Semantic Role Labeler. - *<p> - * Please cite the following paper, if you use the semantic role labeler - * Anders Björkelund, Love Hafdell, and Pierre Nugues. Multilingual semantic role labeling. - * In Proceedings of The Thirteenth Conference on Computational Natural Language Learning (CoNLL-2009), - * pages 43--48, Boulder, June 4--5 2009. - * </p> - */ -@ResourceMetaData(name="Mate Tools Semantic Role Labeler") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred", - "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg" }) -public class MateSemanticRoleLabeler - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - protected String modelLocation; - - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - private CasConfigurableProviderBase<SemanticRoleLabeler> modelProvider; - - private static final String UNUSED = "_"; - private static final int UNUSED_INT = -1; - private static final Pattern NEWLINE_PATTERN=Pattern.compile("\n"); - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - - modelProvider = new ModelProviderBase<SemanticRoleLabeler>(this, "matetools", "srl") - { - @Override - protected SemanticRoleLabeler produceResource(URL aUrl) - throws IOException - { - File modelFile = ResourceUtils.getUrlAsFile(aUrl, false); - try { - ZipFile zipFile = new ZipFile(modelFile); - SemanticRoleLabeler srl = Pipeline.fromZipFile(zipFile); - zipFile.close(); - return srl; - } - catch (Exception e) { - throw new IOException(e); - } - } - }; - } - - @Override - public void process(JCas jcas) - throws AnalysisEngineProcessException - { - modelProvider.configure(jcas.getCas()); - SemanticRoleLabeler srl = modelProvider.getResource(); - - //Set the language information for SRL - switch(jcas.getDocumentLanguage()){ - case "de": Language.setLanguage(L.ger); break; - case "en": Language.setLanguage(L.eng); break; - case "zh": Language.setLanguage(L.chi); break; - case "es": Language.setLanguage(L.spa); break; - default: throw new AnalysisEngineProcessException("Language not supported", null); - } - - for(Sentence s : JCasUtil.select(jcas, Sentence.class)) { - String conll2009String = convert(jcas, s); - se.lth.cs.srl.corpus.Sentence sen = se.lth.cs.srl.corpus.Sentence.newDepsOnlySentence(NEWLINE_PATTERN.split(conll2009String)); - - srl.parseSentence(sen); - - List<Predicate> preds = sen.getPredicates(); - List<Token> tokens = JCasUtil.selectCovered(Token.class, s); - - - for(Predicate pred : preds) { - //Add the predicates - Token predToken = tokens.get(pred.getIdx()-1); - SemPred semanticPredicate = new SemPred(jcas, predToken.getBegin(), predToken.getEnd()); - semanticPredicate.setCategory(pred.getSense()); - semanticPredicate.addToIndexes(); - - //Add the arguments - Map<Word, String> argmap = pred.getArgMap(); - List<SemArgLink> arguments = new LinkedList<>(); - for(Map.Entry<Word, String> entry : argmap.entrySet()) { - Token argumentToken = tokens.get(entry.getKey().getIdx()-1); - - SemArg arg = new SemArg(jcas, argumentToken.getBegin(), argumentToken.getEnd()); - arg.addToIndexes(); - - SemArgLink link = new SemArgLink(jcas); - link.setRole(pred.getArgumentTag(entry.getKey())); - link.setTarget(arg); - - arguments.add(link); - } - - //Add the arguments to the predicate - semanticPredicate.setArguments( - FSCollectionFactory.createFSArray(jcas, arguments)); - } - } - } - - private String convert(JCas aJCas, Sentence sentence) - { - Map<Token, Collection<SemPred>> predIdx = indexCovered(aJCas, Token.class, SemPred.class); - Map<SemArg, Collection<Token>> argIdx = indexCovered(aJCas, SemArg.class, Token.class); - HashMap<Token, Row> ctokens = new LinkedHashMap<Token, Row>(); - - StringBuilder conll2009String = new StringBuilder(); - - // Tokens - List<Token> tokens = selectCovered(Token.class, sentence); - - // Check if we should try to include the FEATS in output - List<Morpheme> morphology = selectCovered(Morpheme.class, sentence); - boolean useFeats = tokens.size() == morphology.size(); - - int tokenSize = tokens.size(); - int morhSize = morphology.size(); - - List<SemPred> preds = selectCovered(SemPred.class, sentence); - - for (int i = 0; i < tokens.size(); i++) { - Row row = new Row(); - row.id = i+1; - row.token = tokens.get(i); - row.args = new SemArgLink[preds.size()]; - if (useFeats) { - row.feats = morphology.get(i); - } - - // If there are multiple semantic predicates for the current token, then - // we keep only the first - Collection<SemPred> predsForToken = predIdx.get(row.token); - if (predsForToken != null && !predsForToken.isEmpty()) { - row.pred = predsForToken.iterator().next(); - } - ctokens.put(row.token, row); - } - - // Dependencies - for (Dependency rel : selectCovered(Dependency.class, sentence)) { - ctokens.get(rel.getDependent()).deprel = rel; - } - - // Semantic arguments - for (int p = 0; p < preds.size(); p++) { - FSArray args = preds.get(p).getArguments(); - for (SemArgLink link : select(args, SemArgLink.class)) { - for (Token t : argIdx.get(link.getTarget())) { - Row row = ctokens.get(t); - row.args[p] = link; - } - } - } - - // Write sentence in CONLL 2009 format - for (Row row : ctokens.values()) { - int id = row.id; - - String form = row.token.getText(); - - String lemma = UNUSED; - if (row.token.getLemma() != null) { - lemma = row.token.getLemma().getValue(); - } - String plemma = lemma; - - String pos = UNUSED; - if (row.token.getPos() != null) { - POS posAnno = row.token.getPos(); - pos = posAnno.getPosValue(); - } - String ppos = pos; - - String feat = UNUSED; - if (row.feats != null) { - feat = row.feats.getMorphTag(); - } - String pfeat = feat; - - int headId = UNUSED_INT; - String deprel = UNUSED; - if (row.deprel != null) { - deprel = row.deprel.getDependencyType(); - headId = ctokens.get(row.deprel.getGovernor()).id; - if (headId == row.id) { - // ROOT dependencies may be modeled as a loop, ignore these. - headId = 0; - } - } else { - headId = 0; //Mate SRL expects the head to have id = 0 - } - - String head = UNUSED; - if (headId != UNUSED_INT) { - head = Integer.toString(headId); - } - - String phead = head; - String pdeprel = deprel; - - String fillpred = UNUSED; - String pred = UNUSED; - StringBuilder apreds = new StringBuilder(); - - conll2009String.append( - String.format("%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", id, form, - lemma, plemma, pos, ppos, feat, pfeat, head, phead, deprel, pdeprel, fillpred, pred, apreds) - ); - } - - return conll2009String.toString(); - } - - private static final class Row { - int id; - Token token; - Morpheme feats; - Dependency deprel; - SemPred pred; - SemArgLink[] args; // These are the arguments roles for the current token! - } - -} diff --git a/dkpro-core-matetools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/matetools/package-info.java b/dkpro-core-matetools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/matetools/package-info.java deleted file mode 100644 index 77a33732e2..0000000000 --- a/dkpro-core-matetools-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/matetools/package-info.java +++ /dev/null @@ -1,25 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -/** - * Integration of NLP components from the <a href="http://code.google.com/p/mate-tools/"> - * mate-tools suite</a>. - * - * @since 1.5.0 - */ -package de.tudarmstadt.ukp.dkpro.core.matetools; diff --git a/dkpro-core-matetools-gpl/src/main/java/org/dkpro/core/matetools/MateLemmatizer.java b/dkpro-core-matetools-gpl/src/main/java/org/dkpro/core/matetools/MateLemmatizer.java new file mode 100644 index 0000000000..1eb7a19d3d --- /dev/null +++ b/dkpro-core-matetools-gpl/src/main/java/org/dkpro/core/matetools/MateLemmatizer.java @@ -0,0 +1,168 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.matetools; + +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.util.LinkedList; +import java.util.List; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.resources.ResourceUtils; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.lemmatizer.Lemmatizer; + +/** + * DKPro Core Annotator for the MateToolsLemmatizer. + */ +@Component(OperationType.LEMMATIZER) +@ResourceMetaData(name = "Mate Tools Lemmatizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" + }, + outputs = {"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma"}) +public class MateLemmatizer + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + private String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = "variant"; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + private String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + private String modelLocation; + + /** + * Try reconstructing proper casing for lemmata. This is useful for German, but e.g. for + * English creates odd results. + */ + public static final String PARAM_UPPERCASE = "uppercase"; + @ConfigurationParameter(name = PARAM_UPPERCASE, mandatory = true, defaultValue = "false") + private boolean uppercase; + + private CasConfigurableProviderBase<Lemmatizer> modelProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase<Lemmatizer>(this, "matetools", "lemmatizer") + { + { + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/matetools/lib/lemmatizer-${language}-${variant}.properties"); + } + + @Override + protected Lemmatizer produceResource(URL aUrl) + throws IOException + { + File modelFile = ResourceUtils.getUrlAsFile(aUrl, true); + + return new Lemmatizer(modelFile.getPath(), uppercase); // create a lemmatizer + } + }; + } + + @Override + public void process(JCas jcas) + throws AnalysisEngineProcessException + { + CAS cas = jcas.getCas(); + + modelProvider.configure(cas); + + for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) { + List<Token> tokens = JCasUtil.selectCovered(Token.class, sentence); + + List<String> forms = new LinkedList<String>(); + forms.add(CONLLReader09.ROOT); + forms.addAll(JCasUtil.toText(tokens)); + + SentenceData09 sd = new SentenceData09(); + sd.init(forms.toArray(new String[0])); + String[] lemmas = modelProvider.getResource().apply(sd).plemmas; + + for (int i = 0; i < lemmas.length; i++) { + Token token = tokens.get(i); + if (lemmas[i] == null) { + lemmas[i] = token.getText(); + } + Lemma lemma = new Lemma(jcas, token.getBegin(), token.getEnd()); + lemma.setValue(lemmas[i]); + lemma.addToIndexes(); + token.setLemma(lemma); + } + } + } +} diff --git a/dkpro-core-matetools-gpl/src/main/java/org/dkpro/core/matetools/MateMorphTagger.java b/dkpro-core-matetools-gpl/src/main/java/org/dkpro/core/matetools/MateMorphTagger.java new file mode 100644 index 0000000000..1bd745e2e1 --- /dev/null +++ b/dkpro-core-matetools-gpl/src/main/java/org/dkpro/core/matetools/MateMorphTagger.java @@ -0,0 +1,178 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.matetools; + +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.util.LinkedList; +import java.util.List; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.lexmorph.morph.MorphologicalFeaturesParser; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.resources.ResourceUtils; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.mtag.Tagger; + +/** + * DKPro Core Annotator for the MateToolsMorphTagger. + */ +@Component(OperationType.MORPHOLOGICAL_TAGGER) +@ResourceMetaData(name = "Mate Tools Morphological Analyzer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" + }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.Morpheme", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures" + } +) +public class MateMorphTagger + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + protected String modelLocation; + + private CasConfigurableProviderBase<Tagger> modelProvider; + private MorphologicalFeaturesParser featuresParser; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase<Tagger>(this, "matetools", "morphtagger") + { + { + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/matetools/lib/morphtagger-${language}-${variant}.properties"); + } + + @Override + protected Tagger produceResource(URL aUrl) + throws IOException + { + File modelFile = ResourceUtils.getUrlAsFile(aUrl, true); + + return new Tagger(modelFile.getPath()); // create a MorphTagger + } + }; + + featuresParser = new MorphologicalFeaturesParser(this, modelProvider); + } + + @Override + public void process(JCas jcas) + throws AnalysisEngineProcessException + { + CAS cas = jcas.getCas(); + + modelProvider.configure(cas); + featuresParser.configure(cas); + + try { + for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) { + List<Token> tokens = JCasUtil.selectCovered(Token.class, sentence); + + List<String> forms = new LinkedList<String>(); + forms.add(CONLLReader09.ROOT); + forms.addAll(JCasUtil.toText(tokens)); + + List<String> lemmas = new LinkedList<String>(); + lemmas.add(CONLLReader09.ROOT_LEMMA); + for (Token token : tokens) { + lemmas.add(token.getLemma().getValue()); + } + + SentenceData09 sd = new SentenceData09(); + sd.init(forms.toArray(new String[0])); + sd.setLemmas(lemmas.toArray(new String[0])); + String[] morphTags = modelProvider.getResource().apply(sd).pfeats; + + for (int i = 1; i < morphTags.length; i++) { + Token token = tokens.get(i - 1); + MorphologicalFeatures analysis = featuresParser + .parse(jcas, token, morphTags[i]); + token.setMorph(analysis); + } + } + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + } +} diff --git a/dkpro-core-matetools-gpl/src/main/java/org/dkpro/core/matetools/MateParser.java b/dkpro-core-matetools-gpl/src/main/java/org/dkpro/core/matetools/MateParser.java new file mode 100644 index 0000000000..42b1564740 --- /dev/null +++ b/dkpro-core-matetools-gpl/src/main/java/org/dkpro/core/matetools/MateParser.java @@ -0,0 +1,270 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.matetools; + +import static java.util.Arrays.asList; +import static org.apache.uima.util.Level.INFO; +import static org.dkpro.core.api.resources.MappingProviderFactory.createDependencyMappingProvider; + +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Properties; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Type; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.resources.ResourceUtils; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.parser.MFO; +import is2.parser.Options; +import is2.parser.Parser; + +/** + * DKPro Annotator for the MateToolsParser. + * + * <p> + * Please cite the following paper, if you use the parser: Bernd Bohnet. 2010. Top Accuracy and Fast + * Dependency Parsing is not a Contradiction. The 23rd International Conference on Computational + * Linguistics (COLING 2010), Beijing, China. + * </p> + */ +@Component(OperationType.DEPENDENCY_PARSER) +@ResourceMetaData(name = "Mate Tools Dependency Parser") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) +public class MateParser + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + protected String modelLocation; + + /** + * Log the tag set(s) when a model is loaded. + */ + public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") + protected boolean printTagSet; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Load the dependency to UIMA type mapping from this location instead of locating + * the mapping automatically. + */ + public static final String PARAM_DEPENDENCY_MAPPING_LOCATION = + ComponentParameters.PARAM_DEPENDENCY_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_DEPENDENCY_MAPPING_LOCATION, mandatory = false) + protected String dependencyMappingLocation; + + + private CasConfigurableProviderBase<Parser> modelProvider; + private MappingProvider mappingProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase<Parser>(this, "matetools", "parser") + { + { + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/matetools/lib/parser-${language}-${variant}.properties"); + } + + @Override + protected Parser produceResource(URL aUrl) + throws IOException + { + File modelFile = ResourceUtils.getUrlAsFile(aUrl, true); + + String[] args = { "-model", modelFile.getPath() }; + Options option = new Options(args); + Parser parser = new Parser(option); // create a parser + + Properties metadata = getResourceMetaData(); + + HashMap<String, HashMap<String, Integer>> featureSet = MFO.getFeatureSet(); + SingletonTagset posTags = new SingletonTagset( + POS.class, metadata.getProperty("pos.tagset")); + HashMap<String, Integer> posTagFeatures = featureSet.get("POS"); + posTags.addAll(posTagFeatures.keySet()); + posTags.removeAll(asList("<None>", "<root-POS>")); + addTagset(posTags); + + SingletonTagset depTags = new SingletonTagset( + Dependency.class, metadata.getProperty("dependency.tagset")); + HashMap<String, Integer> depTagFeatures = featureSet.get("REL"); + depTags.addAll(depTagFeatures.keySet()); + depTags.removeAll(asList("<None>", "<no-type>", "<root-type>")); + addTagset(depTags); + + if (printTagSet) { + getContext().getLogger().log(INFO, getTagset().toString()); + } + + return parser; + } + }; + + mappingProvider = createDependencyMappingProvider(this, dependencyMappingLocation, language, + modelProvider); + } + + @Override + public void process(JCas jcas) + throws AnalysisEngineProcessException + { + CAS cas = jcas.getCas(); + + modelProvider.configure(cas); + mappingProvider.configure(cas); + + for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) { + List<Token> tokens = JCasUtil.selectCovered(Token.class, sentence); + + List<String> forms = new LinkedList<String>(); + forms.add(CONLLReader09.ROOT); + forms.addAll(JCasUtil.toText(tokens)); + + List<String> lemmas = new LinkedList<String>(); + List<String> posTags = new LinkedList<String>(); + lemmas.add(CONLLReader09.ROOT_LEMMA); + posTags.add(CONLLReader09.ROOT_POS); + for (Token token : tokens) { + if (token.getLemma() != null) { + lemmas.add(token.getLemma().getValue()); + } + else { + lemmas.add("_"); + } + posTags.add(token.getPos().getPosValue()); + } + + SentenceData09 sd = new SentenceData09(); + sd.init(forms.toArray(new String[forms.size()])); + sd.setLemmas(lemmas.toArray(new String[lemmas.size()])); + sd.setPPos(posTags.toArray(new String[posTags.size()])); + SentenceData09 parsed = modelProvider.getResource().apply(sd); + + for (int i = 0; i < parsed.labels.length; i++) { + if (parsed.pheads[i] != 0) { + Token sourceToken = tokens.get(parsed.pheads[i] - 1); + Token targetToken = tokens.get(i); + + Type depRel = mappingProvider.getTagType(parsed.plabels[i]); + Dependency dep = (Dependency) cas.createFS(depRel); + dep.setGovernor(sourceToken); + dep.setDependent(targetToken); + dep.setDependencyType(parsed.plabels[i]); + dep.setFlavor(DependencyFlavor.BASIC); + dep.setBegin(dep.getDependent().getBegin()); + dep.setEnd(dep.getDependent().getEnd()); + dep.addToIndexes(); + } + else { + Token rootToken = tokens.get(i); + + Dependency dep = new ROOT(jcas); + dep.setGovernor(rootToken); + dep.setDependent(rootToken); + dep.setDependencyType(parsed.plabels[i]); + dep.setFlavor(DependencyFlavor.BASIC); + dep.setBegin(dep.getDependent().getBegin()); + dep.setEnd(dep.getDependent().getEnd()); + dep.addToIndexes(); + } + } + } + } +} diff --git a/dkpro-core-matetools-gpl/src/main/java/org/dkpro/core/matetools/MatePosTagger.java b/dkpro-core-matetools-gpl/src/main/java/org/dkpro/core/matetools/MatePosTagger.java new file mode 100644 index 0000000000..a9bc5b40b3 --- /dev/null +++ b/dkpro-core-matetools-gpl/src/main/java/org/dkpro/core/matetools/MatePosTagger.java @@ -0,0 +1,224 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.matetools; + +import static java.util.Arrays.asList; +import static org.apache.uima.util.Level.INFO; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; + +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Type; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.resources.ResourceUtils; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.tag.Options; +import is2.tag.Tagger; + +/** + * DKPro Annotator for the MateToolsPosTagger + */ +@Component(OperationType.PART_OF_SPEECH_TAGGER) +@ResourceMetaData(name = "Mate Tools POS-Tagger") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" + }, + outputs = {"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}) +public class MatePosTagger + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + protected String modelLocation; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** + * Load the part-of-speech tag to UIMA type mapping from this location instead of locating the + * mapping automatically. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + protected String posMappingLocation; + + /** + * Log the tag set(s) when a model is loaded. + */ + public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") + protected boolean printTagSet; + + private CasConfigurableProviderBase<Tagger> modelProvider; + private MappingProvider posMappingProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase<Tagger>(this, "matetools", "tagger") + { + { + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/matetools/lib/tagger-${language}-${variant}.properties"); + } + + @Override + protected Tagger produceResource(URL aUrl) + throws IOException + { + File modelFile = ResourceUtils.getUrlAsFile(aUrl, true); + + String[] args = { "-model", modelFile.getPath() }; + Options option = new Options(args); + Tagger tagger = new Tagger(option); // create a POSTagger + + HashMap<String, HashMap<String, Integer>> featureSet = tagger.mf.getFeatureSet(); + SingletonTagset posTags = new SingletonTagset(POS.class, getResourceMetaData() + .getProperty("pos.tagset")); + HashMap<String, Integer> posTagFeatures = featureSet.get("POS"); + posTags.addAll(posTagFeatures.keySet()); + posTags.removeAll(asList("<None>", "<root-POS>")); + addTagset(posTags); + + if (printTagSet) { + getContext().getLogger().log(INFO, getTagset().toString()); + } + + return tagger; + } + }; + + posMappingProvider = createPosMappingProvider(this, posMappingLocation, language, + modelProvider); + } + + @Override + public void process(JCas jcas) + throws AnalysisEngineProcessException + { + CAS cas = jcas.getCas(); + + modelProvider.configure(cas); + posMappingProvider.configure(cas); + + for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) { + List<Token> tokens = JCasUtil.selectCovered(Token.class, sentence); + + List<String> forms = new LinkedList<String>(); + forms.add(CONLLReader09.ROOT); + forms.addAll(JCasUtil.toText(tokens)); + + List<String> lemmas = new LinkedList<String>(); + lemmas.add(CONLLReader09.ROOT_LEMMA); + for (Token token : tokens) { + if (token.getLemma() != null) { + lemmas.add(token.getLemma().getValue()); + } else { + lemmas.add("_"); + } + } + + SentenceData09 sd = new SentenceData09(); + sd.init(forms.toArray(new String[0])); + sd.setLemmas(lemmas.toArray(new String[0])); + String[] posTags = modelProvider.getResource().apply(sd).ppos; + + for (int i = 1; i < posTags.length; i++) { + Token token = tokens.get(i - 1); + Type posType = posMappingProvider.getTagType(posTags[i]); + POS posTag = (POS) cas.createAnnotation(posType, token.getBegin(), token.getEnd()); + posTag.setPosValue(posTags[i] != null ? posTags[i].intern() : null); + POSUtils.assignCoarseValue(posTag); + posTag.addToIndexes(); + token.setPos(posTag); + } + } + } +} diff --git a/dkpro-core-matetools-gpl/src/main/java/org/dkpro/core/matetools/MateSemanticRoleLabeler.java b/dkpro-core-matetools-gpl/src/main/java/org/dkpro/core/matetools/MateSemanticRoleLabeler.java new file mode 100644 index 0000000000..81f77b61f8 --- /dev/null +++ b/dkpro-core-matetools-gpl/src/main/java/org/dkpro/core/matetools/MateSemanticRoleLabeler.java @@ -0,0 +1,358 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.matetools; + +import static org.apache.uima.fit.util.JCasUtil.indexCovered; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; + +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.util.Collection; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; +import java.util.zip.ZipFile; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.FSCollectionFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.resources.ResourceUtils; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.Morpheme; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg; +import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArgLink; +import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; +import se.lth.cs.srl.SemanticRoleLabeler; +import se.lth.cs.srl.corpus.Predicate; +import se.lth.cs.srl.corpus.Word; +import se.lth.cs.srl.languages.Language; +import se.lth.cs.srl.languages.Language.L; +import se.lth.cs.srl.pipeline.Pipeline; + +/** + * Annotator for the MateTools Semantic Role Labeler. + * <p> + * Please cite the following paper, if you use the semantic role labeler Anders Björkelund, Love + * Hafdell, and Pierre Nugues. Multilingual semantic role labeling. In Proceedings of The Thirteenth + * Conference on Computational Natural Language Learning (CoNLL-2009), pages 43--48, Boulder, June + * 4--5 2009. + * </p> + */ +@Component(OperationType.ANNOTATOR_OF_SEMANTIC_ROLE_LABELS) +@ResourceMetaData(name = "Mate Tools Semantic Role Labeler") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred", + "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg" }) +public class MateSemanticRoleLabeler + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + protected String modelLocation; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + private CasConfigurableProviderBase<SemanticRoleLabeler> modelProvider; + + private static final String UNUSED = "_"; + private static final int UNUSED_INT = -1; + private static final Pattern NEWLINE_PATTERN = Pattern.compile("\n"); + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + modelProvider = new ModelProviderBase<SemanticRoleLabeler>(this, "matetools", "srl") + { + { + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/matetools/lib/srl-${language}-${variant}.properties"); + } + + @Override + protected SemanticRoleLabeler produceResource(URL aUrl) + throws IOException + { + File modelFile = ResourceUtils.getUrlAsFile(aUrl, false); + try { + ZipFile zipFile = new ZipFile(modelFile); + SemanticRoleLabeler srl = Pipeline.fromZipFile(zipFile); + zipFile.close(); + return srl; + } + catch (Exception e) { + throw new IOException(e); + } + } + }; + } + + @Override + public void process(JCas jcas) + throws AnalysisEngineProcessException + { + modelProvider.configure(jcas.getCas()); + SemanticRoleLabeler srl = modelProvider.getResource(); + + // Set the language information for SRL + switch (jcas.getDocumentLanguage()) { + case "de": + Language.setLanguage(L.ger); + break; + case "en": + Language.setLanguage(L.eng); + break; + case "zh": + Language.setLanguage(L.chi); + break; + case "es": + Language.setLanguage(L.spa); + break; + default: + throw new AnalysisEngineProcessException("Language not supported", null); + } + + for (Sentence s : JCasUtil.select(jcas, Sentence.class)) { + String conll2009String = convert(jcas, s); + se.lth.cs.srl.corpus.Sentence sen = se.lth.cs.srl.corpus.Sentence + .newDepsOnlySentence(NEWLINE_PATTERN.split(conll2009String)); + + srl.parseSentence(sen); + + List<Predicate> preds = sen.getPredicates(); + List<Token> tokens = JCasUtil.selectCovered(Token.class, s); + + + for (Predicate pred : preds) { + //Add the predicates + Token predToken = tokens.get(pred.getIdx() - 1); + SemPred semanticPredicate = new SemPred(jcas, predToken.getBegin(), + predToken.getEnd()); + semanticPredicate.setCategory(pred.getSense()); + semanticPredicate.addToIndexes(); + + //Add the arguments + Map<Word, String> argmap = pred.getArgMap(); + List<SemArgLink> arguments = new LinkedList<>(); + for (Map.Entry<Word, String> entry : argmap.entrySet()) { + Token argumentToken = tokens.get(entry.getKey().getIdx() - 1); + + SemArg arg = new SemArg(jcas, argumentToken.getBegin(), argumentToken.getEnd()); + arg.addToIndexes(); + + SemArgLink link = new SemArgLink(jcas); + link.setRole(pred.getArgumentTag(entry.getKey())); + link.setTarget(arg); + + arguments.add(link); + } + + //Add the arguments to the predicate + semanticPredicate.setArguments( + FSCollectionFactory.createFSArray(jcas, arguments)); + } + } + } + + private String convert(JCas aJCas, Sentence sentence) + { + Map<Token, List<SemPred>> predIdx = indexCovered(aJCas, Token.class, SemPred.class); + Map<SemArg, List<Token>> argIdx = indexCovered(aJCas, SemArg.class, Token.class); + HashMap<Token, Row> ctokens = new LinkedHashMap<Token, Row>(); + + StringBuilder conll2009String = new StringBuilder(); + + // Tokens + List<Token> tokens = selectCovered(Token.class, sentence); + + // Check if we should try to include the FEATS in output + List<Morpheme> morphology = selectCovered(Morpheme.class, sentence); + boolean useFeats = tokens.size() == morphology.size(); + + int tokenSize = tokens.size(); + int morhSize = morphology.size(); + + List<SemPred> preds = selectCovered(SemPred.class, sentence); + + for (int i = 0; i < tokens.size(); i++) { + Row row = new Row(); + row.id = i + 1; + row.token = tokens.get(i); + row.args = new SemArgLink[preds.size()]; + if (useFeats) { + row.feats = morphology.get(i); + } + + // If there are multiple semantic predicates for the current token, then + // we keep only the first + Collection<SemPred> predsForToken = predIdx.get(row.token); + if (predsForToken != null && !predsForToken.isEmpty()) { + row.pred = predsForToken.iterator().next(); + } + ctokens.put(row.token, row); + } + + // Dependencies + for (Dependency rel : selectCovered(Dependency.class, sentence)) { + ctokens.get(rel.getDependent()).deprel = rel; + } + + // Semantic arguments + for (int p = 0; p < preds.size(); p++) { + FSArray args = preds.get(p).getArguments(); + for (SemArgLink link : select(args, SemArgLink.class)) { + for (Token t : argIdx.get(link.getTarget())) { + Row row = ctokens.get(t); + row.args[p] = link; + } + } + } + + // Write sentence in CONLL 2009 format + for (Row row : ctokens.values()) { + int id = row.id; + + String form = row.token.getText(); + + String lemma = UNUSED; + if (row.token.getLemma() != null) { + lemma = row.token.getLemma().getValue(); + } + String plemma = lemma; + + String pos = UNUSED; + if (row.token.getPos() != null) { + POS posAnno = row.token.getPos(); + pos = posAnno.getPosValue(); + } + String ppos = pos; + + String feat = UNUSED; + if (row.feats != null) { + feat = row.feats.getMorphTag(); + } + String pfeat = feat; + + int headId = UNUSED_INT; + String deprel = UNUSED; + if (row.deprel != null) { + deprel = row.deprel.getDependencyType(); + headId = ctokens.get(row.deprel.getGovernor()).id; + if (headId == row.id) { + // ROOT dependencies may be modeled as a loop, ignore these. + headId = 0; + } + } else { + headId = 0; //Mate SRL expects the head to have id = 0 + } + + String head = UNUSED; + if (headId != UNUSED_INT) { + head = Integer.toString(headId); + } + + String phead = head; + String pdeprel = deprel; + + String fillpred = UNUSED; + String pred = UNUSED; + StringBuilder apreds = new StringBuilder(); + + conll2009String.append( + String.format("%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", + id, form, lemma, plemma, pos, ppos, feat, pfeat, head, phead, deprel, + pdeprel, fillpred, pred, apreds)); + } + + return conll2009String.toString(); + } + + private static final class Row { + int id; + Token token; + Morpheme feats; + Dependency deprel; + SemPred pred; + SemArgLink[] args; // These are the arguments roles for the current token! + } + +} diff --git a/dkpro-core-matetools-gpl/src/main/java/org/dkpro/core/matetools/package-info.java b/dkpro-core-matetools-gpl/src/main/java/org/dkpro/core/matetools/package-info.java new file mode 100644 index 0000000000..04aa1352f0 --- /dev/null +++ b/dkpro-core-matetools-gpl/src/main/java/org/dkpro/core/matetools/package-info.java @@ -0,0 +1,25 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +/** + * Integration of NLP components from the <a href="http://code.google.com/p/mate-tools/"> + * mate-tools suite</a>. + * + * @since 1.5.0 + */ +package org.dkpro.core.matetools; diff --git a/dkpro-core-matetools-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/matetools/lib/lemmatizer-default-variants.map b/dkpro-core-matetools-gpl/src/main/resources/org/dkpro/core/matetools/lib/lemmatizer-default-variants.map similarity index 100% rename from dkpro-core-matetools-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/matetools/lib/lemmatizer-default-variants.map rename to dkpro-core-matetools-gpl/src/main/resources/org/dkpro/core/matetools/lib/lemmatizer-default-variants.map diff --git a/dkpro-core-matetools-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/matetools/lib/morphtagger-default-variants.map b/dkpro-core-matetools-gpl/src/main/resources/org/dkpro/core/matetools/lib/morphtagger-default-variants.map similarity index 100% rename from dkpro-core-matetools-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/matetools/lib/morphtagger-default-variants.map rename to dkpro-core-matetools-gpl/src/main/resources/org/dkpro/core/matetools/lib/morphtagger-default-variants.map diff --git a/dkpro-core-matetools-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/matetools/lib/parser-default-variants.map b/dkpro-core-matetools-gpl/src/main/resources/org/dkpro/core/matetools/lib/parser-default-variants.map similarity index 100% rename from dkpro-core-matetools-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/matetools/lib/parser-default-variants.map rename to dkpro-core-matetools-gpl/src/main/resources/org/dkpro/core/matetools/lib/parser-default-variants.map diff --git a/dkpro-core-matetools-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/matetools/lib/srl-default-variants.map b/dkpro-core-matetools-gpl/src/main/resources/org/dkpro/core/matetools/lib/srl-default-variants.map similarity index 100% rename from dkpro-core-matetools-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/matetools/lib/srl-default-variants.map rename to dkpro-core-matetools-gpl/src/main/resources/org/dkpro/core/matetools/lib/srl-default-variants.map diff --git a/dkpro-core-matetools-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/matetools/lib/tagger-default-variants.map b/dkpro-core-matetools-gpl/src/main/resources/org/dkpro/core/matetools/lib/tagger-default-variants.map similarity index 100% rename from dkpro-core-matetools-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/matetools/lib/tagger-default-variants.map rename to dkpro-core-matetools-gpl/src/main/resources/org/dkpro/core/matetools/lib/tagger-default-variants.map diff --git a/dkpro-core-matetools-gpl/src/scripts/build.xml b/dkpro-core-matetools-gpl/src/scripts/build.xml index 246e470cc1..92b560699b 100644 --- a/dkpro-core-matetools-gpl/src/scripts/build.xml +++ b/dkpro-core-matetools-gpl/src/scripts/build.xml @@ -1,6 +1,6 @@ <!-- - Copyright 2007-2017 + Copyright 2007-2019 Ubiquitous Knowledge Processing (UKP) Lab Technische Universität Darmstadt @@ -15,7 +15,7 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. + along with this program. If not, see http://www.gnu.org/licenses/. --> <project basedir="../.." default="separate-jars"> diff --git a/dkpro-core-matetools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateLemmatizerTest.java b/dkpro-core-matetools-gpl/src/test/java/org/dkpro/core/matetools/MateLemmatizerTest.java similarity index 89% rename from dkpro-core-matetools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateLemmatizerTest.java rename to dkpro-core-matetools-gpl/src/test/java/org/dkpro/core/matetools/MateLemmatizerTest.java index 426c58cbbf..754d5a6716 100644 --- a/dkpro-core-matetools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateLemmatizerTest.java +++ b/dkpro-core-matetools-gpl/src/test/java/org/dkpro/core/matetools/MateLemmatizerTest.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,23 +14,25 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.matetools; +package org.dkpro.core.matetools; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.jcas.JCas; +import org.dkpro.core.matetools.MateLemmatizer; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Assume; import org.junit.Rule; import org.junit.Test; + import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class MateLemmatizerTest { diff --git a/dkpro-core-matetools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateMorphTaggerTest.java b/dkpro-core-matetools-gpl/src/test/java/org/dkpro/core/matetools/MateMorphTaggerTest.java similarity index 94% rename from dkpro-core-matetools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateMorphTaggerTest.java rename to dkpro-core-matetools-gpl/src/test/java/org/dkpro/core/matetools/MateMorphTaggerTest.java index 2c06f68e93..81b1e1eea6 100644 --- a/dkpro-core-matetools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateMorphTaggerTest.java +++ b/dkpro-core-matetools-gpl/src/test/java/org/dkpro/core/matetools/MateMorphTaggerTest.java @@ -1,155 +1,160 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.matetools; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.jcas.JCas; -import org.junit.Assume; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -public class MateMorphTaggerTest -{ - @Test - public void testGerman() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() >= 1000000000); - - JCas jcas = runTest("de", "Wir brauchen ein sehr kompliziertes Beispiel , welches " - + "möglichst viele Konstituenten und Dependenzen beinhaltet ."); - - String[] morphTagsExpected = { - "[ 0, 3] - - - - - - - - - - - - - - - - - Wir (case=nom|number=pl|gender=*|person=1)", - "[ 4, 12] - - - - - - - - - - - - - - - - - brauchen (number=pl|person=1|tense=pres|mood=ind)", - "[ 13, 16] - - - - - - - - - - - - - - - - - ein (case=acc|number=sg|gender=neut)", - "[ 17, 21] - - - - - - - - - - - - - - - - - sehr (_)", - "[ 22, 35] - - - - - - - - - - - - - - - - - kompliziertes (case=acc|number=sg|gender=neut|degree=pos)", - "[ 36, 44] - - - - - - - - - - - - - - - - - Beispiel (case=acc|number=sg|gender=neut)", - "[ 45, 46] - - - - - - - - - - - - - - - - - , (_)", - "[ 47, 54] - - - - - - - - - - - - - - - - - welches (case=acc|number=sg|gender=neut)", - "[ 55, 64] - - - - - - - - - - - - - - - - - möglichst (_)", - "[ 65, 70] - - - - - - - - - - - - - - - - - viele (case=acc|number=pl|gender=*)", - "[ 71, 84] - - - - - - - - - - - - - - - - - Konstituenten (case=acc|number=pl|gender=*)", - "[ 85, 88] - - - - - - - - - - - - - - - - - und (_)", - "[ 89,100] - - - - - - - - - - - - - - - - - Dependenzen (case=acc|number=pl|gender=fem)", - "[101,111] - - - - - - - - - - - - - - - - - beinhaltet (number=sg|person=3|tense=pres|mood=ind)", - "[112,113] - - - - - - - - - - - - - - - - - . (_)" }; - - AssertAnnotations.assertMorph(morphTagsExpected, select(jcas, MorphologicalFeatures.class)); - } - - @Test - public void testFrench() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() >= 1000000000); - - JCas jcas = runTest("fr", "Nous avons besoin d'une phrase par exemple très " - + "compliqué, qui contient des constituants que de nombreuses dépendances et que " - + "possible ."); - - String[] morphTagsExpected = { - "[ 0, 4] - - - - - - - - - - - - - - - - - Nous (g=m|n=p|p=1|s=suj)", - "[ 5, 10] - - - - - - - - - - - - - - - - - avons (m=ind|n=p|p=1|t=pst)", - "[ 11, 17] - - - - - - - - - - - - - - - - - besoin (g=m|n=s|s=c)", - "[ 18, 23] - - - - - - - - - - - - - - - - - d'une (_)", - "[ 24, 30] - - - - - - - - - - - - - - - - - phrase (g=f|n=s|s=c)", - "[ 31, 34] - - - - - - - - - - - - - - - - - par (_)", - "[ 35, 42] - - - - - - - - - - - - - - - - - exemple (g=m|n=s|s=c)", - "[ 43, 47] - - - - - - - - - - - - - - - - - très (_)", - "[ 48, 58] - - - - - - - - - - - - - - - - - compliqué, (g=m|n=s|s=qual)", - "[ 59, 62] - - - - - - - - - - - - - - - - - qui (g=m|n=p|p=3|s=rel)", - "[ 63, 71] - - - - - - - - - - - - - - - - - contient (m=ind|n=s|p=3|t=pst)", - "[ 72, 75] - - - - - - - - - - - - - - - - - des (g=m|n=p|s=ind)", - "[ 76, 88] - - - - - - - - - - - - - - - - - constituants (g=m|n=p|s=c)", - "[ 89, 92] - - - - - - - - - - - - - - - - - que (g=m|n=p|p=3|s=rel)", - "[ 93, 95] - - - - - - - - - - - - - - - - - de (g=f|n=p|s=ind)", - "[ 96,106] - - - - - - - - - - - - - - - - - nombreuses (g=f|n=p|s=qual)", - "[107,118] - - - - - - - - - - - - - - - - - dépendances (g=f|n=p|s=c)", - "[119,121] - - - - - - - - - - - - - - - - - et (s=c)", - "[122,125] - - - - - - - - - - - - - - - - - que (s=s)", - "[126,134] - - - - - - - - - - - - - - - - - possible (g=m|n=s|s=qual)", - "[135,136] - - - - - - - - - - - - - - - - - . (s=s)" }; - - AssertAnnotations.assertMorph(morphTagsExpected, select(jcas, MorphologicalFeatures.class)); - } - - @Test - public void testSpanish() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() >= 1000000000); - - JCas jcas = runTest("es", "Necesitamos una oración de ejemplo muy complicado , que " - + "contiene la mayor cantidad de componentes y dependencias como sea posible ."); - - String[] morphTagsExpected = { - "[ 0, 11] - - - - - - - - - - - - - - - - - Necesitamos (postype=main|gen=c|num=p|person=1|mood=indicative|tense=present)", - "[ 12, 15] - - - - - - - - - - - - - - - - - una (postype=indefinite|gen=f|num=s)", - "[ 16, 23] - - - - - - - - - - - - - - - - - oración (postype=common|gen=f|num=s)", - "[ 24, 26] - - - - - - - - - - - - - - - - - de (postype=preposition|gen=c|num=c)", - "[ 27, 34] - - - - - - - - - - - - - - - - - ejemplo (postype=common|gen=m|num=s)", - "[ 35, 38] - - - - - - - - - - - - - - - - - muy (_)", - "[ 39, 49] - - - - - - - - - - - - - - - - - complicado (postype=qualificative|gen=m|num=s|posfunction=participle)", - "[ 50, 51] - - - - - - - - - - - - - - - - - , (punct=comma)", - "[ 52, 55] - - - - - - - - - - - - - - - - - que (postype=relative|gen=c|num=c)", - "[ 56, 64] - - - - - - - - - - - - - - - - - contiene (postype=main|gen=c|num=s|person=3|mood=indicative|tense=present)", - "[ 65, 67] - - - - - - - - - - - - - - - - - la (postype=article|gen=f|num=s)", - "[ 68, 73] - - - - - - - - - - - - - - - - - mayor (postype=qualificative|gen=c|num=s)", - "[ 74, 82] - - - - - - - - - - - - - - - - - cantidad (postype=common|gen=f|num=s)", - "[ 83, 85] - - - - - - - - - - - - - - - - - de (postype=preposition|gen=c|num=c)", - "[ 86, 97] - - - - - - - - - - - - - - - - - componentes (postype=common|gen=m|num=p)", - "[ 98, 99] - - - - - - - - - - - - - - - - - y (postype=coordinating)", - "[100,112] - - - - - - - - - - - - - - - - - dependencias (postype=common|gen=f|num=p)", - "[113,117] - - - - - - - - - - - - - - - - - como (postype=subordinating)", - "[118,121] - - - - - - - - - - - - - - - - - sea (postype=semiauxiliary|gen=c|num=s|person=3|mood=subjunctive|tense=present)", - "[122,129] - - - - - - - - - - - - - - - - - posible (postype=qualificative|gen=c|num=s)", - "[130,131] - - - - - - - - - - - - - - - - - . (punct=period)"}; - - AssertAnnotations.assertMorph(morphTagsExpected, select(jcas, MorphologicalFeatures.class)); - } - - private JCas runTest(String aLanguage, String aText) - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() >= 2000000000); - - AssumeResource.assumeResource(MateMorphTagger.class, "morphtagger", aLanguage, null); - - AnalysisEngineDescription lemma = createEngineDescription(MateLemmatizer.class); - AnalysisEngineDescription morphTag = createEngineDescription(MateMorphTagger.class); - - AnalysisEngineDescription aggregate = createEngineDescription(lemma, morphTag); - - return TestRunner.runTest(aggregate, aLanguage, aText); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.matetools; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.matetools.MateLemmatizer; +import org.dkpro.core.matetools.MateMorphTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Assume; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; + +public class MateMorphTaggerTest +{ + @Test + public void testGerman() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() >= 1000000000); + + JCas jcas = runTest("de", "Wir brauchen ein sehr kompliziertes Beispiel , welches " + + "möglichst viele Konstituenten und Dependenzen beinhaltet ."); + + String[] morphTagsExpected = { + "[ 0, 3] - - - - - - - - - - - - - - - - - Wir (case=nom|number=pl|gender=*|person=1)", + "[ 4, 12] - - - - - - - - - - - - - - - - - brauchen (number=pl|person=1|tense=pres|mood=ind)", + "[ 13, 16] - - - - - - - - - - - - - - - - - ein (case=acc|number=sg|gender=neut)", + "[ 17, 21] - - - - - - - - - - - - - - - - - sehr (_)", + "[ 22, 35] - - - - - - - - - - - - - - - - - kompliziertes (case=acc|number=sg|gender=neut|degree=pos)", + "[ 36, 44] - - - - - - - - - - - - - - - - - Beispiel (case=acc|number=sg|gender=neut)", + "[ 45, 46] - - - - - - - - - - - - - - - - - , (_)", + "[ 47, 54] - - - - - - - - - - - - - - - - - welches (case=acc|number=sg|gender=neut)", + "[ 55, 64] - - - - - - - - - - - - - - - - - möglichst (_)", + "[ 65, 70] - - - - - - - - - - - - - - - - - viele (case=acc|number=pl|gender=*)", + "[ 71, 84] - - - - - - - - - - - - - - - - - Konstituenten (case=acc|number=pl|gender=*)", + "[ 85, 88] - - - - - - - - - - - - - - - - - und (_)", + "[ 89,100] - - - - - - - - - - - - - - - - - Dependenzen (case=acc|number=pl|gender=fem)", + "[101,111] - - - - - - - - - - - - - - - - - beinhaltet (number=sg|person=3|tense=pres|mood=ind)", + "[112,113] - - - - - - - - - - - - - - - - - . (_)" + }; + + AssertAnnotations.assertMorph(morphTagsExpected, select(jcas, MorphologicalFeatures.class)); + } + + @Test + public void testFrench() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() >= 1000000000); + + JCas jcas = runTest("fr", "Nous avons besoin d'une phrase par exemple très " + + "compliqué, qui contient des constituants que de nombreuses dépendances et que " + + "possible ."); + + String[] morphTagsExpected = { + "[ 0, 4] - - - - - - - - - - - - - - - - - Nous (g=m|n=p|p=1|s=suj)", + "[ 5, 10] - - - - - - - - - - - - - - - - - avons (m=ind|n=p|p=1|t=pst)", + "[ 11, 17] - - - - - - - - - - - - - - - - - besoin (g=m|n=s|s=c)", + "[ 18, 23] - - - - - - - - - - - - - - - - - d'une (_)", + "[ 24, 30] - - - - - - - - - - - - - - - - - phrase (g=f|n=s|s=c)", + "[ 31, 34] - - - - - - - - - - - - - - - - - par (_)", + "[ 35, 42] - - - - - - - - - - - - - - - - - exemple (g=m|n=s|s=c)", + "[ 43, 47] - - - - - - - - - - - - - - - - - très (_)", + "[ 48, 58] - - - - - - - - - - - - - - - - - compliqué, (g=m|n=s|s=qual)", + "[ 59, 62] - - - - - - - - - - - - - - - - - qui (g=m|n=p|p=3|s=rel)", + "[ 63, 71] - - - - - - - - - - - - - - - - - contient (m=ind|n=s|p=3|t=pst)", + "[ 72, 75] - - - - - - - - - - - - - - - - - des (g=m|n=p|s=ind)", + "[ 76, 88] - - - - - - - - - - - - - - - - - constituants (g=m|n=p|s=c)", + "[ 89, 92] - - - - - - - - - - - - - - - - - que (g=m|n=p|p=3|s=rel)", + "[ 93, 95] - - - - - - - - - - - - - - - - - de (g=f|n=p|s=ind)", + "[ 96,106] - - - - - - - - - - - - - - - - - nombreuses (g=f|n=p|s=qual)", + "[107,118] - - - - - - - - - - - - - - - - - dépendances (g=f|n=p|s=c)", + "[119,121] - - - - - - - - - - - - - - - - - et (s=c)", + "[122,125] - - - - - - - - - - - - - - - - - que (s=s)", + "[126,134] - - - - - - - - - - - - - - - - - possible (g=m|n=s|s=qual)", + "[135,136] - - - - - - - - - - - - - - - - - . (s=s)" + }; + + AssertAnnotations.assertMorph(morphTagsExpected, select(jcas, MorphologicalFeatures.class)); + } + + @Test + public void testSpanish() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() >= 1000000000); + + JCas jcas = runTest("es", "Necesitamos una oración de ejemplo muy complicado , que " + + "contiene la mayor cantidad de componentes y dependencias como sea posible ."); + + String[] morphTagsExpected = { + "[ 0, 11] - - - - - - - - - - - - - - - - - Necesitamos (postype=main|gen=c|num=p|person=1|mood=indicative|tense=present)", + "[ 12, 15] - - - - - - - - - - - - - - - - - una (postype=indefinite|gen=f|num=s)", + "[ 16, 23] - - - - - - - - - - - - - - - - - oración (postype=common|gen=f|num=s)", + "[ 24, 26] - - - - - - - - - - - - - - - - - de (postype=preposition|gen=c|num=c)", + "[ 27, 34] - - - - - - - - - - - - - - - - - ejemplo (postype=common|gen=m|num=s)", + "[ 35, 38] - - - - - - - - - - - - - - - - - muy (_)", + "[ 39, 49] - - - - - - - - - - - - - - - - - complicado (postype=qualificative|gen=m|num=s|posfunction=participle)", + "[ 50, 51] - - - - - - - - - - - - - - - - - , (punct=comma)", + "[ 52, 55] - - - - - - - - - - - - - - - - - que (postype=relative|gen=c|num=c)", + "[ 56, 64] - - - - - - - - - - - - - - - - - contiene (postype=main|gen=c|num=s|person=3|mood=indicative|tense=present)", + "[ 65, 67] - - - - - - - - - - - - - - - - - la (postype=article|gen=f|num=s)", + "[ 68, 73] - - - - - - - - - - - - - - - - - mayor (postype=qualificative|gen=c|num=s)", + "[ 74, 82] - - - - - - - - - - - - - - - - - cantidad (postype=common|gen=f|num=s)", + "[ 83, 85] - - - - - - - - - - - - - - - - - de (postype=preposition|gen=c|num=c)", + "[ 86, 97] - - - - - - - - - - - - - - - - - componentes (postype=common|gen=m|num=p)", + "[ 98, 99] - - - - - - - - - - - - - - - - - y (postype=coordinating)", + "[100,112] - - - - - - - - - - - - - - - - - dependencias (postype=common|gen=f|num=p)", + "[113,117] - - - - - - - - - - - - - - - - - como (postype=subordinating)", + "[118,121] - - - - - - - - - - - - - - - - - sea (postype=semiauxiliary|gen=c|num=s|person=3|mood=subjunctive|tense=present)", + "[122,129] - - - - - - - - - - - - - - - - - posible (postype=qualificative|gen=c|num=s)", + "[130,131] - - - - - - - - - - - - - - - - - . (punct=period)" + }; + + AssertAnnotations.assertMorph(morphTagsExpected, select(jcas, MorphologicalFeatures.class)); + } + + private JCas runTest(String aLanguage, String aText) + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() >= 2000000000); + + AssumeResource.assumeResource(MateMorphTagger.class, "morphtagger", aLanguage, null); + + AnalysisEngineDescription lemma = createEngineDescription(MateLemmatizer.class); + AnalysisEngineDescription morphTag = createEngineDescription(MateMorphTagger.class); + + AnalysisEngineDescription aggregate = createEngineDescription(lemma, morphTag); + + return TestRunner.runTest(aggregate, aLanguage, aText); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-matetools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateParserTest.java b/dkpro-core-matetools-gpl/src/test/java/org/dkpro/core/matetools/MateParserTest.java similarity index 93% rename from dkpro-core-matetools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateParserTest.java rename to dkpro-core-matetools-gpl/src/test/java/org/dkpro/core/matetools/MateParserTest.java index a21c36e3fd..c48a8b06f9 100644 --- a/dkpro-core-matetools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateParserTest.java +++ b/dkpro-core-matetools-gpl/src/test/java/org/dkpro/core/matetools/MateParserTest.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,9 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.matetools; +package org.dkpro.core.matetools; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; @@ -29,27 +29,27 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.hunpos.HunPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TagsetDescriptionStripper; +import org.dkpro.core.testing.TestRunner; import org.junit.Assume; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.hunpos.HunPosTagger; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TagsetDescriptionStripper; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class MateParserTest { - @Test - public void testGerman() - throws Exception - { - JCas jcas = runTest("de", "Wir brauchen ein sehr kompliziertes Beispiel , welches " - + "möglichst viele Konstituenten und Dependenzen beinhaltet ."); + @Test + public void testGerman() + throws Exception + { + JCas jcas = runTest("de", "Wir brauchen ein sehr kompliziertes Beispiel , welches " + + "möglichst viele Konstituenten und Dependenzen beinhaltet ."); String[] dependencies = { "[ 0, 3]Dependency(SB,basic) D[0,3](Wir) G[4,12](brauchen)", @@ -80,10 +80,10 @@ public void testGerman() "OA", "OA2", "OC", "OG", "OP", "PAR", "PD", "PG", "PH", "PM", "PNC", "RC", "RE", "RS", "SB", "SBP", "SP", "SVP", "UC", "VO" }; - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); + AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); AssertAnnotations.assertTagset(POS.class, "stts", posTags, jcas); AssertAnnotations.assertTagset(Dependency.class, "negra", dependencyTags, jcas); - } + } @Test public void testEnglish() @@ -288,6 +288,9 @@ public static AnalysisEngineDescription getEngines(String aLanguage, String aVar if ("fa".equals(aLanguage) || "sv".equals(aLanguage)) { Assume.assumeFalse("HunPos currently hangs indefinitely on Windows: Issue #1099", System.getProperty("os.name").toLowerCase(Locale.US).contains("win")); + Assume.assumeTrue("HunPos does not run on OS X Catalina or higher", + System.getProperty("os.name").toLowerCase(Locale.US).contains("mac") && + !System.getProperty("os.version").matches("10\\.([0-9]|1[0-4]).*")); engines.add(createEngineDescription(HunPosTagger.class)); } else { @@ -304,19 +307,19 @@ public static AnalysisEngineDescription getEngines(String aLanguage, String aVar .toArray(new AnalysisEngineDescription[engines.size()])); } - private JCas runTest(String aLanguage, String aText) - throws Exception - { + private JCas runTest(String aLanguage, String aText) + throws Exception + { Assume.assumeTrue(Runtime.getRuntime().maxMemory() >= 2000000000); AssumeResource.assumeResource(MateSemanticRoleLabeler.class, "parser", aLanguage, null); - AnalysisEngineDescription aggregate = createEngineDescription( - createEngineDescription(MatePosTagger.class), - createEngineDescription(MateParser.class)); + AnalysisEngineDescription aggregate = createEngineDescription( + createEngineDescription(MatePosTagger.class), + createEngineDescription(MateParser.class)); return TestRunner.runTest(aggregate, aLanguage, aText); - } + } @Rule public DkproTestContext testContext = new DkproTestContext(); diff --git a/dkpro-core-matetools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/matetools/MatePosTaggerTest.java b/dkpro-core-matetools-gpl/src/test/java/org/dkpro/core/matetools/MatePosTaggerTest.java similarity index 90% rename from dkpro-core-matetools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/matetools/MatePosTaggerTest.java rename to dkpro-core-matetools-gpl/src/test/java/org/dkpro/core/matetools/MatePosTaggerTest.java index a9c5bfe10d..4fce34dbec 100644 --- a/dkpro-core-matetools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/matetools/MatePosTaggerTest.java +++ b/dkpro-core-matetools-gpl/src/test/java/org/dkpro/core/matetools/MatePosTaggerTest.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,23 +14,25 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.matetools; +package org.dkpro.core.matetools; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.jcas.JCas; +import org.dkpro.core.matetools.MatePosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Assume; import org.junit.Rule; import org.junit.Test; + import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class MatePosTaggerTest { @@ -70,8 +72,9 @@ public void testEnglish() String[] posOriginal = { "PRP", "VBP", "DT", "RB", "JJ", "NN", "NN", ",", "WDT", "VBZ", "IN", "DT", "NNS", "CC", "NNS", "IN", "JJ", "." }; - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", "POS_NOUN", "POS_PUNCT", "POS_DET", - "POS_VERB", "POS_ADP", "POS_DET", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", + "POS_NOUN", "POS_PUNCT", "POS_DET", "POS_VERB", "POS_ADP", "POS_DET", "POS_NOUN", + "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; String[] posTags = { "#", "$", "''", "(", ")", ",", ".", ":", "CC", "CD", "DT", "END", "EX", "FW", "HYPH", "IN", "JJ", "JJR", "JJS", "LS", "MD", "MID", "NIL", "NN", diff --git a/dkpro-core-matetools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateSemanticRoleLabelerTest.java b/dkpro-core-matetools-gpl/src/test/java/org/dkpro/core/matetools/MateSemanticRoleLabelerTest.java similarity index 85% rename from dkpro-core-matetools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateSemanticRoleLabelerTest.java rename to dkpro-core-matetools-gpl/src/test/java/org/dkpro/core/matetools/MateSemanticRoleLabelerTest.java index 76c57f4961..b4da7cd4d1 100644 --- a/dkpro-core-matetools-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/matetools/MateSemanticRoleLabelerTest.java +++ b/dkpro-core-matetools-gpl/src/test/java/org/dkpro/core/matetools/MateSemanticRoleLabelerTest.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,24 +14,29 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.matetools; +package org.dkpro.core.matetools; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertSemPred; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.AssertAnnotations.assertSemPred; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.jcas.JCas; +import org.dkpro.core.matetools.MateLemmatizer; +import org.dkpro.core.matetools.MateMorphTagger; +import org.dkpro.core.matetools.MateParser; +import org.dkpro.core.matetools.MatePosTagger; +import org.dkpro.core.matetools.MateSemanticRoleLabeler; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Assume; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class MateSemanticRoleLabelerTest { diff --git a/dkpro-core-matetools-gpl/src/test/resources/log4j.properties b/dkpro-core-matetools-gpl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-matetools-gpl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-matetools-gpl/src/test/resources/log4j2.xml b/dkpro-core-matetools-gpl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-matetools-gpl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Logger name="org.dkpro.core.api.resources.ResourceObjectProviderBase" level="INFO"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-maui-gpl/.license-header.txt b/dkpro-core-maui-gpl/.license-header.txt new file mode 100644 index 0000000000..bbaf6e0e56 --- /dev/null +++ b/dkpro-core-maui-gpl/.license-header.txt @@ -0,0 +1,16 @@ +Copyright ${inceptionYear}-${currentYear} +Ubiquitous Knowledge Processing (UKP) Lab +Technische Universität Darmstadt + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see http://www.gnu.org/licenses/. diff --git a/dkpro-core-maui-gpl/LICENSE.txt b/dkpro-core-maui-gpl/LICENSE.txt new file mode 100644 index 0000000000..712f55c971 --- /dev/null +++ b/dkpro-core-maui-gpl/LICENSE.txt @@ -0,0 +1,874 @@ +The code in this repository is licensed under the Apache License 2.0. Some parts may depend on +libraries licensed under the GPL/AGPL. These parts should be considered under GPL/AGPL for the +purpose of use. For the purpose of refactoring, the respective code may be used under the ASL +2.0 as long as all references to the GPLed libraries are removed. Mind license headers in +individual files and information in the NOTICE.txt file, as sometimes we had to copy specific +files under other licenses into our codebase! + +---- + + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/> + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + + A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU Affero General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +<http://www.gnu.org/licenses/>. + +---- + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/dkpro-core-maui-gpl/pom.xml b/dkpro-core-maui-gpl/pom.xml new file mode 100644 index 0000000000..9a076e916b --- /dev/null +++ b/dkpro-core-maui-gpl/pom.xml @@ -0,0 +1,115 @@ +<!-- + + Copyright 2007-2019 + Ubiquitous Knowledge Processing (UKP) Lab + Technische Universität Darmstadt + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see http://www.gnu.org/licenses/. + +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-gpl</artifactId> + <version>2.3.0-SNAPSHOT</version> + <relativePath>../dkpro-core-gpl</relativePath> + </parent> + <modelVersion>4.0.0</modelVersion> + <artifactId>dkpro-core-maui-gpl</artifactId> + <packaging>jar</packaging> + <name>DKPro Core GPL - Maui (v ${maui.version})</name> + <properties> + <maui.version>1.3.0</maui.version> + </properties> + <dependencies> +<!-- + <dependency> + <groupId>fi.nationallibrary</groupId> + <artifactId>maui</artifactId> + <version>1.4.5</version> + </dependency> +--> + + <dependency> + <groupId>com.entopix</groupId> + <artifactId>maui</artifactId> + <version>${maui.version}</version> + </dependency> + + <dependency> + <groupId>org.apache.uima</groupId> + <artifactId>uimaj-core</artifactId> + </dependency> + <dependency> + <groupId>org.apache.uima</groupId> + <artifactId>uimafit-core</artifactId> + </dependency> + + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-lang3</artifactId> + </dependency> + + <dependency> + <groupId>org.apache.jena</groupId> + <artifactId>jena-core</artifactId> + <version>2.11.2</version> + <exclusions> + <exclusion> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + </exclusion> + </exclusions> + </dependency> + + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-metadata-asl</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> + </dependency> + + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> + </dependency> + + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.assertj</groupId> + <artifactId>assertj-core</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-io-text-asl</artifactId> + <scope>test</scope> + </dependency> + </dependencies> +</project> \ No newline at end of file diff --git a/dkpro-core-maui-gpl/src/main/java/org/dkpro/core/maui/MauiKeywordAnnotator.java b/dkpro-core-maui-gpl/src/main/java/org/dkpro/core/maui/MauiKeywordAnnotator.java new file mode 100644 index 0000000000..7d73cf8d3b --- /dev/null +++ b/dkpro-core-maui-gpl/src/main/java/org/dkpro/core/maui/MauiKeywordAnnotator.java @@ -0,0 +1,252 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.maui; + +import static org.apache.commons.lang3.StringUtils.defaultString; +import static org.apache.commons.lang3.StringUtils.substringBefore; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.ObjectInputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.Properties; +import java.util.zip.GZIPInputStream; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.resources.ResourceUtils; + +import com.entopix.maui.filters.MauiFilter; +import com.entopix.maui.filters.MauiFilter.MauiFilterException; +import com.entopix.maui.main.MauiWrapper; +import com.entopix.maui.stemmers.SremovalStemmer; +import com.entopix.maui.stopwords.StopwordsFactory; +import com.entopix.maui.util.Topic; +import com.entopix.maui.vocab.Vocabulary; +import com.hp.hpl.jena.rdf.model.Model; +import com.hp.hpl.jena.rdf.model.ModelFactory; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.MetaDataStringField; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.ResourceInput; +import eu.openminted.share.annotations.api.ResourceOutput; +import eu.openminted.share.annotations.api.constants.AnnotationType; +import eu.openminted.share.annotations.api.constants.OperationType; +import eu.openminted.share.annotations.api.constants.ProcessingResourceType; + +/** + * The Maui tool assigns keywords to documents. The keywords can optionally come from controlled + * vocabulary. The keywords are stored in DKPro Core {@link MetaDataStringField} + * annotations with the key {@code http://purl.org/dc/terms/subject}. + * + * @see <a href="https://github.com/zelandiya/maui-standalone">Maui</a> + */ +@Component(OperationType.DOCUMENT_CLASSIFIER) +@ResourceMetaData(name = "Maui Keyword Annotator") +@ResourceInput( + type = ProcessingResourceType.DOCUMENT) +@ResourceOutput( + type = ProcessingResourceType.DOCUMENT, + annotationLevel = AnnotationType.KEYWORD) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.MetaDataStringField" }) +public class MauiKeywordAnnotator + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + private String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + private String variant; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + private String modelLocation; + + /** + * Location of the vocabulary file. Normally, this location is derived from the model + * location by replacing the model extension {@code .ser} with {@code .rdf.gz.} + */ + public static final String PARAM_VOCABULARY_LOCATION = "vocabularyLocation"; + @ConfigurationParameter(name = PARAM_VOCABULARY_LOCATION, mandatory = false) + private String vocabularyLocation; + + /** + * Format of the vocabulary file. Normally, this information is obtained from the key + * {@code vocabulary.format} in the model metadata. Only {@code skos} and leaving the + * parameter unset (i.e. no vocabulary) are currently supported. + */ + public static final String PARAM_VOCABULARY_FORMAT = "vocabularyFormat"; + @ConfigurationParameter(name = PARAM_VOCABULARY_FORMAT, mandatory = false) + private String vocabularyFormat; + + /** + * Encoding of the vocabulary file. Normally, this information is obtained from the key + * {@code vocabulary.encoding} in the model metadata. + */ + public static final String PARAM_VOCABULARY_ENCODING = "vocabularyEncoding"; + @ConfigurationParameter(name = PARAM_VOCABULARY_ENCODING, mandatory = false, + defaultValue = ComponentParameters.DEFAULT_ENCODING) + private String vocabularyEncoding; + + /** + * Minimum similarity score to a variable require to count as a match (0-1). + */ + public static final String PARAM_SCORE_THRESHOLD = "scoreThreshold"; + @ConfigurationParameter(name = PARAM_SCORE_THRESHOLD, defaultValue = "0.5") + private double scoreThreshold; + + /** + * Maximum number of keywords to assign to a document. + */ + public static final String PARAM_MAX_TOPICS = "maxTopics"; + @ConfigurationParameter(name = PARAM_MAX_TOPICS, defaultValue = "10") + private int maxTopics; + + private ModelProviderBase<MauiWrapper> modelProvider; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase<MauiWrapper>(this, "keywords") + { + @Override + protected MauiWrapper produceResource(URL aUrl) + throws IOException + { + Properties props = getAggregatedProperties(); + Properties metadata = getResourceMetaData(); + + MauiFilter filter; + + try (InputStream is = aUrl.openStream()) { + InputStream iis = is; + if (aUrl.toString().endsWith(".gz")) { + iis = new GZIPInputStream(is); + } + filter = (MauiFilter) new ObjectInputStream(iis).readObject(); + } + catch (ClassNotFoundException e) { + throw new IOException(e); + } + + + Vocabulary vocabulary = null; + String vocabFormat = defaultString(vocabularyFormat, + metadata.getProperty("vocabulary.format")); + String vocabEncoding = defaultString(vocabularyEncoding, + metadata.getProperty("vocabulary.encoding", "UTF-8")); + + if (vocabFormat != null) { + String vocabLocation = defaultString(vocabularyLocation, + substringBefore(aUrl.toString(), ".ser") + ".rdf.gz"); + + vocabulary = new Vocabulary(); + vocabulary.setVocabularyName(vocabLocation); + vocabulary.setStemmer(new SremovalStemmer()); + vocabulary.setLanguage(props.getProperty(LANGUAGE)); + vocabulary.setStopwords( + StopwordsFactory.makeStopwords(props.getProperty(LANGUAGE))); + + try (InputStream is = ResourceUtils.resolveLocation(vocabLocation) + .openStream()) { + InputStream iis = is; + if (vocabLocation.endsWith(".gz")) { + iis = new GZIPInputStream(is); + } + + switch (vocabFormat) { + case "skos": { + Model model = ModelFactory.createDefaultModel(); + model.read(new InputStreamReader(iis, vocabEncoding), ""); + vocabulary.initializeFromModel(model); + break; + } + case "text": + // Maui supports this, but we presently do not. + throw new IllegalArgumentException( + "Unknown format: [" + vocabFormat + "]"); + default: + throw new IllegalArgumentException( + "Unknown format: [" + vocabFormat + "]"); + } + } + + filter.setVocabulary(vocabulary); + filter.setVocabularyFormat(vocabFormat); + filter.setVocabularyName(vocabLocation); + } + + return new MauiWrapper(vocabulary, filter); + } + }; + } + + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException + { + modelProvider.configure(aJCas.getCas()); + + ArrayList<Topic> topics; + try { + MauiWrapper wrapper = modelProvider.getResource(); + topics = wrapper.extractTopicsFromText(aJCas.getDocumentText(), maxTopics); + } + catch (MauiFilterException e) { + throw new AnalysisEngineProcessException(e); + } + + for (Topic t : topics) { + if (t.getProbability() < scoreThreshold) { + continue; + } + + MetaDataStringField mdf = new MetaDataStringField(aJCas); + mdf.setKey("http://purl.org/dc/terms/subject"); + mdf.setValue(t.getTitle()); + mdf.addToIndexes(); + //getLogger().info(String.format("[%s]\t[%s]\t[%f]%n", t.getId(), t.getTitle(), + // t.getProbability())); + } + } +} diff --git a/dkpro-core-maui-gpl/src/test/java/org/dkpro/core/maui/MauiKeywordAnnotatorTest.java b/dkpro-core-maui-gpl/src/test/java/org/dkpro/core/maui/MauiKeywordAnnotatorTest.java new file mode 100644 index 0000000000..1e1ed7db9e --- /dev/null +++ b/dkpro-core-maui-gpl/src/test/java/org/dkpro/core/maui/MauiKeywordAnnotatorTest.java @@ -0,0 +1,70 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.maui; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.MetaDataStringField; + +public class MauiKeywordAnnotatorTest +{ + @Test + public void test() throws Exception + { + CollectionReader reader = createReader( + TextReader.class, + TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/texts/input.txt", + TextReader.PARAM_LANGUAGE, "en"); + + AnalysisEngine annotator = createEngine( + MauiKeywordAnnotator.class, + MauiKeywordAnnotator.PARAM_MODEL_LOCATION, "src/test/resources/fao30.model.gz"); + + JCas jcas = JCasFactory.createJCas(); + + reader.getNext(jcas.getCas()); + annotator.process(jcas); + + List<String> keywords = select(jcas, MetaDataStringField.class).stream() + .filter(m -> "http://purl.org/dc/terms/subject".equals(m.getKey())) + .map(MetaDataStringField::getValue) + .sorted() + .collect(Collectors.toList()); + + assertThat(keywords).containsExactly("standards"); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-maui-gpl/src/test/resources/README.txt b/dkpro-core-maui-gpl/src/test/resources/README.txt new file mode 100644 index 0000000000..b18a1d2f54 --- /dev/null +++ b/dkpro-core-maui-gpl/src/test/resources/README.txt @@ -0,0 +1,13 @@ +The fao30.model.gz was trained from the FAO30 dataaset: + + https://code.google.com/archive/p/maui-indexer/downloads#makechanges + +using + + java -jar maui-standalone-1.1-SNAPSHOT.jar train -l fao30/documents/ -m fao30.model -v none -o 2 + +With the ".key" files from indexer "iic1" copied into the "documents" folder prior to training. + +The `input.txt` file contains a few (modified) sentences from the file `a0011e00.txt`. + +This data is used for testing purposes only. \ No newline at end of file diff --git a/dkpro-core-maui-gpl/src/test/resources/fao30.model.gz b/dkpro-core-maui-gpl/src/test/resources/fao30.model.gz new file mode 100644 index 0000000000..1a7b921167 Binary files /dev/null and b/dkpro-core-maui-gpl/src/test/resources/fao30.model.gz differ diff --git a/dkpro-core-maui-gpl/src/test/resources/log4j2.xml b/dkpro-core-maui-gpl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-maui-gpl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Logger name="org.dkpro.core.api.resources.ResourceObjectProviderBase" level="INFO"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-maui-gpl/src/test/resources/texts/input.txt b/dkpro-core-maui-gpl/src/test/resources/texts/input.txt new file mode 100644 index 0000000000..45519f79e5 --- /dev/null +++ b/dkpro-core-maui-gpl/src/test/resources/texts/input.txt @@ -0,0 +1,4 @@ +How do existing and anticipated standards work? +Standards can be valuable in promoting food safety, but the process by which they are set, and the speed at which they are implemented, may result in exclusion of vulnerable groups. +Standards and regulations for animal health, food safety and food quality affect and are affected by the structure of livestock food chains. +It is possible for participants in livestock value chains to comply with changes in standards. diff --git a/dkpro-core-mecab-asl/pom.xml b/dkpro-core-mecab-asl/pom.xml index 7ab9f08aa5..8a0373dd73 100644 --- a/dkpro-core-mecab-asl/pom.xml +++ b/dkpro-core-mecab-asl/pom.xml @@ -18,15 +18,16 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <version>1.10.0-SNAPSHOT</version> + <artifactId>dkpro-core-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.mecab-asl</artifactId> + <artifactId>dkpro-core-mecab-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - Mecab</name> <description>interface to mecab (via JNI for Japanese POS Tagger C++ binary)</description> + <url>https://dkpro.github.io/dkpro-core/</url> <dependencies> <dependency> <groupId>org.apache.uima</groupId> @@ -42,33 +43,37 @@ <version>0.993</version> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.metadata-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-metadata-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.mecab-model-tagger-jp-ipadic</artifactId> </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> + </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.io.text-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-io-text-asl</artifactId> <scope>test</scope> </dependency> <dependency> diff --git a/dkpro-core-mecab-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mecab/MeCabTagger.java b/dkpro-core-mecab-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mecab/MeCabTagger.java deleted file mode 100755 index 523551111c..0000000000 --- a/dkpro-core-mecab-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mecab/MeCabTagger.java +++ /dev/null @@ -1,606 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.mecab; - -import java.io.BufferedReader; -import java.io.File; -import java.io.IOException; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.LanguageCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.Level; -import org.apache.uima.util.Logger; -import org.chasen.mecab.Tagger; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.resources.PlatformDetector; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.mecab.type.JapaneseToken; - -/** - * Annotator for the MeCab Japanese POS Tagger. - */ -@ResourceMetaData(name="MeCab POS-Tagger") -@LanguageCapability("ja") -@TypeCapability( - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.mecab.type.JapaneseToken", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"} - ) -public class MeCabTagger - extends SegmenterBase -{ - private Logger logger; - private Tagger tagger; - - /** - * Loads MeCab library from system default paths. Throws and UnsatisfiedLinkError in case the - * native code cannot be read. - */ - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - logger = getContext().getLogger(); - try { - tagger = getMeCabJNI(); - } - catch (IOException e) { - throw new ResourceInitializationException(e); - } - if (tagger == null) { - throw new ResourceInitializationException(); - } - } - - private Tagger getMeCabJNI() - throws ResourceInitializationException, IOException - { - PlatformDetector pd = new PlatformDetector(); - Tagger tagger = null; - try { - String platform = pd.getPlatformId(); - this.getLogger().log(Level.INFO, "Your platform is " + platform); - - if (platform.equalsIgnoreCase("linux-x86_64")) { - tagger = initTagger(platform, "libmecab.so.2.0.0", "libMeCab.so"); - } - else if (platform.equalsIgnoreCase("linux-x86_32")) { - tagger = initTagger(platform, "libmecab.so.2.0.0", "libMeCab.so"); - } - else if (platform.equalsIgnoreCase("osx-x86_64")) { - tagger = initTagger(platform, "libmecab.2.dylib", "libMeCab.so"); - } - else { - throw new ResourceInitializationException(new Throwable("MeCab native code for " - + platform + " is not supported")); - } - } - catch (UnsatisfiedLinkError e) { - this.getLogger() - .log(Level.SEVERE, - "Cannot load the MeCab native code.\nMake sure that the system path (i.e. LD_LIBRARY_PATH) contains the library (i.e. libMeCab.so)\n"); - throw new ResourceInitializationException(e); - - } - return tagger; - } - - private Tagger initTagger(String platform, String sysLib, String javaWrapper) - throws IOException - { - String prefix = "lib/tagger/jp/bin-" + platform; - String packagePrefix = getClass().getPackage().getName().replaceAll("\\.", "/"); - - File binFolder = ResourceUtils.getClasspathAsFolder("classpath*:" + packagePrefix + "/" - + prefix, true); - - System.load(new File(binFolder, sysLib).getAbsolutePath()); - System.load(new File(binFolder, javaWrapper).getAbsolutePath()); - - // Generate a dummy config file. Mecab does not really need any settings form it, but it - // requires that the file is present. - File dummyConfigFile = File.createTempFile("mecab", "rc"); - dummyConfigFile.deleteOnExit(); - String configFile = dummyConfigFile.getAbsolutePath(); - - // We force a temporary location because Mecab cannot deal with paths containing spaces - // and it is quite unlikely that the temp folder has spaces in its path. (See comment - // below as well). -- REC 2012-06-03 - File dictFolder = ResourceUtils.getClasspathAsFolder("classpath*:" + packagePrefix + - "/lib/tagger/jp/ipadic", true); - - getLogger().log(Level.INFO, "Native library folder: " + binFolder); - getLogger().log(Level.INFO, "Dictionary folder: " + dictFolder); - - // FIXME Mecab tagger cannot deal with folders containing spaces because it uses spaces - // to split the parameter string and there is no way implemented to quote parameters. - // See param.cpp. There is a static create() method in C++ that acceptsma parameter - // count and an array of parameter strings, but this is unusable as it is realized in JNI - // at the moment. -- REC 2012-06-02 - return new Tagger("-d " + dictFolder.getAbsolutePath() + " -r " + configFile); - } - - @Override - public void destroy() - { - super.destroy(); - tagger.delete(); - } - - @Override - protected void process(JCas aJCas, String text, int zoneBegin) - throws AnalysisEngineProcessException - { - tag(aJCas, text, zoneBegin); - } - - protected void tag(JCas aJCas, String text, int begin) // , int end - { - DocumentMetaData docMeta = DocumentMetaData.get(aJCas); - String documentId = docMeta.getDocumentId(); - this.getLogger().log(Level.INFO, "Start tagging document with id: " + documentId); - - /* - * First, read all morphemes and POS tags. - * - * The native library seems to have a problem with parseToNode(), parseToString() functions - * For now, we have to parse the test from parse() function. - */ - // Node node = tagger.parseToNode(docText); - // for (; node != null; node = node.getNext()) { - // System.out.println(node.getSurface() + "\t" + node.getFeature()); - // } - // System.out.println("EOS\n"); - - List<String> morphList = new ArrayList<String>(); - List<String> posList = new ArrayList<String>(); - List<String> baseFormList = new ArrayList<String>(); - List<String> readingFormList = new ArrayList<String>(); - List<String> iboList = new ArrayList<String>(); - List<String> keiList = new ArrayList<String>(); - List<String> danList = new ArrayList<String>(); - - String taggedResult = tagger.parse(text.replaceAll("[\\s]+", " ")); - BufferedReader taggedResultReader = new BufferedReader(new StringReader(taggedResult)); - try { - String line; - while ((line = taggedResultReader.readLine()) != null) { - String morph = null, pos = null, baseForm = null, readingForm = null, ibo = null, dan = null, kei = null; - String[] tokens = line.split("[\\s]+"); - morph = tokens[0]; - if (tokens.length >= 2) { - String[] features = tokens[1].split(","); - pos = getPartOfSpeech(features); - dan = getDan(features); - kei = getKei(features); - baseForm = getBaseForm(features, morph); - readingForm = getReading(features, morph); - ibo = getIBO(morph, features, iboList); - } - if ((morph == null) && (pos == null) && (baseForm == null)) { - logger.log(Level.WARNING, "Morph and pos not found: " + line); - continue; - } - morphList.add(morph); - posList.add(pos); - baseFormList.add(baseForm); - readingFormList.add(readingForm); - iboList.add(ibo); - danList.add(dan); - keiList.add(kei); - } - } - catch (IOException e) { - logger.log(Level.WARNING, - "Reading results from tagger caused an exception: " + e.getMessage()); - } - - /* - * Using the list of morphemes and POS tags, we mark sentence boundaries, as well as - * morpheme and POS boundaries. Japanese sentences end with full stop mark (。), exclamation - * mark (!), or a question mark (?). Note that these are full-width characters. - */ - { - int curSenBegin = 0; - List<String> curMorphList = new ArrayList<String>(); - List<String> curPOSList = new ArrayList<String>(); - List<String> curBaseFormList = new ArrayList<String>(); - List<String> curReadingFormList = new ArrayList<String>(); - List<String> curIBOList = new ArrayList<String>(); - List<String> curDanList = new ArrayList<String>(); - List<String> curKeiList = new ArrayList<String>(); - - for (int i = 0; i < morphList.size(); i++) { - String morph = morphList.get(i); - String pos = posList.get(i); - String baseForm = baseFormList.get(i); - String readingForm = readingFormList.get(i); - String ibo = iboList.get(i); - String dan = danList.get(i); - String kei = keiList.get(i); - - curMorphList.add(morph); - curPOSList.add(pos); - curBaseFormList.add(baseForm); - curReadingFormList.add(readingForm); - curIBOList.add(ibo); - curDanList.add(dan); - curKeiList.add(kei); - - if (morph.matches("[。!?]")) { - curSenBegin = createSentence(aJCas, text, begin, curSenBegin, curMorphList, - curPOSList, curBaseFormList, curReadingFormList, curIBOList, - curDanList, curKeiList, begin, curSenBegin); - } - } - - // cut off mecab's 'EOS' and its entries in the various lists - int morphs = curMorphList.size(); - if (curMorphList.get(morphs - 1).equals("EOS")) { - curMorphList.remove(morphs - 1); - curPOSList.remove(morphs - 1); - curBaseFormList.remove(morphs - 1); - curReadingFormList.remove(morphs - 1); - curIBOList.remove(morphs - 1); - curDanList.remove(morphs - 1); - curKeiList.remove(morphs - 1); - } - - // process the remaining text - if (curMorphList.size() > 0) { - curSenBegin = createSentence(aJCas, text, begin, curSenBegin, curMorphList, - curPOSList, curBaseFormList, curReadingFormList, curIBOList, curDanList, - curKeiList, begin, curSenBegin); - } - } - this.getLogger().log(Level.INFO, "Finished tagging document with id: " + documentId); - } - - private String getReading(String[] features, String fallback) - { - String readingForm = (features.length > 7) ? features[7] : "*"; - if (readingForm.equals("*")) { - readingForm = fallback; - } - return readingForm; - } - - private String getBaseForm(String[] features, String fallback) - { - String baseForm = features[6]; - if (baseForm.equals("*")) { - baseForm = fallback; - } - return baseForm; - } - - private String getKei(String[] features) - { - String kei = features[5]; - if (kei.equals("*")) { - kei = ""; - } - return kei; - } - - private String getDan(String[] features) - { - String dan = features[4]; - if (dan.equals("*")) { - dan = ""; - } - return dan; - } - - private String getPartOfSpeech(String[] features) - { - StringBuffer posBuf = new StringBuffer(); - int i = 0; - while (!features[i].equals("*") && (i < features.length) && (i < 4)) { - if (posBuf.length() > 0) { - posBuf.append("-"); - } - posBuf.append(features[i]); - i++; - } - return posBuf.toString(); - } - - private int createSentence(JCas aJCas, String text, int begin, int curSenBegin, - List<String> curMorphList, List<String> curPOSList, List<String> curBaseFormList, - List<String> curReadingFormList, List<String> curIBOList, List<String> curDanList, - List<String> curKeiList, int begin2, int curSenBegin2) - { - curSenBegin = skipBlanksAtBeginningOfString(text, begin, curSenBegin); - - int curMorphBegin = 0; - curMorphBegin = createTokensAddToIndex(text, curMorphList, curPOSList, curBaseFormList, - curReadingFormList, curIBOList, curDanList, curKeiList, curMorphBegin, curSenBegin, - begin, aJCas); - - createSentenceAddToIndex(aJCas, begin, curSenBegin, curMorphBegin); - curSenBegin += curMorphBegin; - - clearLists(curMorphList, curPOSList, curBaseFormList, curReadingFormList, curIBOList, - curDanList, curKeiList); - return curSenBegin; - } - - private void clearLists(List<String> curMorphList, List<String> curPOSList, - List<String> curBaseFormList, List<String> curReadingFormList, List<String> curIBOList, - List<String> curDanList, List<String> curKeiList) - { - curMorphList.clear(); - curPOSList.clear(); - curBaseFormList.clear(); - curReadingFormList.clear(); - curIBOList.clear(); - curDanList.clear(); - curKeiList.clear(); - } - - private int skipBlanksAtBeginningOfString(String text, int begin, int curSenBegin) - { - while ((text.length() > (begin + curSenBegin)) - && Character.isWhitespace(text.charAt(begin + curSenBegin))) { - curSenBegin++; - } - return curSenBegin; - } - - private void createSentenceAddToIndex(JCas aJCas, int begin, int curSenBegin, int curMorphBegin) - { - Sentence curSentence = new Sentence(aJCas, begin + curSenBegin, begin + curSenBegin - + curMorphBegin); - curSentence.addToIndexes(); - } - - private int createTokensAddToIndex(String text, List<String> curMorphList, - List<String> curPOSList, List<String> curBaseFormList, List<String> curReadingFormList, - List<String> curIBOList, List<String> curDanList, List<String> curKeiList, - int curMorphBegin, int curSenBegin, int begin, JCas aJCas) - { - for (int j = 0; j < curMorphList.size(); j++) { - - String curMorph = trimWhitespaces(curMorphList.get(j)); - - if (!isValidMorph(curMorph)) { - continue; - } - - JapaneseToken jpyToken = new JapaneseToken(aJCas, begin + curSenBegin + curMorphBegin, - begin + curSenBegin + curMorphBegin + curMorph.length()); - jpyToken.setKana(curReadingFormList.get(j)); - jpyToken.setIbo(curIBOList.get(j)); - jpyToken.setDan(curDanList.get(j)); - jpyToken.setKei(curKeiList.get(j)); - jpyToken.addToIndexes(); - POS curPOS = new POS(aJCas, begin + curSenBegin + curMorphBegin, begin + curSenBegin - + curMorphBegin + curMorph.length()); - curPOS.setPosValue(curPOSList.get(j)); - POSUtils.assignCoarseValue(curPOS); - curPOS.addToIndexes(); - String lemmaString = curBaseFormList.get(j); - if (lemmaString == null) { - lemmaString = jpyToken.getText(); - } - Lemma curLemma = new Lemma(aJCas, begin + curSenBegin + curMorphBegin, begin - + curSenBegin + curMorphBegin + curMorph.length()); - curLemma.setValue(lemmaString); - curLemma.addToIndexes(); - - // set lemma and pos additionally for the token - jpyToken.setPos(curPOS); - jpyToken.setLemma(curLemma); - - curMorphBegin += curMorph.length(); - - // append whitespace after the morph - while ((text.length() > (begin + curSenBegin + curMorphBegin)) - && Character.isWhitespace(text.charAt(begin + curSenBegin + curMorphBegin))) { - curMorphBegin++; - } - - } - return curMorphBegin; - } - - private boolean isValidMorph(String curMorph) - { - if ((curMorph.length() == 1) && Character.isWhitespace(curMorph.charAt(0))) { - return false; - } - if (containsOnlyWhitespacesAndTabs(curMorph)) { - return false; - } - return true; - } - - private boolean containsOnlyWhitespacesAndTabs(String curMorph) - { - for (int i = 0; i < curMorph.length(); i++) { - char c = curMorph.charAt(i); - if (!(Character.isWhitespace(c) || (c == '\t'))) { - return false; - } - } - return true; - } - - private String trimWhitespaces(String morph) - { - - if (morph.length() == 1) { - return morph; - } - - int i = 0; - // forward to first non-blank character of morph - while ((i < morph.length()) && Character.isWhitespace(morph.charAt(i))) { - i++; - } - // step back until first non-blank character of morph - int j = morph.length() - 1; - while ((j >= 0) && Character.isWhitespace(morph.charAt(j))) { - j--; - } - - if (j < i) { - return morph; - } - - return morph.substring(i, j + 1); - } - - /** - * Based on a simple heuristic it is attempted to mark the morphemes with I-B-O tags if they - * belong to the same word. O = 1-morpheme word B = morpheme marks the beginning of a word I = - * morpheme is part of a word - * - * @param morph a morpheme. - * @param features a set of features. - * @param iboList a IBO list. - * @return the IBO code. - */ - private String getIBO(String morph, String[] features, List<String> iboList) - { - String pos = features[0]; - String pos_suffix_1 = features[1]; - String kei = features[5]; - String baseForm = features[6]; - - String OUTSIDE = "O"; - String INSIDE = "I"; - String BEGINNING = "B"; - - String ibo = OUTSIDE; - - if (isVerb(pos)) { - if (isIndependent(pos_suffix_1) && !baseForm.equals(morph)) { - if (isBeginning(iboList)) { - ibo = BEGINNING; - } - else { - ibo = INSIDE; - } - } - else if (isSuffix(pos_suffix_1)) { - ibo = INSIDE; - } - else if (isIncompleteVerbForm(kei)) { - ibo = BEGINNING; - } - } - else if (isAuxilaryVerb(pos)) { - if (isBeginning(iboList)) { - ibo = BEGINNING; - } - else { - ibo = INSIDE; - } - } - else if (isParticle(pos)) { - if (isLinkingParticle(pos_suffix_1)) { - ibo = INSIDE; - } - } - else if (isAdjective(pos)) { - if (endsOnInformalPastTense(morph, pos_suffix_1)) { - ibo = BEGINNING; - } - else if (isPastTenseEndingSyllablePolite(morph, kei)) { - ibo = INSIDE; - } - } - - return ibo; - } - - private boolean endsOnInformalPastTense(String morph, String pos_suffix_1) - { - return (morph.charAt(morph.length() - 1) == 'っ') && pos_suffix_1.equals("自立"); - } - - private boolean isPastTenseEndingSyllablePolite(String morph, String feature) - { - return morph.equals("た") && feature.equals("基本形"); - } - - private boolean isAdjective(String pos) - { - return pos.equals("形容詞"); - } - - private boolean isIncompleteVerbForm(String feature) - { - return feature.equals("未然形"); - } - - private boolean isLinkingParticle(String pos_suffix_1) - { - return pos_suffix_1.startsWith("接続"); - } - - private boolean isParticle(String pos) - { - return pos.equals("助詞"); - } - - private boolean isBeginning(List<String> iboList) - { - int size = iboList.size(); - return (size > 1) && iboList.get(size - 1).equals("O"); - } - - private boolean isSuffix(String pos_suffix_1) - { - return pos_suffix_1.equals("接尾"); - } - - private boolean isAuxilaryVerb(String pos) - { - return pos.equals("助動詞"); - } - - private boolean isIndependent(String pos_suffix_1) - { - return pos_suffix_1.equals("自立"); - } - - private boolean isVerb(String pos) - { - return pos.equals("動詞"); - } -} diff --git a/dkpro-core-mecab-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mecab/package-info.java b/dkpro-core-mecab-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mecab/package-info.java deleted file mode 100644 index c4c4bac77a..0000000000 --- a/dkpro-core-mecab-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mecab/package-info.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Integration of the <a href="https://code.google.com/p/mecab/">MeCab</a> part-of-speech and - * morphological analyzer. - * - * @since 1.4.0 - */ -package de.tudarmstadt.ukp.dkpro.core.mecab; diff --git a/dkpro-core-mecab-asl/src/main/java/org/dkpro/core/mecab/MeCabTagger.java b/dkpro-core-mecab-asl/src/main/java/org/dkpro/core/mecab/MeCabTagger.java new file mode 100755 index 0000000000..6362b38ec5 --- /dev/null +++ b/dkpro-core-mecab-asl/src/main/java/org/dkpro/core/mecab/MeCabTagger.java @@ -0,0 +1,612 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.mecab; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Level; +import org.apache.uima.util.Logger; +import org.chasen.mecab.Tagger; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.resources.PlatformDetector; +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.api.segmentation.SegmenterBase; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.mecab.type.JapaneseToken; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Annotator for the MeCab Japanese POS Tagger. + */ +@Component(OperationType.PART_OF_SPEECH_TAGGER) +@ResourceMetaData(name = "MeCab POS-Tagger") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability("ja") +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.mecab.type.JapaneseToken", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"} + ) +public class MeCabTagger + extends SegmenterBase +{ + private Logger logger; + private Tagger tagger; + + /** + * Loads MeCab library from system default paths. Throws and UnsatisfiedLinkError in case the + * native code cannot be read. + */ + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + logger = getContext().getLogger(); + try { + tagger = getMeCabJNI(); + } + catch (IOException e) { + throw new ResourceInitializationException(e); + } + if (tagger == null) { + throw new ResourceInitializationException(); + } + } + + private Tagger getMeCabJNI() + throws ResourceInitializationException, IOException + { + PlatformDetector pd = new PlatformDetector(); + Tagger tagger = null; + try { + String platform = pd.getPlatformId(); + this.getLogger().log(Level.INFO, "Your platform is " + platform); + + if (platform.equalsIgnoreCase("linux-x86_64")) { + tagger = initTagger(platform, "libmecab.so.2.0.0", "libMeCab.so"); + } + else if (platform.equalsIgnoreCase("linux-x86_32")) { + tagger = initTagger(platform, "libmecab.so.2.0.0", "libMeCab.so"); + } + else if (platform.equalsIgnoreCase("osx-x86_64")) { + tagger = initTagger(platform, "libmecab.2.dylib", "libMeCab.so"); + } + else { + throw new ResourceInitializationException(new Throwable("MeCab native code for " + + platform + " is not supported")); + } + } + catch (UnsatisfiedLinkError e) { + this.getLogger() + .log(Level.SEVERE, + "Cannot load the MeCab native code.\nMake sure that the system path (i.e. LD_LIBRARY_PATH) contains the library (i.e. libMeCab.so)\n"); + throw new ResourceInitializationException(e); + + } + return tagger; + } + + private Tagger initTagger(String platform, String sysLib, String javaWrapper) + throws IOException + { + String prefix = "lib/tagger/jp/bin-" + platform; + String packagePrefix = getClass().getPackage().getName().replaceAll("\\.", "/"); + + File binFolder = ResourceUtils.getClasspathAsFolder( + "classpath*:de/tudarmstadt/ukp/dkpro/core/mecab/" + prefix, true); + + System.load(new File(binFolder, sysLib).getAbsolutePath()); + System.load(new File(binFolder, javaWrapper).getAbsolutePath()); + + // Generate a dummy config file. Mecab does not really need any settings form it, but it + // requires that the file is present. + File dummyConfigFile = File.createTempFile("mecab", "rc"); + dummyConfigFile.deleteOnExit(); + String configFile = dummyConfigFile.getAbsolutePath(); + + // We force a temporary location because Mecab cannot deal with paths containing spaces + // and it is quite unlikely that the temp folder has spaces in its path. (See comment + // below as well). -- REC 2012-06-03 + File dictFolder = ResourceUtils.getClasspathAsFolder( + "classpath*:de/tudarmstadt/ukp/dkpro/core/mecab/lib/tagger/jp/ipadic", true); + + getLogger().log(Level.INFO, "Native library folder: " + binFolder); + getLogger().log(Level.INFO, "Dictionary folder: " + dictFolder); + + // FIXME Mecab tagger cannot deal with folders containing spaces because it uses spaces + // to split the parameter string and there is no way implemented to quote parameters. + // See param.cpp. There is a static create() method in C++ that acceptsma parameter + // count and an array of parameter strings, but this is unusable as it is realized in JNI + // at the moment. -- REC 2012-06-02 + return new Tagger("-d " + dictFolder.getAbsolutePath() + " -r " + configFile); + } + + @Override + public void destroy() + { + super.destroy(); + tagger.delete(); + } + + @Override + protected void process(JCas aJCas, String text, int zoneBegin) + throws AnalysisEngineProcessException + { + tag(aJCas, text, zoneBegin); + } + + protected void tag(JCas aJCas, String text, int begin) // , int end + { + DocumentMetaData docMeta = DocumentMetaData.get(aJCas); + String documentId = docMeta.getDocumentId(); + this.getLogger().log(Level.INFO, "Start tagging document with id: " + documentId); + + /* + * First, read all morphemes and POS tags. + * + * The native library seems to have a problem with parseToNode(), parseToString() functions + * For now, we have to parse the test from parse() function. + */ + // Node node = tagger.parseToNode(docText); + // for (; node != null; node = node.getNext()) { + // System.out.println(node.getSurface() + "\t" + node.getFeature()); + // } + // System.out.println("EOS\n"); + + List<String> morphList = new ArrayList<String>(); + List<String> posList = new ArrayList<String>(); + List<String> baseFormList = new ArrayList<String>(); + List<String> readingFormList = new ArrayList<String>(); + List<String> iboList = new ArrayList<String>(); + List<String> keiList = new ArrayList<String>(); + List<String> danList = new ArrayList<String>(); + + String taggedResult = tagger.parse(text.replaceAll("[\\s]+", " ")); + BufferedReader taggedResultReader = new BufferedReader(new StringReader(taggedResult)); + try { + String line; + while ((line = taggedResultReader.readLine()) != null) { + String morph = null, pos = null, baseForm = null, readingForm = null, ibo = null, + dan = null, kei = null; + String[] tokens = line.split("[\\s]+"); + morph = tokens[0]; + if (tokens.length >= 2) { + String[] features = tokens[1].split(","); + pos = getPartOfSpeech(features); + dan = getDan(features); + kei = getKei(features); + baseForm = getBaseForm(features, morph); + readingForm = getReading(features, morph); + ibo = getIBO(morph, features, iboList); + } + if ((morph == null) && (pos == null) && (baseForm == null)) { + logger.log(Level.WARNING, "Morph and pos not found: " + line); + continue; + } + morphList.add(morph); + posList.add(pos); + baseFormList.add(baseForm); + readingFormList.add(readingForm); + iboList.add(ibo); + danList.add(dan); + keiList.add(kei); + } + } + catch (IOException e) { + logger.log(Level.WARNING, + "Reading results from tagger caused an exception: " + e.getMessage()); + } + + /* + * Using the list of morphemes and POS tags, we mark sentence boundaries, as well as + * morpheme and POS boundaries. Japanese sentences end with full stop mark (。), exclamation + * mark (!), or a question mark (?). Note that these are full-width characters. + */ + { + int curSenBegin = 0; + List<String> curMorphList = new ArrayList<String>(); + List<String> curPOSList = new ArrayList<String>(); + List<String> curBaseFormList = new ArrayList<String>(); + List<String> curReadingFormList = new ArrayList<String>(); + List<String> curIBOList = new ArrayList<String>(); + List<String> curDanList = new ArrayList<String>(); + List<String> curKeiList = new ArrayList<String>(); + + for (int i = 0; i < morphList.size(); i++) { + String morph = morphList.get(i); + String pos = posList.get(i); + String baseForm = baseFormList.get(i); + String readingForm = readingFormList.get(i); + String ibo = iboList.get(i); + String dan = danList.get(i); + String kei = keiList.get(i); + + curMorphList.add(morph); + curPOSList.add(pos); + curBaseFormList.add(baseForm); + curReadingFormList.add(readingForm); + curIBOList.add(ibo); + curDanList.add(dan); + curKeiList.add(kei); + + if (morph.matches("[。!?]")) { + curSenBegin = createSentence(aJCas, text, begin, curSenBegin, curMorphList, + curPOSList, curBaseFormList, curReadingFormList, curIBOList, + curDanList, curKeiList, begin, curSenBegin); + } + } + + // cut off mecab's 'EOS' and its entries in the various lists + int morphs = curMorphList.size(); + if (curMorphList.get(morphs - 1).equals("EOS")) { + curMorphList.remove(morphs - 1); + curPOSList.remove(morphs - 1); + curBaseFormList.remove(morphs - 1); + curReadingFormList.remove(morphs - 1); + curIBOList.remove(morphs - 1); + curDanList.remove(morphs - 1); + curKeiList.remove(morphs - 1); + } + + // process the remaining text + if (curMorphList.size() > 0) { + curSenBegin = createSentence(aJCas, text, begin, curSenBegin, curMorphList, + curPOSList, curBaseFormList, curReadingFormList, curIBOList, curDanList, + curKeiList, begin, curSenBegin); + } + } + this.getLogger().log(Level.INFO, "Finished tagging document with id: " + documentId); + } + + private String getReading(String[] features, String fallback) + { + String readingForm = (features.length > 7) ? features[7] : "*"; + if (readingForm.equals("*")) { + readingForm = fallback; + } + return readingForm; + } + + private String getBaseForm(String[] features, String fallback) + { + String baseForm = features[6]; + if (baseForm.equals("*")) { + baseForm = fallback; + } + return baseForm; + } + + private String getKei(String[] features) + { + String kei = features[5]; + if (kei.equals("*")) { + kei = ""; + } + return kei; + } + + private String getDan(String[] features) + { + String dan = features[4]; + if (dan.equals("*")) { + dan = ""; + } + return dan; + } + + private String getPartOfSpeech(String[] features) + { + StringBuffer posBuf = new StringBuffer(); + int i = 0; + while (!features[i].equals("*") && (i < features.length) && (i < 4)) { + if (posBuf.length() > 0) { + posBuf.append("-"); + } + posBuf.append(features[i]); + i++; + } + return posBuf.toString(); + } + + private int createSentence(JCas aJCas, String text, int begin, int curSenBegin, + List<String> curMorphList, List<String> curPOSList, List<String> curBaseFormList, + List<String> curReadingFormList, List<String> curIBOList, List<String> curDanList, + List<String> curKeiList, int begin2, int curSenBegin2) + { + curSenBegin = skipBlanksAtBeginningOfString(text, begin, curSenBegin); + + int curMorphBegin = 0; + curMorphBegin = createTokensAddToIndex(text, curMorphList, curPOSList, curBaseFormList, + curReadingFormList, curIBOList, curDanList, curKeiList, curMorphBegin, curSenBegin, + begin, aJCas); + + createSentenceAddToIndex(aJCas, begin, curSenBegin, curMorphBegin); + curSenBegin += curMorphBegin; + + clearLists(curMorphList, curPOSList, curBaseFormList, curReadingFormList, curIBOList, + curDanList, curKeiList); + return curSenBegin; + } + + private void clearLists(List<String> curMorphList, List<String> curPOSList, + List<String> curBaseFormList, List<String> curReadingFormList, List<String> curIBOList, + List<String> curDanList, List<String> curKeiList) + { + curMorphList.clear(); + curPOSList.clear(); + curBaseFormList.clear(); + curReadingFormList.clear(); + curIBOList.clear(); + curDanList.clear(); + curKeiList.clear(); + } + + private int skipBlanksAtBeginningOfString(String text, int begin, int curSenBegin) + { + while ((text.length() > (begin + curSenBegin)) + && Character.isWhitespace(text.charAt(begin + curSenBegin))) { + curSenBegin++; + } + return curSenBegin; + } + + private void createSentenceAddToIndex(JCas aJCas, int begin, int curSenBegin, int curMorphBegin) + { + Sentence curSentence = new Sentence(aJCas, begin + curSenBegin, begin + curSenBegin + + curMorphBegin); + curSentence.addToIndexes(); + } + + private int createTokensAddToIndex(String text, List<String> curMorphList, + List<String> curPOSList, List<String> curBaseFormList, List<String> curReadingFormList, + List<String> curIBOList, List<String> curDanList, List<String> curKeiList, + int curMorphBegin, int curSenBegin, int begin, JCas aJCas) + { + for (int j = 0; j < curMorphList.size(); j++) { + + String curMorph = trimWhitespaces(curMorphList.get(j)); + + if (!isValidMorph(curMorph)) { + continue; + } + + JapaneseToken jpyToken = new JapaneseToken(aJCas, begin + curSenBegin + curMorphBegin, + begin + curSenBegin + curMorphBegin + curMorph.length()); + jpyToken.setKana(curReadingFormList.get(j)); + jpyToken.setIbo(curIBOList.get(j)); + jpyToken.setDan(curDanList.get(j)); + jpyToken.setKei(curKeiList.get(j)); + jpyToken.addToIndexes(); + POS curPOS = new POS(aJCas, begin + curSenBegin + curMorphBegin, begin + curSenBegin + + curMorphBegin + curMorph.length()); + curPOS.setPosValue(curPOSList.get(j)); + POSUtils.assignCoarseValue(curPOS); + curPOS.addToIndexes(); + String lemmaString = curBaseFormList.get(j); + if (lemmaString == null) { + lemmaString = jpyToken.getText(); + } + Lemma curLemma = new Lemma(aJCas, begin + curSenBegin + curMorphBegin, begin + + curSenBegin + curMorphBegin + curMorph.length()); + curLemma.setValue(lemmaString); + curLemma.addToIndexes(); + + // set lemma and pos additionally for the token + jpyToken.setPos(curPOS); + jpyToken.setLemma(curLemma); + + curMorphBegin += curMorph.length(); + + // append whitespace after the morph + while ((text.length() > (begin + curSenBegin + curMorphBegin)) + && Character.isWhitespace(text.charAt(begin + curSenBegin + curMorphBegin))) { + curMorphBegin++; + } + + } + return curMorphBegin; + } + + private boolean isValidMorph(String curMorph) + { + if ((curMorph.length() == 1) && Character.isWhitespace(curMorph.charAt(0))) { + return false; + } + if (containsOnlyWhitespacesAndTabs(curMorph)) { + return false; + } + return true; + } + + private boolean containsOnlyWhitespacesAndTabs(String curMorph) + { + for (int i = 0; i < curMorph.length(); i++) { + char c = curMorph.charAt(i); + if (!(Character.isWhitespace(c) || (c == '\t'))) { + return false; + } + } + return true; + } + + private String trimWhitespaces(String morph) + { + + if (morph.length() == 1) { + return morph; + } + + int i = 0; + // forward to first non-blank character of morph + while ((i < morph.length()) && Character.isWhitespace(morph.charAt(i))) { + i++; + } + // step back until first non-blank character of morph + int j = morph.length() - 1; + while ((j >= 0) && Character.isWhitespace(morph.charAt(j))) { + j--; + } + + if (j < i) { + return morph; + } + + return morph.substring(i, j + 1); + } + + /** + * Based on a simple heuristic it is attempted to mark the morphemes with I-B-O tags if they + * belong to the same word. O = 1-morpheme word B = morpheme marks the beginning of a word I = + * morpheme is part of a word + * + * @param morph a morpheme. + * @param features a set of features. + * @param iboList a IBO list. + * @return the IBO code. + */ + private String getIBO(String morph, String[] features, List<String> iboList) + { + String pos = features[0]; + String pos_suffix_1 = features[1]; + String kei = features[5]; + String baseForm = features[6]; + + String OUTSIDE = "O"; + String INSIDE = "I"; + String BEGINNING = "B"; + + String ibo = OUTSIDE; + + if (isVerb(pos)) { + if (isIndependent(pos_suffix_1) && !baseForm.equals(morph)) { + if (isBeginning(iboList)) { + ibo = BEGINNING; + } + else { + ibo = INSIDE; + } + } + else if (isSuffix(pos_suffix_1)) { + ibo = INSIDE; + } + else if (isIncompleteVerbForm(kei)) { + ibo = BEGINNING; + } + } + else if (isAuxilaryVerb(pos)) { + if (isBeginning(iboList)) { + ibo = BEGINNING; + } + else { + ibo = INSIDE; + } + } + else if (isParticle(pos)) { + if (isLinkingParticle(pos_suffix_1)) { + ibo = INSIDE; + } + } + else if (isAdjective(pos)) { + if (endsOnInformalPastTense(morph, pos_suffix_1)) { + ibo = BEGINNING; + } + else if (isPastTenseEndingSyllablePolite(morph, kei)) { + ibo = INSIDE; + } + } + + return ibo; + } + + private boolean endsOnInformalPastTense(String morph, String pos_suffix_1) + { + return (morph.charAt(morph.length() - 1) == 'っ') && pos_suffix_1.equals("自立"); + } + + private boolean isPastTenseEndingSyllablePolite(String morph, String feature) + { + return morph.equals("た") && feature.equals("基本形"); + } + + private boolean isAdjective(String pos) + { + return pos.equals("形容詞"); + } + + private boolean isIncompleteVerbForm(String feature) + { + return feature.equals("未然形"); + } + + private boolean isLinkingParticle(String pos_suffix_1) + { + return pos_suffix_1.startsWith("接続"); + } + + private boolean isParticle(String pos) + { + return pos.equals("助詞"); + } + + private boolean isBeginning(List<String> iboList) + { + int size = iboList.size(); + return (size > 1) && iboList.get(size - 1).equals("O"); + } + + private boolean isSuffix(String pos_suffix_1) + { + return pos_suffix_1.equals("接尾"); + } + + private boolean isAuxilaryVerb(String pos) + { + return pos.equals("助動詞"); + } + + private boolean isIndependent(String pos_suffix_1) + { + return pos_suffix_1.equals("自立"); + } + + private boolean isVerb(String pos) + { + return pos.equals("動詞"); + } +} diff --git a/dkpro-core-mecab-asl/src/main/java/org/dkpro/core/mecab/package-info.java b/dkpro-core-mecab-asl/src/main/java/org/dkpro/core/mecab/package-info.java new file mode 100644 index 0000000000..747419465d --- /dev/null +++ b/dkpro-core-mecab-asl/src/main/java/org/dkpro/core/mecab/package-info.java @@ -0,0 +1,25 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Integration of the <a href="https://code.google.com/p/mecab/">MeCab</a> part-of-speech and + * morphological analyzer. + * + * @since 1.4.0 + */ +package org.dkpro.core.mecab; diff --git a/dkpro-core-mecab-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map b/dkpro-core-mecab-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map new file mode 100644 index 0000000000..cea280b860 --- /dev/null +++ b/dkpro-core-mecab-asl/src/main/resources/META-INF/eu.openminted.share/uimaTypeMapping.map @@ -0,0 +1 @@ +de.tudarmstadt.ukp.dkpro.core.mecab.type.JapaneseToken=http://w3id.org/meta-share/omtd-share/Token diff --git a/dkpro-core-mecab-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mecab/MeCabTaggerDetailedTest.java b/dkpro-core-mecab-asl/src/test/java/org/dkpro/core/mecab/MeCabTaggerDetailedTest.java similarity index 95% rename from dkpro-core-mecab-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mecab/MeCabTaggerDetailedTest.java rename to dkpro-core-mecab-asl/src/test/java/org/dkpro/core/mecab/MeCabTaggerDetailedTest.java index 219f7c8dbe..fa2ddcecd5 100644 --- a/dkpro-core-mecab-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mecab/MeCabTaggerDetailedTest.java +++ b/dkpro-core-mecab-asl/src/test/java/org/dkpro/core/mecab/MeCabTaggerDetailedTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.mecab; +package org.dkpro.core.mecab; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; @@ -33,6 +33,8 @@ import org.apache.uima.fit.pipeline.JCasIterable; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.mecab.MeCabTagger; import org.junit.Assume; import org.junit.Before; import org.junit.Test; @@ -41,7 +43,6 @@ import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; import de.tudarmstadt.ukp.dkpro.core.mecab.type.JapaneseToken; public class MeCabTaggerDetailedTest { @@ -56,7 +57,7 @@ public void prepare() public void testMeCabTagger() throws UIMAException, IOException { CollectionReaderDescription reader = createReaderDescription( TextReader.class, - TextReader.PARAM_SOURCE_LOCATION, "src/test/resources", + TextReader.PARAM_SOURCE_LOCATION, "src/test/resources", TextReader.PARAM_LANGUAGE, "ja", TextReader.PARAM_PATTERNS, new String[] { "[+]detailedTest.txt" }); @@ -79,7 +80,8 @@ public void testMeCabTagger() throws UIMAException, IOException { private void evaluateSentence(Collection<Sentence> totalFound, JCas jcas) { Sentence sent = totalFound.iterator().next(); - List<JapaneseToken> tokens = JCasUtil.selectCovered(jcas, JapaneseToken.class, sent.getBegin(), sent.getEnd()); + List<JapaneseToken> tokens = JCasUtil.selectCovered(jcas, JapaneseToken.class, + sent.getBegin(), sent.getEnd()); assertEquals(15, tokens.size()); int token = 0; @@ -234,7 +236,8 @@ private void evaluateSentence(Collection<Sentence> totalFound, JCas jcas) { } private String getPOS(JCas jcas, Token token) { - List<POS> selectCovered = JCasUtil.selectCovered(jcas, POS.class, token.getBegin(), token.getEnd()); + List<POS> selectCovered = JCasUtil.selectCovered(jcas, POS.class, token.getBegin(), + token.getEnd()); if (selectCovered.size() == 1) { return selectCovered.get(0).getPosValue(); } @@ -242,7 +245,8 @@ private String getPOS(JCas jcas, Token token) { } private String getLemma(JCas jcas, Token token) { - List<Lemma> selectCovered = JCasUtil.selectCovered(jcas, Lemma.class, token.getBegin(), token.getEnd()); + List<Lemma> selectCovered = JCasUtil.selectCovered(jcas, Lemma.class, token.getBegin(), + token.getEnd()); if (selectCovered.size() == 1) { return selectCovered.get(0).getValue(); } @@ -253,8 +257,9 @@ private String getForm(Token token) { return token.getCoveredText(); } - private Collection<Sentence> getSentences(AnalysisEngine jTagger, JCas jcas) throws AnalysisEngineProcessException, - UIMAException, IOException { + private Collection<Sentence> getSentences(AnalysisEngine jTagger, JCas jcas) + throws AnalysisEngineProcessException, UIMAException, IOException + { jTagger.process(jcas); Collection<Sentence> found = JCasUtil.select(jcas, Sentence.class); diff --git a/dkpro-core-mecab-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mecab/MeCabTaggerTest.java b/dkpro-core-mecab-asl/src/test/java/org/dkpro/core/mecab/MeCabTaggerTest.java similarity index 97% rename from dkpro-core-mecab-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mecab/MeCabTaggerTest.java rename to dkpro-core-mecab-asl/src/test/java/org/dkpro/core/mecab/MeCabTaggerTest.java index c6e41ec8ea..1eca534d0f 100755 --- a/dkpro-core-mecab-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mecab/MeCabTaggerTest.java +++ b/dkpro-core-mecab-asl/src/test/java/org/dkpro/core/mecab/MeCabTaggerTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.mecab; +package org.dkpro.core.mecab; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; @@ -35,12 +35,13 @@ import org.apache.uima.fit.pipeline.JCasIterable; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.mecab.MeCabTagger; import org.junit.Assume; import org.junit.Before; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; public class MeCabTaggerTest { @Before diff --git a/dkpro-core-mecab-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mecab/MeCabWhiteSpacesTest.java b/dkpro-core-mecab-asl/src/test/java/org/dkpro/core/mecab/MeCabWhiteSpacesTest.java similarity index 97% rename from dkpro-core-mecab-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mecab/MeCabWhiteSpacesTest.java rename to dkpro-core-mecab-asl/src/test/java/org/dkpro/core/mecab/MeCabWhiteSpacesTest.java index 25831699d5..7835850919 100644 --- a/dkpro-core-mecab-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mecab/MeCabWhiteSpacesTest.java +++ b/dkpro-core-mecab-asl/src/test/java/org/dkpro/core/mecab/MeCabWhiteSpacesTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.mecab; +package org.dkpro.core.mecab; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; @@ -34,11 +34,12 @@ import org.apache.uima.fit.pipeline.JCasIterable; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.mecab.MeCabTagger; import org.junit.Assume; import org.junit.Before; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; import de.tudarmstadt.ukp.dkpro.core.mecab.type.JapaneseToken; /** diff --git a/dkpro-core-mecab-asl/suppressions.xml b/dkpro-core-mecab-asl/suppressions.xml new file mode 100644 index 0000000000..05381817ea --- /dev/null +++ b/dkpro-core-mecab-asl/suppressions.xml @@ -0,0 +1,9 @@ +<?xml version="1.0"?> + +<!DOCTYPE suppressions PUBLIC +"-//Puppy Crawl//DTD Suppressions 1.1//EN" +"http://www.puppycrawl.com/dtds/suppressions_1_1.dtd"> + +<suppressions> + <suppress files=".*[/\\]target[/\\].*" checks=".*"/> +</suppressions> diff --git a/dkpro-core-morpha-asl/pom.xml b/dkpro-core-morpha-asl/pom.xml index 74848c2af3..031f272318 100644 --- a/dkpro-core-morpha-asl/pom.xml +++ b/dkpro-core-morpha-asl/pom.xml @@ -1,11 +1,11 @@ <!-- - Copyright 2017 - Ubiquitous Knowledge Processing (UKP) Lab - Technische Universität Darmstadt - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at + Licensed to the Technische Universität Darmstadt under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The Technische Universität Darmstadt + licenses this file to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. http://www.apache.org/licenses/LICENSE-2.0 @@ -18,14 +18,15 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.morpha-asl</artifactId> + <artifactId>dkpro-core-morpha-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - Morpha (v ${morpha.version}) (non-commercial, attribution)</name> + <url>https://dkpro.github.io/dkpro-core/</url> <properties> <morpha.version>1.0.5</morpha.version> </properties> @@ -44,16 +45,20 @@ <version>${morpha.version}</version> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> </dependency> <dependency> <groupId>junit</groupId> @@ -61,13 +66,13 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-opennlp-asl</artifactId> <scope>test</scope> </dependency> <dependency> @@ -79,9 +84,9 @@ <dependencyManagement> <dependencies> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-opennlp-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <scope>import</scope> <type>pom</type> </dependency> diff --git a/dkpro-core-morpha-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/morpha/package-info.java b/dkpro-core-morpha-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/morpha/package-info.java deleted file mode 100644 index f5f544f6a2..0000000000 --- a/dkpro-core-morpha-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/morpha/package-info.java +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * - */ -package de.tudarmstadt.ukp.dkpro.core.morpha; diff --git a/dkpro-core-morpha-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/morpha/MorphaLemmatizer.java b/dkpro-core-morpha-asl/src/main/java/org/dkpro/core/morpha/MorphaLemmatizer.java similarity index 77% rename from dkpro-core-morpha-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/morpha/MorphaLemmatizer.java rename to dkpro-core-morpha-asl/src/main/java/org/dkpro/core/morpha/MorphaLemmatizer.java index e14d88b41f..42b8a14ff0 100644 --- a/dkpro-core-morpha-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/morpha/MorphaLemmatizer.java +++ b/dkpro-core-morpha-asl/src/main/java/org/dkpro/core/morpha/MorphaLemmatizer.java @@ -1,21 +1,21 @@ /* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.morpha; +package org.dkpro.core.morpha; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; @@ -29,11 +29,14 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Lemmatize based on a finite-state machine. Uses the <a href="https://github.com/knowitall/morpha"> @@ -45,7 +48,9 @@ * processing of English, Natural Language Engineering, 7(3). 207-223.</li> * </ul> */ -@ResourceMetaData(name="Morpha Lemmatizer") +@Component(OperationType.LEMMATIZER) +@ResourceMetaData(name = "Morpha Lemmatizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @LanguageCapability("en") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", @@ -62,7 +67,7 @@ public class MorphaLemmatizer * so this is disabled by default. */ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; - @ConfigurationParameter(name=PARAM_READ_POS, mandatory=true, defaultValue="false") + @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "false") private boolean readPos; @Override diff --git a/dkpro-core-morpha-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/morpha/MorphaLemmatizerTest.java b/dkpro-core-morpha-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/morpha/MorphaLemmatizerTest.java deleted file mode 100644 index 611cfff5e1..0000000000 --- a/dkpro-core-morpha-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/morpha/MorphaLemmatizerTest.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.morpha; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.jcas.JCas; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -public class MorphaLemmatizerTest -{ - @Test - public void testEnglishNoPos() - throws Exception - { - JCas jcas = runTest("en", false, "We need a very complicated example sentence , which " + - "contains as many constituents and dependencies as possible ."); - - String[] lemmas = { "We", "need", "a", "very", "complicate", "example", "sentence", ",", - "which", "contain", "as", "many", "constituent", "and", "dependency", "as", - "possible", "." }; - - AssertAnnotations.assertLemma(lemmas, select(jcas, Lemma.class)); - } - - @Test - public void testEnglishWithPos() - throws Exception - { - JCas jcas = runTest("en", true, "We need a very complicated example sentence , which " + - "contains as many constituents and dependencies as possible ."); - - String[] lemmas = { "We", "need", "a", "very", "complicated", "example", "sentence", ",", - "which", "contain", "as", "many", "constituent", "and", "dependency", "as", - "possible", "." }; - - AssertAnnotations.assertLemma(lemmas, select(jcas, Lemma.class)); - } - - private JCas runTest(String aLanguage, boolean aUsePosTags, String aText) - throws Exception - { - AnalysisEngineDescription engine; - - if (aUsePosTags) { - engine = createEngineDescription( - createEngineDescription(OpenNlpPosTagger.class), - createEngineDescription(MorphaLemmatizer.class, - MorphaLemmatizer.PARAM_READ_POS, true)); - } - else { - engine = createEngineDescription( - createEngineDescription(MorphaLemmatizer.class)); - } - - return TestRunner.runTest(engine, aLanguage, aText); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-morpha-asl/src/test/java/org/dkpro/core/morpha/MorphaLemmatizerTest.java b/dkpro-core-morpha-asl/src/test/java/org/dkpro/core/morpha/MorphaLemmatizerTest.java new file mode 100644 index 0000000000..d04fc643b0 --- /dev/null +++ b/dkpro-core-morpha-asl/src/test/java/org/dkpro/core/morpha/MorphaLemmatizerTest.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.morpha; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.opennlp.OpenNlpPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; + +public class MorphaLemmatizerTest +{ + @Test + public void testEnglishNoPos() + throws Exception + { + JCas jcas = runTest("en", false, "We need a very complicated example sentence , which " + + "contains as many constituents and dependencies as possible ."); + + String[] lemmas = { "We", "need", "a", "very", "complicate", "example", "sentence", ",", + "which", "contain", "as", "many", "constituent", "and", "dependency", "as", + "possible", "." }; + + AssertAnnotations.assertLemma(lemmas, select(jcas, Lemma.class)); + } + + @Test + public void testEnglishWithPos() + throws Exception + { + JCas jcas = runTest("en", true, "We need a very complicated example sentence , which " + + "contains as many constituents and dependencies as possible ."); + + String[] lemmas = { "We", "need", "a", "very", "complicated", "example", "sentence", ",", + "which", "contain", "as", "many", "constituent", "and", "dependency", "as", + "possible", "." }; + + AssertAnnotations.assertLemma(lemmas, select(jcas, Lemma.class)); + } + + private JCas runTest(String aLanguage, boolean aUsePosTags, String aText) + throws Exception + { + AnalysisEngineDescription engine; + + if (aUsePosTags) { + engine = createEngineDescription( + createEngineDescription(OpenNlpPosTagger.class), + createEngineDescription(MorphaLemmatizer.class, + MorphaLemmatizer.PARAM_READ_POS, true)); + } + else { + engine = createEngineDescription( + createEngineDescription(MorphaLemmatizer.class)); + } + + return TestRunner.runTest(engine, aLanguage, aText); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-mstparser-asl/pom.xml b/dkpro-core-mstparser-asl/pom.xml index 7d383d081e..95bc3c7099 100644 --- a/dkpro-core-mstparser-asl/pom.xml +++ b/dkpro-core-mstparser-asl/pom.xml @@ -18,14 +18,15 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.mstparser-asl</artifactId> + <artifactId>dkpro-core-mstparser-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - MSTParser (v ${mstparser.version})</name> + <url>https://dkpro.github.io/dkpro-core/</url> <properties> <mstparser.version>0.5.1</mstparser.version> </properties> @@ -52,28 +53,32 @@ <version>${mstparser.version}</version> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.metadata-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-metadata-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.syntax-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-syntax-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> </dependency> <dependency> <groupId>junit</groupId> @@ -81,13 +86,13 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.hunpos-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-hunpos-asl</artifactId> <scope>test</scope> </dependency> <dependency> @@ -124,9 +129,9 @@ <version>20130527.1</version> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.hunpos-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-hunpos-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <type>pom</type> <scope>import</scope> </dependency> diff --git a/dkpro-core-mstparser-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mstparser/package-info.java b/dkpro-core-mstparser-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mstparser/package-info.java deleted file mode 100644 index 5474de9f93..0000000000 --- a/dkpro-core-mstparser-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mstparser/package-info.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Integration of the <a href="http://www.seas.upenn.edu/~strctlrn/MSTParser/MSTParser.html">MST</a> - * dependency parser. - */ -package de.tudarmstadt.ukp.dkpro.core.mstparser; diff --git a/dkpro-core-mstparser-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mstparser/MstParser.java b/dkpro-core-mstparser-asl/src/main/java/org/dkpro/core/mstparser/MstParser.java similarity index 87% rename from dkpro-core-mstparser-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mstparser/MstParser.java rename to dkpro-core-mstparser-asl/src/main/java/org/dkpro/core/mstparser/MstParser.java index 32cf684ffd..6b2886c975 100644 --- a/dkpro-core-mstparser-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mstparser/MstParser.java +++ b/dkpro-core-mstparser-asl/src/main/java/org/dkpro/core/mstparser/MstParser.java @@ -1,442 +1,473 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.mstparser; - -import static java.util.Arrays.asList; -import static org.apache.commons.io.IOUtils.closeQuietly; -import static org.apache.uima.fit.util.JCasUtil.exists; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; -import static org.apache.uima.util.Level.INFO; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.io.InputStream; -import java.net.URL; -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; - -import mstparser.DependencyInstance; -import mstparser.DependencyParser; -import mstparser.DependencyPipe; -import mstparser.DependencyPipe2O; -import mstparser.ParserOptions; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Type; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; - -/** - * Dependency parsing using MSTParser. - * <p> - * Wrapper for the MSTParser (<b>high memory requirements</b>). More information about the parser - * can be found <a href="http://www.seas.upenn.edu/~strctlrn/MSTParser/MSTParser.html">here</a> <a - * href="http://sourceforge.net/projects/mstparser/">here</a> - * </p> - * <p> - * The MSTParser models tend to be very large, e.g. the <a - * href="http://nlp.stanford.edu/software/stanford-dependencies.shtml">Eisner</a> model is about 600 - * MB uncompressed. With this model, parsing a simple sentence with MSTParser requires about 3 GB - * heap memory. - * </p> - * <p> - * This component feeds MSTParser only with the FORM (token) and POS (part-of-speech) fields. LEMMA, - * CPOS, and other columns from the CONLL 2006 format are not generated (cf. - * {@link mstparser.DependencyInstance DependencyInstance}). - * </p> - */ -@ResourceMetaData(name="MSTParser Dependency Parser") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) -public class MstParser - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - protected String modelLocation; - - /** - * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") - protected boolean printTagSet; - - /** - * Load the dependency to UIMA type mapping from this location instead of locating - * the mapping automatically. - */ - public static final String PARAM_DEPENDENCY_MAPPING_LOCATION = ComponentParameters.PARAM_DEPENDENCY_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_DEPENDENCY_MAPPING_LOCATION, mandatory = false) - protected String dependencyMappingLocation; - - /** - * Specifies the order/scope of features. 1 only has features over single edges - * and 2 has features over pairs of adjacent edges in the tree. The model must have been - * trained with the respective order set here. - */ - public static final String PARAM_ORDER = "order"; - @ConfigurationParameter(name = PARAM_ORDER, mandatory = false) - private Integer order; - - private ModelProviderBase<DependencyParser> modelProvider; - private MappingProvider mappingProvider; - - /** - * Initializes the MSTParser and creates a ModelResourceProvicer - * - * @throws ResourceInitializationException - * Cannot be initialized - */ - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - - // the modelProvider reads in the model and produces a parser - modelProvider = new ModelProviderBase<DependencyParser>(this, "mstparser", "parser") - { - @Override - protected DependencyParser produceResource(URL aUrl) - throws IOException - { - Properties metadata = getResourceMetaData(); - - // Configure parser - ParserOptions options = createOptions(aUrl, metadata); - DependencyPipe pipe = createPipe(options); - DependencyParser dp = loadParser(aUrl, pipe, options); - - // Check if the model order corresponds to the order the component is configured for - boolean secondOrderModel = isSecondOrderModel(pipe); - if (secondOrderModel != options.secondOrder) { - String model = secondOrderModel ? "second" : "first"; - String component = options.secondOrder ? "second" : "first"; - getLogger().warn("Model is " + model + " but component has been configured " - + "for " + component + " order. I am going to reload the model now " - + "with the correct order. To avoid loading the model twice, please " - + "configure the component for the correct order."); - - // Reconfigure pipe and reload - options.secondOrder = secondOrderModel; - pipe = createPipe(options); - dp = loadParser(aUrl, pipe, options); - } - - // Extract dependency tagset - SingletonTagset depTags = new SingletonTagset( - Dependency.class, metadata.getProperty("dependency.tagset")); - depTags.addAll(asList(pipe.types)); - //depTags.remove("<no-type>"); - addTagset(depTags); - - // Extract POS tagset (from POS, not from CPOS!) - SingletonTagset posTags = new SingletonTagset( - POS.class, metadata.getProperty("pos.tagset")); - for (Object key : pipe.dataAlphabet.toArray()) { - if (key instanceof String) { - String sKey = (String) key; - - // See mstparser.DependencyPipe.addLinearFeatures(...) - if (sKey.startsWith("POSPC=")) { - String[] fragments = sKey.substring(6).split(" ",3); - posTags.add(fragments[0]); - posTags.add(fragments[1]); - } - } - } - //posTags.remove("<root-POS>"); - addTagset(posTags); - - if (printTagSet) { - getContext().getLogger().log(INFO, getTagset().toString()); - } - - return dp; - }; - }; - - mappingProvider = MappingProviderFactory.createDependencyMappingProvider( - dependencyMappingLocation, language, modelProvider); - } - - /** - * Processes the given text using the MSTParser. As the MSTParser expects an input file, a - * temporary file is created. - * - * @param jcas - * The JCas containing the textual input - * @throws AnalysisEngineProcessException - * No parse created - */ - @Override - public void process(JCas jcas) - throws AnalysisEngineProcessException - { - CAS cas = jcas.getCas(); - modelProvider.configure(cas); - mappingProvider.configure(cas); - DependencyParser dp = modelProvider.getResource(); - - // If there are no sentences or tokens in the CAS, skip it. - if (!exists(jcas, Sentence.class) || !exists(jcas, Token.class)) { - return; - } - - // currently the parser needs a file as input, it cannot yet work directly with the - // cas-structure - try { - String tempfile = generateTempInputFile(jcas); - dp.options.testfile = tempfile; - } - catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - - // Run the parser - // dp.getParses() is a method that we added to the MSTParser codebase, it returns a list of - // parses. Originally this was dp.outputParses() and the method wrote the parses into a - // file. - // The old method is still available. - List<DependencyInstance> parsedInstances; - try { - parsedInstances = dp.getParses(); - } - catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - - List<Sentence> sentences = new ArrayList<Sentence>(select(jcas, Sentence.class)); - - for (int instanceIndex = 0; instanceIndex < parsedInstances.size(); instanceIndex++) { - - DependencyInstance instance = parsedInstances.get(instanceIndex); - Sentence sentence = sentences.get(instanceIndex); - - List<Token> tokens = new ArrayList<Token>(selectCovered(jcas, Token.class, sentence)); - - // iterate through tokens - for (int formsIndex = 0; formsIndex < instance.forms.length; formsIndex++) { - Token token = tokens.get(formsIndex); - - // get dependency relation and head information for token - int head = instance.heads[formsIndex]; - - // write dependency information as annotation to JCas - Type depRel = mappingProvider.getTagType(instance.deprels[formsIndex]); - - if (head > 0) { - Dependency dep = (Dependency) cas.createFS(depRel); - dep.setDependencyType(instance.deprels[formsIndex]); - dep.setFlavor(DependencyFlavor.BASIC); - dep.setDependent(token); - dep.setGovernor(tokens.get(head - 1)); - dep.setBegin(dep.getDependent().getBegin()); - dep.setEnd(dep.getDependent().getEnd()); - dep.addToIndexes(); - } - else { - Dependency dep = new ROOT(jcas); - dep.setDependencyType(instance.deprels[formsIndex]); - dep.setFlavor(DependencyFlavor.BASIC); - dep.setDependent(token); - dep.setGovernor(token); - dep.setBegin(dep.getDependent().getBegin()); - dep.setEnd(dep.getDependent().getEnd()); - dep.addToIndexes(); - } - } - } - } - - /** - * Generates a temporary file from a jcas. This is needed as input to the MST parser. - * - * @param jcas - * The JCas containing the textual input - * @return The path to the created temporary file. - * @throws IOException - * The temporary file could not be created - */ - private String generateTempInputFile(JCas jcas) - throws IOException - { - File tempfile = File.createTempFile("MSTinput", "txt"); - BufferedWriter out = new BufferedWriter(new FileWriter(tempfile, true)); - - // write sentences to temporary file in MST input format - for (Sentence sentence : select(jcas, Sentence.class)) { - int tokencount = 0; - - List<Token> tokens = selectCovered(jcas, Token.class, sentence); - for (Token token : tokens) { - out.write(token.getText() + "\t"); - tokencount++; - } - out.write("\n"); - for (Token token : tokens) { - out.write(token.getPos().getPosValue() + "\t"); - - } - // Dummy values for labels - out.write("\n"); - for (int k = 0; k < tokencount; k++) { - out.write("Dummy\t"); - } - // Dummy values for heads - out.write("\n"); - for (int i = 0; i < tokencount; i++) { - out.write("0\t"); - } - - out.write("\n\n"); - } - - IOUtils.closeQuietly(out); - tempfile.deleteOnExit(); - return tempfile.getPath(); - } - - /** - * Checks if the data alphabet loaded into the pipe contains features that are only generated - * when a second-order model has been trained. - * - * @param aPipe - * the parser pipeline. - * @return if the pipeline uses a second-order model. - */ - private boolean isSecondOrderModel(DependencyPipe aPipe) - { - for (Object key : aPipe.dataAlphabet.toArray()) { - if (key instanceof String) { - String sKey = (String) key; - if (sKey.startsWith("POS_TRIP=")) { - return true; - } - } - } - return false; - } - - private ParserOptions createOptions(URL aUrl, Properties aMetadata) - { - // mst.ParserOptions needs a String as argument - ParserOptions options = new ParserOptions(new String[] {}); - options.test = true; - options.train = false; - options.trainfile = ""; - options.eval = false; - options.format = "MST"; - options.goldfile = ""; - options.testfile = ""; - options.modelName = aUrl.toString(); - - if (order == null) { - String modelOrder = aMetadata.getProperty("mstparser.param.order"); - if (StringUtils.isNotEmpty(modelOrder)) { - getLogger().info( - "Using model order (mstparser.param.order): " + modelOrder); - options.secondOrder = "2".equals(modelOrder.trim()); - } - else { - getLogger().info("Using default order: 1"); - options.secondOrder = false; - } - } - else { - getLogger().info("Using user-specified order: " + order); - options.secondOrder = order == 2; - } - - return options; - } - - private DependencyParser loadParser(URL aUrl, DependencyPipe aPipe, ParserOptions aOptions) - throws IOException - { - DependencyParser dp = new DependencyParser(aPipe, aOptions); - - InputStream is = null; - try { - getLogger().info("Retrieving model"); - is = CompressionUtils.getInputStream(aUrl.getFile(), aUrl.openStream()); - dp.loadModel(is); - } - finally { - closeQuietly(is); - } - - return dp; - } - - private DependencyPipe createPipe(ParserOptions aOptions) - throws IOException - { - return aOptions.secondOrder ? new DependencyPipe2O(aOptions) : new DependencyPipe(aOptions); - } +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.mstparser; + +import static java.util.Arrays.asList; +import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.apache.uima.fit.util.JCasUtil.exists; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.apache.uima.util.Level.INFO; +import static org.dkpro.core.api.resources.MappingProviderFactory.createDependencyMappingProvider; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Type; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CompressionUtils; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; +import mstparser.DependencyInstance; +import mstparser.DependencyParser; +import mstparser.DependencyPipe; +import mstparser.DependencyPipe2O; +import mstparser.ParserOptions; + +/** + * Dependency parsing using MSTParser. + * <p> + * Wrapper for the MSTParser (<b>high memory requirements</b>). More information about the parser + * can be found <a href="http://www.seas.upenn.edu/~strctlrn/MSTParser/MSTParser.html">here</a> <a + * href="http://sourceforge.net/projects/mstparser/">here</a> + * </p> + * <p> + * The MSTParser models tend to be very large, e.g. the <a + * href="http://nlp.stanford.edu/software/stanford-dependencies.shtml">Eisner</a> model is about 600 + * MB uncompressed. With this model, parsing a simple sentence with MSTParser requires about 3 GB + * heap memory. + * </p> + * <p> + * This component feeds MSTParser only with the FORM (token) and POS (part-of-speech) fields. LEMMA, + * CPOS, and other columns from the CONLL 2006 format are not generated (cf. + * {@link mstparser.DependencyInstance DependencyInstance}). + * </p> + */ +@Component(OperationType.DEPENDENCY_PARSER) +@ResourceMetaData(name = "MSTParser Dependency Parser") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) +public class MstParser + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + protected String modelLocation; + + /** + * Log the tag set(s) when a model is loaded. + */ + public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") + protected boolean printTagSet; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Load the dependency to UIMA type mapping from this location instead of locating + * the mapping automatically. + */ + public static final String PARAM_DEPENDENCY_MAPPING_LOCATION = + ComponentParameters.PARAM_DEPENDENCY_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_DEPENDENCY_MAPPING_LOCATION, mandatory = false) + protected String dependencyMappingLocation; + + /** + * Specifies the order/scope of features. 1 only has features over single edges + * and 2 has features over pairs of adjacent edges in the tree. The model must have been + * trained with the respective order set here. + */ + public static final String PARAM_ORDER = "order"; + @ConfigurationParameter(name = PARAM_ORDER, mandatory = false) + private Integer order; + + private ModelProviderBase<DependencyParser> modelProvider; + private MappingProvider mappingProvider; + + /** + * Initializes the MSTParser and creates a ModelResourceProvicer + * + * @throws ResourceInitializationException + * Cannot be initialized + */ + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + // the modelProvider reads in the model and produces a parser + modelProvider = new ModelProviderBase<DependencyParser>(this, "mstparser", "parser") + { + { + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/mstparser/lib/parser-${language}-${variant}.properties"); + } + + @Override + protected DependencyParser produceResource(URL aUrl) + throws IOException + { + Properties metadata = getResourceMetaData(); + + // Configure parser + ParserOptions options = createOptions(aUrl, metadata); + DependencyPipe pipe = createPipe(options); + DependencyParser dp = loadParser(aUrl, pipe, options); + + // Check if the model order corresponds to the order the component is configured for + boolean secondOrderModel = isSecondOrderModel(pipe); + if (secondOrderModel != options.secondOrder) { + String model = secondOrderModel ? "second" : "first"; + String component = options.secondOrder ? "second" : "first"; + getLogger().warn("Model is " + model + " but component has been configured " + + "for " + component + " order. I am going to reload the model now " + + "with the correct order. To avoid loading the model twice, please " + + "configure the component for the correct order."); + + // Reconfigure pipe and reload + options.secondOrder = secondOrderModel; + pipe = createPipe(options); + dp = loadParser(aUrl, pipe, options); + } + + // Extract dependency tagset + SingletonTagset depTags = new SingletonTagset( + Dependency.class, metadata.getProperty("dependency.tagset")); + depTags.addAll(asList(pipe.types)); + //depTags.remove("<no-type>"); + addTagset(depTags); + + // Extract POS tagset (from POS, not from CPOS!) + SingletonTagset posTags = new SingletonTagset( + POS.class, metadata.getProperty("pos.tagset")); + for (Object key : pipe.dataAlphabet.toArray()) { + if (key instanceof String) { + String sKey = (String) key; + + // See mstparser.DependencyPipe.addLinearFeatures(...) + if (sKey.startsWith("POSPC=")) { + String[] fragments = sKey.substring(6).split(" ",3); + posTags.add(fragments[0]); + posTags.add(fragments[1]); + } + } + } + //posTags.remove("<root-POS>"); + addTagset(posTags); + + if (printTagSet) { + getContext().getLogger().log(INFO, getTagset().toString()); + } + + return dp; + }; + }; + + mappingProvider = createDependencyMappingProvider(this, dependencyMappingLocation, language, + modelProvider); + } + + /** + * Processes the given text using the MSTParser. As the MSTParser expects an input file, a + * temporary file is created. + * + * @param jcas + * The JCas containing the textual input + * @throws AnalysisEngineProcessException + * No parse created + */ + @Override + public void process(JCas jcas) + throws AnalysisEngineProcessException + { + CAS cas = jcas.getCas(); + modelProvider.configure(cas); + mappingProvider.configure(cas); + DependencyParser dp = modelProvider.getResource(); + + // If there are no sentences or tokens in the CAS, skip it. + if (!exists(jcas, Sentence.class) || !exists(jcas, Token.class)) { + return; + } + + // currently the parser needs a file as input, it cannot yet work directly with the + // cas-structure + try { + String tempfile = generateTempInputFile(jcas); + dp.options.testfile = tempfile; + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + + // Run the parser + // dp.getParses() is a method that we added to the MSTParser codebase, it returns a list of + // parses. Originally this was dp.outputParses() and the method wrote the parses into a + // file. + // The old method is still available. + List<DependencyInstance> parsedInstances; + try { + parsedInstances = dp.getParses(); + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + + List<Sentence> sentences = new ArrayList<Sentence>(select(jcas, Sentence.class)); + + for (int instanceIndex = 0; instanceIndex < parsedInstances.size(); instanceIndex++) { + + DependencyInstance instance = parsedInstances.get(instanceIndex); + Sentence sentence = sentences.get(instanceIndex); + + List<Token> tokens = new ArrayList<Token>(selectCovered(jcas, Token.class, sentence)); + + // iterate through tokens + for (int formsIndex = 0; formsIndex < instance.forms.length; formsIndex++) { + Token token = tokens.get(formsIndex); + + // get dependency relation and head information for token + int head = instance.heads[formsIndex]; + + // write dependency information as annotation to JCas + Type depRel = mappingProvider.getTagType(instance.deprels[formsIndex]); + + if (head > 0) { + Dependency dep = (Dependency) cas.createFS(depRel); + dep.setDependencyType(instance.deprels[formsIndex]); + dep.setFlavor(DependencyFlavor.BASIC); + dep.setDependent(token); + dep.setGovernor(tokens.get(head - 1)); + dep.setBegin(dep.getDependent().getBegin()); + dep.setEnd(dep.getDependent().getEnd()); + dep.addToIndexes(); + } + else { + Dependency dep = new ROOT(jcas); + dep.setDependencyType(instance.deprels[formsIndex]); + dep.setFlavor(DependencyFlavor.BASIC); + dep.setDependent(token); + dep.setGovernor(token); + dep.setBegin(dep.getDependent().getBegin()); + dep.setEnd(dep.getDependent().getEnd()); + dep.addToIndexes(); + } + } + } + } + + /** + * Generates a temporary file from a jcas. This is needed as input to the MST parser. + * + * @param jcas + * The JCas containing the textual input + * @return The path to the created temporary file. + * @throws IOException + * The temporary file could not be created + */ + private String generateTempInputFile(JCas jcas) + throws IOException + { + File tempfile = File.createTempFile("MSTinput", "txt"); + BufferedWriter out = new BufferedWriter(new FileWriter(tempfile, true)); + + // write sentences to temporary file in MST input format + for (Sentence sentence : select(jcas, Sentence.class)) { + int tokencount = 0; + + List<Token> tokens = selectCovered(jcas, Token.class, sentence); + for (Token token : tokens) { + out.write(token.getText() + "\t"); + tokencount++; + } + out.write("\n"); + for (Token token : tokens) { + out.write(token.getPos().getPosValue() + "\t"); + + } + // Dummy values for labels + out.write("\n"); + for (int k = 0; k < tokencount; k++) { + out.write("Dummy\t"); + } + // Dummy values for heads + out.write("\n"); + for (int i = 0; i < tokencount; i++) { + out.write("0\t"); + } + + out.write("\n\n"); + } + + IOUtils.closeQuietly(out); + tempfile.deleteOnExit(); + return tempfile.getPath(); + } + + /** + * Checks if the data alphabet loaded into the pipe contains features that are only generated + * when a second-order model has been trained. + * + * @param aPipe + * the parser pipeline. + * @return if the pipeline uses a second-order model. + */ + private boolean isSecondOrderModel(DependencyPipe aPipe) + { + for (Object key : aPipe.dataAlphabet.toArray()) { + if (key instanceof String) { + String sKey = (String) key; + if (sKey.startsWith("POS_TRIP=")) { + return true; + } + } + } + return false; + } + + private ParserOptions createOptions(URL aUrl, Properties aMetadata) + { + // mst.ParserOptions needs a String as argument + ParserOptions options = new ParserOptions(new String[] {}); + options.test = true; + options.train = false; + options.trainfile = ""; + options.eval = false; + options.format = "MST"; + options.goldfile = ""; + options.testfile = ""; + options.modelName = aUrl.toString(); + + if (order == null) { + String modelOrder = aMetadata.getProperty("mstparser.param.order"); + if (StringUtils.isNotEmpty(modelOrder)) { + getLogger().info( + "Using model order (mstparser.param.order): " + modelOrder); + options.secondOrder = "2".equals(modelOrder.trim()); + } + else { + getLogger().info("Using default order: 1"); + options.secondOrder = false; + } + } + else { + getLogger().info("Using user-specified order: " + order); + options.secondOrder = order == 2; + } + + return options; + } + + private DependencyParser loadParser(URL aUrl, DependencyPipe aPipe, ParserOptions aOptions) + throws IOException + { + DependencyParser dp = new DependencyParser(aPipe, aOptions); + + InputStream is = null; + try { + getLogger().info("Retrieving model"); + is = CompressionUtils.getInputStream(aUrl.getFile(), aUrl.openStream()); + dp.loadModel(is); + } + finally { + closeQuietly(is); + } + + return dp; + } + + private DependencyPipe createPipe(ParserOptions aOptions) + throws IOException + { + return aOptions.secondOrder ? new DependencyPipe2O(aOptions) : new DependencyPipe(aOptions); + } } diff --git a/dkpro-core-mstparser-asl/src/main/java/org/dkpro/core/mstparser/package-info.java b/dkpro-core-mstparser-asl/src/main/java/org/dkpro/core/mstparser/package-info.java new file mode 100644 index 0000000000..50bdec5da8 --- /dev/null +++ b/dkpro-core-mstparser-asl/src/main/java/org/dkpro/core/mstparser/package-info.java @@ -0,0 +1,23 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Integration of the <a href="http://www.seas.upenn.edu/~strctlrn/MSTParser/MSTParser.html">MST</a> + * dependency parser. + */ +package org.dkpro.core.mstparser; diff --git a/dkpro-core-mstparser-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/mstparser/lib/parser-default-variants.map b/dkpro-core-mstparser-asl/src/main/resources/org/dkpro/core/mstparser/lib/parser-default-variants.map similarity index 100% rename from dkpro-core-mstparser-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/mstparser/lib/parser-default-variants.map rename to dkpro-core-mstparser-asl/src/main/resources/org/dkpro/core/mstparser/lib/parser-default-variants.map diff --git a/dkpro-core-mstparser-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mstparser/MstParserTest.java b/dkpro-core-mstparser-asl/src/test/java/org/dkpro/core/mstparser/MstParserTest.java similarity index 87% rename from dkpro-core-mstparser-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mstparser/MstParserTest.java rename to dkpro-core-mstparser-asl/src/test/java/org/dkpro/core/mstparser/MstParserTest.java index 22d24d5222..3270154c27 100644 --- a/dkpro-core-mstparser-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/mstparser/MstParserTest.java +++ b/dkpro-core-mstparser-asl/src/test/java/org/dkpro/core/mstparser/MstParserTest.java @@ -1,351 +1,355 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.mstparser; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import java.util.Locale; - -import org.apache.commons.lang3.ArrayUtils; -import org.apache.uima.fit.factory.AggregateBuilder; -import org.apache.uima.jcas.JCas; -import org.junit.Assume; -import org.junit.Ignore; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.hunpos.HunPosTagger; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -/** - */ -public class MstParserTest -{ - @Ignore("Takes too long") - @Test - public void testCroatianMte5Defnpout() - throws Exception - { - JCas jcas = runTest("hr", "mte5.defnpout", "Moramo vrlo kompliciran primjer rečenicu , " - + "koja sadrži što više sastojaka i ovisnosti što je više moguće ."); - - String[] dependencies = { - "[ 0, 6]Dependency(Pred,basic) D[0,6](Moramo) G[41,42](,)", - "[ 7, 11]Dependency(Adv,basic) D[7,11](vrlo) G[12,23](kompliciran)", - "[ 12, 23]Dependency(Atr,basic) D[12,23](kompliciran) G[24,31](primjer)", - "[ 24, 31]Dependency(Ap,basic) D[24,31](primjer) G[32,40](rečenicu)", - "[ 32, 40]Dependency(Sb,basic) D[32,40](rečenicu) G[0,6](Moramo)", - "[ 41, 42]Dependency(Punc,basic) D[41,42](,) G[48,54](sadrži)", - "[ 43, 47]Dependency(Sb,basic) D[43,47](koja) G[48,54](sadrži)", - "[ 48, 54]ROOT(Pred,basic) D[48,54](sadrži) G[48,54](sadrži)", - "[ 55, 58]Dependency(Pred,basic) D[55,58](što) G[74,75](i)", - "[ 59, 63]Dependency(Oth,basic) D[59,63](više) G[74,75](i)", - "[ 64, 73]Dependency(Atr,basic) D[64,73](sastojaka) G[59,63](više)", - "[ 74, 75]Dependency(Co,basic) D[74,75](i) G[48,54](sadrži)", - "[ 76, 85]Dependency(Pred,basic) D[76,85](ovisnosti) G[74,75](i)", - "[ 86, 89]ROOT(Pred,basic) D[86,89](što) G[86,89](što)", - "[ 90, 92]ROOT(Pred,basic) D[90,92](je) G[90,92](je)", - "[ 93, 97]Dependency(Adv,basic) D[93,97](više) G[98,104](moguće)", - "[ 98,104]Dependency(Pnom,basic) D[98,104](moguće) G[90,92](je)", - "[105,106]ROOT(Punc,basic) D[105,106](.) G[105,106](.)" }; - - String[] posTags = { "<root-POS>", "Afmnpa-", "Afpfsn-", "Afpmpgy", "Afpmply", - "Afpnpgy", "Afpnpn-", "Agcfpn", "Agcfsa", "Agcfsg", "Agcfsi", "Agcfsn", "Agcmpa", - "Agcmpg", "Agcmpn", "Agcmsg", "Agcmsl", "Agcmsn", "Agcnsa", "Agcnsn", "Agpfpa", - "Agpfpd", "Agpfpg", "Agpfpi", "Agpfpl", "Agpfpn", "Agpfsa", "Agpfsd", "Agpfsg", - "Agpfsi", "Agpfsl", "Agpfsn", "Agpmpa", "Agpmpd", "Agpmpg", "Agpmpi", "Agpmpl", - "Agpmpn", "Agpmsan", "Agpmsay", "Agpmsd", "Agpmsg", "Agpmsi", "Agpmsl", "Agpmsn", - "Agpngs", "Agpnpa", "Agpnpg", "Agpnpi", "Agpnpl", "Agpnpn", "Agpnsa", "Agpnsd", - "Agpnsg", "Agpnsl", "Agpnsn", "Agsfpa", "Agsfpg", "Agsfpn", "Agsfsa", "Agsfsg", - "Agsfsi", "Agsfsl", "Agsfsn", "Agsmpa", "Agsmpg", "Agsmpn", "Agsmsan", "Agsmsd", - "Agsmsn", "Agsnpg", "Agsnsn", "Appfpg", "Appfpl", "Appfpn", "Appfsa", "Appfsg", - "Appfsl", "Appfsn", "Appmpa", "Appmpd", "Appmpg", "Appmpi", "Appmpn", "Appmsan", - "Appmsay", "Appmsd", "Appmsg", "Appmsi", "Appmsl", "Appmsn", "Appnpa", "Appnpn", - "Appnsg", "Appnsl", "Appnsn", "Apsfsg", "Aspfpn", "Aspfsl", "Aspfsn", "Aspmsd", - "Aspmsn", "Aspnsa", "Aspnsg", "Cc", "Ccs", "Cs", "Css", "M", "Mc-p-l", "Mc-pal", - "Mc-pgl", "Mc-s-l", "Mcfp-l", "Mcfpal", "Mcfpgl", "Mcfpnl", "Mcfsal-", "Mcfsgl-", - "Mcfsll-", "Mcfsnl-", "Mcmpal", "Mcmpan", "Mcmpgl", "Mcmpnl", "Mcmsal", - "Mcmsal---n", "Mcmsal---y", "Mcmsgl", "Mcmsil-", "Mcmsll", "Mcmsnl", "Mcnpnl", - "Mcnsal-", "Mcnsnl-", "Ms-s-l", "Msfpgl", "Msfpnl", "N--pg", "N--pn", "N-fpa", - "N-fpd", "N-fpg", "N-fpi", "N-fpl", "N-fpn", "N-fsa", "N-fsd", "N-fsg", "N-fsi", - "N-fsl", "N-fsn", "N-mpa", "N-mpd", "N-mpg", "N-mpi", "N-mpl", "N-mpn", "N-msa", - "N-msan", "N-msay", "N-msd", "N-msg", "N-msi", "N-msl", "N-msn", "N-msv", "N-npa", - "N-npd", "N-npg", "N-npi", "N-npl", "N-npn", "N-nsa", "N-nsd", "N-nsg", "N-nsi", - "N-nsl", "N-nsn", "Ncfpn", "Ncfsg", "Ncfsl", "Ncfsn", "Ncmpa", "Ncmpg", "Ncmpl", - "Ncmpn", "Ncmsg", "Ncmsi", "Ncmsl", "Ncmsn", "Np-si", "Np-sn", "Npmsi", "Npmsn", - "Pd-fpa--n-a--", "Pd-fpg--n-a--", "Pd-fpn--n-a--", "Pd-fsa--n-a--", - "Pd-fsg--n-a--", "Pd-fsi--n-a--", "Pd-fsl--n-a--", "Pd-fsn--n-a--", - "Pd-mpa--n-a--", "Pd-mpg--n-a--", "Pd-mpi--n-a--", "Pd-mpn--n-a--", - "Pd-msa--n-a-n", "Pd-msd--n-a--", "Pd-msg--n-a--", "Pd-msi--n-a--", - "Pd-msl--n-a--", "Pd-msn--n-a--", "Pd-npa--n-a--", "Pd-npi--n-a--", - "Pd-nsa--n-a--", "Pd-nsg--n-a--", "Pd-nsi--n-a--", "Pd-nsl--n-a--", - "Pd-nsn--n-a--", "Pi-fpa--n-a--", "Pi-fpd--n-a--", "Pi-fpg--n-a--", - "Pi-fpi--n-a--", "Pi-fpl--n-a--", "Pi-fpn--n-a--", "Pi-fsa--n-a--", - "Pi-fsg--n-a--", "Pi-fsi--n-a--", "Pi-fsl--n-a--", "Pi-fsn--n-a--", - "Pi-mpa--n-a--", "Pi-mpd--n-a--", "Pi-mpg--n-a--", "Pi-mpi--n-a--", - "Pi-mpl--n-a--", "Pi-mpn--n-a--", "Pi-msa--n-a--", "Pi-msa--n-a-n", - "Pi-msa--n-a-y", "Pi-msd--n-a--", "Pi-msg--n-a--", "Pi-msi--n-a--", - "Pi-msl--n-a--", "Pi-msn--n-a--", "Pi-npa--n-a--", "Pi-npi--n-a--", - "Pi-npl--n-a--", "Pi-npn--n-a--", "Pi-nsa--n-a--", "Pi-nsd--n-a--", - "Pi-nsg--n-a--", "Pi-nsi--n-a--", "Pi-nsl--n-a--", "Pi-nsn--n-a--", - "Pi3m-a--n-n-y", "Pi3m-d--n-n-y", "Pi3m-n--n-n-y", "Pi3n-a--n-n-n", - "Pi3n-g--n-n-n", "Pi3n-i--n-n-n", "Pi3n-i--y-n-n", "Pi3n-n--n-n-n", "Pi3nsn----a", - "Pp1-pa--n-n--", "Pp1-pd--y-n--", "Pp1-pn--n-n--", "Pp1-sa--n-n--", - "Pp1-sa--y-n--", "Pp1-sd--y-n--", "Pp1-sn--n-n--", "Pp2-pd--y-n--", - "Pp3-pa--y-n--", "Pp3-pd--y-n--", "Pp3-pg--n-n--", "Pp3-pg--y-n--", - "Pp3fsa--y-n--", "Pp3fsd--y-n--", "Pp3fsi--n-n--", "Pp3fsn--n-n--", - "Pp3mpn--n-n--", "Pp3msa--n-n--", "Pp3msa--y-n--", "Pp3msd--n-n--", - "Pp3msd--y-n--", "Pp3msg--n-n--", "Pp3msi--n-n--", "Pp3msn--n-n--", - "Pp3npn--n-n--", "Pp3nsn--n-n--", "Ps1fpgp-n-a--", "Ps1fsgp-n-a--", - "Ps1mpgp-n-a--", "Ps1msnp-n-a--", "Ps1msns-n-a--", "Ps1nsnp-n-a--", - "Ps3fpap-n-a--", "Ps3fpgsfn-a--", "Ps3fpnsmn-a--", "Ps3fsgsmn-a--", - "Ps3fsnsfn-a--", "Ps3fsnsmn-a--", "Ps3mpasmn-a--", "Ps3mpgsfn-a--", - "Ps3mpgsnn-a--", "Ps3mpnp-n-a--", "Ps3msgsmn-a--", "Ps3mslsmn-a--", - "Ps3mslsnn-a--", "Ps3msnp-n-a--", "Ps3msnsfn-a--", "Ps3msnsmn-a--", - "Ps3npgsmn-a--", "Ps3nplsmn-a--", "Ps3nsisfn-a--", "Ps3nsnsfn-a--", "Px--sa--ypn-", - "Px--sa--ypn--", "Px--sd--ypn--", "Px-fpa--nsa--", "Px-fpg--nsa--", - "Px-fsa--nsa--", "Px-fsg--nsa--", "Px-fsl--nsa--", "Px-mpa--nsa--", - "Px-mpl--nsa--", "Px-msa--nsa-n", "Px-msg--nsa--", "Px-msi--nsa--", - "Px-msl--nsa--", "Px-nsa--nsa--", "Qo", "Qq", "Qr", "Qz", "Rgc", "Rgp", "Rgs", - "Rl", "Rlp", "Rnp", "Rp", "Rs", "Rt", "Rtp", "Sa", "Sd", "Sg", "Si", "Sl", "Spsa", - "Spsg", "Spsi", "Spsl", "Var1p", "Var1s", "Var2p", "Var3p", "Var3p-y", "Var3s", - "Var3s-y", "Vca1s", "Vca2s", "Vca3p", "Vca3s", "Vcia3s", "Vcip3p", "Vcip3s", "Vcn", - "Vcp-pf", "Vcp-pm", "Vcp-pn", "Vcp-sf", "Vcp-sm", "Vcp-sn", "Vcpp", "Vcps-sna", - "Vcr1p", "Vcr1p-y", "Vcr1s", "Vcr2p", "Vcr3p", "Vcr3p-y", "Vcr3s", "Vcr3s-y", - "Vma3s", "Vmip3p", "Vmip3s", "Vmm1p", "Vmm2p", "Vmm2s", "Vmn", "Vmp-pf", "Vmp-pm", - "Vmp-pn", "Vmp-sf", "Vmp-sm", "Vmp-sn", "Vmps-pma", "Vmps-sma", "Vmps-snp", - "Vmr1p", "Vmr1s", "Vmr2p", "Vmr3p", "Vmr3s", "Vmr3s-y", "X", "Y", "Yn--n", "Yn-s-", - "Yn-sl", "Yn-sn", "Ynfpg", "Ynfsa", "Ynfsd", "Ynfsg", "Ynfsl", "Ynfsn", "Ynmpg", - "Ynmpn", "Ynmsa", "Ynmsd", "Ynmsg", "Ynmsi", "Ynmsl", "Ynmsn", "Z" }; - - //String[] unmappedPosTags = { "$", "''", "-LRB-", "-RRB-", "<root-POS>", "``" }; - - String[] depTags = { "<no-type>", "Adv", "Ap", "Atr", "Atv", "Aux", "Co", - "Elp", "Obj", "Oth", "Pnom", "Pred", "Prep", "Punc", "Sb", "Sub" }; - - String[] posOrig = { "Vmr1p", "Rgp", "Agpmsn", "N-msn", "N-msn", "Z", "Pi-fsn--n-a", - "Vmr3s", "Pi3n-a--n-nn", "Sg", "N-mpg", "Cc", "Vmn", "Pi3n-n--n-nn", "Vcr3s", "Rgc", - "Agpnsn", "Z" }; - - String[] posMapped = { "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", - "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS" }; - - AssertAnnotations.assertPOS(posMapped, posOrig, select(jcas, POS.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(MstParser.class, POS.class, "mte5-reduced", posTags, jcas); - //AssertAnnotations.assertTagsetMapping(POS.class, "mte5", unmappedPosTags, jcas); - AssertAnnotations.assertTagset(MstParser.class, Dependency.class, "setimes.hr", depTags, jcas); - } - - /** - * The POS tags produced by Hunpos are MSD tags. This model here only uses the first character - * of these MSD tags. Thus, we have a tag mismatch and the results here are completely bogus. - * - * @throws Exception - * if an error occurs. - */ - @Ignore("Takes too long") - @Test - public void testCroatianMte5Pos() - throws Exception - { - JCas jcas = runTest("hr", "mte5.pos", "Moramo vrlo kompliciran primjer rečenicu , " - + "koja sadrži što više sastojaka i ovisnosti što je više moguće ."); - - String[] dependencies = { - "[ 0, 6]Dependency(Oth,basic) D[0,6](Moramo) G[12,23](kompliciran)", - "[ 7, 11]Dependency(Oth,basic) D[7,11](vrlo) G[12,23](kompliciran)", - "[ 12, 23]Dependency(Oth,basic) D[12,23](kompliciran) G[24,31](primjer)", - "[ 24, 31]Dependency(Oth,basic) D[24,31](primjer) G[32,40](rečenicu)", - "[ 32, 40]Dependency(Punc,basic) D[32,40](rečenicu) G[41,42](,)", - "[ 41, 42]Dependency(Punc,basic) D[41,42](,) G[48,54](sadrži)", - "[ 43, 47]Dependency(Oth,basic) D[43,47](koja) G[48,54](sadrži)", - "[ 48, 54]Dependency(Oth,basic) D[48,54](sadrži) G[74,75](i)", - "[ 55, 58]Dependency(Oth,basic) D[55,58](što) G[74,75](i)", - "[ 59, 63]Dependency(Atr,basic) D[59,63](više) G[74,75](i)", - "[ 64, 73]Dependency(Oth,basic) D[64,73](sastojaka) G[59,63](više)", - "[ 74, 75]ROOT(Co,basic) D[74,75](i) G[74,75](i)", - "[ 76, 85]Dependency(Oth,basic) D[76,85](ovisnosti) G[98,104](moguće)", - "[ 86, 89]Dependency(Oth,basic) D[86,89](što) G[98,104](moguće)", - "[ 90, 92]Dependency(Oth,basic) D[90,92](je) G[98,104](moguće)", - "[ 93, 97]Dependency(Oth,basic) D[93,97](više) G[98,104](moguće)", - "[ 98,104]Dependency(Punc,basic) D[98,104](moguće) G[105,106](.)", - "[105,106]ROOT(Punc,basic) D[105,106](.) G[105,106](.)" }; - - String[] posTags = { "<root-POS>", "A", "C", "M", "N", "P", "Q", "R", "S", - "V", "X", "Y", "Z" }; - - //String[] unmappedPosTags = { "$", "''", "-LRB-", "-RRB-", "<root-POS>", "``" }; - - String[] depTags = { "<no-type>", "Adv", "Ap", "Atr", "Atv", "Aux", "Co", - "Elp", "Obj", "Oth", "Pnom", "Pred", "Prep", "Punc", "Sb", "Sub" }; - - String[] posOrig = { "Vmr1p", "Rgp", "Agpmsn", "N-msn", "N-msn", "Z", "Pi-fsn--n-a", - "Vmr3s", "Pi3n-a--n-nn", "Sg", "N-mpg", "Cc", "Vmn", "Pi3n-n--n-nn", "Vcr3s", "Rgc", - "Agpnsn", "Z" }; - - String[] posMapped = { "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", - "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS" }; - - AssertAnnotations.assertPOS(posMapped, posOrig, select(jcas, POS.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(MstParser.class, POS.class, "mte5-pos", posTags, jcas); - //AssertAnnotations.assertTagsetMapping(POS.class, "mte5", unmappedPosTags, jcas); - AssertAnnotations.assertTagset(MstParser.class, Dependency.class, "setimes.hr", depTags, jcas); - } - - /** - * This method runs the MSTParser for an example sentence and checks if it returns the correct - * annotations. An annotation consists of: dependency type, begin of dependency, end of - * dependency, begin of the head, end of the head - * - * @throws Exception - * if an error occurs. - */ - @Test - public void testEnglishDefault() - throws Exception - { - System.out.printf("Maximum memory: %d%n", Runtime.getRuntime().maxMemory()); - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 3000000000l); - - JCas jcas = runTest("en", null, "We need a very complicated example sentence , which " + - "contains as many constituents and dependencies as possible ."); - - String[] dependencies = { - "[ 0, 2]Dependency(nsubj,basic) D[0,2](We) G[3,7](need)", - "[ 3, 7]ROOT(null,basic) D[3,7](need) G[3,7](need)", - "[ 8, 9]Dependency(det,basic) D[8,9](a) G[35,43](sentence)", - "[ 10, 14]Dependency(advmod,basic) D[10,14](very) G[15,26](complicated)", - "[ 15, 26]Dependency(amod,basic) D[15,26](complicated) G[35,43](sentence)", - "[ 27, 34]Dependency(nn,basic) D[27,34](example) G[35,43](sentence)", - "[ 35, 43]Dependency(dobj,basic) D[35,43](sentence) G[3,7](need)", - "[ 44, 45]Dependency(punct,basic) D[44,45](,) G[35,43](sentence)", - "[ 46, 51]Dependency(nsubj,basic) D[46,51](which) G[52,60](contains)", - "[ 52, 60]Dependency(rcmod,basic) D[52,60](contains) G[35,43](sentence)", - "[ 61, 63]Dependency(prep,basic) D[61,63](as) G[52,60](contains)", - "[ 64, 68]Dependency(amod,basic) D[64,68](many) G[69,81](constituents)", - "[ 69, 81]Dependency(pobj,basic) D[69,81](constituents) G[61,63](as)", - "[ 82, 85]Dependency(cc,basic) D[82,85](and) G[69,81](constituents)", - "[ 86, 98]Dependency(conj,basic) D[86,98](dependencies) G[69,81](constituents)", - "[ 99,101]Dependency(dep,basic) D[99,101](as) G[61,63](as)", - "[102,110]Dependency(pobj,basic) D[102,110](possible) G[99,101](as)", - "[111,112]Dependency(punct,basic) D[111,112](.) G[3,7](need)" }; - - String[] depTags = { "<no-type>", "abbrev", "acomp", "advcl", "advmod", - "amod", "appos", "attr", "aux", "auxpass", "cc", "ccomp", "complm", "conj", "cop", - "csubj", "csubjpass", "dep", "det", "dobj", "expl", "infmod", "iobj", "mark", - "measure", "neg", "nn", "nsubj", "nsubjpass", "null", "num", "number", "parataxis", - "partmod", "pcomp", "pobj", "poss", "possessive", "preconj", "pred", "predet", - "prep", "prt", "punct", "purpcl", "quantmod", "rcmod", "rel", "tmod", "xcomp" }; - - String[] posTags = { "#", "$", "''", ",", "-LRB-", "-RRB-", ".", ":", - "<root-POS>", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", - "NN", "NNP", "NNPS", "NNS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", - "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", - "WRB", "``" }; - - String[] unmappedPos = { "<root-POS>"}; - - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(POS.class, "ptb", posTags, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ptb", unmappedPos, jcas); - AssertAnnotations.assertTagset(Dependency.class, "stanford", depTags, jcas); - } - - /** - * This method runs the MSTParser for an example sentence and checks if it returns the correct - * annotations. An annotation consists of: dependency type, begin of dependency, end of - * dependency, begin of the head, end of the head - * - * @throws Exception - * if an error occurs. - */ - @Test - public void testEnglishSample() - throws Exception - { - JCas jcas = runTest("en", "sample", "We need a very complicated example sentence , which " + - "contains as many constituents and dependencies as possible ."); - - String[] dependencies = { - "[ 0, 2]Dependency(NP-SBJ,basic) D[0,2](We) G[3,7](need)", - "[ 3, 7]ROOT(ROOT,basic) D[3,7](need) G[3,7](need)", - "[ 8, 9]Dependency(DEP,basic) D[8,9](a) G[35,43](sentence)", - "[ 10, 14]Dependency(DEP,basic) D[10,14](very) G[15,26](complicated)", - "[ 15, 26]Dependency(DEP,basic) D[15,26](complicated) G[35,43](sentence)", - "[ 27, 34]Dependency(DEP,basic) D[27,34](example) G[35,43](sentence)", - "[ 35, 43]Dependency(NP-OBJ,basic) D[35,43](sentence) G[3,7](need)", - "[ 44, 45]Dependency(DEP,basic) D[44,45](,) G[3,7](need)", - "[ 46, 51]Dependency(SBAR,basic) D[46,51](which) G[3,7](need)", - "[ 52, 60]Dependency(S,basic) D[52,60](contains) G[46,51](which)", - "[ 61, 63]Dependency(PP,basic) D[61,63](as) G[52,60](contains)", - "[ 64, 68]Dependency(DEP,basic) D[64,68](many) G[69,81](constituents)", - "[ 69, 81]Dependency(NP,basic) D[69,81](constituents) G[61,63](as)", - "[ 82, 85]Dependency(DEP,basic) D[82,85](and) G[86,98](dependencies)", - "[ 86, 98]Dependency(NP,basic) D[86,98](dependencies) G[61,63](as)", - "[ 99,101]Dependency(PP,basic) D[99,101](as) G[86,98](dependencies)", - "[102,110]Dependency(ADJP,basic) D[102,110](possible) G[99,101](as)", - "[111,112]Dependency(DEP,basic) D[111,112](.) G[3,7](need)" }; - - String[] posTags = { "$", "''", ",", "-LRB-", "-RRB-", ".", ":", "<root-POS>", - "CC", "CD", "DT", "FW", "IN", "JJ", "JJR", "JJS", "MD", "NN", "NNP", "NNPS", "NNS", - "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "TO", "VB", "VBD", "VBG", "VBN", - "VBP", "VBZ", "WDT", "WP", "WRB", "``" }; - - String[] unmappedPos = { "<root-POS>"}; - - String[] depTags = { "<no-type>", "ADJP", "ADVP", "CONJP", "DEP", "FRAG", - "NAC", "NP", "NP-OBJ", "NP-PRD", "NP-SBJ", "NX", "PP", "PRN", "PRT", "QP", "ROOT", - "S", "SBAR", "SINV", "SQ", "UCP", "VP", "WHNP" }; - - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(POS.class, "ptb", posTags, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ptb", unmappedPos, jcas); - AssertAnnotations.assertTagset(Dependency.class, "conll2008", depTags, jcas); - } - - private JCas runTest(String aLanguage, String aVariant, String aText, Object... aExtraParams) - throws Exception - { - AssumeResource.assumeResource(MstParser.class, "parser", aLanguage, aVariant); - - AggregateBuilder aggregate = new AggregateBuilder(); - - Assume.assumeFalse("HunPos currently hangs indefinitely on Windows: Issue #1099", - System.getProperty("os.name").toLowerCase(Locale.US).contains("win")); - - aggregate.add(createEngineDescription(HunPosTagger.class)); - Object[] params = new Object[] { - MstParser.PARAM_VARIANT, aVariant, - MstParser.PARAM_PRINT_TAGSET, true}; - params = ArrayUtils.addAll(params, aExtraParams); - aggregate.add(createEngineDescription(MstParser.class, params)); - - return TestRunner.runTest(aggregate.createAggregateDescription(), aLanguage, aText); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.mstparser; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.AssertAnnotations.assertDependencies; +import static org.dkpro.core.testing.AssertAnnotations.assertPOS; +import static org.dkpro.core.testing.AssertAnnotations.assertTagset; +import static org.dkpro.core.testing.AssertAnnotations.assertTagsetMapping; + +import java.util.Locale; + +import org.apache.commons.lang3.ArrayUtils; +import org.apache.uima.fit.factory.AggregateBuilder; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.hunpos.HunPosTagger; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Assume; +import org.junit.Ignore; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; + +public class MstParserTest +{ + @Ignore("Takes too long") + @Test + public void testCroatianMte5Defnpout() + throws Exception + { + JCas jcas = runTest("hr", "mte5.defnpout", "Moramo vrlo kompliciran primjer rečenicu , " + + "koja sadrži što više sastojaka i ovisnosti što je više moguće ."); + + String[] dependencies = { + "[ 0, 6]Dependency(Pred,basic) D[0,6](Moramo) G[41,42](,)", + "[ 7, 11]Dependency(Adv,basic) D[7,11](vrlo) G[12,23](kompliciran)", + "[ 12, 23]Dependency(Atr,basic) D[12,23](kompliciran) G[24,31](primjer)", + "[ 24, 31]Dependency(Ap,basic) D[24,31](primjer) G[32,40](rečenicu)", + "[ 32, 40]Dependency(Sb,basic) D[32,40](rečenicu) G[0,6](Moramo)", + "[ 41, 42]Dependency(Punc,basic) D[41,42](,) G[48,54](sadrži)", + "[ 43, 47]Dependency(Sb,basic) D[43,47](koja) G[48,54](sadrži)", + "[ 48, 54]ROOT(Pred,basic) D[48,54](sadrži) G[48,54](sadrži)", + "[ 55, 58]Dependency(Pred,basic) D[55,58](što) G[74,75](i)", + "[ 59, 63]Dependency(Oth,basic) D[59,63](više) G[74,75](i)", + "[ 64, 73]Dependency(Atr,basic) D[64,73](sastojaka) G[59,63](više)", + "[ 74, 75]Dependency(Co,basic) D[74,75](i) G[48,54](sadrži)", + "[ 76, 85]Dependency(Pred,basic) D[76,85](ovisnosti) G[74,75](i)", + "[ 86, 89]ROOT(Pred,basic) D[86,89](što) G[86,89](što)", + "[ 90, 92]ROOT(Pred,basic) D[90,92](je) G[90,92](je)", + "[ 93, 97]Dependency(Adv,basic) D[93,97](više) G[98,104](moguće)", + "[ 98,104]Dependency(Pnom,basic) D[98,104](moguće) G[90,92](je)", + "[105,106]ROOT(Punc,basic) D[105,106](.) G[105,106](.)" }; + + String[] posTags = { "<root-POS>", "Afmnpa-", "Afpfsn-", "Afpmpgy", "Afpmply", + "Afpnpgy", "Afpnpn-", "Agcfpn", "Agcfsa", "Agcfsg", "Agcfsi", "Agcfsn", "Agcmpa", + "Agcmpg", "Agcmpn", "Agcmsg", "Agcmsl", "Agcmsn", "Agcnsa", "Agcnsn", "Agpfpa", + "Agpfpd", "Agpfpg", "Agpfpi", "Agpfpl", "Agpfpn", "Agpfsa", "Agpfsd", "Agpfsg", + "Agpfsi", "Agpfsl", "Agpfsn", "Agpmpa", "Agpmpd", "Agpmpg", "Agpmpi", "Agpmpl", + "Agpmpn", "Agpmsan", "Agpmsay", "Agpmsd", "Agpmsg", "Agpmsi", "Agpmsl", "Agpmsn", + "Agpngs", "Agpnpa", "Agpnpg", "Agpnpi", "Agpnpl", "Agpnpn", "Agpnsa", "Agpnsd", + "Agpnsg", "Agpnsl", "Agpnsn", "Agsfpa", "Agsfpg", "Agsfpn", "Agsfsa", "Agsfsg", + "Agsfsi", "Agsfsl", "Agsfsn", "Agsmpa", "Agsmpg", "Agsmpn", "Agsmsan", "Agsmsd", + "Agsmsn", "Agsnpg", "Agsnsn", "Appfpg", "Appfpl", "Appfpn", "Appfsa", "Appfsg", + "Appfsl", "Appfsn", "Appmpa", "Appmpd", "Appmpg", "Appmpi", "Appmpn", "Appmsan", + "Appmsay", "Appmsd", "Appmsg", "Appmsi", "Appmsl", "Appmsn", "Appnpa", "Appnpn", + "Appnsg", "Appnsl", "Appnsn", "Apsfsg", "Aspfpn", "Aspfsl", "Aspfsn", "Aspmsd", + "Aspmsn", "Aspnsa", "Aspnsg", "Cc", "Ccs", "Cs", "Css", "M", "Mc-p-l", "Mc-pal", + "Mc-pgl", "Mc-s-l", "Mcfp-l", "Mcfpal", "Mcfpgl", "Mcfpnl", "Mcfsal-", "Mcfsgl-", + "Mcfsll-", "Mcfsnl-", "Mcmpal", "Mcmpan", "Mcmpgl", "Mcmpnl", "Mcmsal", + "Mcmsal---n", "Mcmsal---y", "Mcmsgl", "Mcmsil-", "Mcmsll", "Mcmsnl", "Mcnpnl", + "Mcnsal-", "Mcnsnl-", "Ms-s-l", "Msfpgl", "Msfpnl", "N--pg", "N--pn", "N-fpa", + "N-fpd", "N-fpg", "N-fpi", "N-fpl", "N-fpn", "N-fsa", "N-fsd", "N-fsg", "N-fsi", + "N-fsl", "N-fsn", "N-mpa", "N-mpd", "N-mpg", "N-mpi", "N-mpl", "N-mpn", "N-msa", + "N-msan", "N-msay", "N-msd", "N-msg", "N-msi", "N-msl", "N-msn", "N-msv", "N-npa", + "N-npd", "N-npg", "N-npi", "N-npl", "N-npn", "N-nsa", "N-nsd", "N-nsg", "N-nsi", + "N-nsl", "N-nsn", "Ncfpn", "Ncfsg", "Ncfsl", "Ncfsn", "Ncmpa", "Ncmpg", "Ncmpl", + "Ncmpn", "Ncmsg", "Ncmsi", "Ncmsl", "Ncmsn", "Np-si", "Np-sn", "Npmsi", "Npmsn", + "Pd-fpa--n-a--", "Pd-fpg--n-a--", "Pd-fpn--n-a--", "Pd-fsa--n-a--", + "Pd-fsg--n-a--", "Pd-fsi--n-a--", "Pd-fsl--n-a--", "Pd-fsn--n-a--", + "Pd-mpa--n-a--", "Pd-mpg--n-a--", "Pd-mpi--n-a--", "Pd-mpn--n-a--", + "Pd-msa--n-a-n", "Pd-msd--n-a--", "Pd-msg--n-a--", "Pd-msi--n-a--", + "Pd-msl--n-a--", "Pd-msn--n-a--", "Pd-npa--n-a--", "Pd-npi--n-a--", + "Pd-nsa--n-a--", "Pd-nsg--n-a--", "Pd-nsi--n-a--", "Pd-nsl--n-a--", + "Pd-nsn--n-a--", "Pi-fpa--n-a--", "Pi-fpd--n-a--", "Pi-fpg--n-a--", + "Pi-fpi--n-a--", "Pi-fpl--n-a--", "Pi-fpn--n-a--", "Pi-fsa--n-a--", + "Pi-fsg--n-a--", "Pi-fsi--n-a--", "Pi-fsl--n-a--", "Pi-fsn--n-a--", + "Pi-mpa--n-a--", "Pi-mpd--n-a--", "Pi-mpg--n-a--", "Pi-mpi--n-a--", + "Pi-mpl--n-a--", "Pi-mpn--n-a--", "Pi-msa--n-a--", "Pi-msa--n-a-n", + "Pi-msa--n-a-y", "Pi-msd--n-a--", "Pi-msg--n-a--", "Pi-msi--n-a--", + "Pi-msl--n-a--", "Pi-msn--n-a--", "Pi-npa--n-a--", "Pi-npi--n-a--", + "Pi-npl--n-a--", "Pi-npn--n-a--", "Pi-nsa--n-a--", "Pi-nsd--n-a--", + "Pi-nsg--n-a--", "Pi-nsi--n-a--", "Pi-nsl--n-a--", "Pi-nsn--n-a--", + "Pi3m-a--n-n-y", "Pi3m-d--n-n-y", "Pi3m-n--n-n-y", "Pi3n-a--n-n-n", + "Pi3n-g--n-n-n", "Pi3n-i--n-n-n", "Pi3n-i--y-n-n", "Pi3n-n--n-n-n", "Pi3nsn----a", + "Pp1-pa--n-n--", "Pp1-pd--y-n--", "Pp1-pn--n-n--", "Pp1-sa--n-n--", + "Pp1-sa--y-n--", "Pp1-sd--y-n--", "Pp1-sn--n-n--", "Pp2-pd--y-n--", + "Pp3-pa--y-n--", "Pp3-pd--y-n--", "Pp3-pg--n-n--", "Pp3-pg--y-n--", + "Pp3fsa--y-n--", "Pp3fsd--y-n--", "Pp3fsi--n-n--", "Pp3fsn--n-n--", + "Pp3mpn--n-n--", "Pp3msa--n-n--", "Pp3msa--y-n--", "Pp3msd--n-n--", + "Pp3msd--y-n--", "Pp3msg--n-n--", "Pp3msi--n-n--", "Pp3msn--n-n--", + "Pp3npn--n-n--", "Pp3nsn--n-n--", "Ps1fpgp-n-a--", "Ps1fsgp-n-a--", + "Ps1mpgp-n-a--", "Ps1msnp-n-a--", "Ps1msns-n-a--", "Ps1nsnp-n-a--", + "Ps3fpap-n-a--", "Ps3fpgsfn-a--", "Ps3fpnsmn-a--", "Ps3fsgsmn-a--", + "Ps3fsnsfn-a--", "Ps3fsnsmn-a--", "Ps3mpasmn-a--", "Ps3mpgsfn-a--", + "Ps3mpgsnn-a--", "Ps3mpnp-n-a--", "Ps3msgsmn-a--", "Ps3mslsmn-a--", + "Ps3mslsnn-a--", "Ps3msnp-n-a--", "Ps3msnsfn-a--", "Ps3msnsmn-a--", + "Ps3npgsmn-a--", "Ps3nplsmn-a--", "Ps3nsisfn-a--", "Ps3nsnsfn-a--", "Px--sa--ypn-", + "Px--sa--ypn--", "Px--sd--ypn--", "Px-fpa--nsa--", "Px-fpg--nsa--", + "Px-fsa--nsa--", "Px-fsg--nsa--", "Px-fsl--nsa--", "Px-mpa--nsa--", + "Px-mpl--nsa--", "Px-msa--nsa-n", "Px-msg--nsa--", "Px-msi--nsa--", + "Px-msl--nsa--", "Px-nsa--nsa--", "Qo", "Qq", "Qr", "Qz", "Rgc", "Rgp", "Rgs", + "Rl", "Rlp", "Rnp", "Rp", "Rs", "Rt", "Rtp", "Sa", "Sd", "Sg", "Si", "Sl", "Spsa", + "Spsg", "Spsi", "Spsl", "Var1p", "Var1s", "Var2p", "Var3p", "Var3p-y", "Var3s", + "Var3s-y", "Vca1s", "Vca2s", "Vca3p", "Vca3s", "Vcia3s", "Vcip3p", "Vcip3s", "Vcn", + "Vcp-pf", "Vcp-pm", "Vcp-pn", "Vcp-sf", "Vcp-sm", "Vcp-sn", "Vcpp", "Vcps-sna", + "Vcr1p", "Vcr1p-y", "Vcr1s", "Vcr2p", "Vcr3p", "Vcr3p-y", "Vcr3s", "Vcr3s-y", + "Vma3s", "Vmip3p", "Vmip3s", "Vmm1p", "Vmm2p", "Vmm2s", "Vmn", "Vmp-pf", "Vmp-pm", + "Vmp-pn", "Vmp-sf", "Vmp-sm", "Vmp-sn", "Vmps-pma", "Vmps-sma", "Vmps-snp", + "Vmr1p", "Vmr1s", "Vmr2p", "Vmr3p", "Vmr3s", "Vmr3s-y", "X", "Y", "Yn--n", "Yn-s-", + "Yn-sl", "Yn-sn", "Ynfpg", "Ynfsa", "Ynfsd", "Ynfsg", "Ynfsl", "Ynfsn", "Ynmpg", + "Ynmpn", "Ynmsa", "Ynmsd", "Ynmsg", "Ynmsi", "Ynmsl", "Ynmsn", "Z" }; + + //String[] unmappedPosTags = { "$", "''", "-LRB-", "-RRB-", "<root-POS>", "``" }; + + String[] depTags = { "<no-type>", "Adv", "Ap", "Atr", "Atv", "Aux", "Co", + "Elp", "Obj", "Oth", "Pnom", "Pred", "Prep", "Punc", "Sb", "Sub" }; + + String[] posOrig = { "Vmr1p", "Rgp", "Agpmsn", "N-msn", "N-msn", "Z", "Pi-fsn--n-a", + "Vmr3s", "Pi3n-a--n-nn", "Sg", "N-mpg", "Cc", "Vmn", "Pi3n-n--n-nn", "Vcr3s", "Rgc", + "Agpnsn", "Z" }; + + String[] posMapped = { "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", + "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS" }; + + assertPOS(posMapped, posOrig, select(jcas, POS.class)); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(MstParser.class, POS.class, "mte5-reduced", posTags, jcas); + //assertTagsetMapping(POS.class, "mte5", unmappedPosTags, jcas); + assertTagset(MstParser.class, Dependency.class, "setimes.hr", depTags, jcas); + } + + /** + * The POS tags produced by Hunpos are MSD tags. This model here only uses the first character + * of these MSD tags. Thus, we have a tag mismatch and the results here are completely bogus. + * + * @throws Exception + * if an error occurs. + */ + @Ignore("Takes too long") + @Test + public void testCroatianMte5Pos() + throws Exception + { + JCas jcas = runTest("hr", "mte5.pos", "Moramo vrlo kompliciran primjer rečenicu , " + + "koja sadrži što više sastojaka i ovisnosti što je više moguće ."); + + String[] dependencies = { + "[ 0, 6]Dependency(Oth,basic) D[0,6](Moramo) G[12,23](kompliciran)", + "[ 7, 11]Dependency(Oth,basic) D[7,11](vrlo) G[12,23](kompliciran)", + "[ 12, 23]Dependency(Oth,basic) D[12,23](kompliciran) G[24,31](primjer)", + "[ 24, 31]Dependency(Oth,basic) D[24,31](primjer) G[32,40](rečenicu)", + "[ 32, 40]Dependency(Punc,basic) D[32,40](rečenicu) G[41,42](,)", + "[ 41, 42]Dependency(Punc,basic) D[41,42](,) G[48,54](sadrži)", + "[ 43, 47]Dependency(Oth,basic) D[43,47](koja) G[48,54](sadrži)", + "[ 48, 54]Dependency(Oth,basic) D[48,54](sadrži) G[74,75](i)", + "[ 55, 58]Dependency(Oth,basic) D[55,58](što) G[74,75](i)", + "[ 59, 63]Dependency(Atr,basic) D[59,63](više) G[74,75](i)", + "[ 64, 73]Dependency(Oth,basic) D[64,73](sastojaka) G[59,63](više)", + "[ 74, 75]ROOT(Co,basic) D[74,75](i) G[74,75](i)", + "[ 76, 85]Dependency(Oth,basic) D[76,85](ovisnosti) G[98,104](moguće)", + "[ 86, 89]Dependency(Oth,basic) D[86,89](što) G[98,104](moguće)", + "[ 90, 92]Dependency(Oth,basic) D[90,92](je) G[98,104](moguće)", + "[ 93, 97]Dependency(Oth,basic) D[93,97](više) G[98,104](moguće)", + "[ 98,104]Dependency(Punc,basic) D[98,104](moguće) G[105,106](.)", + "[105,106]ROOT(Punc,basic) D[105,106](.) G[105,106](.)" }; + + String[] posTags = { "<root-POS>", "A", "C", "M", "N", "P", "Q", "R", "S", + "V", "X", "Y", "Z" }; + + //String[] unmappedPosTags = { "$", "''", "-LRB-", "-RRB-", "<root-POS>", "``" }; + + String[] depTags = { "<no-type>", "Adv", "Ap", "Atr", "Atv", "Aux", "Co", + "Elp", "Obj", "Oth", "Pnom", "Pred", "Prep", "Punc", "Sb", "Sub" }; + + String[] posOrig = { "Vmr1p", "Rgp", "Agpmsn", "N-msn", "N-msn", "Z", "Pi-fsn--n-a", + "Vmr3s", "Pi3n-a--n-nn", "Sg", "N-mpg", "Cc", "Vmn", "Pi3n-n--n-nn", "Vcr3s", "Rgc", + "Agpnsn", "Z" }; + + String[] posMapped = { "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", + "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS" }; + + assertPOS(posMapped, posOrig, select(jcas, POS.class)); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(MstParser.class, POS.class, "mte5-pos", posTags, jcas); + //assertTagsetMapping(POS.class, "mte5", unmappedPosTags, jcas); + assertTagset(MstParser.class, Dependency.class, "setimes.hr", depTags, jcas); + } + + /** + * This method runs the MSTParser for an example sentence and checks if it returns the correct + * annotations. An annotation consists of: dependency type, begin of dependency, end of + * dependency, begin of the head, end of the head + * + * @throws Exception + * if an error occurs. + */ + @Test + public void testEnglishDefault() + throws Exception + { + System.out.printf("Maximum memory: %d%n", Runtime.getRuntime().maxMemory()); + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 3000000000l); + + JCas jcas = runTest("en", null, "We need a very complicated example sentence , which " + + "contains as many constituents and dependencies as possible ."); + + String[] dependencies = { + "[ 0, 2]Dependency(nsubj,basic) D[0,2](We) G[3,7](need)", + "[ 3, 7]ROOT(null,basic) D[3,7](need) G[3,7](need)", + "[ 8, 9]Dependency(det,basic) D[8,9](a) G[35,43](sentence)", + "[ 10, 14]Dependency(advmod,basic) D[10,14](very) G[15,26](complicated)", + "[ 15, 26]Dependency(amod,basic) D[15,26](complicated) G[35,43](sentence)", + "[ 27, 34]Dependency(nn,basic) D[27,34](example) G[35,43](sentence)", + "[ 35, 43]Dependency(dobj,basic) D[35,43](sentence) G[3,7](need)", + "[ 44, 45]Dependency(punct,basic) D[44,45](,) G[35,43](sentence)", + "[ 46, 51]Dependency(nsubj,basic) D[46,51](which) G[52,60](contains)", + "[ 52, 60]Dependency(rcmod,basic) D[52,60](contains) G[35,43](sentence)", + "[ 61, 63]Dependency(prep,basic) D[61,63](as) G[52,60](contains)", + "[ 64, 68]Dependency(amod,basic) D[64,68](many) G[69,81](constituents)", + "[ 69, 81]Dependency(pobj,basic) D[69,81](constituents) G[61,63](as)", + "[ 82, 85]Dependency(cc,basic) D[82,85](and) G[69,81](constituents)", + "[ 86, 98]Dependency(conj,basic) D[86,98](dependencies) G[69,81](constituents)", + "[ 99,101]Dependency(dep,basic) D[99,101](as) G[61,63](as)", + "[102,110]Dependency(pobj,basic) D[102,110](possible) G[99,101](as)", + "[111,112]Dependency(punct,basic) D[111,112](.) G[3,7](need)" }; + + String[] depTags = { "<no-type>", "abbrev", "acomp", "advcl", "advmod", + "amod", "appos", "attr", "aux", "auxpass", "cc", "ccomp", "complm", "conj", "cop", + "csubj", "csubjpass", "dep", "det", "dobj", "expl", "infmod", "iobj", "mark", + "measure", "neg", "nn", "nsubj", "nsubjpass", "null", "num", "number", "parataxis", + "partmod", "pcomp", "pobj", "poss", "possessive", "preconj", "pred", "predet", + "prep", "prt", "punct", "purpcl", "quantmod", "rcmod", "rel", "tmod", "xcomp" }; + + String[] posTags = { "#", "$", "''", ",", "-LRB-", "-RRB-", ".", ":", + "<root-POS>", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", + "NN", "NNP", "NNPS", "NNS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", + "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", + "WRB", "``" }; + + String[] unmappedPos = { "<root-POS>"}; + + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(POS.class, "ptb", posTags, jcas); + assertTagsetMapping(POS.class, "ptb", unmappedPos, jcas); + assertTagset(Dependency.class, "stanford", depTags, jcas); + } + + /** + * This method runs the MSTParser for an example sentence and checks if it returns the correct + * annotations. An annotation consists of: dependency type, begin of dependency, end of + * dependency, begin of the head, end of the head + * + * @throws Exception + * if an error occurs. + */ + @Test + public void testEnglishSample() + throws Exception + { + JCas jcas = runTest("en", "sample", "We need a very complicated example sentence , which " + + "contains as many constituents and dependencies as possible ."); + + String[] dependencies = { + "[ 0, 2]Dependency(NP-SBJ,basic) D[0,2](We) G[3,7](need)", + "[ 3, 7]ROOT(ROOT,basic) D[3,7](need) G[3,7](need)", + "[ 8, 9]Dependency(DEP,basic) D[8,9](a) G[35,43](sentence)", + "[ 10, 14]Dependency(DEP,basic) D[10,14](very) G[15,26](complicated)", + "[ 15, 26]Dependency(DEP,basic) D[15,26](complicated) G[35,43](sentence)", + "[ 27, 34]Dependency(DEP,basic) D[27,34](example) G[35,43](sentence)", + "[ 35, 43]Dependency(NP-OBJ,basic) D[35,43](sentence) G[3,7](need)", + "[ 44, 45]Dependency(DEP,basic) D[44,45](,) G[3,7](need)", + "[ 46, 51]Dependency(SBAR,basic) D[46,51](which) G[3,7](need)", + "[ 52, 60]Dependency(S,basic) D[52,60](contains) G[46,51](which)", + "[ 61, 63]Dependency(PP,basic) D[61,63](as) G[52,60](contains)", + "[ 64, 68]Dependency(DEP,basic) D[64,68](many) G[69,81](constituents)", + "[ 69, 81]Dependency(NP,basic) D[69,81](constituents) G[61,63](as)", + "[ 82, 85]Dependency(DEP,basic) D[82,85](and) G[86,98](dependencies)", + "[ 86, 98]Dependency(NP,basic) D[86,98](dependencies) G[61,63](as)", + "[ 99,101]Dependency(PP,basic) D[99,101](as) G[86,98](dependencies)", + "[102,110]Dependency(ADJP,basic) D[102,110](possible) G[99,101](as)", + "[111,112]Dependency(DEP,basic) D[111,112](.) G[3,7](need)" }; + + String[] posTags = { "$", "''", ",", "-LRB-", "-RRB-", ".", ":", "<root-POS>", + "CC", "CD", "DT", "FW", "IN", "JJ", "JJR", "JJS", "MD", "NN", "NNP", "NNPS", "NNS", + "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "TO", "VB", "VBD", "VBG", "VBN", + "VBP", "VBZ", "WDT", "WP", "WRB", "``" }; + + String[] unmappedPos = { "<root-POS>"}; + + String[] depTags = { "<no-type>", "ADJP", "ADVP", "CONJP", "DEP", "FRAG", + "NAC", "NP", "NP-OBJ", "NP-PRD", "NP-SBJ", "NX", "PP", "PRN", "PRT", "QP", "ROOT", + "S", "SBAR", "SINV", "SQ", "UCP", "VP", "WHNP" }; + + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(POS.class, "ptb", posTags, jcas); + assertTagsetMapping(POS.class, "ptb", unmappedPos, jcas); + assertTagset(Dependency.class, "conll2008", depTags, jcas); + } + + private JCas runTest(String aLanguage, String aVariant, String aText, Object... aExtraParams) + throws Exception + { + AssumeResource.assumeResource(MstParser.class, "parser", aLanguage, aVariant); + + AggregateBuilder aggregate = new AggregateBuilder(); + + Assume.assumeFalse("HunPos currently hangs indefinitely on Windows: Issue #1099", + System.getProperty("os.name").toLowerCase(Locale.US).contains("win")); + Assume.assumeTrue("HunPos does not run on OS X Catalina or higher", + System.getProperty("os.name").toLowerCase(Locale.US).contains("mac") && + !System.getProperty("os.version").matches("10\\.([0-9]|1[0-4]).*")); + + aggregate.add(createEngineDescription(HunPosTagger.class)); + Object[] params = new Object[] { + MstParser.PARAM_VARIANT, aVariant, + MstParser.PARAM_PRINT_TAGSET, true}; + params = ArrayUtils.addAll(params, aExtraParams); + aggregate.add(createEngineDescription(MstParser.class, params)); + + return TestRunner.runTest(aggregate.createAggregateDescription(), aLanguage, aText); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); } diff --git a/dkpro-core-mstparser-asl/src/test/resources/log4j.properties b/dkpro-core-mstparser-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-mstparser-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-mstparser-asl/src/test/resources/log4j2.xml b/dkpro-core-mstparser-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-mstparser-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Logger name="org.dkpro.core.api.resources.ResourceObjectProviderBase" level="INFO"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-mystem-asl/LICENSE.txt b/dkpro-core-mystem-asl/LICENSE.txt new file mode 100644 index 0000000000..d645695673 --- /dev/null +++ b/dkpro-core-mystem-asl/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/dkpro-core-mystem-asl/pom.xml b/dkpro-core-mystem-asl/pom.xml new file mode 100644 index 0000000000..21cad2e2d8 --- /dev/null +++ b/dkpro-core-mystem-asl/pom.xml @@ -0,0 +1,101 @@ +<!-- + Licensed to the Technische Universität Darmstadt under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The Technische Universität Darmstadt + licenses this file to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> + <relativePath>../dkpro-core-asl</relativePath> + </parent> + <artifactId>dkpro-core-mystem-asl</artifactId> + <url>https://dkpro.github.io/dkpro-core/</url> + <name>DKPro Core ASL - MyStem</name> + <dependencies> + <dependency> + <groupId>org.apache.uima</groupId> + <artifactId>uimaj-core</artifactId> + </dependency> + <dependency> + <groupId>org.apache.uima</groupId> + <artifactId>uimafit-core</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> + </dependency> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-lang3</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-mystem-bin</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-featurepath-asl</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> + </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> + <scope>test</scope> + </dependency> + </dependencies> + <dependencyManagement> + <dependencies> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-mystem-bin</artifactId> + <version>20180116.0</version> + </dependency> + </dependencies> + </dependencyManagement> + <build> + <pluginManagement> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + <configuration> + <usedDependencies combine.children="append"> + <usedDependency>org.dkpro.core:dkpro-core-mystem-bin</usedDependency> + </usedDependencies> + </configuration> + </plugin> + </plugins> + </pluginManagement> + </build> +</project> \ No newline at end of file diff --git a/dkpro-core-mystem-asl/src/main/java/org/dkpro/core/mystem/MyStemStemmer.java b/dkpro-core-mystem-asl/src/main/java/org/dkpro/core/mystem/MyStemStemmer.java new file mode 100644 index 0000000000..a1554f21cc --- /dev/null +++ b/dkpro-core-mystem-asl/src/main/java/org/dkpro/core/mystem/MyStemStemmer.java @@ -0,0 +1,291 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.mystem; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Set; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.output.FileWriterWithEncoding; +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.cas.text.AnnotationIndex; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.CasUtil; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.pear.util.FileUtil; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.featurepath.FeaturePathAnnotatorBase; +import org.dkpro.core.api.featurepath.FeaturePathException; +import org.dkpro.core.api.resources.PlatformDetector; +import org.dkpro.core.api.resources.RuntimeProvider; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * This MyStem stemmer implementation only works with the Russian language. + */ +@Component(OperationType.STEMMER) +@ResourceMetaData(name = "MyStem Stemmer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability("ru") +@TypeCapability(inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem" }) +public class MyStemStemmer + extends FeaturePathAnnotatorBase +{ + + private static final String MESSAGE_DIGEST = MyStemStemmer.class.getName() + "_Messages"; + + private RuntimeProvider runtimeProvider; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException + { + super.initialize(aContext); + + catch32BitOperatinSystemUsers(); + + runtimeProvider = new RuntimeProvider("classpath:/org/dkpro/core/mystem/bin/"); + } + + private void catch32BitOperatinSystemUsers() + { + PlatformDetector detector = new PlatformDetector(); + if (detector.getArch().equals(PlatformDetector.ARCH_X86_32)) { + throw new UnsupportedOperationException("Only 64bit operating systems supported"); + } + } + + @Override + protected Set<String> getDefaultPaths() + { + return Collections.singleton(Token.class.getName()); + } + + @Override + protected void generateAnnotations(JCas aJCas) + throws FeaturePathException, AnalysisEngineProcessException + { + + // CAS is necessary to retrieve values + CAS currCAS = aJCas.getCas(); + + // Try language set in CAS. + String lang = aJCas.getDocumentLanguage(); + + if (StringUtils.isBlank(lang)) { + throw new AnalysisEngineProcessException(MESSAGE_DIGEST, "no_language_error", null); + } + + lang = lang.toLowerCase(Locale.US); + + if (!"ru".equals(lang)) { // Only specified language is supported + throw new AnalysisEngineProcessException(MESSAGE_DIGEST, "unsupported_language_error", + new Object[] { lang }); + } + + for (String path : paths) { + // Separate Typename and featurepath + String[] segments = path.split("/", 2); + String typeName = segments[0]; + + // Try to get the type from the typesystem of the CAS + Type t = CasUtil.getType(currCAS, typeName); + if (t == null) { + throw new IllegalStateException("Type [" + typeName + "] not found in type system"); + } + + // get an fpi object and initialize it + // initialize the FeaturePathInfo with the corresponding part + initializeFeaturePathInfoFrom(fp, segments); + + // get the annotations + AnnotationIndex<?> idx = currCAS.getAnnotationIndex(t); + FSIterator<?> iterator = idx.iterator(); + + List<AnnotationFS> afs = new ArrayList<>(); + iterator.forEachRemaining(x -> afs.add((AnnotationFS) x)); + + // get the stems + PlatformDetector pd = new PlatformDetector(); + String platform = pd.getPlatformId(); + getLogger().info("Load binary for platform: [" + platform + "]"); + + File executableFile = getExecutable(); + + File inputFile = prepareInputfile(aJCas); + File outputFile = prepareOutputFile(); + + List<String> cmd = new ArrayList<>(); + cmd.add(executableFile.getAbsolutePath()); + cmd.add("-n"); // one word per line output + cmd.add("-l"); // suppress input token form and output only stem + cmd.add(inputFile.getAbsolutePath()); + cmd.add(outputFile.getAbsolutePath()); + + runProcess(cmd); + + List<String> l = readStemmerOutput(outputFile); + + if (afs.size() != l.size()) { + throw new AnalysisEngineProcessException(new IllegalStateException( + "Number of [" + t.getName() + "] annotations [" + afs.size() + + "] does not match with number of stems [" + l.size() + "]")); + } + + for (int i = 0; i < l.size(); i++) { + + AnnotationFS fs = afs.get(i); + String stem = l.get(i); + + if (this.filterFeaturePath != null) { + // check annotation filter condition + if (this.filterFeaturePathInfo.match(fs, this.filterCondition)) { + createStemAnnotation(aJCas, fs, stem); + } + } + else { // no annotation filter specified + createStemAnnotation(aJCas, fs, stem); + } + } + } + } + + private void createStemAnnotation(JCas aJCas, AnnotationFS fs, String stem) + throws AnalysisEngineProcessException + { + + // Check for blank text, it makes no sense to add a stem then (and raised an + // exception) + String value = fp.getValue(fs); + if (!StringUtils.isBlank(value)) { + Stem stemAnnot = new Stem(aJCas, fs.getBegin(), fs.getEnd()); + + stemAnnot.setValue(stem); + stemAnnot.addToIndexes(aJCas); + + // Try setting the "stem" feature on Tokens. + Feature feat = fs.getType().getFeatureByBaseName("stem"); + if (feat != null && feat.getRange() != null + && aJCas.getTypeSystem().subsumes(feat.getRange(), stemAnnot.getType())) { + fs.setFeatureValue(feat, stemAnnot); + } + } + } + + private List<String> readStemmerOutput(File outputFile) throws AnalysisEngineProcessException + { + List<String> readLines; + try { + readLines = FileUtils.readLines(outputFile, "utf-8"); + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + return readLines; + } + + private void runProcess(List<String> cmd) throws AnalysisEngineProcessException + { + try { + ProcessBuilder pb = new ProcessBuilder(); + pb.inheritIO(); + pb.command(cmd); + Process p = pb.start(); + p.waitFor(); + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + } + + private File prepareOutputFile() throws AnalysisEngineProcessException + { + try { + File file = FileUtil.createTempFile("mystemOutput" + System.currentTimeMillis(), + ".txt"); + file.deleteOnExit(); + return file; + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + + private File prepareInputfile(JCas aJCas) throws AnalysisEngineProcessException + { + File inputTmp = null; + try { + inputTmp = FileUtil.createTempFile("mystemInput" + System.currentTimeMillis(), ".txt"); + + try (BufferedWriter wrt = new BufferedWriter( + new FileWriterWithEncoding(inputTmp, "utf-8"))) { + Iterator<Token> iterator = JCasUtil.select(aJCas, Token.class).iterator(); + while (iterator.hasNext()) { + Token next = iterator.next(); + wrt.write(next.getCoveredText()); + if (iterator.hasNext()) { + wrt.write(" "); + } + } + } + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + + if (inputTmp != null) { + inputTmp.deleteOnExit(); + } + return inputTmp; + } + + private File getExecutable() throws AnalysisEngineProcessException + { + File exec = null; + try { + exec = runtimeProvider.getFile("mystem"); + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + return exec; + } +} diff --git a/dkpro-core-mystem-asl/src/scripts/build.xml b/dkpro-core-mystem-asl/src/scripts/build.xml new file mode 100644 index 0000000000..71b1c769ed --- /dev/null +++ b/dkpro-core-mystem-asl/src/scripts/build.xml @@ -0,0 +1,130 @@ +<!-- + Copyright 2018 + Ubiquitous Knowledge Processing (UKP) Lab + Technische Universität Darmstadt + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project basedir="../.." default="separate-jars"> + <import> + <url url="https://raw.githubusercontent.com/dkpro/resource-packager/0.8.0/ant-macros.xml"/> + </import> + + <!-- + - Output package configuration + --> + <property name="outputPackage" value="org/dkpro/core/mystem/lib"/> + + <target name="local-maven"> + <property name="install-artifact-mode" value="local"/> + <antcall target="separate-jars"/> + </target> + + <target name="remote-maven"> + <property name="install-artifact-mode" value="remote"/> + <antcall target="separate-jars"/> + </target> + + <target name="separate-jars" depends="install-executables"/> + + + <target name="install-executables"> + <property name="version.bin" value="20180116.0"/> + <mkdir dir="target/download"/> + + <!-- OSX --> + <get + src="http://download.cdn.yandex.net/mystem/mystem-3.1-macosx.tar.gz" + dest="target/download/mystem-3.1-macosx.tar.gz" + skipexisting="true"/> + <gunzip + src="target/download/mystem-3.1-macosx.tar.gz" + dest="target/download/mystem-3.1-macosx.tar"/> + <untar + src="target/download/mystem-3.1-macosx.tar" + dest="target/model-staging/org/dkpro/core/mystem/bin/osx-x86_64"> + <patternset> + <include name="*"/> + </patternset> + </untar> + <propertyfile + file="target/model-staging/org/dkpro/core/mystem/bin/osx-x86_64/manifest.properties"> + <entry key="mystem" value="executable"/> + </propertyfile> + + <!-- Linux 64 --> + <get + src="http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz" + dest="target/download/mystem-3.1-linux-64bit.tar.gz" + skipexisting="true"/> + <gunzip + src="target/download/mystem-3.1-linux-64bit.tar.gz" + dest="target/download/mystem-3.1-linux-64bit.tar"/> + <untar + src="target/download/mystem-3.1-linux-64bit.tar" + dest="target/model-staging/org/dkpro/core/mystem/bin/linux-x86_64"> + <patternset> + <include name="*" /> + </patternset> + </untar> + <propertyfile + file="target/model-staging/org/dkpro/core/mystem/bin/linux-x86_64/manifest.properties"> + <entry key="mystem" value="executable"/> + </propertyfile> + + <!-- Window 64bit --> + <get + src="http://download.cdn.yandex.net/mystem/mystem-3.1-win-64bit.zip" + dest="target/download/mystem-3.1-win-64bit.zip" + skipexisting="true"/> + <mkdir dir="target/model-staging/org/dkpro/core/mystem/bin/windows-x86_64"/> + <unzip src="target/download/mystem-3.1-win-64bit.zip" + dest="target/model-staging/org/dkpro/core/mystem/bin/windows-x86_64"> + <patternset> + <include name="*"/> + </patternset> + </unzip> + <move file="target/model-staging/org/dkpro/core/mystem/bin/windows-x86_64/mystem.exe" + tofile="target/model-staging/org/dkpro/core/mystem/bin/windows-x86_64/mystem"/> + <propertyfile + file="target/model-staging/org/dkpro/core/mystem/bin/windows-x86_64/manifest.properties"> + <entry key="mystem" value="executable"/> + </propertyfile> + + + <echo file="target/model-staging/org/dkpro/core/mystem/bin/README"> + MyStem version 3.1 + </echo> + + <jar + destfile="target/dkpro-core-mystem-bin-${version.bin}.jar" + compress="true"> + <fileset dir="target/model-staging"> + <include name="META-INF/**/*"/> + <include name="**/*"/> + </fileset> + </jar> + + <generate-pom + groupId="org.dkpro.core" + artifactId="dkpro-core-mystem-bin" + version="${version.bin}"/> + + <install-artifact + file="target/dkpro-core-mystem-bin-${version.bin}.jar" + groupId="org.dkpro.core" + artifactId="dkpro-core-mystem-bin" + version="${version.bin}"/> + + </target> +</project> \ No newline at end of file diff --git a/dkpro-core-mystem-asl/src/test/java/org/dkpro/core/mystem/MyStemStemmerTest.java b/dkpro-core-mystem-asl/src/test/java/org/dkpro/core/mystem/MyStemStemmerTest.java new file mode 100644 index 0000000000..0851e36a75 --- /dev/null +++ b/dkpro-core-mystem-asl/src/test/java/org/dkpro/core/mystem/MyStemStemmerTest.java @@ -0,0 +1,53 @@ +package org.dkpro.core.mystem; +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; + +public class MyStemStemmerTest +{ + @Test + public void testRussian() throws Exception + { + runTest("ru", "Не печатать исходные словоформы, только леммы и граммемы.", new String[] { + "не", "печатать", "исходный", "словоформа", "только", "лемма", "и", "граммема" }); + } + + private JCas runTest(String aLanguage, String aText, String[] aStems, Object... aParams) + throws Exception + { + JCas result = TestRunner.runTest(createEngineDescription(MyStemStemmer.class, aParams), + aLanguage, aText); + AssertAnnotations.assertStem(aStems, select(result, Stem.class)); + + return result; + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-ngrams-asl/pom.xml b/dkpro-core-ngrams-asl/pom.xml index febb765ad7..abae23296d 100644 --- a/dkpro-core-ngrams-asl/pom.xml +++ b/dkpro-core-ngrams-asl/pom.xml @@ -1,52 +1,57 @@ <!-- - Copyright 2010 - Ubiquitous Knowledge Processing (UKP) Lab - Technische Universität Darmstadt + Copyright 2010 + Ubiquitous Knowledge Processing (UKP) Lab + Technische Universität Darmstadt - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. --> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - <parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <version>1.10.0-SNAPSHOT</version> - <relativePath>../dkpro-core-asl</relativePath> - </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.ngrams-asl</artifactId> - <packaging>jar</packaging> - <name>DKPro Core ASL - N-Gram Tools</name> - <dependencies> - <dependency> - <groupId>org.apache.uima</groupId> - <artifactId>uimaj-core</artifactId> - </dependency> - <dependency> - <groupId>org.apache.uima</groupId> - <artifactId>uimafit-core</artifactId> - </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-lang3</artifactId> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> - </dependency> - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <scope>test</scope> - </dependency> - </dependencies> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> + <relativePath>../dkpro-core-asl</relativePath> + </parent> + <artifactId>dkpro-core-ngrams-asl</artifactId> + <packaging>jar</packaging> + <name>DKPro Core ASL - N-Gram Tools</name> + <url>https://dkpro.github.io/dkpro-core/</url> + <dependencies> + <dependency> + <groupId>org.apache.uima</groupId> + <artifactId>uimaj-core</artifactId> + </dependency> + <dependency> + <groupId>org.apache.uima</groupId> + <artifactId>uimafit-core</artifactId> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-lang3</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> + </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + </dependencies> </project> diff --git a/dkpro-core-ngrams-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ngrams/NGramAnnotator.java b/dkpro-core-ngrams-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ngrams/NGramAnnotator.java deleted file mode 100644 index cfa75031c9..0000000000 --- a/dkpro-core-ngrams-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ngrams/NGramAnnotator.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.ngrams; - -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; - -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.NGram; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * N-gram annotator. - */ -@ResourceMetaData(name="N-Gram Annotator") -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.NGram"}) -public class NGramAnnotator - extends JCasAnnotator_ImplBase -{ - /** - * The length of the n-grams to generate (the "n" in n-gram). - */ - public static final String PARAM_N = "N"; - @ConfigurationParameter(name = PARAM_N, mandatory = true, defaultValue = "3") - private int n; - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - for (Sentence s : select(aJCas, Sentence.class)) { - for (NGram ngram : NGramIterable.create(selectCovered(Token.class, s), n)) { - ngram.addToIndexes(); - } - } - } -} diff --git a/dkpro-core-ngrams-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ngrams/NGramIterable.java b/dkpro-core-ngrams-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ngrams/NGramIterable.java deleted file mode 100644 index d8b94c7525..0000000000 --- a/dkpro-core-ngrams-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ngrams/NGramIterable.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.ngrams; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.cas.CASException; -import org.apache.uima.cas.text.AnnotationFS; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.NGram; - -public class NGramIterable<T extends AnnotationFS> - implements Iterable<NGram> -{ - List<NGram> nGramList; - - private NGramIterable(Iterable<T> tokens, int n) - { - this.nGramList = createNGramList(tokens, n); - } - - public static <T extends AnnotationFS> NGramIterable<T> create(Iterable<T> tokens, int n) - { - return new NGramIterable<T>(tokens, n); - } - - @Override - public Iterator<NGram> iterator() - { - return nGramList.iterator(); - } - - private List<NGram> createNGramList(Iterable<T> tokens, int n) - { - List<NGram> nGrams = new ArrayList<NGram>(); - - // fill token list - List<T> tokenList = new ArrayList<T>(); - for (T t : tokens) { - tokenList.add(t); - } - - // remove last element, if it contains a punctuation mark - if (tokenList.size() > 0) { - String lastElementText = tokenList.get(tokenList.size() - 1).getCoveredText(); - if (lastElementText.length() == 1 - && (lastElementText.equals(".") - || lastElementText.equals("!") || lastElementText.equals("?"))) { - tokenList.remove(tokenList.size() - 1); - } - } - - for (int k = 1; k <= n; k++) { - // if the number of tokens is less than k => break - if (tokenList.size() < k) { - break; - } - nGrams.addAll(getNGrams(tokenList, k)); - } - - return nGrams; - } - - private List<NGram> getNGrams(List<T> tokenList, int k) - { - List<NGram> nGrams = new ArrayList<NGram>(); - - int size = tokenList.size(); - for (int i = 0; i < (size + 1 - k); i++) { - try { - NGram ngram = new NGram(tokenList.get(i).getCAS().getJCas(), tokenList.get(i) - .getBegin(), tokenList.get(i + k - 1).getEnd()); - ngram.setText(getTokenText(tokenList, i, i + k - 1)); - nGrams.add(ngram); - } - catch (CASException e) { - throw new IllegalStateException(e); - } - } - - return nGrams; - } - - private String getTokenText(List<T> tokenList, int start, int end) - { - List<String> tokenTexts = new ArrayList<String>(); - for (int i = start; i <= end; i++) { - tokenTexts.add(tokenList.get(i).getCoveredText()); - } - return StringUtils.join(tokenTexts, " "); - } -} diff --git a/dkpro-core-ngrams-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ngrams/package-info.java b/dkpro-core-ngrams-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ngrams/package-info.java deleted file mode 100644 index 2410c5afd6..0000000000 --- a/dkpro-core-ngrams-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ngrams/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * N-gram annotator. - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.ngrams; diff --git a/dkpro-core-ngrams-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ngrams/util/NGramStringIterable.java b/dkpro-core-ngrams-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ngrams/util/NGramStringIterable.java deleted file mode 100644 index 666ab12c26..0000000000 --- a/dkpro-core-ngrams-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ngrams/util/NGramStringIterable.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright 2011 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.ngrams.util; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; - -import org.apache.commons.lang3.StringUtils; - -/** - * Creates a NGram iterable from a list of tokens. - * It does not detect any sentence boundaries. - * Thus, one should make sure to only add lists that reflect a sentence or a phrase. - * - * - */ -public class NGramStringIterable implements Iterable<String> -{ - List<String> nGramList; - -// /** -// * @param tokens An iterable of annotations. -// * The {@link JCasUtil} method toText() is called to created the string representation. -// */ -// public NGramStringIterable(Iterable<AnnotationFS> annotations, int minN, int maxN) -// { -// this.nGramList = createNGramList(JCasUtil.toText(annotations), minN, maxN); -// } - - /** - * @param tokens - * An iterable of tokens. - * @param minN - * the minimal n-gram length. - * @param maxN - * the maximal n-gram length. - */ - public NGramStringIterable(Iterable<String> tokens, int minN, int maxN) - { - this.nGramList = createNGramList(tokens, minN, maxN); - } - - /** - * @param tokens An array of tokens. - * @param minN - * the minimal n-gram length. - * @param maxN - * the maximal n-gram length. - */ - public NGramStringIterable(String[] tokens, int minN, int maxN) - { - this.nGramList = createNGramList(Arrays.asList(tokens), minN, maxN); - } - - - @Override - public Iterator<String> iterator() - { - return nGramList.iterator(); - } - - private List<String> createNGramList(Iterable<String> tokens, int minN, int maxN) - { - if (minN > maxN) { - throw new IllegalArgumentException("minN needs to be smaller or equal than maxN."); - } - - List<String> nGrams = new ArrayList<String>(); - - // fill token list - List<String> tokenList = new ArrayList<String>(); - for (String t : tokens) { - tokenList.add(t); - } - - for (int k = minN; k <= maxN; k++) { - // if the number of tokens is less than k => break - if (tokenList.size() < k) { - break; - } - nGrams.addAll(getNGrams(tokenList, k)); - } - - return nGrams; - } - - private List<String> getNGrams(List<String> tokenList, int k) - { - List<String> nGrams = new ArrayList<String>(); - - int size = tokenList.size(); - for (int i = 0; i < (size + 1 - k); i++) { - nGrams.add( - StringUtils.join(tokenList.subList(i, i + k), ' ') - ); - } - - return nGrams; - } -} diff --git a/dkpro-core-ngrams-asl/src/main/java/org/dkpro/core/ngrams/NGramAnnotator.java b/dkpro-core-ngrams-asl/src/main/java/org/dkpro/core/ngrams/NGramAnnotator.java new file mode 100644 index 0000000000..72cb858e20 --- /dev/null +++ b/dkpro-core-ngrams-asl/src/main/java/org/dkpro/core/ngrams/NGramAnnotator.java @@ -0,0 +1,66 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.ngrams; + +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.NGram; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * N-gram annotator. + */ +@ResourceMetaData(name = "N-Gram Annotator") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.NGram"}) +public class NGramAnnotator + extends JCasAnnotator_ImplBase +{ + /** + * The length of the n-grams to generate (the "n" in n-gram). + */ + public static final String PARAM_N = "N"; + @ConfigurationParameter(name = PARAM_N, mandatory = true, defaultValue = "3") + private int n; + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + for (Sentence s : select(aJCas, Sentence.class)) { + for (NGram ngram : NGramIterable.create(selectCovered(Token.class, s), n)) { + ngram.addToIndexes(); + } + } + } +} diff --git a/dkpro-core-ngrams-asl/src/main/java/org/dkpro/core/ngrams/NGramIterable.java b/dkpro-core-ngrams-asl/src/main/java/org/dkpro/core/ngrams/NGramIterable.java new file mode 100644 index 0000000000..751947ecec --- /dev/null +++ b/dkpro-core-ngrams-asl/src/main/java/org/dkpro/core/ngrams/NGramIterable.java @@ -0,0 +1,110 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.ngrams; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.cas.CASException; +import org.apache.uima.cas.text.AnnotationFS; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.NGram; + +public class NGramIterable<T extends AnnotationFS> + implements Iterable<NGram> +{ + List<NGram> nGramList; + + private NGramIterable(Iterable<T> tokens, int n) + { + this.nGramList = createNGramList(tokens, n); + } + + public static <T extends AnnotationFS> NGramIterable<T> create(Iterable<T> tokens, int n) + { + return new NGramIterable<T>(tokens, n); + } + + @Override + public Iterator<NGram> iterator() + { + return nGramList.iterator(); + } + + private List<NGram> createNGramList(Iterable<T> tokens, int n) + { + List<NGram> nGrams = new ArrayList<NGram>(); + + // fill token list + List<T> tokenList = new ArrayList<T>(); + for (T t : tokens) { + tokenList.add(t); + } + + // remove last element, if it contains a punctuation mark + if (tokenList.size() > 0) { + String lastElementText = tokenList.get(tokenList.size() - 1).getCoveredText(); + if (lastElementText.length() == 1 + && (lastElementText.equals(".") + || lastElementText.equals("!") || lastElementText.equals("?"))) { + tokenList.remove(tokenList.size() - 1); + } + } + + for (int k = 1; k <= n; k++) { + // if the number of tokens is less than k => break + if (tokenList.size() < k) { + break; + } + nGrams.addAll(getNGrams(tokenList, k)); + } + + return nGrams; + } + + private List<NGram> getNGrams(List<T> tokenList, int k) + { + List<NGram> nGrams = new ArrayList<NGram>(); + + int size = tokenList.size(); + for (int i = 0; i < (size + 1 - k); i++) { + try { + NGram ngram = new NGram(tokenList.get(i).getCAS().getJCas(), tokenList.get(i) + .getBegin(), tokenList.get(i + k - 1).getEnd()); + ngram.setText(getTokenText(tokenList, i, i + k - 1)); + nGrams.add(ngram); + } + catch (CASException e) { + throw new IllegalStateException(e); + } + } + + return nGrams; + } + + private String getTokenText(List<T> tokenList, int start, int end) + { + List<String> tokenTexts = new ArrayList<String>(); + for (int i = start; i <= end; i++) { + tokenTexts.add(tokenList.get(i).getCoveredText()); + } + return StringUtils.join(tokenTexts, " "); + } +} diff --git a/dkpro-core-ngrams-asl/src/main/java/org/dkpro/core/ngrams/package-info.java b/dkpro-core-ngrams-asl/src/main/java/org/dkpro/core/ngrams/package-info.java new file mode 100644 index 0000000000..7a0da03cdf --- /dev/null +++ b/dkpro-core-ngrams-asl/src/main/java/org/dkpro/core/ngrams/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * N-gram annotator. + * + * @since 1.1.0 + */ +package org.dkpro.core.ngrams; diff --git a/dkpro-core-ngrams-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ngrams/util/CharacterNGramStringIterable.java b/dkpro-core-ngrams-asl/src/main/java/org/dkpro/core/ngrams/util/CharacterNGramStringIterable.java similarity index 94% rename from dkpro-core-ngrams-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ngrams/util/CharacterNGramStringIterable.java rename to dkpro-core-ngrams-asl/src/main/java/org/dkpro/core/ngrams/util/CharacterNGramStringIterable.java index b4b0ae56ca..238c653ff9 100644 --- a/dkpro-core-ngrams-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ngrams/util/CharacterNGramStringIterable.java +++ b/dkpro-core-ngrams-asl/src/main/java/org/dkpro/core/ngrams/util/CharacterNGramStringIterable.java @@ -1,89 +1,89 @@ -/* - * Copyright 2011 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.ngrams.util; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import org.apache.commons.lang3.StringUtils; - -/** - * Creates a character NGram iterable from a list of tokens. - */ -public class CharacterNGramStringIterable - implements Iterable<String> -{ - List<String> nGramList; - - /** - * @param token - * A token - * @param minN - * the minimal n-gram length. - * @param maxN - * the maximal n-gram length. - */ - public CharacterNGramStringIterable(String token, int minN, int maxN) - { - this.nGramList = createNGramList(token, minN, maxN); - } - - @Override - public Iterator<String> iterator() - { - return nGramList.iterator(); - } - - private List<String> createNGramList(String token, int minN, int maxN) - { - if (minN > maxN) { - throw new IllegalArgumentException("minN needs to be smaller or equal than maxN."); - } - - List<String> nGrams = new ArrayList<String>(); - - // fill character list - List<String> charList = new ArrayList<String>(); - for (char c : token.toCharArray()) { - charList.add(Character.toString(c)); - } - - for (int k = minN; k <= maxN; k++) { - // if the number of tokens is less than k => break - if (charList.size() < k) { - break; - } - nGrams.addAll(getNGrams(charList, k)); - } - - return nGrams; - } - - private List<String> getNGrams(List<String> tokenList, int k) - { - List<String> nGrams = new ArrayList<String>(); - - int size = tokenList.size(); - for (int i = 0; i < (size + 1 - k); i++) { - nGrams.add(StringUtils.join(tokenList.subList(i, i + k), "")); - } - - return nGrams; - } +/* + * Copyright 2011 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.ngrams.util; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.commons.lang3.StringUtils; + +/** + * Creates a character NGram iterable from a list of tokens. + */ +public class CharacterNGramStringIterable + implements Iterable<String> +{ + List<String> nGramList; + + /** + * @param token + * A token + * @param minN + * the minimal n-gram length. + * @param maxN + * the maximal n-gram length. + */ + public CharacterNGramStringIterable(String token, int minN, int maxN) + { + this.nGramList = createNGramList(token, minN, maxN); + } + + @Override + public Iterator<String> iterator() + { + return nGramList.iterator(); + } + + private List<String> createNGramList(String token, int minN, int maxN) + { + if (minN > maxN) { + throw new IllegalArgumentException("minN needs to be smaller or equal than maxN."); + } + + List<String> nGrams = new ArrayList<String>(); + + // fill character list + List<String> charList = new ArrayList<String>(); + for (char c : token.toCharArray()) { + charList.add(Character.toString(c)); + } + + for (int k = minN; k <= maxN; k++) { + // if the number of tokens is less than k => break + if (charList.size() < k) { + break; + } + nGrams.addAll(getNGrams(charList, k)); + } + + return nGrams; + } + + private List<String> getNGrams(List<String> tokenList, int k) + { + List<String> nGrams = new ArrayList<String>(); + + int size = tokenList.size(); + for (int i = 0; i < (size + 1 - k); i++) { + nGrams.add(StringUtils.join(tokenList.subList(i, i + k), "")); + } + + return nGrams; + } } diff --git a/dkpro-core-ngrams-asl/src/main/java/org/dkpro/core/ngrams/util/NGramStringIterable.java b/dkpro-core-ngrams-asl/src/main/java/org/dkpro/core/ngrams/util/NGramStringIterable.java new file mode 100644 index 0000000000..38cd81e5c2 --- /dev/null +++ b/dkpro-core-ngrams-asl/src/main/java/org/dkpro/core/ngrams/util/NGramStringIterable.java @@ -0,0 +1,117 @@ +/* + * Copyright 2011 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.ngrams.util; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +import org.apache.commons.lang3.StringUtils; + +/** + * Creates a NGram iterable from a list of tokens. + * It does not detect any sentence boundaries. + * Thus, one should make sure to only add lists that reflect a sentence or a phrase. + * + * + */ +public class NGramStringIterable implements Iterable<String> +{ + List<String> nGramList; + +// /** +// * @param tokens An iterable of annotations. +// * The {@link JCasUtil} method toText() is called to created the string representation. +// */ +// public NGramStringIterable(Iterable<AnnotationFS> annotations, int minN, int maxN) +// { +// this.nGramList = createNGramList(JCasUtil.toText(annotations), minN, maxN); +// } + + /** + * @param tokens + * An iterable of tokens. + * @param minN + * the minimal n-gram length. + * @param maxN + * the maximal n-gram length. + */ + public NGramStringIterable(Iterable<String> tokens, int minN, int maxN) + { + this.nGramList = createNGramList(tokens, minN, maxN); + } + + /** + * @param tokens An array of tokens. + * @param minN + * the minimal n-gram length. + * @param maxN + * the maximal n-gram length. + */ + public NGramStringIterable(String[] tokens, int minN, int maxN) + { + this.nGramList = createNGramList(Arrays.asList(tokens), minN, maxN); + } + + + @Override + public Iterator<String> iterator() + { + return nGramList.iterator(); + } + + private List<String> createNGramList(Iterable<String> tokens, int minN, int maxN) + { + if (minN > maxN) { + throw new IllegalArgumentException("minN needs to be smaller or equal than maxN."); + } + + List<String> nGrams = new ArrayList<String>(); + + // fill token list + List<String> tokenList = new ArrayList<String>(); + for (String t : tokens) { + tokenList.add(t); + } + + for (int k = minN; k <= maxN; k++) { + // if the number of tokens is less than k => break + if (tokenList.size() < k) { + break; + } + nGrams.addAll(getNGrams(tokenList, k)); + } + + return nGrams; + } + + private List<String> getNGrams(List<String> tokenList, int k) + { + List<String> nGrams = new ArrayList<String>(); + + int size = tokenList.size(); + for (int i = 0; i < (size + 1 - k); i++) { + nGrams.add( + StringUtils.join(tokenList.subList(i, i + k), ' ') + ); + } + + return nGrams; + } +} diff --git a/dkpro-core-ngrams-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ngrams/util/NGramStringListIterable.java b/dkpro-core-ngrams-asl/src/main/java/org/dkpro/core/ngrams/util/NGramStringListIterable.java similarity index 95% rename from dkpro-core-ngrams-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ngrams/util/NGramStringListIterable.java rename to dkpro-core-ngrams-asl/src/main/java/org/dkpro/core/ngrams/util/NGramStringListIterable.java index 5def2d335f..4f95e92504 100644 --- a/dkpro-core-ngrams-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/ngrams/util/NGramStringListIterable.java +++ b/dkpro-core-ngrams-asl/src/main/java/org/dkpro/core/ngrams/util/NGramStringListIterable.java @@ -1,104 +1,104 @@ -/* - * Copyright 2011 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.ngrams.util; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; - -/** - * Creates a NGram iterable from a list of tokens. It does not detect any sentence boundaries. Thus, - * one should make sure to only add lists that reflect a sentence or a phrase. - * - * - */ -public class NGramStringListIterable - implements Iterable<List<String>> -{ - List<List<String>> nGramList; - - /** - * @param tokens - * An iterable of tokens. - * @param minN - * minimum n-gram length. - * @param maxN - * maximum n-gram length. - */ - public NGramStringListIterable(Iterable<String> tokens, int minN, int maxN) - { - this.nGramList = createNGramList(tokens, minN, maxN); - } - - /** - * @param tokens - * An array of tokens. - * @param minN - * minimum n-gram length. - * @param maxN - * maximum n-gram length. - */ - public NGramStringListIterable(String[] tokens, int minN, int maxN) - { - this.nGramList = createNGramList(Arrays.asList(tokens), minN, maxN); - } - - @Override - public Iterator<List<String>> iterator() - { - return nGramList.iterator(); - } - - private List<List<String>> createNGramList(Iterable<String> tokens, int minN, int maxN) - { - if (minN > maxN) { - throw new IllegalArgumentException("minN needs to be smaller or equal than maxN."); - } - - List<List<String>> nGrams = new ArrayList<List<String>>(); - - // fill token list - List<String> tokenList = new ArrayList<String>(); - for (String t : tokens) { - tokenList.add(t); - } - - for (int k = minN; k <= maxN; k++) { - // if the number of tokens is less than k => break - if (tokenList.size() < k) { - break; - } - nGrams.addAll(getNGrams(tokenList, k)); - } - - return nGrams; - } - - private List<List<String>> getNGrams(List<String> tokenList, int k) - { - List<List<String>> nGrams = new ArrayList<List<String>>(); - - int size = tokenList.size(); - for (int i = 0; i < (size + 1 - k); i++) { - nGrams.add(tokenList.subList(i, i + k)); - } - - return nGrams; - } +/* + * Copyright 2011 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.ngrams.util; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +/** + * Creates a NGram iterable from a list of tokens. It does not detect any sentence boundaries. Thus, + * one should make sure to only add lists that reflect a sentence or a phrase. + * + * + */ +public class NGramStringListIterable + implements Iterable<List<String>> +{ + List<List<String>> nGramList; + + /** + * @param tokens + * An iterable of tokens. + * @param minN + * minimum n-gram length. + * @param maxN + * maximum n-gram length. + */ + public NGramStringListIterable(Iterable<String> tokens, int minN, int maxN) + { + this.nGramList = createNGramList(tokens, minN, maxN); + } + + /** + * @param tokens + * An array of tokens. + * @param minN + * minimum n-gram length. + * @param maxN + * maximum n-gram length. + */ + public NGramStringListIterable(String[] tokens, int minN, int maxN) + { + this.nGramList = createNGramList(Arrays.asList(tokens), minN, maxN); + } + + @Override + public Iterator<List<String>> iterator() + { + return nGramList.iterator(); + } + + private List<List<String>> createNGramList(Iterable<String> tokens, int minN, int maxN) + { + if (minN > maxN) { + throw new IllegalArgumentException("minN needs to be smaller or equal than maxN."); + } + + List<List<String>> nGrams = new ArrayList<List<String>>(); + + // fill token list + List<String> tokenList = new ArrayList<String>(); + for (String t : tokens) { + tokenList.add(t); + } + + for (int k = minN; k <= maxN; k++) { + // if the number of tokens is less than k => break + if (tokenList.size() < k) { + break; + } + nGrams.addAll(getNGrams(tokenList, k)); + } + + return nGrams; + } + + private List<List<String>> getNGrams(List<String> tokenList, int k) + { + List<List<String>> nGrams = new ArrayList<List<String>>(); + + int size = tokenList.size(); + for (int i = 0; i < (size + 1 - k); i++) { + nGrams.add(tokenList.subList(i, i + k)); + } + + return nGrams; + } } diff --git a/dkpro-core-ngrams-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ngrams/NGramAnnotatorTest.java b/dkpro-core-ngrams-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ngrams/NGramAnnotatorTest.java deleted file mode 100644 index 83ca27d9a4..0000000000 --- a/dkpro-core-ngrams-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ngrams/NGramAnnotatorTest.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.ngrams; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.junit.Assert.assertTrue; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.fit.factory.JCasBuilder; -import org.apache.uima.jcas.JCas; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.NGram; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -public -class NGramAnnotatorTest -{ - @Test - public - void ngramAnnotatorTest() - throws Exception - { - AnalysisEngine ae = createEngine(NGramAnnotator.class); - JCas jcas = ae.newJCas(); - - JCasBuilder jb = new JCasBuilder(jcas); - int begin1 = jb.getPosition(); - jb.add("example", Token.class); - jb.add(" "); - jb.add("sentence", Token.class); - jb.add(" "); - jb.add("funny", Token.class); - jb.add(begin1, Sentence.class); - jb.add("."); - - int begin2 = jb.getPosition(); - jb.add("second", Token.class); - jb.add(" "); - jb.add("example", Token.class); - jb.add(begin2, Sentence.class); - jb.add("."); - - jb.close(); - - ae.process(jcas); - - int i = 0; - for (NGram ngram : select(jcas, NGram.class)) { - assertTrue(i != 0 || "example sentence funny".equals(ngram.getText())); - assertTrue(i != 1 || "example sentence".equals(ngram.getText())); - assertTrue(i != 2 || "example".equals(ngram.getText())); - assertTrue(i != 3 || "sentence funny".equals(ngram.getText())); - assertTrue(i != 4 || "sentence".equals(ngram.getText())); - assertTrue(i != 5 || "funny".equals(ngram.getText())); - assertTrue(i != 6 || "second example".equals(ngram.getText())); - assertTrue(i != 7 || "second".equals(ngram.getText())); - assertTrue(i != 8 || "example".equals(ngram.getText())); - i++; - } - } -} diff --git a/dkpro-core-ngrams-asl/src/test/java/org/dkpro/core/ngrams/NGramAnnotatorTest.java b/dkpro-core-ngrams-asl/src/test/java/org/dkpro/core/ngrams/NGramAnnotatorTest.java new file mode 100644 index 0000000000..77e356fa92 --- /dev/null +++ b/dkpro-core-ngrams-asl/src/test/java/org/dkpro/core/ngrams/NGramAnnotatorTest.java @@ -0,0 +1,76 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.ngrams; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.junit.Assert.assertTrue; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.fit.factory.JCasBuilder; +import org.apache.uima.jcas.JCas; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.NGram; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class NGramAnnotatorTest +{ + @Test + public void ngramAnnotatorTest() throws Exception + { + AnalysisEngine ae = createEngine(NGramAnnotator.class); + JCas jcas = ae.newJCas(); + + JCasBuilder jb = new JCasBuilder(jcas); + int begin1 = jb.getPosition(); + jb.add("example", Token.class); + jb.add(" "); + jb.add("sentence", Token.class); + jb.add(" "); + jb.add("funny", Token.class); + jb.add(begin1, Sentence.class); + jb.add("."); + + int begin2 = jb.getPosition(); + jb.add("second", Token.class); + jb.add(" "); + jb.add("example", Token.class); + jb.add(begin2, Sentence.class); + jb.add("."); + + jb.close(); + + ae.process(jcas); + + int i = 0; + for (NGram ngram : select(jcas, NGram.class)) { + assertTrue(i != 0 || "example sentence funny".equals(ngram.getText())); + assertTrue(i != 1 || "example sentence".equals(ngram.getText())); + assertTrue(i != 2 || "example".equals(ngram.getText())); + assertTrue(i != 3 || "sentence funny".equals(ngram.getText())); + assertTrue(i != 4 || "sentence".equals(ngram.getText())); + assertTrue(i != 5 || "funny".equals(ngram.getText())); + assertTrue(i != 6 || "second example".equals(ngram.getText())); + assertTrue(i != 7 || "second".equals(ngram.getText())); + assertTrue(i != 8 || "example".equals(ngram.getText())); + i++; + } + } +} diff --git a/dkpro-core-ngrams-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ngrams/util/CharacterNGramStringIterableTest.java b/dkpro-core-ngrams-asl/src/test/java/org/dkpro/core/ngrams/util/CharacterNGramStringIterableTest.java similarity index 87% rename from dkpro-core-ngrams-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ngrams/util/CharacterNGramStringIterableTest.java rename to dkpro-core-ngrams-asl/src/test/java/org/dkpro/core/ngrams/util/CharacterNGramStringIterableTest.java index d4c9765c10..0ca5021bbe 100644 --- a/dkpro-core-ngrams-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ngrams/util/CharacterNGramStringIterableTest.java +++ b/dkpro-core-ngrams-asl/src/test/java/org/dkpro/core/ngrams/util/CharacterNGramStringIterableTest.java @@ -15,8 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.ngrams.util; - +package org.dkpro.core.ngrams.util; import static org.junit.Assert.assertEquals; @@ -25,16 +24,17 @@ public class CharacterNGramStringIterableTest { @Test - public void ngramTest() { - + public void ngramTest() + { + String token = "Token"; - - int i=0; + + int i = 0; for (String ngram : new CharacterNGramStringIterable(token, 2, 2)) { - if (i==0) { + if (i == 0) { assertEquals("To", ngram); } - + System.out.println(ngram); i++; } diff --git a/dkpro-core-ngrams-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ngrams/util/NGramStringIterableTest.java b/dkpro-core-ngrams-asl/src/test/java/org/dkpro/core/ngrams/util/NGramStringIterableTest.java similarity index 87% rename from dkpro-core-ngrams-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ngrams/util/NGramStringIterableTest.java rename to dkpro-core-ngrams-asl/src/test/java/org/dkpro/core/ngrams/util/NGramStringIterableTest.java index 4defbc0f9d..f9886c492b 100644 --- a/dkpro-core-ngrams-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ngrams/util/NGramStringIterableTest.java +++ b/dkpro-core-ngrams-asl/src/test/java/org/dkpro/core/ngrams/util/NGramStringIterableTest.java @@ -15,8 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.ngrams.util; - +package org.dkpro.core.ngrams.util; import static org.junit.Assert.assertEquals; @@ -25,16 +24,17 @@ public class NGramStringIterableTest { @Test - public void ngramTest() { - + public void ngramTest() + { + String[] tokens = "This is a simple example sentence .".split(" "); - - int i=0; + + int i = 0; for (String ngram : new NGramStringIterable(tokens, 2, 2)) { - if (i==0) { + if (i == 0) { assertEquals("This is", ngram); } - + System.out.println(ngram); i++; } diff --git a/dkpro-core-ngrams-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ngrams/util/NGramStringListIterableTest.java b/dkpro-core-ngrams-asl/src/test/java/org/dkpro/core/ngrams/util/NGramStringListIterableTest.java similarity index 89% rename from dkpro-core-ngrams-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ngrams/util/NGramStringListIterableTest.java rename to dkpro-core-ngrams-asl/src/test/java/org/dkpro/core/ngrams/util/NGramStringListIterableTest.java index bcef88f94c..8cb9e3597e 100644 --- a/dkpro-core-ngrams-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/ngrams/util/NGramStringListIterableTest.java +++ b/dkpro-core-ngrams-asl/src/test/java/org/dkpro/core/ngrams/util/NGramStringListIterableTest.java @@ -15,8 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.ngrams.util; - +package org.dkpro.core.ngrams.util; import static org.junit.Assert.assertEquals; @@ -28,17 +27,17 @@ public class NGramStringListIterableTest { @Test - public void ngramTest() { - + public void ngramTest() + { String[] tokens = "This is a simple example sentence .".split(" "); - - int i=0; + + int i = 0; for (List<String> ngram : new NGramStringListIterable(tokens, 2, 2)) { - if (i==0) { + if (i == 0) { assertEquals(2, ngram.size()); assertEquals("This is", StringUtils.join(ngram, " ")); } - + System.out.println(ngram); i++; } diff --git a/dkpro-core-nlp4j-asl/pom.xml b/dkpro-core-nlp4j-asl/pom.xml index d38477b8db..a2f9102e84 100644 --- a/dkpro-core-nlp4j-asl/pom.xml +++ b/dkpro-core-nlp4j-asl/pom.xml @@ -18,14 +18,15 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <version>1.10.0-SNAPSHOT</version> + <artifactId>dkpro-core-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.nlp4j-asl</artifactId> + <artifactId>dkpro-core-nlp4j-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - NLP4J</name> + <url>https://dkpro.github.io/dkpro-core/</url> <properties> <nlp4j.version>1.1.3</nlp4j.version> </properties> @@ -55,36 +56,40 @@ </dependency> --> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.metadata-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-metadata-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.syntax-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-syntax-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.ner-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-ner-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.io-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-io-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> </dependency> <dependency> <groupId>junit</groupId> @@ -92,8 +97,8 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> <dependency> diff --git a/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JLemmatizer.java b/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JLemmatizer.java deleted file mode 100644 index 5e3ff0ee2c..0000000000 --- a/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JLemmatizer.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.nlp4j; - -import static org.apache.uima.fit.util.JCasUtil.select; - -import java.io.IOException; -import java.net.URL; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import edu.emory.mathcs.nlp.common.util.StringUtils; -import edu.emory.mathcs.nlp.component.morph.MorphAnalyzer; -import edu.emory.mathcs.nlp.component.morph.english.EnglishMorphAnalyzer; - -/** - * Emory NLP4J lemmatizer. This is a lower-casing lemmatizer. - */ -@ResourceMetaData(name="NLP4J Lemmatizer") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }) -public class Nlp4JLemmatizer - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - private ModelProviderBase<MorphAnalyzer> modelProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase<MorphAnalyzer>() { - { - setContextObject(Nlp4JLemmatizer.this); - setDefault(LOCATION, NOT_REQUIRED + "-${language}"); - setOverride(LANGUAGE, language); - } - - @Override - protected MorphAnalyzer produceResource(URL aUrl) - throws IOException - { - String language = getAggregatedProperties().getProperty(LANGUAGE); - - if (!language.equals("en")) { - throw new IllegalArgumentException(new Throwable( - "Emory NLP4J supports only English")); - } - - return new EnglishMorphAnalyzer(); - } - }; - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - modelProvider.configure(aJCas.getCas()); - - MorphAnalyzer lemmatizer = modelProvider.getResource(); - - for (Token t : select(aJCas, Token.class)) { - String pos = null; - if (t.getPos() != null) { - pos = t.getPos().getPosValue(); - } - - Lemma lemma = new Lemma(aJCas, t.getBegin(), t.getEnd()); - lemma.setValue(lemmatizer.lemmatize(StringUtils.toSimplifiedForm(t.getText()), - pos)); - lemma.addToIndexes(); - - t.setLemma(lemma); - } - } -} diff --git a/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JPosTagger.java b/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JPosTagger.java deleted file mode 100644 index 6a06de92d2..0000000000 --- a/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JPosTagger.java +++ /dev/null @@ -1,218 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.nlp4j; - -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; -import static org.apache.uima.util.Level.INFO; - -import java.io.IOException; -import java.io.InputStream; -import java.util.List; -import java.util.Set; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.nlp4j.internal.EmoryNlp2Uima; -import de.tudarmstadt.ukp.dkpro.core.nlp4j.internal.EmoryNlpUtils; -import de.tudarmstadt.ukp.dkpro.core.nlp4j.internal.OnlineComponentTagsetDescriptionProvider; -import de.tudarmstadt.ukp.dkpro.core.nlp4j.internal.Uima2EmoryNlp; -import edu.emory.mathcs.nlp.component.pos.POSState; -import edu.emory.mathcs.nlp.component.template.OnlineComponent; -import edu.emory.mathcs.nlp.component.template.node.NLPNode; -import edu.emory.mathcs.nlp.common.util.NLPUtils; - -/** - * Part-of-Speech annotator using Emory NLP4J. Requires {@link Sentence}s to be annotated before. - */ -@ResourceMetaData(name="NLP4J POS-Tagger") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }) -public class Nlp4JPosTagger - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - protected String modelLocation; - - /** - * Load the part-of-speech tag to UIMA type mapping from this location instead of locating - * the mapping automatically. - */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - protected String posMappingLocation; - - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - - /** - * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") - protected boolean printTagSet; - - /** - * Process anyway, even if the model relies on features that are not supported by this - * component. - * - * Default: {@code false} - */ - public static final String PARAM_IGNORE_MISSING_FEATURES = "ignoreMissingFeatures"; - @ConfigurationParameter(name = PARAM_IGNORE_MISSING_FEATURES, mandatory = true, defaultValue = "false") - protected boolean ignoreMissingFeatures; - - private Nlp4JPosTaggerModelProvider modelProvider; - private MappingProvider mappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new Nlp4JPosTaggerModelProvider(this); - - // General setup of the mapping provider in initialize() - mappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - language, modelProvider); - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - - // Document-specific configuration of model and mapping provider in process() - modelProvider.configure(cas); - - // Mind the mapping provider must be configured after the model provider as it uses the - // model metadata - mappingProvider.configure(cas); - - for (Sentence sentence : select(aJCas, Sentence.class)) { - List<Token> tokens = selectCovered(aJCas, Token.class, sentence); - NLPNode[] nodes = Uima2EmoryNlp.convertSentence(tokens); - - // Process the sentences - new results will be stored in the existing NLPNodes - modelProvider.getResource().process(nodes); - - EmoryNlp2Uima.convertPos(cas, tokens, nodes, mappingProvider, internTags); - } - } - - private class Nlp4JPosTaggerModelProvider - extends ModelProviderBase<OnlineComponent<NLPNode, POSState<NLPNode>>> - { - public Nlp4JPosTaggerModelProvider(Object aOwner) - { - super(aOwner, "nlp4j", "tagger"); - } - - @Override - protected OnlineComponent<NLPNode, POSState<NLPNode>> produceResource(InputStream aStream) - throws Exception - { - String language = getAggregatedProperties().getProperty(LANGUAGE); - - if (!language.equals("en")) { - throw new IllegalArgumentException(new Throwable( - "Emory NLP4J supports only English")); - } - - EmoryNlpUtils.initGlobalLexica(); - - // Load the POS tagger model from the location the model provider offers - OnlineComponent<NLPNode, POSState<NLPNode>> component = (OnlineComponent) - NLPUtils.getComponent(aStream); - - // Extract tagset information from the model - OnlineComponentTagsetDescriptionProvider<NLPNode, POSState<NLPNode>> tsdp = - new OnlineComponentTagsetDescriptionProvider<>( - getResourceMetaData().getProperty("pos.tagset"), POS.class, component); - addTagset(tsdp); - - if (printTagSet) { - getContext().getLogger().log(INFO, tsdp.toString()); - } - - Set<String> features = EmoryNlpUtils.extractFeatures(component); - getLogger().info("Model uses these features: " + features); - - - Set<String> unsupportedFeatures = EmoryNlpUtils.extractUnsupportedFeatures(component); - if (!unsupportedFeatures.isEmpty()) { - String message = "Model these uses unsupported features: " + unsupportedFeatures; - if (ignoreMissingFeatures) { - getLogger().warn(message); - } - else { - throw new IOException(message); - } - } - - // Create a new POS tagger instance from the loaded model - return component; - } - }; -} diff --git a/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JDependencyParser.java b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JDependencyParser.java similarity index 76% rename from dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JDependencyParser.java rename to dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JDependencyParser.java index 7b6744246c..acc20e8e3f 100644 --- a/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JDependencyParser.java +++ b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JDependencyParser.java @@ -15,11 +15,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.nlp4j; +package org.dkpro.core.nlp4j; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import static org.apache.uima.util.Level.INFO; +import static org.dkpro.core.api.resources.MappingProviderFactory.createDependencyMappingProvider; import java.io.IOException; import java.io.InputStream; @@ -37,27 +38,31 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.nlp4j.internal.EmoryNlp2Uima; +import org.dkpro.core.nlp4j.internal.EmoryNlpUtils; +import org.dkpro.core.nlp4j.internal.OnlineComponentTagsetDescriptionProvider; +import org.dkpro.core.nlp4j.internal.Uima2EmoryNlp; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.nlp4j.internal.EmoryNlp2Uima; -import de.tudarmstadt.ukp.dkpro.core.nlp4j.internal.EmoryNlpUtils; -import de.tudarmstadt.ukp.dkpro.core.nlp4j.internal.OnlineComponentTagsetDescriptionProvider; -import de.tudarmstadt.ukp.dkpro.core.nlp4j.internal.Uima2EmoryNlp; import edu.emory.mathcs.nlp.common.util.NLPUtils; import edu.emory.mathcs.nlp.component.dep.DEPState; import edu.emory.mathcs.nlp.component.template.OnlineComponent; import edu.emory.mathcs.nlp.component.template.node.NLPNode; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Emory NLP4J dependency parser. */ -@ResourceMetaData(name="NLP4J Dependency Parser") +@Component(OperationType.DEPENDENCY_PARSER) +@ResourceMetaData(name = "NLP4J Dependency Parser") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", @@ -70,11 +75,9 @@ public class Nlp4JDependencyParser { /** * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} */ public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") private boolean printTagSet; /** @@ -92,6 +95,20 @@ public class Nlp4JDependencyParser @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) private String variant; + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + /** * Location from which the model is read. */ @@ -99,26 +116,25 @@ public class Nlp4JDependencyParser @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) private String modelLocation; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Location of the mapping file for part-of-speech tags to UIMA types. */ - public static final String PARAM_DEPENDENCY_MAPPING_LOCATION = ComponentParameters.PARAM_DEPENDENCY_MAPPING_LOCATION; + public static final String PARAM_DEPENDENCY_MAPPING_LOCATION = + ComponentParameters.PARAM_DEPENDENCY_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_DEPENDENCY_MAPPING_LOCATION, mandatory = false) private String dependencyMappingLocation; - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - /** * Process anyway, even if the model relies on features that are not supported by this * component. - * - * Default: {@code false} */ public static final String PARAM_IGNORE_MISSING_FEATURES = "ignoreMissingFeatures"; @ConfigurationParameter(name = PARAM_IGNORE_MISSING_FEATURES, mandatory = true, defaultValue = "false") @@ -135,8 +151,8 @@ public void initialize(UimaContext aContext) modelProvider = new Nlp4JDependencyParserModelProvider(this); - mappingProvider = MappingProviderFactory.createDependencyMappingProvider( - dependencyMappingLocation, language, modelProvider); + mappingProvider = createDependencyMappingProvider(this, dependencyMappingLocation, language, + modelProvider); } @Override @@ -155,7 +171,7 @@ public void process(JCas aJCas) // Process the sentences - new results will be stored in the existing NLPNodes modelProvider.getResource().process(nodes); - EmoryNlp2Uima.convertDependencies(aJCas, tokens, nodes, mappingProvider, internTags); + EmoryNlp2Uima.convertDependencies(aJCas, tokens, nodes, mappingProvider); } } @@ -165,7 +181,12 @@ private class Nlp4JDependencyParserModelProvider public Nlp4JDependencyParserModelProvider(Object aObject) { super(aObject, "nlp4j", "parser"); + + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/nlp4j/lib/parser-${language}-${variant}.properties"); } + @Override protected OnlineComponent<NLPNode, DEPState<NLPNode>> produceResource(InputStream aStream) throws Exception diff --git a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JLemmatizer.java b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JLemmatizer.java new file mode 100644 index 0000000000..0aabc1e872 --- /dev/null +++ b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JLemmatizer.java @@ -0,0 +1,121 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.nlp4j; + +import static org.apache.uima.fit.util.JCasUtil.select; + +import java.io.IOException; +import java.net.URL; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.ModelProviderBase; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import edu.emory.mathcs.nlp.common.util.StringUtils; +import edu.emory.mathcs.nlp.component.morph.MorphAnalyzer; +import edu.emory.mathcs.nlp.component.morph.english.EnglishMorphAnalyzer; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Emory NLP4J lemmatizer. This is a lower-casing lemmatizer. + */ +@Component(OperationType.LEMMATIZER) +@ResourceMetaData(name = "NLP4J Lemmatizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }) +public class Nlp4JLemmatizer + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + private ModelProviderBase<MorphAnalyzer> modelProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase<MorphAnalyzer>() { + { + setContextObject(Nlp4JLemmatizer.this); + setDefault(LOCATION, NOT_REQUIRED + "-${language}"); + setOverride(LANGUAGE, language); + } + + @Override + protected MorphAnalyzer produceResource(URL aUrl) + throws IOException + { + String language = getAggregatedProperties().getProperty(LANGUAGE); + + if (!language.equals("en")) { + throw new IllegalArgumentException(new Throwable( + "Emory NLP4J supports only English")); + } + + return new EnglishMorphAnalyzer(); + } + }; + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + modelProvider.configure(aJCas.getCas()); + + MorphAnalyzer lemmatizer = modelProvider.getResource(); + + for (Token t : select(aJCas, Token.class)) { + String pos = null; + if (t.getPos() != null) { + pos = t.getPos().getPosValue(); + } + + Lemma lemma = new Lemma(aJCas, t.getBegin(), t.getEnd()); + lemma.setValue(lemmatizer.lemmatize(StringUtils.toSimplifiedForm(t.getText()), + pos)); + lemma.addToIndexes(); + + t.setLemma(lemma); + } + } +} diff --git a/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizer.java b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizer.java similarity index 78% rename from dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizer.java rename to dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizer.java index 045647bb4e..3da583e65e 100644 --- a/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizer.java +++ b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizer.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.nlp4j; +package org.dkpro.core.nlp4j; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; @@ -35,27 +35,32 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.MappingProviderFactory; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.nlp4j.internal.EmoryNlp2Uima; +import org.dkpro.core.nlp4j.internal.EmoryNlpUtils; +import org.dkpro.core.nlp4j.internal.OnlineComponentTagsetDescriptionProvider; +import org.dkpro.core.nlp4j.internal.Uima2EmoryNlp; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.nlp4j.internal.EmoryNlp2Uima; -import de.tudarmstadt.ukp.dkpro.core.nlp4j.internal.EmoryNlpUtils; -import de.tudarmstadt.ukp.dkpro.core.nlp4j.internal.OnlineComponentTagsetDescriptionProvider; -import de.tudarmstadt.ukp.dkpro.core.nlp4j.internal.Uima2EmoryNlp; import edu.emory.mathcs.nlp.common.util.NLPUtils; import edu.emory.mathcs.nlp.component.ner.NERState; import edu.emory.mathcs.nlp.component.template.OnlineComponent; import edu.emory.mathcs.nlp.component.template.node.NLPNode; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Emory NLP4J name finder wrapper. */ -@ResourceMetaData(name="NLP4J Named Entity Recognizer") +@Component(OperationType.NAMED_ENTITITY_RECOGNIZER) +@ResourceMetaData(name = "NLP4J Named Entity Recognizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", @@ -89,6 +94,20 @@ public class Nlp4JNamedEntityRecognizer @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + /** * Location from which the model is read. */ @@ -99,23 +118,14 @@ public class Nlp4JNamedEntityRecognizer /** * Location of the mapping file for named entity tags to UIMA types. */ - public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; + public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = + ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_NAMED_ENTITY_MAPPING_LOCATION, mandatory = false) protected String mappingLocation; - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - /** * Process anyway, even if the model relies on features that are not supported by this * component. - * - * Default: {@code false} */ public static final String PARAM_IGNORE_MISSING_FEATURES = "ignoreMissingFeatures"; @ConfigurationParameter(name = PARAM_IGNORE_MISSING_FEATURES, mandatory = true, defaultValue = "false") @@ -132,15 +142,8 @@ public void initialize(UimaContext aContext) modelProvider = new Nlp4JNamedEntityRecognizerModelProvider(this); - mappingProvider = new MappingProvider(); - mappingProvider - .setDefaultVariantsLocation("de/tudarmstadt/ukp/dkpro/core/nlp4j/lib/ner-default-variants.map"); - mappingProvider.setDefault(MappingProvider.LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/" - + "core/nlp4j/lib/ner-${language}-${variant}.map"); - mappingProvider.setDefault(MappingProvider.BASE_TYPE, NamedEntity.class.getName()); - mappingProvider.setOverride(MappingProvider.LOCATION, mappingLocation); - mappingProvider.setOverride(MappingProvider.LANGUAGE, language); - mappingProvider.setOverride(MappingProvider.VARIANT, variant); + mappingProvider = MappingProviderFactory.createNerMappingProvider(this, mappingLocation, + language, variant, modelProvider); } @Override @@ -158,7 +161,7 @@ public void process(JCas aJCas) // Process the sentences - new results will be stored in the existing NLPNodes modelProvider.getResource().process(nodes); - EmoryNlp2Uima.convertNamedEntities(cas, tokens, nodes, mappingProvider, internTags); + EmoryNlp2Uima.convertNamedEntities(cas, tokens, nodes, mappingProvider); } } @@ -168,6 +171,9 @@ private class Nlp4JNamedEntityRecognizerModelProvider public Nlp4JNamedEntityRecognizerModelProvider(Object aOwner) { super(aOwner, "nlp4j", "ner"); + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/nlp4j/lib/ner-${language}-${variant}.properties"); } @Override diff --git a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JPosTagger.java b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JPosTagger.java new file mode 100644 index 0000000000..ad66128277 --- /dev/null +++ b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JPosTagger.java @@ -0,0 +1,237 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.nlp4j; + +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.apache.uima.util.Level.INFO; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import java.util.Set; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.nlp4j.internal.EmoryNlp2Uima; +import org.dkpro.core.nlp4j.internal.EmoryNlpUtils; +import org.dkpro.core.nlp4j.internal.OnlineComponentTagsetDescriptionProvider; +import org.dkpro.core.nlp4j.internal.Uima2EmoryNlp; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import edu.emory.mathcs.nlp.common.util.NLPUtils; +import edu.emory.mathcs.nlp.component.pos.POSState; +import edu.emory.mathcs.nlp.component.template.OnlineComponent; +import edu.emory.mathcs.nlp.component.template.node.NLPNode; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Part-of-Speech annotator using Emory NLP4J. Requires {@link Sentence}s to be annotated before. + */ +@Component(OperationType.PART_OF_SPEECH_TAGGER) +@ResourceMetaData(name = "NLP4J POS-Tagger") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }) +public class Nlp4JPosTagger + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + protected String modelLocation; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Load the part-of-speech tag to UIMA type mapping from this location instead of locating + * the mapping automatically. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + protected String posMappingLocation; + + /** + * Log the tag set(s) when a model is loaded. + */ + public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") + protected boolean printTagSet; + + /** + * Process anyway, even if the model relies on features that are not supported by this + * component. + */ + public static final String PARAM_IGNORE_MISSING_FEATURES = "ignoreMissingFeatures"; + @ConfigurationParameter(name = PARAM_IGNORE_MISSING_FEATURES, mandatory = true, defaultValue = "false") + protected boolean ignoreMissingFeatures; + + private Nlp4JPosTaggerModelProvider modelProvider; + private MappingProvider mappingProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new Nlp4JPosTaggerModelProvider(this); + + // General setup of the mapping provider in initialize() + mappingProvider = createPosMappingProvider(this, posMappingLocation, language, + modelProvider); + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + CAS cas = aJCas.getCas(); + + // Document-specific configuration of model and mapping provider in process() + modelProvider.configure(cas); + + // Mind the mapping provider must be configured after the model provider as it uses the + // model metadata + mappingProvider.configure(cas); + + for (Sentence sentence : select(aJCas, Sentence.class)) { + List<Token> tokens = selectCovered(aJCas, Token.class, sentence); + NLPNode[] nodes = Uima2EmoryNlp.convertSentence(tokens); + + // Process the sentences - new results will be stored in the existing NLPNodes + modelProvider.getResource().process(nodes); + + EmoryNlp2Uima.convertPos(cas, tokens, nodes, mappingProvider); + } + } + + private class Nlp4JPosTaggerModelProvider + extends ModelProviderBase<OnlineComponent<NLPNode, POSState<NLPNode>>> + { + public Nlp4JPosTaggerModelProvider(Object aOwner) + { + super(aOwner, "nlp4j", "tagger"); + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/nlp4j/lib/tagger-${language}-${variant}.properties"); + } + + @Override + protected OnlineComponent<NLPNode, POSState<NLPNode>> produceResource(InputStream aStream) + throws Exception + { + String language = getAggregatedProperties().getProperty(LANGUAGE); + + if (!language.equals("en")) { + throw new IllegalArgumentException(new Throwable( + "Emory NLP4J supports only English")); + } + + EmoryNlpUtils.initGlobalLexica(); + + // Load the POS tagger model from the location the model provider offers + OnlineComponent<NLPNode, POSState<NLPNode>> component = (OnlineComponent) + NLPUtils.getComponent(aStream); + + // Extract tagset information from the model + OnlineComponentTagsetDescriptionProvider<NLPNode, POSState<NLPNode>> tsdp = + new OnlineComponentTagsetDescriptionProvider<>( + getResourceMetaData().getProperty("pos.tagset"), POS.class, component); + addTagset(tsdp); + + if (printTagSet) { + getContext().getLogger().log(INFO, tsdp.toString()); + } + + Set<String> features = EmoryNlpUtils.extractFeatures(component); + getLogger().info("Model uses these features: " + features); + + + Set<String> unsupportedFeatures = EmoryNlpUtils.extractUnsupportedFeatures(component); + if (!unsupportedFeatures.isEmpty()) { + String message = "Model these uses unsupported features: " + unsupportedFeatures; + if (ignoreMissingFeatures) { + getLogger().warn(message); + } + else { + throw new IOException(message); + } + } + + // Create a new POS tagger instance from the loaded model + return component; + } + }; +} diff --git a/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JSegmenter.java b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JSegmenter.java similarity index 89% rename from dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JSegmenter.java rename to dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JSegmenter.java index 856030eec4..1c158c10ad 100644 --- a/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JSegmenter.java +++ b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JSegmenter.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.nlp4j; +package org.dkpro.core.nlp4j; import java.io.IOException; import java.net.URL; @@ -28,19 +28,21 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.segmentation.SegmenterBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; import edu.emory.mathcs.nlp.component.tokenizer.EnglishTokenizer; import edu.emory.mathcs.nlp.component.tokenizer.Tokenizer; import edu.emory.mathcs.nlp.component.tokenizer.token.Token; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Segmenter using Emory NLP4J. */ -@ResourceMetaData(name="NLP4J Segmenter") +@ResourceMetaData(name = "NLP4J Segmenter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", diff --git a/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/internal/EmoryNlp2Uima.java b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/EmoryNlp2Uima.java similarity index 84% rename from dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/internal/EmoryNlp2Uima.java rename to dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/EmoryNlp2Uima.java index 32ca39f9d0..a821577675 100644 --- a/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/internal/EmoryNlp2Uima.java +++ b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/EmoryNlp2Uima.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.nlp4j.internal; +package org.dkpro.core.nlp4j.internal; import java.util.List; @@ -23,12 +23,12 @@ import org.apache.uima.cas.Feature; import org.apache.uima.cas.Type; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.BilouDecoder; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.io.BilouDecoder; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; @@ -38,7 +38,7 @@ public class EmoryNlp2Uima { public static void convertPos(CAS aCas, List<Token> aTokens, NLPNode[] aNodes, - MappingProvider aMappingProvider, boolean internStrings) + MappingProvider aMappingProvider) { // EmoryNLP tokens start at 1 int i = 1; @@ -50,7 +50,7 @@ public static void convertPos(CAS aCas, List<Token> aTokens, NLPNode[] aNodes, Type posTag = aMappingProvider.getTagType(tag); POS posAnno = (POS) aCas.createAnnotation(posTag, t.getBegin(), t.getEnd()); // To save memory, we typically intern() tag strings - posAnno.setPosValue(internStrings ? tag.intern() : tag); + posAnno.setPosValue(tag != null ? tag.intern() : null); POSUtils.assignCoarseValue(posAnno); posAnno.addToIndexes(); @@ -61,7 +61,7 @@ public static void convertPos(CAS aCas, List<Token> aTokens, NLPNode[] aNodes, } public static void convertDependencies(JCas aJCas, List<Token> aTokens, NLPNode[] aNodes, - MappingProvider aMappingProvider, boolean aInternTags) + MappingProvider aMappingProvider) { for (int i = 1; i < aNodes.length; i++) { NLPNode depNode = aNodes[i]; @@ -74,7 +74,7 @@ public static void convertDependencies(JCas aJCas, List<Token> aTokens, NLPNode[ if (govNode.getID() != 0) { Type depRel = aMappingProvider.getTagType(label); Dependency dep = (Dependency) aJCas.getCas().createFS(depRel); - dep.setDependencyType(aInternTags ? label.intern() : label); + dep.setDependencyType(label != null ? label.intern() : null); dep.setDependent(aTokens.get(depNode.getID() - 1)); dep.setGovernor(aTokens.get(govNode.getID() - 1)); dep.setBegin(dep.getDependent().getBegin()); @@ -96,18 +96,17 @@ public static void convertDependencies(JCas aJCas, List<Token> aTokens, NLPNode[ } public static void convertNamedEntities(CAS aCas, List<Token> aTokens, NLPNode[] aNodes, - MappingProvider aMappingProvider, boolean aInternTags) + MappingProvider aMappingProvider) { Type neType = aCas.getTypeSystem().getType(NamedEntity.class.getName()); Feature valueFeat = neType.getFeatureByBaseName("value"); - String[] neTags = new String[aNodes.length-1]; + String[] neTags = new String[aNodes.length - 1]; for (int i = 1; i < aNodes.length; i++) { - neTags[i-1] = aNodes[i].getNamedEntityTag(); + neTags[i - 1] = aNodes[i].getNamedEntityTag(); } BilouDecoder decoder = new BilouDecoder(aCas, valueFeat, aMappingProvider); - decoder.setInternTags(aInternTags); decoder.decode(aTokens, neTags); } } diff --git a/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/internal/EmoryNlpUtils.java b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/EmoryNlpUtils.java similarity index 98% rename from dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/internal/EmoryNlpUtils.java rename to dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/EmoryNlpUtils.java index a2cca92d03..acb0249f2a 100644 --- a/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/internal/EmoryNlpUtils.java +++ b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/EmoryNlpUtils.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.nlp4j.internal; +package org.dkpro.core.nlp4j.internal; import static java.util.Arrays.asList; @@ -30,18 +30,18 @@ import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; +import org.dkpro.core.api.resources.ResourceUtils; import org.w3c.dom.Document; import org.w3c.dom.Element; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; import edu.emory.mathcs.nlp.common.collection.tree.PrefixTree; import edu.emory.mathcs.nlp.common.util.IOUtils; import edu.emory.mathcs.nlp.component.template.OnlineComponent; import edu.emory.mathcs.nlp.component.template.feature.FeatureItem; import edu.emory.mathcs.nlp.component.template.feature.Field; -import edu.emory.mathcs.nlp.component.template.node.NLPNode; import edu.emory.mathcs.nlp.component.template.lexicon.GlobalLexica; -import edu.emory.mathcs.nlp.component.template.lexicon.GlobalLexicon;; +import edu.emory.mathcs.nlp.component.template.lexicon.GlobalLexicon; +import edu.emory.mathcs.nlp.component.template.node.NLPNode; public class EmoryNlpUtils { diff --git a/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/internal/OnlineComponentTagsetDescriptionProvider.java b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/OnlineComponentTagsetDescriptionProvider.java similarity index 89% rename from dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/internal/OnlineComponentTagsetDescriptionProvider.java rename to dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/OnlineComponentTagsetDescriptionProvider.java index f7905baaf5..2a8cb22227 100644 --- a/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/internal/OnlineComponentTagsetDescriptionProvider.java +++ b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/OnlineComponentTagsetDescriptionProvider.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.nlp4j.internal; +package org.dkpro.core.nlp4j.internal; import static java.util.Collections.singletonMap; @@ -23,13 +23,15 @@ import java.util.Set; import java.util.TreeSet; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.TagsetBase; +import org.dkpro.core.api.metadata.TagsetBase; + import edu.emory.mathcs.nlp.component.template.OnlineComponent; import edu.emory.mathcs.nlp.component.template.node.AbstractNLPNode; import edu.emory.mathcs.nlp.component.template.state.NLPState; import edu.emory.mathcs.nlp.learning.optimization.OnlineOptimizer; -public class OnlineComponentTagsetDescriptionProvider<N extends AbstractNLPNode<N>, S extends NLPState<N>> +public class OnlineComponentTagsetDescriptionProvider + <N extends AbstractNLPNode<N>, S extends NLPState<N>> extends TagsetBase { private String name; diff --git a/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/internal/Uima2EmoryNlp.java b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/Uima2EmoryNlp.java similarity index 94% rename from dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/internal/Uima2EmoryNlp.java rename to dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/Uima2EmoryNlp.java index 359e90d94f..7728b97cd8 100644 --- a/dkpro-core-nlp4j-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/internal/Uima2EmoryNlp.java +++ b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/Uima2EmoryNlp.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.nlp4j.internal; +package org.dkpro.core.nlp4j.internal; import java.util.List; @@ -26,7 +26,7 @@ public class Uima2EmoryNlp { public static NLPNode[] convertSentence(List<Token> aTokens) { - NLPNode[] nodes = new NLPNode[aTokens.size()+1]; + NLPNode[] nodes = new NLPNode[aTokens.size() + 1]; nodes[0] = new NLPNode(); nodes[0].toRoot(); diff --git a/dkpro-core-nlp4j-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/nlp4j/lib/ner-default-variants.map b/dkpro-core-nlp4j-asl/src/main/resources/org/dkpro/core/nlp4j/lib/ner-default-variants.map similarity index 100% rename from dkpro-core-nlp4j-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/nlp4j/lib/ner-default-variants.map rename to dkpro-core-nlp4j-asl/src/main/resources/org/dkpro/core/nlp4j/lib/ner-default-variants.map diff --git a/dkpro-core-nlp4j-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/EnglishTokenizerTest.java b/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/EnglishTokenizerTest.java similarity index 96% rename from dkpro-core-nlp4j-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/EnglishTokenizerTest.java rename to dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/EnglishTokenizerTest.java index f1a6f4f937..3872f57cee 100644 --- a/dkpro-core-nlp4j-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/EnglishTokenizerTest.java +++ b/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/EnglishTokenizerTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.nlp4j; +package org.dkpro.core.nlp4j; import java.util.List; diff --git a/dkpro-core-nlp4j-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JDependencyParserTest.java b/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JDependencyParserTest.java similarity index 94% rename from dkpro-core-nlp4j-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JDependencyParserTest.java rename to dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JDependencyParserTest.java index 0483a2de74..f5c6aac6bd 100644 --- a/dkpro-core-nlp4j-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JDependencyParserTest.java +++ b/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JDependencyParserTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.nlp4j; +package org.dkpro.core.nlp4j; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; @@ -23,14 +23,16 @@ import org.apache.commons.lang3.ArrayUtils; import org.apache.uima.fit.factory.AggregateBuilder; import org.apache.uima.jcas.JCas; +import org.dkpro.core.nlp4j.Nlp4JDependencyParser; +import org.dkpro.core.nlp4j.Nlp4JPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Assume; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class Nlp4JDependencyParserTest { diff --git a/dkpro-core-nlp4j-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JLemmatizerTest.java b/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JLemmatizerTest.java similarity index 88% rename from dkpro-core-nlp4j-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JLemmatizerTest.java rename to dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JLemmatizerTest.java index 1d37ffd4f0..e7980a1412 100644 --- a/dkpro-core-nlp4j-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JLemmatizerTest.java +++ b/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JLemmatizerTest.java @@ -15,20 +15,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.nlp4j; +package org.dkpro.core.nlp4j; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.jcas.JCas; +import org.dkpro.core.nlp4j.Nlp4JLemmatizer; +import org.dkpro.core.nlp4j.Nlp4JPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class Nlp4JLemmatizerTest { diff --git a/dkpro-core-nlp4j-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizerTest.java b/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizerTest.java similarity index 91% rename from dkpro-core-nlp4j-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizerTest.java rename to dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizerTest.java index aa29d93ac1..140cb58f70 100644 --- a/dkpro-core-nlp4j-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizerTest.java +++ b/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizerTest.java @@ -15,21 +15,24 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.nlp4j; +package org.dkpro.core.nlp4j; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.jcas.JCas; +import org.dkpro.core.nlp4j.Nlp4JLemmatizer; +import org.dkpro.core.nlp4j.Nlp4JNamedEntityRecognizer; +import org.dkpro.core.nlp4j.Nlp4JPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Assume; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class Nlp4JNamedEntityRecognizerTest { @@ -74,4 +77,5 @@ private JCas runTest(String language, String variant, String testDocument) } @Rule - public DkproTestContext testContext = new DkproTestContext();} + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-nlp4j-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JPosTaggerTest.java b/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JPosTaggerTest.java similarity index 79% rename from dkpro-core-nlp4j-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JPosTaggerTest.java rename to dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JPosTaggerTest.java index 8dfc4ae396..d2e6b244e7 100644 --- a/dkpro-core-nlp4j-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JPosTaggerTest.java +++ b/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JPosTaggerTest.java @@ -15,20 +15,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.nlp4j; +package org.dkpro.core.nlp4j; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; +import org.dkpro.core.nlp4j.Nlp4JPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class Nlp4JPosTaggerTest { @@ -57,16 +58,16 @@ public void testEnglish() throws Exception { runTest("en", null, "This is a test .", - new String[] { "DT", "VBZ", "DT", "NN", "." }, - new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + new String[] { "DT", "VBZ", "DT", "NN", "." }, + new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); runTest("en", null, "A neural net .", - new String[] { "DT", "JJ", "NN", "." }, - new String[] { "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); + new String[] { "DT", "JJ", "NN", "." }, + new String[] { "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); runTest("en", null, "John is purchasing oranges .", - new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, - new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); + new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, + new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); } private JCas runTest(String language, String variant, String testDocument, String[] tags, diff --git a/dkpro-core-nlp4j-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JSegmenterTest.java b/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JSegmenterTest.java similarity index 88% rename from dkpro-core-nlp4j-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JSegmenterTest.java rename to dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JSegmenterTest.java index 81f0104d1d..205e2b176c 100644 --- a/dkpro-core-nlp4j-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/nlp4j/Nlp4JSegmenterTest.java +++ b/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JSegmenterTest.java @@ -15,16 +15,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.nlp4j; +package org.dkpro.core.nlp4j; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; + import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.dkpro.core.nlp4j.Nlp4JSegmenter; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.harness.SegmenterHarness; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.harness.SegmenterHarness; - public class Nlp4JSegmenterTest { @Test diff --git a/dkpro-core-nlp4j-asl/src/test/resources/log4j.properties b/dkpro-core-nlp4j-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-nlp4j-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-nlp4j-asl/src/test/resources/log4j2.xml b/dkpro-core-nlp4j-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-nlp4j-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Logger name="org.dkpro.core.api.resources.ResourceObjectProviderBase" level="INFO"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-norvig-asl/pom.xml b/dkpro-core-norvig-asl/pom.xml index 3973de6a48..7606a54a6e 100644 --- a/dkpro-core-norvig-asl/pom.xml +++ b/dkpro-core-norvig-asl/pom.xml @@ -18,14 +18,15 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <version>1.10.0-SNAPSHOT</version> + <artifactId>dkpro-core-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.norvig-asl</artifactId> + <artifactId>dkpro-core-norvig-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - Norvig's Spelling Correction</name> + <url>https://dkpro.github.io/dkpro-core/</url> <dependencies> <dependency> <groupId>org.apache.uima</groupId> @@ -36,20 +37,40 @@ <artifactId>uimafit-core</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.anomaly-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-anomaly-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.transform-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-transform-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> </dependency> </dependencies> + <build> + <plugins> + <plugin> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-maven-plugin</artifactId> + <configuration> + <!-- + There is no way to specify models for the NorvigSpellingCorrector on OpenMinTeD. + --> + <uimaDescriptorExcludes> + <exclude>**/NorvigSpellingCorrector.xml</exclude> + </uimaDescriptorExcludes> + </configuration> + </plugin> + </plugins> + </build> </project> diff --git a/dkpro-core-norvig-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/norvig/NorvigSpellingAlgorithm.java b/dkpro-core-norvig-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/norvig/NorvigSpellingAlgorithm.java deleted file mode 100644 index 0281812dce..0000000000 --- a/dkpro-core-norvig-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/norvig/NorvigSpellingAlgorithm.java +++ /dev/null @@ -1,261 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.norvig; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; -import java.net.URL; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import static java.lang.Character.isUpperCase; -import static java.lang.Character.toUpperCase; -import static java.lang.Double.parseDouble; - -/** - * Spelling corrector based on Norvig's algorithm. - * - * @see <a href="http://norvig.com/spell-correct.html">Norvig's algorithm</a> - */ -public class NorvigSpellingAlgorithm -{ - private final static Pattern WORD_PATTERN = Pattern.compile("\\w+"); - - private final Map<String, AtomicInteger> nWords = new HashMap<String, AtomicInteger>(); - - private Map<String, String> cachedCorrections = new HashMap<String, String>(); - private int bestScore = -1; - private String bestCandidate; - - protected void resetScore() - { - bestScore = -1; - bestCandidate = null; - } - - public void reset() - { - resetScore(); - cachedCorrections = new HashMap<String, String>(); - } - - /** - * Read words from the given reader and count their occurrences. - * - * @param aReader - * the reader. - * @throws IOException - * if the words cannot be read. - */ - public void train(Reader aReader) - throws IOException - { - BufferedReader in = new BufferedReader(aReader); - - String line = in.readLine(); - while (line != null) { - Matcher m = WORD_PATTERN.matcher(line.toLowerCase()); - - while (m.find()) { - String word = m.group(); - AtomicInteger count = nWords.get(word); - if (count == null) { - count = new AtomicInteger(0); - nWords.put(word, count); - } - count.incrementAndGet(); - } - - line = in.readLine(); - } - } - - /** - * Read words from the given URL and count their occurrences. - * - * @param aUrl - * the URL to load the words from. - * @param aEncoding - * the encoding. - * @throws IOException - * if the words cannot be read. - */ - public void train(URL aUrl, String aEncoding) - throws IOException - { - try (InputStream is = aUrl.openStream()) { - train(new InputStreamReader(is, aEncoding)); - } - } - - /** - * Get a list for all possible variants of the given word containing an insertion, deletion, - * replacement or transposition. - * - * @param word - * the word. - * @return the list of variants. - */ - protected List<String> edits(String word) - { - List<String> candidates = new ArrayList<String>(); - - for (int i = 0; i < word.length(); i++) { - // deletes - candidates.add(word.substring(0, i) + word.substring(i + 1)); - - for (char c = 'a'; c <= 'z'; c++) { - // replaces - candidates.add(word.substring(0, i) + c + word.substring(i + 1)); - // inserts - candidates.add(word.substring(0, i) + c + word.substring(i)); - } - } - - // inserts at the end - for (char c = 'a'; c <= 'z'; c++) { - candidates.add(word + c); - } - - // transposes - for (int i = 0; i < word.length() - 1; i++) { - candidates.add(word.substring(0, i) + word.substring(i + 1, i + 2) - + word.substring(i, i + 1) + word.substring(i + 2)); - } - - return candidates; - } - - /** - * Try to find a correction for the given word. The word may contain up to two edits. If no - * better alternative is found, the word is returned verbatim. For performance reasons - * corrections are cached. - * - * @param aWord - * the word to correct (has to be lower-case) - * @return the possible correction. - */ - public String correct(String aWord) - { - // Too short words and numbers cannot be corrected. - if ((aWord.length() < 2) || isNumber(aWord)) { - return aWord; - } - - // Remember case - boolean isUpper = isUpperCase(aWord.charAt(0)); - - // Correct if not cached - String word = aWord.toLowerCase(); - String correction = cachedCorrections.get(word); - if (correction == null) { - correction = getBestCandidate(word); - cachedCorrections.put(word, correction); - } - - // Restore case - char[] buffer = correction.toCharArray(); - if (isUpper) { - buffer[0] = toUpperCase(buffer[0]); - } - - return new String(buffer); - } - - protected boolean isNumber(String aWord) - { - try { - parseDouble(aWord); - return true; - } - catch (NumberFormatException nfe) { - return false; - } - } - - /** - * Try to find a correction for the given word. The word may contain up to two edits. If no - * better alternative is found, the word is returned verbatim. - * - * @param word - * the word to correct (has to be lower-case) - * @return the possible correction. - */ - protected String getBestCandidate(String word) - { - // If the word is in the dictionary, it is probably correct - if (nWords.containsKey(word)) { - return word; - } - - // Reset score - resetScore(); - - // Look up the potential correct words in the dictionary - List<String> candidates1 = edits(word); - for (String candidate : candidates1) { - consider(candidate); - } - - // Found possible correction for one mistake - if (bestScore != -1) { - return bestCandidate; - } - - // Repeat the process for a potential second mistake - for (String candidate1 : candidates1) { - List<String> candidates2 = edits(candidate1); - for (String candidate2 : candidates2) { - consider(candidate2); - } - } - - if (bestScore != -1) { - return bestCandidate; - } - else { - return word; - } - } - - /** - * Consider the given candidate. If it is better than a previously found candidate, then - * remember it, otherwise forget it. - * - * @param candidate - * the candidate to consider. - */ - protected void consider(String candidate) - { - AtomicInteger score = nWords.get(candidate); - if (score != null) { - if (score.get() > bestScore) { - bestScore = score.get(); - bestCandidate = candidate; - } - } - } -} diff --git a/dkpro-core-norvig-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/norvig/NorvigSpellingCorrector.java b/dkpro-core-norvig-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/norvig/NorvigSpellingCorrector.java deleted file mode 100644 index 33aac3a382..0000000000 --- a/dkpro-core-norvig-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/norvig/NorvigSpellingCorrector.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.norvig; - -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SpellingAnomaly; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation; - -/** - * Creates {@link SofaChangeAnnotation}s containing corrections for previously identified spelling - * errors. - */ -@ResourceMetaData(name="Simple Spelling Corrector") -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation"}) - -public class NorvigSpellingCorrector - extends JCasAnnotator_ImplBase -{ - public static final String PARAM_MODEL_FILE = ComponentParameters.PARAM_MODEL_LOCATION; - - private NorvigSpellingAlgorithm spellingCorrector; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - try { - String trainingFile = (String) context.getConfigParameterValue(PARAM_MODEL_FILE); - - spellingCorrector = new NorvigSpellingAlgorithm(); - spellingCorrector.train(getContext().getResourceURL(trainingFile), "UTF-8"); - } - catch (Exception e) { - throw new ResourceInitializationException(e); - } - } - - @Override - public void process(JCas jcas) - throws AnalysisEngineProcessException - { - for (Token t : select(jcas, Token.class)) { - String token = t.getCoveredText(); - - // If there is no spelling error in this token, then we do not - // have to correct it. - if (selectCovered(SpellingAnomaly.class, t).size() == 0) { - continue; // No mistake here - } - - String correction = spellingCorrector.correct(token); - - if (!correction.equals(token)) { - // Create change annotation - SofaChangeAnnotation change = new SofaChangeAnnotation(jcas, t.getBegin(), - t.getEnd()); - change.setValue(correction); - change.setReason("spelling error"); - change.setOperation("replace"); - change.addToIndexes(); - } - } - } -} diff --git a/dkpro-core-norvig-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/norvig/package-info.java b/dkpro-core-norvig-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/norvig/package-info.java deleted file mode 100644 index c6293515ee..0000000000 --- a/dkpro-core-norvig-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/norvig/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Spelling correction based on <a href="http://norvig.com/spell-correct.html">Norvig's algorithm</a>. - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.norvig; diff --git a/dkpro-core-norvig-asl/src/main/java/org/dkpro/core/norvig/NorvigSpellingAlgorithm.java b/dkpro-core-norvig-asl/src/main/java/org/dkpro/core/norvig/NorvigSpellingAlgorithm.java new file mode 100644 index 0000000000..0bf065a911 --- /dev/null +++ b/dkpro-core-norvig-asl/src/main/java/org/dkpro/core/norvig/NorvigSpellingAlgorithm.java @@ -0,0 +1,261 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.norvig; + +import static java.lang.Character.isUpperCase; +import static java.lang.Character.toUpperCase; +import static java.lang.Double.parseDouble; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Spelling corrector based on Norvig's algorithm. + * + * @see <a href="http://norvig.com/spell-correct.html">Norvig's algorithm</a> + */ +public class NorvigSpellingAlgorithm +{ + private final static Pattern WORD_PATTERN = Pattern.compile("\\w+"); + + private final Map<String, AtomicInteger> nWords = new HashMap<String, AtomicInteger>(); + + private Map<String, String> cachedCorrections = new HashMap<String, String>(); + private int bestScore = -1; + private String bestCandidate; + + protected void resetScore() + { + bestScore = -1; + bestCandidate = null; + } + + public void reset() + { + resetScore(); + cachedCorrections = new HashMap<String, String>(); + } + + /** + * Read words from the given reader and count their occurrences. + * + * @param aReader + * the reader. + * @throws IOException + * if the words cannot be read. + */ + public void train(Reader aReader) + throws IOException + { + BufferedReader in = new BufferedReader(aReader); + + String line = in.readLine(); + while (line != null) { + Matcher m = WORD_PATTERN.matcher(line.toLowerCase()); + + while (m.find()) { + String word = m.group(); + AtomicInteger count = nWords.get(word); + if (count == null) { + count = new AtomicInteger(0); + nWords.put(word, count); + } + count.incrementAndGet(); + } + + line = in.readLine(); + } + } + + /** + * Read words from the given URL and count their occurrences. + * + * @param aUrl + * the URL to load the words from. + * @param aEncoding + * the encoding. + * @throws IOException + * if the words cannot be read. + */ + public void train(URL aUrl, String aEncoding) + throws IOException + { + try (InputStream is = aUrl.openStream()) { + train(new InputStreamReader(is, aEncoding)); + } + } + + /** + * Get a list for all possible variants of the given word containing an insertion, deletion, + * replacement or transposition. + * + * @param word + * the word. + * @return the list of variants. + */ + protected List<String> edits(String word) + { + List<String> candidates = new ArrayList<String>(); + + for (int i = 0; i < word.length(); i++) { + // deletes + candidates.add(word.substring(0, i) + word.substring(i + 1)); + + for (char c = 'a'; c <= 'z'; c++) { + // replaces + candidates.add(word.substring(0, i) + c + word.substring(i + 1)); + // inserts + candidates.add(word.substring(0, i) + c + word.substring(i)); + } + } + + // inserts at the end + for (char c = 'a'; c <= 'z'; c++) { + candidates.add(word + c); + } + + // transposes + for (int i = 0; i < word.length() - 1; i++) { + candidates.add(word.substring(0, i) + word.substring(i + 1, i + 2) + + word.substring(i, i + 1) + word.substring(i + 2)); + } + + return candidates; + } + + /** + * Try to find a correction for the given word. The word may contain up to two edits. If no + * better alternative is found, the word is returned verbatim. For performance reasons + * corrections are cached. + * + * @param aWord + * the word to correct (has to be lower-case) + * @return the possible correction. + */ + public String correct(String aWord) + { + // Too short words and numbers cannot be corrected. + if ((aWord.length() < 2) || isNumber(aWord)) { + return aWord; + } + + // Remember case + boolean isUpper = isUpperCase(aWord.charAt(0)); + + // Correct if not cached + String word = aWord.toLowerCase(); + String correction = cachedCorrections.get(word); + if (correction == null) { + correction = getBestCandidate(word); + cachedCorrections.put(word, correction); + } + + // Restore case + char[] buffer = correction.toCharArray(); + if (isUpper) { + buffer[0] = toUpperCase(buffer[0]); + } + + return new String(buffer); + } + + protected boolean isNumber(String aWord) + { + try { + parseDouble(aWord); + return true; + } + catch (NumberFormatException nfe) { + return false; + } + } + + /** + * Try to find a correction for the given word. The word may contain up to two edits. If no + * better alternative is found, the word is returned verbatim. + * + * @param word + * the word to correct (has to be lower-case) + * @return the possible correction. + */ + protected String getBestCandidate(String word) + { + // If the word is in the dictionary, it is probably correct + if (nWords.containsKey(word)) { + return word; + } + + // Reset score + resetScore(); + + // Look up the potential correct words in the dictionary + List<String> candidates1 = edits(word); + for (String candidate : candidates1) { + consider(candidate); + } + + // Found possible correction for one mistake + if (bestScore != -1) { + return bestCandidate; + } + + // Repeat the process for a potential second mistake + for (String candidate1 : candidates1) { + List<String> candidates2 = edits(candidate1); + for (String candidate2 : candidates2) { + consider(candidate2); + } + } + + if (bestScore != -1) { + return bestCandidate; + } + else { + return word; + } + } + + /** + * Consider the given candidate. If it is better than a previously found candidate, then + * remember it, otherwise forget it. + * + * @param candidate + * the candidate to consider. + */ + protected void consider(String candidate) + { + AtomicInteger score = nWords.get(candidate); + if (score != null) { + if (score.get() > bestScore) { + bestScore = score.get(); + bestCandidate = candidate; + } + } + } +} diff --git a/dkpro-core-norvig-asl/src/main/java/org/dkpro/core/norvig/NorvigSpellingCorrector.java b/dkpro-core-norvig-asl/src/main/java/org/dkpro/core/norvig/NorvigSpellingCorrector.java new file mode 100644 index 0000000000..6fb79b2088 --- /dev/null +++ b/dkpro-core-norvig-asl/src/main/java/org/dkpro/core/norvig/NorvigSpellingCorrector.java @@ -0,0 +1,106 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.norvig; + +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; + +import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SpellingAnomaly; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.Parameters; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Identifies spelling errors using Norvig's algorithm. + */ +@Component(OperationType.SPELLING_CHECKER) +@ResourceMetaData(name = "Simple Spelling Corrector") +@Parameters( + exclude = { + NorvigSpellingCorrector.PARAM_MODEL_LOCATION }) +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation"}) +public class NorvigSpellingCorrector + extends JCasAnnotator_ImplBase +{ + /** + * Location from which the model is read. This is either a local path or a classpath location. + * In the latter case, the model artifact (if any) is searched as well. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + private String modelLocation; + + private NorvigSpellingAlgorithm spellingCorrector; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + try { + spellingCorrector = new NorvigSpellingAlgorithm(); + spellingCorrector.train(getContext().getResourceURL(modelLocation), "UTF-8"); + } + catch (Exception e) { + throw new ResourceInitializationException(e); + } + } + + @Override + public void process(JCas jcas) + throws AnalysisEngineProcessException + { + for (Token t : select(jcas, Token.class)) { + String token = t.getCoveredText(); + + // If there is no spelling error in this token, then we do not + // have to correct it. + if (selectCovered(SpellingAnomaly.class, t).size() == 0) { + continue; // No mistake here + } + + String correction = spellingCorrector.correct(token); + + if (!correction.equals(token)) { + // Create change annotation + SofaChangeAnnotation change = new SofaChangeAnnotation(jcas, t.getBegin(), + t.getEnd()); + change.setValue(correction); + change.setReason("spelling error"); + change.setOperation("replace"); + change.addToIndexes(); + } + } + } +} diff --git a/dkpro-core-norvig-asl/src/main/java/org/dkpro/core/norvig/package-info.java b/dkpro-core-norvig-asl/src/main/java/org/dkpro/core/norvig/package-info.java new file mode 100644 index 0000000000..9256e77e40 --- /dev/null +++ b/dkpro-core-norvig-asl/src/main/java/org/dkpro/core/norvig/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Spelling correction based on <a href="http://norvig.com/spell-correct.html">Norvig's algorithm</a>. + * + * @since 1.1.0 + */ +package org.dkpro.core.norvig; diff --git a/dkpro-core-opennlp-asl/pom.xml b/dkpro-core-opennlp-asl/pom.xml index a101512939..8f414ef667 100644 --- a/dkpro-core-opennlp-asl/pom.xml +++ b/dkpro-core-opennlp-asl/pom.xml @@ -15,20 +15,20 @@ See the License for the specific language governing permissions and limitations under the License. --> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <version>1.10.0-SNAPSHOT</version> + <artifactId>dkpro-core-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId> + <artifactId>dkpro-core-opennlp-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - OpenNLP (v ${opennlp.version}) (ASL)</name> + <url>https://dkpro.github.io/dkpro-core/</url> <properties> - <opennlp.version>1.8.4</opennlp.version> + <opennlp.version>1.9.3</opennlp.version> </properties> <dependencies> <dependency> @@ -52,36 +52,48 @@ <artifactId>commons-io</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.apache.commons</groupId> + <artifactId>commons-lang3</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.metadata-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-featurepath-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.io-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-metadata-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.syntax-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-io-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.ner-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-syntax-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-ner-asl</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> </dependency> <dependency> <groupId>junit</groupId> @@ -89,23 +101,23 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.eval-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-eval-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.io.conll-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-io-conll-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.datasets-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-datasets-asl</artifactId> <scope>test</scope> </dependency> <dependency> diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpChunker.java b/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpChunker.java deleted file mode 100644 index a51c4212a4..0000000000 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpChunker.java +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; - -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; -import static org.apache.uima.util.Level.INFO; - -import java.io.InputStream; -import java.util.List; - -import opennlp.tools.chunker.Chunker; -import opennlp.tools.chunker.ChunkerME; -import opennlp.tools.chunker.ChunkerModel; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.Type; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.io.IobDecoder; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.Tagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ResourceParameter; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; -import de.tudarmstadt.ukp.dkpro.core.opennlp.internal.OpenNlpChunkerTagsetDescriptionProvider; - -/** - * Chunk annotator using OpenNLP. - */ -@ResourceMetaData(name="OpenNLP Chunker") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk" }) -public class OpenNlpChunker - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - @ResourceParameter(MimeTypes.APPLICATION_X_OPENNLP_CHUNK) - protected String modelLocation; - - /** - * Load the chunk tag to UIMA type mapping from this location instead of locating - * the mapping automatically. - */ - public static final String PARAM_CHUNK_MAPPING_LOCATION = ComponentParameters.PARAM_CHUNK_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_CHUNK_MAPPING_LOCATION, mandatory = false) - protected String chunkMappingLocation; - - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spamming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code true} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - - /** - * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") - protected boolean printTagSet; - - private CasConfigurableProviderBase<Chunker> modelProvider; - private MappingProvider mappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase<Chunker>(this, "opennlp", "chunker") { - @Override - protected Chunker produceResource(InputStream aStream) - throws Exception - { - ChunkerModel model = new ChunkerModel(aStream); - - Tagset tsdp = new OpenNlpChunkerTagsetDescriptionProvider(getResourceMetaData() - .getProperty("chunk.tagset"), Chunk.class, model.getChunkerModel()); - addTagset(tsdp); - - if (printTagSet) { - getContext().getLogger().log(INFO, tsdp.toString()); - } - - return new ChunkerME(model); - } - }; - - mappingProvider = MappingProviderFactory.createChunkMappingProvider(chunkMappingLocation, - language, modelProvider); - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - - modelProvider.configure(cas); - mappingProvider.configure(cas); - - Type chunkType = cas.getTypeSystem().getType(Chunk.class.getName()); - Feature chunkValue = chunkType.getFeatureByBaseName("chunkValue"); - - IobDecoder decoder = new IobDecoder(cas, chunkValue, mappingProvider); - decoder.setInternTags(internTags); - - for (Sentence sentence : select(aJCas, Sentence.class)) { - List<Token> tokens = selectCovered(aJCas, Token.class, sentence); - String[] tokenTexts = new String[tokens.size()]; - String[] tokenTags = new String[tokens.size()]; - int i = 0; - for (Token t : tokens) { - tokenTexts[i] = t.getText(); - if (t.getPos() == null || t.getPos().getPosValue() == null) { - throw new IllegalStateException("Every token must have a POS tag."); - } - tokenTags[i] = t.getPos().getPosValue(); - i++; - } - - String[] chunkTags = modelProvider.getResource().chunk(tokenTexts, tokenTags); - decoder.decode(tokens, chunkTags); - } - } -} diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpLemmatizer.java b/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpLemmatizer.java deleted file mode 100644 index 1d380be6ab..0000000000 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpLemmatizer.java +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; - -import static org.apache.uima.fit.util.JCasUtil.indexCovered; -import static org.apache.uima.fit.util.JCasUtil.select; -import java.io.InputStream; -import java.util.Collection; -import java.util.Map; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ResourceParameter; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import opennlp.tools.lemmatizer.LemmatizerME; -import opennlp.tools.lemmatizer.LemmatizerModel; - -/** - * Lemmatizer using OpenNLP. - */ -@ResourceMetaData(name="OpenNLP Lemmatizer") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }) -public class OpenNlpLemmatizer - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - @ResourceParameter(MimeTypes.APPLICATION_X_OPENNLP_LEMMA) - protected String modelLocation; - - /** - * The character encoding used by the model. - */ - public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; - @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = false) - private String modelEncoding; - - private CasConfigurableProviderBase<LemmatizerME> modelProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase<LemmatizerME>(this, "lemma") - { - @Override - protected LemmatizerME produceResource(InputStream aStream) - throws Exception - { - // Load the lemmatizer model from the location the model provider offers - LemmatizerModel model = new LemmatizerModel(aStream); - - // Create a new POS tagger instance from the loaded model - return new LemmatizerME(model); - } - }; - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - - // Document-specific configuration of model and mapping provider in process() - modelProvider.configure(cas); - - Map<Sentence, Collection<Token>> index = indexCovered(aJCas, Sentence.class, Token.class); - for (Sentence sentence : select(aJCas, Sentence.class)) { - Collection<Token> tokens = index.get(sentence); - - String[] toks = new String[tokens.size()]; - String[] tags = new String[tokens.size()]; - - int i = 0; - for (Token t : tokens) { - toks[i] = t.getText(); - tags[i] = t.getPosValue(); - i++; - } - - // Fetch the OpenNLP lemmatizer instance configured with the right model and use it to - // tag the text - LemmatizerME lemmatizer = modelProvider.getResource(); - String[] lemmas = lemmatizer.lemmatize(toks, tags); - - int n = 0; - for (Token t : tokens) { - Lemma lemmaAnno = new Lemma(aJCas, t.getBegin(), t.getEnd()); - lemmaAnno.setValue(lemmas[n]); - lemmaAnno.addToIndexes(); - - // Connect the Lemma annotation to the respective token annotation - t.setLemma(lemmaAnno); - n++; - } - } - } -} diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpNamedEntityRecognizer.java b/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpNamedEntityRecognizer.java deleted file mode 100644 index e544a30ab3..0000000000 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpNamedEntityRecognizer.java +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; - -import static org.apache.uima.fit.util.JCasUtil.indexCovered; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.toText; -import static org.apache.uima.util.Level.INFO; - -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; -import java.util.Map; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Type; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ResourceParameter; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableStreamProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.opennlp.internal.OpenNlpSequenceTagsetDescriptionProvider; -import opennlp.tools.namefind.NameFinderME; -import opennlp.tools.namefind.TokenNameFinder; -import opennlp.tools.namefind.TokenNameFinderModel; -import opennlp.tools.util.Span; - -/** - * OpenNLP name finder wrapper. - */ -@ResourceMetaData(name="OpenNLP Named Entity Recognizer") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity" }) -public class OpenNlpNamedEntityRecognizer - extends JCasAnnotator_ImplBase -{ - /** - * Log the tag set(s) when a model is loaded. - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") - protected boolean printTagSet; - - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Variant of a model the model. Used to address a specific model if here are multiple models - * for one language. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = true, defaultValue="person") - protected String variant; - - /** - * Location from which the model is read. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - @ResourceParameter(MimeTypes.APPLICATION_X_OPENNLP_NER) - protected String modelLocation; - - /** - * Location of the mapping file for named entity tags to UIMA types. - */ - public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_NAMED_ENTITY_MAPPING_LOCATION, mandatory = false) - protected String mappingLocation; - - private CasConfigurableProviderBase<TokenNameFinder> modelProvider; - private MappingProvider mappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new CasConfigurableStreamProviderBase<TokenNameFinder>() - { - { - setContextObject(OpenNlpNamedEntityRecognizer.this); - - setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); - setDefault(ARTIFACT_ID, - "de.tudarmstadt.ukp.dkpro.core.opennlp-model-ner-${language}-${variant}"); - - setDefaultVariantsLocation("de/tudarmstadt/ukp/dkpro/core/opennlp/lib/ner-default-variants.map"); - setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/" - + "ner-${language}-${variant}.bin"); - - setOverride(LOCATION, modelLocation); - setOverride(LANGUAGE, language); - setOverride(VARIANT, variant); - } - - @Override - protected TokenNameFinder produceResource(InputStream aStream) - throws Exception - { - TokenNameFinderModel model = new TokenNameFinderModel(aStream); - - if (printTagSet) { - OpenNlpSequenceTagsetDescriptionProvider tsdp = new OpenNlpSequenceTagsetDescriptionProvider( - null, NamedEntity.class, model.getNameFinderSequenceModel()); - tsdp.setTagSplitPattern("-(?=[^-]*$)"); - // FIXME addTagset(tsdp) - getContext().getLogger().log(INFO, tsdp.toString()); - } - - return new NameFinderME(model); - } - }; - - mappingProvider = new MappingProvider(); - mappingProvider - .setDefaultVariantsLocation("de/tudarmstadt/ukp/dkpro/core/opennlp/lib/ner-default-variants.map"); - mappingProvider.setDefault(MappingProvider.LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/" - + "core/opennlp/lib/ner-${language}-${variant}.map"); - mappingProvider.setDefault(MappingProvider.BASE_TYPE, NamedEntity.class.getName()); - mappingProvider.setOverride(MappingProvider.LOCATION, mappingLocation); - mappingProvider.setOverride(MappingProvider.LANGUAGE, language); - mappingProvider.setOverride(MappingProvider.VARIANT, variant); - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - modelProvider.configure(cas); - mappingProvider.configure(cas); - - modelProvider.getResource().clearAdaptiveData(); - - Map<Sentence, Collection<Token>> index = indexCovered(aJCas, Sentence.class, Token.class); - for (Sentence sentence : select(aJCas, Sentence.class)) { - // get the document text - List<Token> tokenList = new ArrayList<>(index.get(sentence)); - String[] tokens = toText(tokenList).toArray(new String[tokenList.size()]); - - // test the string - Span[] namedEntities = modelProvider.getResource().find(tokens); - - // get the named entities and their character offsets - for (Span namedEntity : namedEntities) { - int begin = tokenList.get(namedEntity.getStart()).getBegin(); - int end = tokenList.get(namedEntity.getEnd()-1).getEnd(); - - Type type = mappingProvider.getTagType(namedEntity.getType()); - NamedEntity neAnno = (NamedEntity) cas.createAnnotation(type, begin, end); - neAnno.setValue(namedEntity.getType()); - neAnno.addToIndexes(); - } - } - } -} diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpParser.java b/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpParser.java deleted file mode 100644 index 3a519c052f..0000000000 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpParser.java +++ /dev/null @@ -1,336 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; - -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; -import static org.apache.uima.util.Level.INFO; - -import java.io.InputStream; -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; - -import opennlp.tools.parser.AbstractBottomUpParser; -import opennlp.tools.parser.Parse; -import opennlp.tools.parser.Parser; -import opennlp.tools.parser.ParserFactory; -import opennlp.tools.parser.ParserModel; -import opennlp.tools.util.Span; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Type; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.FSCollectionFactory; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.cas.FSArray; -import org.apache.uima.jcas.tcas.Annotation; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ResourceParameter; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; -import de.tudarmstadt.ukp.dkpro.core.opennlp.internal.OpenNlpParserTagsetDescriptionProvider; -import de.tudarmstadt.ukp.dkpro.core.opennlp.internal.OpenNlpTagsetDescriptionProvider; - -/** - * OpenNLP parser. The parser ignores existing POS tags and internally creates new ones. However, - * these tags are only added as annotation if explicitly requested via {@link #PARAM_WRITE_POS}. - */ -@ResourceMetaData(name="OpenNLP Parser") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent", - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree"}) -public class OpenNlpParser - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - @ResourceParameter(MimeTypes.APPLICATION_X_OPENNLP_PARSER) - protected String modelLocation; - - /** - * Load the part-of-speech tag to UIMA type mapping from this location instead of locating - * the mapping automatically. - */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - protected String posMappingLocation; - - /** - * Location of the mapping file for constituent tags to UIMA types. - */ - public static final String PARAM_CONSTITUENT_MAPPING_LOCATION = ComponentParameters.PARAM_CONSTITUENT_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_CONSTITUENT_MAPPING_LOCATION, mandatory = false) - protected String constituentMappingLocation; - - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - * - * <p>Default: {@code true}</p> - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - - /** - * Log the tag set(s) when a model is loaded. - * - * <p>Default: {@code false}</p> - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") - protected boolean printTagSet; - - /** - * Sets whether to create or not to create POS tags. The creation of - * constituent tags must be turned on for this to work. - * - * <p>Default: {@code true}</p> - */ - public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; - @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "false") - private boolean createPosTags; - - /** - * If this parameter is set to true, each sentence is annotated with a PennTree-Annotation, - * containing the whole parse tree in Penn Treebank style format. - * - * <p>Default: {@code false}</p> - */ - public static final String PARAM_WRITE_PENN_TREE = ComponentParameters.PARAM_WRITE_PENN_TREE; - @ConfigurationParameter(name = PARAM_WRITE_PENN_TREE, mandatory = true, defaultValue = "false") - private boolean createPennTreeString; - - private CasConfigurableProviderBase<Parser> modelProvider; - private MappingProvider posMappingProvider; - private MappingProvider constituentMappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new OpenNlpParserModelProvider(); - - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - language, modelProvider); - - constituentMappingProvider = MappingProviderFactory.createConstituentMappingProvider( - constituentMappingLocation, language, modelProvider); - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - - modelProvider.configure(cas); - posMappingProvider.configure(cas); - constituentMappingProvider.configure(cas); - - for (Sentence sentence : select(aJCas, Sentence.class)) { - List<Token> tokens = selectCovered(aJCas, Token.class, sentence); - - Parse parseInput = new Parse(cas.getDocumentText(), - new Span(sentence.getBegin(), sentence.getEnd()), - AbstractBottomUpParser.INC_NODE, 0, 0); - int i=0; - for (Token t : tokens) { - parseInput.insert(new Parse(cas.getDocumentText(), new Span(t.getBegin(), t.getEnd()), - AbstractBottomUpParser.TOK_NODE, 0, i)); - i++; - } - - Parse parseOutput = modelProvider.getResource().parse(parseInput); - - createConstituentAnnotationFromTree(aJCas, parseOutput, null, tokens); - - if (createPennTreeString) { - StringBuffer sb = new StringBuffer(); - parseOutput.setType("ROOT"); // in DKPro the root is ROOT, not TOP - parseOutput.show(sb); - - PennTree pTree = new PennTree(aJCas, sentence.getBegin(), sentence.getEnd()); - pTree.setPennTree(sb.toString()); - pTree.addToIndexes(); - } - } - } - - /** - * Creates linked constituent annotations + POS annotations - * - * @param aNode - * the source tree - * @return the child-structure (needed for recursive call only) - */ - private Annotation createConstituentAnnotationFromTree(JCas aJCas, Parse aNode, - Annotation aParentFS, List<Token> aTokens) - { - // If the node is a word-level constituent node (== POS): - // create parent link on token and (if not turned off) create POS tag - if (aNode.isPosTag()) { - Token token = getToken(aTokens, aNode.getSpan().getStart(), aNode.getSpan().getEnd()); - - // link token to its parent constituent - if (aParentFS != null) { - token.setParent(aParentFS); - } - - // only add POS to index if we want POS-tagging - if (createPosTags) { - Type posTag = posMappingProvider.getTagType(aNode.getType()); - POS posAnno = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); - posAnno.setPosValue(internTags ? aNode.getType().intern() : aNode.getType()); - POSUtils.assignCoarseValue(posAnno); - posAnno.addToIndexes(); - token.setPos(posAnno); - } - - return token; - } - // Check if node is a constituent node on sentence or phrase-level - else { - String typeName = aNode.getType(); - if (AbstractBottomUpParser.TOP_NODE.equals(typeName)) { - typeName = "ROOT"; // in DKPro the root is ROOT, not TOP - } - - // create the necessary objects and methods - Type constType = constituentMappingProvider.getTagType(typeName); - - Constituent constAnno = (Constituent) aJCas.getCas().createAnnotation(constType, - aNode.getSpan().getStart(), aNode.getSpan().getEnd()); - constAnno.setConstituentType(typeName); - - // link to parent - if (aParentFS != null) { - constAnno.setParent(aParentFS); - } - - // Do we have any children? - List<Annotation> childAnnotations = new ArrayList<Annotation>(); - for (Parse child : aNode.getChildren()) { - Annotation childAnnotation = createConstituentAnnotationFromTree(aJCas, child, - constAnno, aTokens); - if (childAnnotation != null) { - childAnnotations.add(childAnnotation); - } - } - - // Now that we know how many children we have, link annotation of - // current node with its children - FSArray childArray = FSCollectionFactory.createFSArray(aJCas, childAnnotations); - constAnno.setChildren(childArray); - - // write annotation for current node to index - aJCas.addFsToIndexes(constAnno); - - return constAnno; - } - } - - /** - * Given a list of tokens (e.g. those from a sentence) return the one at the specified position. - */ - private Token getToken(List<Token> aTokens, int aBegin, int aEnd) - { - for (Token t : aTokens) { - if (aBegin == t.getBegin() && aEnd == t.getEnd()) { - return t; - } - } - throw new IllegalStateException("Token not found"); - } - - private class OpenNlpParserModelProvider - extends ModelProviderBase<Parser> - { - { - setContextObject(OpenNlpParser.this); - - setDefault(ARTIFACT_ID, "${groupId}.opennlp-model-parser-${language}-${variant}"); - setDefault(LOCATION, "classpath:/${package}/lib/parser-${language}-${variant}.bin"); - setDefault(VARIANT, "chunking"); - - setOverride(LOCATION, modelLocation); - setOverride(LANGUAGE, language); - setOverride(VARIANT, variant); - } - - @Override - protected Parser produceResource(InputStream aStream) - throws Exception - { - ParserModel model = new ParserModel(aStream); - Properties metadata = getResourceMetaData(); - - addTagset(new OpenNlpTagsetDescriptionProvider( - metadata.getProperty("pos.tagset"), POS.class, model.getParserTaggerModel() - .getPosModel())); - addTagset(new OpenNlpParserTagsetDescriptionProvider( - metadata.getProperty("constituent.tagset"), Constituent.class, model, metadata)); - - if (printTagSet) { - getContext().getLogger().log(INFO, getTagset().toString()); - } - - return ParserFactory.create(model); - } - } -} diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpPosTagger.java b/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpPosTagger.java deleted file mode 100644 index 0f56fa1bfc..0000000000 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpPosTagger.java +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; - -import static org.apache.uima.fit.util.JCasUtil.indexCovered; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.toText; -import static org.apache.uima.util.Level.INFO; - -import java.io.InputStream; -import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; -import java.util.Collection; -import java.util.Map; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Type; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ResourceParameter; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.opennlp.internal.OpenNlpTagsetDescriptionProvider; -import opennlp.tools.postag.POSModel; -import opennlp.tools.postag.POSTaggerME; - -/** - * Part-of-Speech annotator using OpenNLP. - */ -@ResourceMetaData(name="OpenNLP POS-Tagger") -//NOTE: This file contains Asciidoc markers for partial inclusion of this file in the documentation -//Do not remove these tags! -// tag::capabilities[] -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }) -public class OpenNlpPosTagger - extends JCasAnnotator_ImplBase -{ -// end::capabilities[] - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - @ResourceParameter(MimeTypes.APPLICATION_X_OPENNLP_TAGGER) - protected String modelLocation; - - /** - * The character encoding used by the model. - */ - public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; - @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = false) - private String modelEncoding; - - /** - * Load the part-of-speech tag to UIMA type mapping from this location instead of locating - * the mapping automatically. - */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - protected String posMappingLocation; - - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code true} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - - /** - * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") - protected boolean printTagSet; - - private CasConfigurableProviderBase<POSTaggerME> modelProvider; - private MappingProvider mappingProvider; - private Charset encoding; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - encoding = modelEncoding != null ? Charset.forName(modelEncoding) : null; - -// tag::model-provider-decl[] - // Use ModelProviderBase convenience constructor to set up a model provider that - // auto-detects most of its settings and is configured to use default variants. - // Auto-detection inspects the configuration parameter fields (@ConfigurationParameter) - // of the analysis engine class and looks for default parameters such as PARAM_LANGUAGE, - // PARAM_VARIANT, and PARAM_MODEL_LOCATION. - modelProvider = new ModelProviderBase<POSTaggerME>(this, "tagger") - { - @Override - protected POSTaggerME produceResource(InputStream aStream) - throws Exception - { - // Load the POS tagger model from the location the model provider offers - POSModel model = new POSModel(aStream); -// end::model-provider-decl[] - - // Extract tagset information from the model - OpenNlpTagsetDescriptionProvider tsdp = new OpenNlpTagsetDescriptionProvider( - getResourceMetaData().getProperty("pos.tagset"), POS.class, - model.getPosModel()); - if (getResourceMetaData().containsKey("pos.tagset.tagSplitPattern")) { - tsdp.setTagSplitPattern(getResourceMetaData().getProperty( - "pos.tagset.tagSplitPattern")); - } - addTagset(tsdp); - - if (printTagSet) { - getContext().getLogger().log(INFO, tsdp.toString()); - } - -// tag::model-provider-decl[] - // Create a new POS tagger instance from the loaded model - return new POSTaggerME(model); - } - }; -// end::model-provider-decl[] - -// tag::mapping-provider-decl[] - // General setup of the mapping provider in initialize() - mappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - language, modelProvider); -// end::mapping-provider-decl[] - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { -// tag::model-provider-use-1[] - CAS cas = aJCas.getCas(); - - // Document-specific configuration of model and mapping provider in process() - modelProvider.configure(cas); -// end::model-provider-use-1[] - -// tag::mapping-provider-use-1[] - // Mind the mapping provider must be configured after the model provider as it uses the - // model metadata - mappingProvider.configure(cas); -// end::mapping-provider-use-1[] - - // When packaging a model, it is possible to store additional metadata. Here we fetch such a - // model metadata property that we use to determine if the tag produced by the tagger needs - // to be post-processed. This property is specific to the DKPro Core OpenNLP models - String tagSplitPattern = modelProvider.getResourceMetaData().getProperty( - "pos.tagset.tagSplitPattern"); - - Map<Sentence, Collection<Token>> index = indexCovered(aJCas, Sentence.class, Token.class); - for (Sentence sentence : select(aJCas, Sentence.class)) { -// tag::model-provider-use-2[] - Collection<Token> tokens = index.get(sentence); - String[] tokenTexts = toText(tokens).toArray(new String[tokens.size()]); - fixEncoding(tokenTexts); - - // Fetch the OpenNLP pos tagger instance configured with the right model and use it to - // tag the text - String[] tags = modelProvider.getResource().tag(tokenTexts); -// end::model-provider-use-2[] - - int i = 0; - for (Token t : tokens) { - String tag = tags[i]; - - // Post-process the tag if necessary - if (tagSplitPattern != null) { - tag = tag.split(tagSplitPattern)[0]; - } - -// tag::mapping-provider-use-2[] - // Convert the tag produced by the tagger to an UIMA type, create an annotation - // of this type, and add it to the document. - Type posTag = mappingProvider.getTagType(tag); - POS posAnno = (POS) cas.createAnnotation(posTag, t.getBegin(), t.getEnd()); - // To save memory, we typically intern() tag strings - posAnno.setPosValue(internTags ? tag.intern() : tag); - POSUtils.assignCoarseValue(posAnno); - posAnno.addToIndexes(); -// end::mapping-provider-use-2[] - - // Connect the POS annotation to the respective token annotation - t.setPos(posAnno); - i++; - } - } - } - - private void fixEncoding(String[] aTokenTexts) - throws AnalysisEngineProcessException - { - // "Fix" encoding before passing to a model which was trained with encoding problems - if (encoding != null && !"UTF-8".equals(encoding.name())) { - for (int i = 0; i < aTokenTexts.length; i++) { - aTokenTexts[i] = new String(aTokenTexts[i].getBytes(StandardCharsets.UTF_8), - encoding); - } - } - } -} diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpPosTaggerTrainer.java b/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpPosTaggerTrainer.java deleted file mode 100644 index a4f8b5a29c..0000000000 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpPosTaggerTrainer.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; - -import java.util.concurrent.Callable; - -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.opennlp.internal.CasPosSampleStream; -import de.tudarmstadt.ukp.dkpro.core.opennlp.internal.OpenNlpTrainerBase; -import opennlp.tools.ml.BeamSearch; -import opennlp.tools.ml.EventTrainer; -import opennlp.tools.ml.maxent.GISTrainer; -import opennlp.tools.postag.POSModel; -import opennlp.tools.postag.POSTaggerFactory; -import opennlp.tools.postag.POSTaggerME; -import opennlp.tools.util.TrainingParameters; -import opennlp.tools.util.model.BaseModel; - -/** - * Train a POS tagging model for OpenNLP. - */ -@MimeTypeCapability(MimeTypes.APPLICATION_X_OPENNLP_TAGGER) -@ResourceMetaData(name="OpenNLP POS-Tagger Trainer") -public class OpenNlpPosTaggerTrainer - extends OpenNlpTrainerBase<CasPosSampleStream> -{ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true) - private String language; - - public static final String PARAM_ALGORITHM = "algorithm"; - @ConfigurationParameter(name = PARAM_ALGORITHM, mandatory = true, defaultValue = GISTrainer.MAXENT_VALUE) - private String algorithm; - - public static final String PARAM_TRAINER_TYPE = "trainerType"; - @ConfigurationParameter(name = PARAM_TRAINER_TYPE, mandatory = true, defaultValue = EventTrainer.EVENT_VALUE) - private String trainerType; - - public static final String PARAM_ITERATIONS = "iterations"; - @ConfigurationParameter(name = PARAM_ITERATIONS, mandatory = true, defaultValue = "100") - private int iterations; - - public static final String PARAM_CUTOFF = "cutoff"; - @ConfigurationParameter(name = PARAM_CUTOFF, mandatory = true, defaultValue = "5") - private int cutoff; - - /** - * @see POSTaggerME#DEFAULT_BEAM_SIZE - */ - public static final String PARAM_BEAMSIZE = "beamSize"; - @ConfigurationParameter(name = PARAM_BEAMSIZE, mandatory = true, defaultValue = "3") - private int beamSize; - - public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS; - @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, defaultValue = "1") - private int numThreads; - - @Override - public CasPosSampleStream makeSampleStream() - { - return new CasPosSampleStream(); - } - - @Override - public Callable<? extends BaseModel> makeTrainer() - { - TrainingParameters params = new TrainingParameters(); - params.put(TrainingParameters.ALGORITHM_PARAM, algorithm); - params.put(TrainingParameters.TRAINER_TYPE_PARAM, trainerType); - params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations)); - params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff)); - params.put(TrainingParameters.THREADS_PARAM, Integer.toString(numThreads)); - params.put(BeamSearch.BEAM_SIZE_PARAMETER, Integer.toString(beamSize)); - - Callable<POSModel> trainTask = () -> { - try { - return POSTaggerME.train(language, getStream(), params, new POSTaggerFactory()); - } - catch (Throwable e) { - getStream().close(); - throw e; - } - }; - - return trainTask; - } -} diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpSegmenter.java b/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpSegmenter.java deleted file mode 100644 index 62be4a1668..0000000000 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpSegmenter.java +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; - -import static org.apache.uima.fit.util.JCasUtil.selectCovered; - -import java.io.InputStream; - -import opennlp.tools.sentdetect.SentenceDetectorME; -import opennlp.tools.sentdetect.SentenceModel; -import opennlp.tools.tokenize.TokenizerME; -import opennlp.tools.tokenize.TokenizerModel; -import opennlp.tools.util.Span; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ResourceParameter; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableStreamProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; - -/** - * Tokenizer and sentence splitter using OpenNLP. - */ -@ResourceMetaData(name="OpenNLP Segmenter") -@TypeCapability( - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) -public class OpenNlpSegmenter - extends SegmenterBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Load the segmentation model from this location instead of locating the model automatically. - */ - public static final String PARAM_SEGMENTATION_MODEL_LOCATION = ComponentParameters.PARAM_SEGMENTATION_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_SEGMENTATION_MODEL_LOCATION, mandatory = false) - @ResourceParameter(MimeTypes.APPLICATION_X_OPENNLP_SENT) - protected String segmentationModelLocation; - - /** - * Load the tokenization model from this location instead of locating the model automatically. - */ - public static final String PARAM_TOKENIZATION_MODEL_LOCATION = ComponentParameters.PARAM_TOKENIZATION_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_TOKENIZATION_MODEL_LOCATION, mandatory = false) - @ResourceParameter(MimeTypes.APPLICATION_X_OPENNLP_TOKEN) - protected String tokenizationModelLocation; - - private CasConfigurableProviderBase<SentenceDetectorME> sentenceModelProvider; - private CasConfigurableProviderBase<TokenizerME> tokenModelProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - sentenceModelProvider = new CasConfigurableStreamProviderBase<SentenceDetectorME>() { - { - setContextObject(OpenNlpSegmenter.this); - - setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); - setDefault(ARTIFACT_ID, - "de.tudarmstadt.ukp.dkpro.core.opennlp-model-sentence-${language}-${variant}"); - - setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/" + - "sentence-${language}-${variant}.properties"); - setDefault(VARIANT, "maxent"); - - setOverride(LOCATION, segmentationModelLocation); - setOverride(LANGUAGE, language); - setOverride(VARIANT, variant); - } - - @Override - protected SentenceDetectorME produceResource(InputStream aStream) - throws Exception - { - SentenceModel model = new SentenceModel(aStream); - return new SentenceDetectorME(model); - } - }; - - tokenModelProvider = new CasConfigurableStreamProviderBase<TokenizerME>() { - { - setContextObject(OpenNlpSegmenter.this); - - setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); - setDefault(ARTIFACT_ID, - "de.tudarmstadt.ukp.dkpro.core.opennlp-model-token-${language}-${variant}"); - - setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/" + - "token-${language}-${variant}.properties"); - setDefault(VARIANT, "maxent"); - - setOverride(LOCATION, tokenizationModelLocation); - setOverride(LANGUAGE, language); - setOverride(VARIANT, variant); - } - - @Override - protected TokenizerME produceResource(InputStream aStream) - throws Exception - { - TokenizerModel model = new TokenizerModel(aStream); - return new TokenizerME(model); - } - }; - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - - if (isWriteSentence()) { - sentenceModelProvider.configure(cas); - } - - if (isWriteToken()) { - tokenModelProvider.configure(cas); - } - - super.process(aJCas); - } - - @Override - protected void process(JCas aJCas, String aText, int aZoneBegin) - throws AnalysisEngineProcessException - { - if (isWriteSentence()) { - Span[] sentences = sentenceModelProvider.getResource().sentPosDetect(aText); - for (Span sSpan : sentences) { - createSentence(aJCas, sSpan.getStart() + aZoneBegin, sSpan.getEnd() + aZoneBegin); - } - } - - if (isWriteToken()) { - for (Sentence sent : selectCovered(aJCas, Sentence.class, aZoneBegin, aZoneBegin + aText.length())) { - Span[] tokens = tokenModelProvider.getResource().tokenizePos(sent.getCoveredText()); - for (Span tSpan : tokens) { - createToken(aJCas, tSpan.getStart() + sent.getBegin(), - tSpan.getEnd() + sent.getBegin()); - } - } - } - } -} diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/package-info.java b/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/package-info.java deleted file mode 100644 index f51f13f0ac..0000000000 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * INTERNAL - * - * @since 1.5.0 - */ -package de.tudarmstadt.ukp.dkpro.core.opennlp.internal; \ No newline at end of file diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/package-info.java b/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/package-info.java deleted file mode 100644 index 14170e135b..0000000000 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Integration of the <a href="http://opennlp.apache.org/">Apache OpenNLP</a> tools. - * - * @since 1.4.0 - */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; diff --git a/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpChunker.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpChunker.java new file mode 100644 index 0000000000..1531a7a92a --- /dev/null +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpChunker.java @@ -0,0 +1,205 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.opennlp; + +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.apache.uima.util.Level.INFO; + +import java.io.InputStream; +import java.util.List; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.Type; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.IobDecoder; +import org.dkpro.core.api.metadata.Tagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.parameter.ResourceParameter; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.MappingProviderFactory; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.opennlp.internal.OpenNlpChunkerTagsetDescriptionProvider; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; +import opennlp.tools.chunker.Chunker; +import opennlp.tools.chunker.ChunkerME; +import opennlp.tools.chunker.ChunkerModel; + +/** + * Chunk annotator using OpenNLP. + */ +@Component(OperationType.CHUNKER) +@ResourceMetaData(name = "OpenNLP Chunker") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk" }) +public class OpenNlpChunker + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + @ResourceParameter(MimeTypes.APPLICATION_X_OPENNLP_CHUNK) + protected String modelLocation; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Load the chunk tag to UIMA type mapping from this location instead of locating + * the mapping automatically. + */ + public static final String PARAM_CHUNK_MAPPING_LOCATION = + ComponentParameters.PARAM_CHUNK_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_CHUNK_MAPPING_LOCATION, mandatory = false) + protected String chunkMappingLocation; + + /** + * Log the tag set(s) when a model is loaded. + */ + public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") + protected boolean printTagSet; + + private CasConfigurableProviderBase<Chunker> modelProvider; + private MappingProvider mappingProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase<Chunker>(this, "opennlp", "chunker") { + { + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/chunker-${language}-${variant}.properties"); + } + + @Override + protected Chunker produceResource(InputStream aStream) + throws Exception + { + ChunkerModel model = new ChunkerModel(aStream); + + Tagset tsdp = new OpenNlpChunkerTagsetDescriptionProvider(getResourceMetaData() + .getProperty("chunk.tagset"), Chunk.class, model.getChunkerModel()); + addTagset(tsdp); + + if (printTagSet) { + getContext().getLogger().log(INFO, tsdp.toString()); + } + + return new ChunkerME(model); + } + }; + + mappingProvider = MappingProviderFactory.createChunkMappingProvider(this, + chunkMappingLocation, language, modelProvider); + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + CAS cas = aJCas.getCas(); + + modelProvider.configure(cas); + mappingProvider.configure(cas); + + Type chunkType = cas.getTypeSystem().getType(Chunk.class.getName()); + Feature chunkValue = chunkType.getFeatureByBaseName("chunkValue"); + + IobDecoder decoder = new IobDecoder(cas, chunkValue, mappingProvider); + + for (Sentence sentence : select(aJCas, Sentence.class)) { + List<Token> tokens = selectCovered(aJCas, Token.class, sentence); + String[] tokenTexts = new String[tokens.size()]; + String[] tokenTags = new String[tokens.size()]; + int i = 0; + for (Token t : tokens) { + tokenTexts[i] = t.getText(); + if (t.getPos() == null || t.getPos().getPosValue() == null) { + throw new IllegalStateException("Every token must have a POS tag."); + } + tokenTags[i] = t.getPos().getPosValue(); + i++; + } + + String[] chunkTags = modelProvider.getResource().chunk(tokenTexts, tokenTags); + decoder.decode(tokens, chunkTags); + } + } +} diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpChunkerTrainer.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpChunkerTrainer.java similarity index 77% rename from dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpChunkerTrainer.java rename to dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpChunkerTrainer.java index bd8a9bd138..800dc0733c 100644 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpChunkerTrainer.java +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpChunkerTrainer.java @@ -15,18 +15,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; +package org.dkpro.core.opennlp; import java.util.concurrent.Callable; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.opennlp.internal.CasChunkSampleStream; +import org.dkpro.core.opennlp.internal.OpenNlpTrainerBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.opennlp.internal.CasChunkSampleStream; -import de.tudarmstadt.ukp.dkpro.core.opennlp.internal.OpenNlpTrainerBase; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; import opennlp.tools.chunker.ChunkerFactory; import opennlp.tools.chunker.ChunkerME; import opennlp.tools.chunker.ChunkerModel; @@ -39,38 +42,62 @@ /** * Train a chunker model for OpenNLP. */ +@Component(OperationType.TRAINER_OF_MACHINE_LEARNING_MODELS) @MimeTypeCapability(MimeTypes.APPLICATION_X_OPENNLP_CHUNK) -@ResourceMetaData(name="OpenNLP Chunker Trainer") +@ResourceMetaData(name = "OpenNLP Chunker Trainer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") public class OpenNlpChunkerTrainer extends OpenNlpTrainerBase<CasChunkSampleStream> { + /** + * Store this language to the model instead of the document language. + */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true) private String language; + /** + * Training algorithm. + */ public static final String PARAM_ALGORITHM = "algorithm"; - @ConfigurationParameter(name = PARAM_ALGORITHM, mandatory = true, defaultValue = GISTrainer.MAXENT_VALUE) + @ConfigurationParameter(name = PARAM_ALGORITHM, mandatory = true, + defaultValue = GISTrainer.MAXENT_VALUE) private String algorithm; + /** + * Trainer type. + */ public static final String PARAM_TRAINER_TYPE = "trainerType"; - @ConfigurationParameter(name = PARAM_TRAINER_TYPE, mandatory = true, defaultValue = EventTrainer.EVENT_VALUE) + @ConfigurationParameter(name = PARAM_TRAINER_TYPE, mandatory = true, + defaultValue = EventTrainer.EVENT_VALUE) private String trainerType; + /** + * Number of training iterations. + */ public static final String PARAM_ITERATIONS = "iterations"; @ConfigurationParameter(name = PARAM_ITERATIONS, mandatory = true, defaultValue = "100") private int iterations; + /** + * Frequency cut-off. + */ public static final String PARAM_CUTOFF = "cutoff"; @ConfigurationParameter(name = PARAM_CUTOFF, mandatory = true, defaultValue = "5") private int cutoff; /** + * Beam size. + * * @see ChunkerME#DEFAULT_BEAM_SIZE */ public static final String PARAM_BEAMSIZE = "beamSize"; @ConfigurationParameter(name = PARAM_BEAMSIZE, mandatory = true, defaultValue = "3") private int beamSize; + /** + * Number of parallel threads. + */ public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS; @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, defaultValue = "1") private int numThreads; diff --git a/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpLemmatizer.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpLemmatizer.java new file mode 100644 index 0000000000..9f0da99bb4 --- /dev/null +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpLemmatizer.java @@ -0,0 +1,180 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.opennlp; + +import static org.apache.uima.fit.util.JCasUtil.indexCovered; +import static org.apache.uima.fit.util.JCasUtil.select; + +import java.io.InputStream; +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.parameter.ResourceParameter; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; +import opennlp.tools.lemmatizer.LemmatizerME; +import opennlp.tools.lemmatizer.LemmatizerModel; + +/** + * Lemmatizer using OpenNLP. + */ +@Component(OperationType.LEMMATIZER) +@ResourceMetaData(name = "OpenNLP Lemmatizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }) +public class OpenNlpLemmatizer + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + @ResourceParameter(MimeTypes.APPLICATION_X_OPENNLP_LEMMA) + protected String modelLocation; + + /** + * The character encoding used by the model. + */ + public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; + @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = false) + private String modelEncoding; + + private CasConfigurableProviderBase<LemmatizerME> modelProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase<LemmatizerME>(this, "lemma") + { + { + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/lemma-${language}-${variant}.properties"); + } + + @Override + protected LemmatizerME produceResource(InputStream aStream) + throws Exception + { + // Load the lemmatizer model from the location the model provider offers + LemmatizerModel model = new LemmatizerModel(aStream); + + // Create a new POS tagger instance from the loaded model + return new LemmatizerME(model); + } + }; + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + CAS cas = aJCas.getCas(); + + // Document-specific configuration of model and mapping provider in process() + modelProvider.configure(cas); + + Map<Sentence, List<Token>> index = indexCovered(aJCas, Sentence.class, Token.class); + for (Sentence sentence : select(aJCas, Sentence.class)) { + Collection<Token> tokens = index.get(sentence); + + String[] toks = new String[tokens.size()]; + String[] tags = new String[tokens.size()]; + + int i = 0; + for (Token t : tokens) { + toks[i] = t.getText(); + tags[i] = t.getPosValue(); + i++; + } + + // Fetch the OpenNLP lemmatizer instance configured with the right model and use it to + // tag the text + LemmatizerME lemmatizer = modelProvider.getResource(); + String[] lemmas = lemmatizer.lemmatize(toks, tags); + + int n = 0; + for (Token t : tokens) { + Lemma lemmaAnno = new Lemma(aJCas, t.getBegin(), t.getEnd()); + lemmaAnno.setValue(lemmas[n]); + lemmaAnno.addToIndexes(); + + // Connect the Lemma annotation to the respective token annotation + t.setLemma(lemmaAnno); + n++; + } + } + } +} diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpLemmatizerTrainer.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpLemmatizerTrainer.java similarity index 77% rename from dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpLemmatizerTrainer.java rename to dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpLemmatizerTrainer.java index 7e88861802..1058c8e857 100644 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpLemmatizerTrainer.java +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpLemmatizerTrainer.java @@ -15,18 +15,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; +package org.dkpro.core.opennlp; import java.util.concurrent.Callable; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.opennlp.internal.CasLemmaSampleStream; +import org.dkpro.core.opennlp.internal.OpenNlpTrainerBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.opennlp.internal.CasLemmaSampleStream; -import de.tudarmstadt.ukp.dkpro.core.opennlp.internal.OpenNlpTrainerBase; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; import opennlp.tools.lemmatizer.LemmatizerFactory; import opennlp.tools.lemmatizer.LemmatizerME; import opennlp.tools.lemmatizer.LemmatizerModel; @@ -38,36 +41,60 @@ /** * Train a lemmatizer model for OpenNLP. */ +@Component(OperationType.LEMMATIZER) @MimeTypeCapability(MimeTypes.APPLICATION_X_OPENNLP_LEMMA) -@ResourceMetaData(name="OpenNLP Lemmatizer Trainer") +@ResourceMetaData(name = "OpenNLP Lemmatizer Trainer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") public class OpenNlpLemmatizerTrainer extends OpenNlpTrainerBase<CasLemmaSampleStream> { + /** + * Store this language to the model instead of the document language. + */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true) private String language; + /** + * Training algorithm. + */ public static final String PARAM_ALGORITHM = "algorithm"; - @ConfigurationParameter(name = PARAM_ALGORITHM, mandatory = true, defaultValue = GISTrainer.MAXENT_VALUE) + @ConfigurationParameter(name = PARAM_ALGORITHM, mandatory = true, + defaultValue = GISTrainer.MAXENT_VALUE) private String algorithm; + /** + * Trainer type. + */ public static final String PARAM_TRAINER_TYPE = "trainerType"; - @ConfigurationParameter(name = PARAM_TRAINER_TYPE, mandatory = true, defaultValue = EventTrainer.EVENT_VALUE) + @ConfigurationParameter(name = PARAM_TRAINER_TYPE, mandatory = true, + defaultValue = EventTrainer.EVENT_VALUE) private String trainerType; + /** + * Number of training iterations. + */ public static final String PARAM_ITERATIONS = "iterations"; @ConfigurationParameter(name = PARAM_ITERATIONS, mandatory = true, defaultValue = "100") private int iterations; + /** + * Frequency cut-off. + */ public static final String PARAM_CUTOFF = "cutoff"; @ConfigurationParameter(name = PARAM_CUTOFF, mandatory = true, defaultValue = "5") private int cutoff; + /** + * Number of parallel threads. + */ public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS; @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, defaultValue = "1") private int numThreads; /** + * Beam size. + * * @see LemmatizerME#DEFAULT_BEAM_SIZE */ public static final String PARAM_BEAMSIZE = "beamSize"; diff --git a/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpNamedEntityRecognizer.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpNamedEntityRecognizer.java new file mode 100644 index 0000000000..821f6ebd6a --- /dev/null +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpNamedEntityRecognizer.java @@ -0,0 +1,216 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.opennlp; + +import static org.apache.uima.fit.util.JCasUtil.indexCovered; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.toText; +import static org.apache.uima.util.Level.INFO; +import static org.dkpro.core.api.resources.MappingProviderFactory.createNerMappingProvider; + +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Type; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.parameter.ResourceParameter; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.CasConfigurableStreamProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.opennlp.internal.OpenNlpSequenceTagsetDescriptionProvider; + +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; +import opennlp.tools.namefind.NameFinderME; +import opennlp.tools.namefind.TokenNameFinder; +import opennlp.tools.namefind.TokenNameFinderModel; +import opennlp.tools.util.Span; + +/** + * OpenNLP name finder wrapper. + */ +@Component(OperationType.NAMED_ENTITITY_RECOGNIZER) +@ResourceMetaData(name = "OpenNLP Named Entity Recognizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity" }) +public class OpenNlpNamedEntityRecognizer + extends JCasAnnotator_ImplBase +{ + /** + * Log the tag set(s) when a model is loaded. + */ + public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") + protected boolean printTagSet; + + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Variant of a model the model. Used to address a specific model if here are multiple models + * for one language. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = true, defaultValue = "person") + protected String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Location from which the model is read. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + @ResourceParameter(MimeTypes.APPLICATION_X_OPENNLP_NER) + protected String modelLocation; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Location of the mapping file for named entity tags to UIMA types. + */ + public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = + ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_NAMED_ENTITY_MAPPING_LOCATION, mandatory = false) + protected String mappingLocation; + + private CasConfigurableProviderBase<TokenNameFinder> modelProvider; + private MappingProvider mappingProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new CasConfigurableStreamProviderBase<TokenNameFinder>() + { + { + setContextObject(OpenNlpNamedEntityRecognizer.this); + + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(ARTIFACT_ID, + "de.tudarmstadt.ukp.dkpro.core.opennlp-model-ner-${language}-${variant}"); + + setDefaultVariantsLocation("${package}/lib/ner-default-variants.map"); + setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/" + + "ner-${language}-${variant}.bin"); + + setOverride(ARTIFACT_URI, modelArtifactUri); + setOverride(LOCATION, modelLocation); + setOverride(LANGUAGE, language); + setOverride(VARIANT, variant); + } + + @Override + protected TokenNameFinder produceResource(InputStream aStream) + throws Exception + { + TokenNameFinderModel model = new TokenNameFinderModel(aStream); + + if (printTagSet) { + OpenNlpSequenceTagsetDescriptionProvider tsdp = + new OpenNlpSequenceTagsetDescriptionProvider( + null, NamedEntity.class, model.getNameFinderSequenceModel()); + tsdp.setTagSplitPattern("-(?=[^-]*$)"); + // FIXME addTagset(tsdp) + getContext().getLogger().log(INFO, tsdp.toString()); + } + + return new NameFinderME(model); + } + }; + + mappingProvider = createNerMappingProvider(this, mappingLocation, language, variant, + modelProvider); + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + CAS cas = aJCas.getCas(); + modelProvider.configure(cas); + mappingProvider.configure(cas); + + modelProvider.getResource().clearAdaptiveData(); + + Map<Sentence, List<Token>> index = indexCovered(aJCas, Sentence.class, Token.class); + for (Sentence sentence : select(aJCas, Sentence.class)) { + // get the document text + List<Token> tokenList = new ArrayList<>(index.get(sentence)); + String[] tokens = toText(tokenList).toArray(new String[tokenList.size()]); + + // test the string + Span[] namedEntities = modelProvider.getResource().find(tokens); + + // get the named entities and their character offsets + for (Span namedEntity : namedEntities) { + int begin = tokenList.get(namedEntity.getStart()).getBegin(); + int end = tokenList.get(namedEntity.getEnd() - 1).getEnd(); + + Type type = mappingProvider.getTagType(namedEntity.getType()); + NamedEntity neAnno = (NamedEntity) cas.createAnnotation(type, begin, end); + neAnno.setValue(namedEntity.getType()); + neAnno.addToIndexes(); + } + } + } +} diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpNamedEntityRecognizerTrainer.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpNamedEntityRecognizerTrainer.java similarity index 77% rename from dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpNamedEntityRecognizerTrainer.java rename to dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpNamedEntityRecognizerTrainer.java index 087aa67f8e..db6bfec33b 100644 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpNamedEntityRecognizerTrainer.java +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpNamedEntityRecognizerTrainer.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; +package org.dkpro.core.opennlp; import java.io.File; import java.io.FileInputStream; @@ -38,12 +38,17 @@ import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.opennlp.internal.CasNameSampleStream; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.opennlp.internal.CasNameSampleStream; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.Parameters; +import eu.openminted.share.annotations.api.constants.OperationType; import opennlp.tools.ml.BeamSearch; import opennlp.tools.ml.EventTrainer; import opennlp.tools.ml.maxent.GISTrainer; @@ -61,8 +66,18 @@ /** * Train a named entity recognizer model for OpenNLP. */ +@Component(OperationType.TRAINER_OF_MACHINE_LEARNING_MODELS) @MimeTypeCapability(MimeTypes.APPLICATION_X_OPENNLP_NER) -@ResourceMetaData(name="OpenNLP Named Entity Recognizer Trainer") +@Parameters( + exclude = { + OpenNlpNamedEntityRecognizerTrainer.PARAM_TARGET_LOCATION }) +@ResourceMetaData(name = "OpenNLP Named Entity Recognizer Trainer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity" }) public class OpenNlpNamedEntityRecognizerTrainer extends JCasConsumer_ImplBase { @@ -87,19 +102,26 @@ private SequenceCodec<String> getCodec() } } + /** + * Store this language to the model instead of the document language. + */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true) private String language; + /** + * Location to which the output is written. + */ public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) private File targetLocation; /** - * Regex to filter the {@link de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity#getValue() named entity} by - * type. + * Regex to filter the {@link de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity#getValue() + * named entity} by type. */ - public static final String PARAM_ACCEPTED_TAGS_REGEX = ComponentParameters.PARAM_ACCEPTED_TAGS_REGEX; + public static final String PARAM_ACCEPTED_TAGS_REGEX = + ComponentParameters.PARAM_ACCEPTED_TAGS_REGEX; @ConfigurationParameter(name = PARAM_ACCEPTED_TAGS_REGEX, mandatory = false) protected String acceptedTagsRegex; @@ -110,17 +132,28 @@ private SequenceCodec<String> getCodec() * @see SimplePerceptronSequenceTrainer#PERCEPTRON_SEQUENCE_VALUE */ public static final String PARAM_ALGORITHM = "algorithm"; - @ConfigurationParameter(name = PARAM_ALGORITHM, mandatory = true, defaultValue = PerceptronTrainer.PERCEPTRON_VALUE) + @ConfigurationParameter(name = PARAM_ALGORITHM, mandatory = true, + defaultValue = PerceptronTrainer.PERCEPTRON_VALUE) private String algorithm; + /** + * Training algorithm. + */ public static final String PARAM_TRAINER_TYPE = "trainerType"; - @ConfigurationParameter(name = PARAM_TRAINER_TYPE, mandatory = true, defaultValue = EventTrainer.EVENT_VALUE) + @ConfigurationParameter(name = PARAM_TRAINER_TYPE, mandatory = true, + defaultValue = EventTrainer.EVENT_VALUE) private String trainerType; + /** + * Number of training iterations. + */ public static final String PARAM_ITERATIONS = "iterations"; @ConfigurationParameter(name = PARAM_ITERATIONS, mandatory = true, defaultValue = "300") private int iterations; + /** + * Frequency cut-off. + */ public static final String PARAM_CUTOFF = "cutoff"; @ConfigurationParameter(name = PARAM_CUTOFF, mandatory = true, defaultValue = "0") private int cutoff; @@ -132,14 +165,23 @@ private SequenceCodec<String> getCodec() @ConfigurationParameter(name = PARAM_BEAMSIZE, mandatory = true, defaultValue = "3") private int beamSize; - public static final String PARAM_FEATURE_GEN = "featureGen"; - @ConfigurationParameter(name = PARAM_FEATURE_GEN, mandatory = false) + /** + * File containing the feature generation specification. + */ + public static final String PARAM_FEATURE_GEN_LOCATION = "featureGen"; + @ConfigurationParameter(name = PARAM_FEATURE_GEN_LOCATION, mandatory = false) private File featureGen; + /** + * Type of sequence encoding to use. + */ public static final String PARAM_SEQUENCE_ENCODING = "sequenceEncoding"; - @ConfigurationParameter(name = PARAM_SEQUENCE_ENCODING, mandatory = true, defaultValue="BILOU") + @ConfigurationParameter(name = PARAM_SEQUENCE_ENCODING, mandatory = true, defaultValue = "BILOU") private SequenceEncoding sequenceEncoding; + /** + * Number of parallel threads. + */ public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS; @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, defaultValue = "1") private int numThreads; @@ -157,7 +199,8 @@ public void initialize(UimaContext aContext) if (acceptedTagsRegex != null) { Pattern filterPattern = Pattern.compile(acceptedTagsRegex); - stream.setNamedEntityFilter(namedEntity -> filterPattern.matcher(namedEntity.getValue()).matches()); + stream.setNamedEntityFilter(namedEntity -> + filterPattern.matcher(namedEntity.getValue()).matches()); } TrainingParameters params = new TrainingParameters(); @@ -169,13 +212,13 @@ public void initialize(UimaContext aContext) params.put(TrainingParameters.THREADS_PARAM, Integer.toString(numThreads)); params.put(BeamSearch.BEAM_SIZE_PARAMETER, Integer.toString(beamSize)); - byte featureGenCfg[] = loadFeatureGen(featureGen); + byte[] featureGenCfg = loadFeatureGen(featureGen); Callable<TokenNameFinderModel> trainTask = () -> { try { return NameFinderME.train(language, null, stream, params, new TokenNameFinderFactory(featureGenCfg, - Collections.<String, Object> emptyMap(), + Collections.<String, Object>emptyMap(), sequenceEncoding.getCodec())); } catch (Throwable e) { @@ -226,7 +269,7 @@ public void collectionProcessComplete() private byte[] loadFeatureGen(File aFile) throws ResourceInitializationException { - byte featureGenCfg[] = null; + byte[] featureGenCfg = null; if (aFile != null) { try (InputStream in = new FileInputStream(aFile)) { featureGenCfg = IOUtils.toByteArray(in); diff --git a/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpParser.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpParser.java new file mode 100644 index 0000000000..b73c71987d --- /dev/null +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpParser.java @@ -0,0 +1,353 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.opennlp; + +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.apache.uima.util.Level.INFO; +import static org.dkpro.core.api.resources.MappingProviderFactory.createConstituentMappingProvider; + +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Type; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.FSCollectionFactory; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.jcas.tcas.Annotation; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.parameter.ResourceParameter; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.MappingProviderFactory; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.opennlp.internal.OpenNlpParserTagsetDescriptionProvider; +import org.dkpro.core.opennlp.internal.OpenNlpTagsetDescriptionProvider; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; +import opennlp.tools.parser.AbstractBottomUpParser; +import opennlp.tools.parser.Parse; +import opennlp.tools.parser.Parser; +import opennlp.tools.parser.ParserFactory; +import opennlp.tools.parser.ParserModel; +import opennlp.tools.util.Span; + +/** + * OpenNLP parser. The parser ignores existing POS tags and internally creates new ones. However, + * these tags are only added as annotation if explicitly requested via {@link #PARAM_WRITE_POS}. + */ +@Component(OperationType.CONSTITUENCY_PARSER) +@ResourceMetaData(name = "OpenNLP Parser") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent", + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree"}) +public class OpenNlpParser + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + @ResourceParameter(MimeTypes.APPLICATION_X_OPENNLP_PARSER) + protected String modelLocation; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Load the part-of-speech tag to UIMA type mapping from this location instead of locating + * the mapping automatically. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + protected String posMappingLocation; + + /** + * Location of the mapping file for constituent tags to UIMA types. + */ + public static final String PARAM_CONSTITUENT_MAPPING_LOCATION = + ComponentParameters.PARAM_CONSTITUENT_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_CONSTITUENT_MAPPING_LOCATION, mandatory = false) + protected String constituentMappingLocation; + + /** + * Log the tag set(s) when a model is loaded. + */ + public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") + protected boolean printTagSet; + + /** + * Sets whether to create or not to create POS tags. The creation of + * constituent tags must be turned on for this to work. + */ + public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; + @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "false") + private boolean createPosTags; + + /** + * If this parameter is set to true, each sentence is annotated with a PennTree-Annotation, + * containing the whole parse tree in Penn Treebank style format. + */ + public static final String PARAM_WRITE_PENN_TREE = ComponentParameters.PARAM_WRITE_PENN_TREE; + @ConfigurationParameter(name = PARAM_WRITE_PENN_TREE, mandatory = true, defaultValue = "false") + private boolean createPennTreeString; + + private CasConfigurableProviderBase<Parser> modelProvider; + private MappingProvider posMappingProvider; + private MappingProvider constituentMappingProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new OpenNlpParserModelProvider(); + + posMappingProvider = MappingProviderFactory.createPosMappingProvider(this, + posMappingLocation, language, modelProvider); + + constituentMappingProvider = createConstituentMappingProvider(this, + constituentMappingLocation, language, modelProvider); + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + CAS cas = aJCas.getCas(); + + modelProvider.configure(cas); + posMappingProvider.configure(cas); + constituentMappingProvider.configure(cas); + + for (Sentence sentence : select(aJCas, Sentence.class)) { + List<Token> tokens = selectCovered(aJCas, Token.class, sentence); + + Parse parseInput = new Parse(cas.getDocumentText(), + new Span(sentence.getBegin(), sentence.getEnd()), + AbstractBottomUpParser.INC_NODE, 0, 0); + int i = 0; + for (Token t : tokens) { + parseInput.insert(new Parse(cas.getDocumentText(), + new Span(t.getBegin(), t.getEnd()), AbstractBottomUpParser.TOK_NODE, 0, i)); + i++; + } + + Parse parseOutput = modelProvider.getResource().parse(parseInput); + + createConstituentAnnotationFromTree(aJCas, parseOutput, null, tokens); + + if (createPennTreeString) { + StringBuffer sb = new StringBuffer(); + parseOutput.setType("ROOT"); // in DKPro the root is ROOT, not TOP + parseOutput.show(sb); + + PennTree pTree = new PennTree(aJCas, sentence.getBegin(), sentence.getEnd()); + pTree.setPennTree(sb.toString()); + pTree.addToIndexes(); + } + } + } + + /** + * Creates linked constituent annotations + POS annotations + * + * @param aNode + * the source tree + * @return the child-structure (needed for recursive call only) + */ + private Annotation createConstituentAnnotationFromTree(JCas aJCas, Parse aNode, + Annotation aParentFS, List<Token> aTokens) + { + // If the node is a word-level constituent node (== POS): + // create parent link on token and (if not turned off) create POS tag + if (aNode.isPosTag()) { + Token token = getToken(aTokens, aNode.getSpan().getStart(), aNode.getSpan().getEnd()); + + // link token to its parent constituent + if (aParentFS != null) { + token.setParent(aParentFS); + } + + // only add POS to index if we want POS-tagging + if (createPosTags) { + Type posTag = posMappingProvider.getTagType(aNode.getType()); + POS posAnno = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), + token.getEnd()); + posAnno.setPosValue(aNode.getType() != null ? aNode.getType().intern() : null); + POSUtils.assignCoarseValue(posAnno); + posAnno.addToIndexes(); + token.setPos(posAnno); + } + + return token; + } + // Check if node is a constituent node on sentence or phrase-level + else { + String typeName = aNode.getType(); + if (AbstractBottomUpParser.TOP_NODE.equals(typeName)) { + typeName = "ROOT"; // in DKPro the root is ROOT, not TOP + } + + // create the necessary objects and methods + Type constType = constituentMappingProvider.getTagType(typeName); + + Constituent constAnno = (Constituent) aJCas.getCas().createAnnotation(constType, + aNode.getSpan().getStart(), aNode.getSpan().getEnd()); + constAnno.setConstituentType(typeName); + + // link to parent + if (aParentFS != null) { + constAnno.setParent(aParentFS); + } + + // Do we have any children? + List<Annotation> childAnnotations = new ArrayList<Annotation>(); + for (Parse child : aNode.getChildren()) { + Annotation childAnnotation = createConstituentAnnotationFromTree(aJCas, child, + constAnno, aTokens); + if (childAnnotation != null) { + childAnnotations.add(childAnnotation); + } + } + + // Now that we know how many children we have, link annotation of + // current node with its children + FSArray childArray = FSCollectionFactory.createFSArray(aJCas, childAnnotations); + constAnno.setChildren(childArray); + + // write annotation for current node to index + aJCas.addFsToIndexes(constAnno); + + return constAnno; + } + } + + /** + * Given a list of tokens (e.g. those from a sentence) return the one at the specified position. + */ + private Token getToken(List<Token> aTokens, int aBegin, int aEnd) + { + for (Token t : aTokens) { + if (aBegin == t.getBegin() && aEnd == t.getEnd()) { + return t; + } + } + throw new IllegalStateException("Token not found"); + } + + private class OpenNlpParserModelProvider + extends ModelProviderBase<Parser> + { + { + setContextObject(OpenNlpParser.this); + + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(ARTIFACT_ID, "${groupId}.opennlp-model-parser-${language}-${variant}"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/parser-${language}-${variant}.properties"); + setDefault(VARIANT, "chunking"); + + setOverride(LOCATION, modelLocation); + setOverride(LANGUAGE, language); + setOverride(VARIANT, variant); + } + + @Override + protected Parser produceResource(InputStream aStream) + throws Exception + { + ParserModel model = new ParserModel(aStream); + Properties metadata = getResourceMetaData(); + + addTagset(new OpenNlpTagsetDescriptionProvider( + metadata.getProperty("pos.tagset"), POS.class, model.getParserTaggerModel() + .getPosModel())); + addTagset(new OpenNlpParserTagsetDescriptionProvider( + metadata.getProperty("constituent.tagset"), Constituent.class, model, + metadata)); + + if (printTagSet) { + getContext().getLogger().log(INFO, getTagset().toString()); + } + + return ParserFactory.create(model); + } + } +} diff --git a/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpPosTagger.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpPosTagger.java new file mode 100644 index 0000000000..25d4768406 --- /dev/null +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpPosTagger.java @@ -0,0 +1,283 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.opennlp; + +import static org.apache.uima.fit.util.JCasUtil.indexCovered; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.toText; +import static org.apache.uima.util.Level.INFO; + +import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Type; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.parameter.ResourceParameter; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.MappingProviderFactory; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.opennlp.internal.OpenNlpTagsetDescriptionProvider; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; +import opennlp.tools.postag.POSModel; +import opennlp.tools.postag.POSTaggerME; + +/** + * Part-of-Speech annotator using OpenNLP. + */ +@Component(OperationType.PART_OF_SPEECH_TAGGER) +@ResourceMetaData(name = "OpenNLP POS-Tagger") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +//NOTE: This file contains Asciidoc markers for partial inclusion of this file in the documentation +//Do not remove these tags! +// tag::capabilities[] +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }) +public class OpenNlpPosTagger + extends JCasAnnotator_ImplBase +{ +// end::capabilities[] + + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + @ResourceParameter(MimeTypes.APPLICATION_X_OPENNLP_TAGGER) + protected String modelLocation; + + /** + * The character encoding used by the model. + */ + public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; + @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = false) + private String modelEncoding; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Load the part-of-speech tag to UIMA type mapping from this location instead of locating + * the mapping automatically. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + protected String posMappingLocation; + + /** + * Log the tag set(s) when a model is loaded. + */ + public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") + protected boolean printTagSet; + + protected CasConfigurableProviderBase<POSTaggerME> modelProvider; + private MappingProvider mappingProvider; + private Charset encoding; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + encoding = modelEncoding != null ? Charset.forName(modelEncoding) : null; + +// tag::model-provider-decl[] + // Use ModelProviderBase convenience constructor to set up a model provider that + // auto-detects most of its settings and is configured to use default variants. + // Auto-detection inspects the configuration parameter fields (@ConfigurationParameter) + // of the analysis engine class and looks for default parameters such as PARAM_LANGUAGE, + // PARAM_VARIANT, and PARAM_MODEL_LOCATION. + modelProvider = new ModelProviderBase<POSTaggerME>(this, "tagger") + { + { + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/tagger-${language}-${variant}.properties"); + } + + @Override + protected POSTaggerME produceResource(InputStream aStream) + throws Exception + { + // Load the POS tagger model from the location the model provider offers + POSModel model = new POSModel(aStream); +// end::model-provider-decl[] + + // Extract tagset information from the model + OpenNlpTagsetDescriptionProvider tsdp = new OpenNlpTagsetDescriptionProvider( + getResourceMetaData().getProperty("pos.tagset"), POS.class, + model.getPosModel()); + if (getResourceMetaData().containsKey("pos.tagset.tagSplitPattern")) { + tsdp.setTagSplitPattern(getResourceMetaData().getProperty( + "pos.tagset.tagSplitPattern")); + } + addTagset(tsdp); + + if (printTagSet) { + getContext().getLogger().log(INFO, tsdp.toString()); + } + +// tag::model-provider-decl[] + // Create a new POS tagger instance from the loaded model + return new POSTaggerME(model); + } + }; +// end::model-provider-decl[] + +// tag::mapping-provider-decl[] + // General setup of the mapping provider in initialize() + mappingProvider = MappingProviderFactory.createPosMappingProvider(this, posMappingLocation, + language, modelProvider); +// end::mapping-provider-decl[] + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { +// tag::model-provider-use-1[] + CAS cas = aJCas.getCas(); + + // Document-specific configuration of model and mapping provider in process() + modelProvider.configure(cas); +// end::model-provider-use-1[] + +// tag::mapping-provider-use-1[] + // Mind the mapping provider must be configured after the model provider as it uses the + // model metadata + mappingProvider.configure(cas); +// end::mapping-provider-use-1[] + + // When packaging a model, it is possible to store additional metadata. Here we fetch such a + // model metadata property that we use to determine if the tag produced by the tagger needs + // to be post-processed. This property is specific to the DKPro Core OpenNLP models + String tagSplitPattern = modelProvider.getResourceMetaData().getProperty( + "pos.tagset.tagSplitPattern"); + + Map<Sentence, List<Token>> index = indexCovered(aJCas, Sentence.class, Token.class); + for (Sentence sentence : select(aJCas, Sentence.class)) { +// tag::model-provider-use-2[] + Collection<Token> tokens = index.get(sentence); + String[] tokenTexts = toText(tokens).toArray(new String[tokens.size()]); + fixEncoding(tokenTexts); + + // Fetch the OpenNLP pos tagger instance configured with the right model and use it to + // tag the text + String[] tags = modelProvider.getResource().tag(tokenTexts); +// end::model-provider-use-2[] + + int i = 0; + for (Token t : tokens) { + String tag = tags[i]; + + // Post-process the tag if necessary + if (tagSplitPattern != null) { + tag = tag.split(tagSplitPattern)[0]; + } + +// tag::mapping-provider-use-2[] + // Convert the tag produced by the tagger to an UIMA type, create an annotation + // of this type, and add it to the document. + Type posTag = mappingProvider.getTagType(tag); + POS posAnno = (POS) cas.createAnnotation(posTag, t.getBegin(), t.getEnd()); + // To save memory, we typically intern() tag strings + posAnno.setPosValue(tag != null ? tag.intern() : null); + POSUtils.assignCoarseValue(posAnno); + posAnno.addToIndexes(); +// end::mapping-provider-use-2[] + + // Connect the POS annotation to the respective token annotation + t.setPos(posAnno); + i++; + } + } + } + + private void fixEncoding(String[] aTokenTexts) + throws AnalysisEngineProcessException + { + // "Fix" encoding before passing to a model which was trained with encoding problems + if (encoding != null && !"UTF-8".equals(encoding.name())) { + for (int i = 0; i < aTokenTexts.length; i++) { + aTokenTexts[i] = new String(aTokenTexts[i].getBytes(StandardCharsets.UTF_8), + encoding); + } + } + } +} diff --git a/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpPosTaggerTrainer.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpPosTaggerTrainer.java new file mode 100644 index 0000000000..17a1c5ddb8 --- /dev/null +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpPosTaggerTrainer.java @@ -0,0 +1,138 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.opennlp; + +import java.util.concurrent.Callable; + +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.opennlp.internal.CasPosSampleStream; +import org.dkpro.core.opennlp.internal.OpenNlpTrainerBase; + +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; +import opennlp.tools.ml.BeamSearch; +import opennlp.tools.ml.EventTrainer; +import opennlp.tools.ml.maxent.GISTrainer; +import opennlp.tools.postag.POSModel; +import opennlp.tools.postag.POSTaggerFactory; +import opennlp.tools.postag.POSTaggerME; +import opennlp.tools.util.TrainingParameters; +import opennlp.tools.util.model.BaseModel; + +/** + * Train a POS tagging model for OpenNLP. + */ +@Component(OperationType.TRAINER_OF_MACHINE_LEARNING_MODELS) +@MimeTypeCapability(MimeTypes.APPLICATION_X_OPENNLP_TAGGER) +@ResourceMetaData(name = "OpenNLP POS-Tagger Trainer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }) +public class OpenNlpPosTaggerTrainer + extends OpenNlpTrainerBase<CasPosSampleStream> +{ + /** + * Store this language to the model instead of the document language. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true) + private String language; + + /** + * Training algorithm. + */ + public static final String PARAM_ALGORITHM = "algorithm"; + @ConfigurationParameter(name = PARAM_ALGORITHM, mandatory = true, + defaultValue = GISTrainer.MAXENT_VALUE) + private String algorithm; + + /** + * Trainer type. + */ + public static final String PARAM_TRAINER_TYPE = "trainerType"; + @ConfigurationParameter(name = PARAM_TRAINER_TYPE, mandatory = true, + defaultValue = EventTrainer.EVENT_VALUE) + private String trainerType; + + /** + * Number of training iterations. + */ + public static final String PARAM_ITERATIONS = "iterations"; + @ConfigurationParameter(name = PARAM_ITERATIONS, mandatory = true, defaultValue = "100") + private int iterations; + + /** + * Frequency cut-off. + */ + public static final String PARAM_CUTOFF = "cutoff"; + @ConfigurationParameter(name = PARAM_CUTOFF, mandatory = true, defaultValue = "5") + private int cutoff; + + /** + * @see POSTaggerME#DEFAULT_BEAM_SIZE + */ + public static final String PARAM_BEAMSIZE = "beamSize"; + @ConfigurationParameter(name = PARAM_BEAMSIZE, mandatory = true, defaultValue = "3") + private int beamSize; + + /** + * Number of parallel threads. + */ + public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS; + @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, defaultValue = "1") + private int numThreads; + + @Override + public CasPosSampleStream makeSampleStream() + { + return new CasPosSampleStream(); + } + + @Override + public Callable<? extends BaseModel> makeTrainer() + { + TrainingParameters params = new TrainingParameters(); + params.put(TrainingParameters.ALGORITHM_PARAM, algorithm); + params.put(TrainingParameters.TRAINER_TYPE_PARAM, trainerType); + params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations)); + params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff)); + params.put(TrainingParameters.THREADS_PARAM, Integer.toString(numThreads)); + params.put(BeamSearch.BEAM_SIZE_PARAMETER, Integer.toString(beamSize)); + + Callable<POSModel> trainTask = () -> { + try { + return POSTaggerME.train(language, getStream(), params, new POSTaggerFactory()); + } + catch (Throwable e) { + getStream().close(); + throw e; + } + }; + + return trainTask; + } +} diff --git a/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpSegmenter.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpSegmenter.java new file mode 100644 index 0000000000..d0742e054c --- /dev/null +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpSegmenter.java @@ -0,0 +1,194 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.opennlp; + +import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.dkpro.core.api.resources.ResourceObjectProviderBase.PACKAGE; + +import java.io.InputStream; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.parameter.ResourceParameter; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.CasConfigurableStreamProviderBase; +import org.dkpro.core.api.segmentation.SegmenterBase; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import eu.openminted.share.annotations.api.DocumentationResource; +import opennlp.tools.sentdetect.SentenceDetectorME; +import opennlp.tools.sentdetect.SentenceModel; +import opennlp.tools.tokenize.TokenizerME; +import opennlp.tools.tokenize.TokenizerModel; +import opennlp.tools.util.Span; + +/** + * Tokenizer and sentence splitter using OpenNLP. + */ +@ResourceMetaData(name = "OpenNLP Segmenter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) +public class OpenNlpSegmenter + extends SegmenterBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * Load the segmentation model from this location instead of locating the model automatically. + */ + public static final String PARAM_SEGMENTATION_MODEL_LOCATION = + ComponentParameters.PARAM_SEGMENTATION_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_SEGMENTATION_MODEL_LOCATION, mandatory = false) + @ResourceParameter(MimeTypes.APPLICATION_X_OPENNLP_SENT) + protected String segmentationModelLocation; + + /** + * Load the tokenization model from this location instead of locating the model automatically. + */ + public static final String PARAM_TOKENIZATION_MODEL_LOCATION = + ComponentParameters.PARAM_TOKENIZATION_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_TOKENIZATION_MODEL_LOCATION, mandatory = false) + @ResourceParameter(MimeTypes.APPLICATION_X_OPENNLP_TOKEN) + protected String tokenizationModelLocation; + + private CasConfigurableProviderBase<SentenceDetectorME> sentenceModelProvider; + private CasConfigurableProviderBase<TokenizerME> tokenModelProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + sentenceModelProvider = new CasConfigurableStreamProviderBase<SentenceDetectorME>() { + { + setContextObject(OpenNlpSegmenter.this); + + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(ARTIFACT_ID, + "de.tudarmstadt.ukp.dkpro.core.opennlp-model-sentence-${language}-${variant}"); + + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/sentence-${language}-${variant}.properties"); + setDefault(VARIANT, "maxent"); + + setOverride(LOCATION, segmentationModelLocation); + setOverride(LANGUAGE, language); + setOverride(VARIANT, variant); + } + + @Override + protected SentenceDetectorME produceResource(InputStream aStream) + throws Exception + { + SentenceModel model = new SentenceModel(aStream); + return new SentenceDetectorME(model); + } + }; + + tokenModelProvider = new CasConfigurableStreamProviderBase<TokenizerME>() { + { + setContextObject(OpenNlpSegmenter.this); + + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(ARTIFACT_ID, + "de.tudarmstadt.ukp.dkpro.core.opennlp-model-token-${language}-${variant}"); + + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/token-${language}-${variant}.properties"); + setDefault(VARIANT, "maxent"); + + setOverride(LOCATION, tokenizationModelLocation); + setOverride(LANGUAGE, language); + setOverride(VARIANT, variant); + } + + @Override + protected TokenizerME produceResource(InputStream aStream) + throws Exception + { + TokenizerModel model = new TokenizerModel(aStream); + return new TokenizerME(model); + } + }; + tokenModelProvider.setDefault(PACKAGE, "de/tudarmstadt/ukp/dkpro/core/opennlp"); + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + CAS cas = aJCas.getCas(); + + if (isWriteSentence()) { + sentenceModelProvider.configure(cas); + } + + if (isWriteToken()) { + tokenModelProvider.configure(cas); + } + + super.process(aJCas); + } + + @Override + protected void process(JCas aJCas, String aText, int aZoneBegin) + throws AnalysisEngineProcessException + { + if (isWriteSentence()) { + Span[] sentences = sentenceModelProvider.getResource().sentPosDetect(aText); + for (Span sSpan : sentences) { + createSentence(aJCas, sSpan.getStart() + aZoneBegin, sSpan.getEnd() + aZoneBegin); + } + } + + if (isWriteToken()) { + for (Sentence sent : selectCovered(aJCas, Sentence.class, aZoneBegin, + aZoneBegin + aText.length())) { + Span[] tokens = tokenModelProvider.getResource().tokenizePos(sent.getCoveredText()); + for (Span tSpan : tokens) { + createToken(aJCas, tSpan.getStart() + sent.getBegin(), + tSpan.getEnd() + sent.getBegin()); + } + } + } + } +} diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpSentenceTrainer.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpSentenceTrainer.java similarity index 80% rename from dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpSentenceTrainer.java rename to dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpSentenceTrainer.java index 77fd7cf398..2ca1c0038a 100644 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpSentenceTrainer.java +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpSentenceTrainer.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; +package org.dkpro.core.opennlp; import java.io.File; import java.io.FileOutputStream; @@ -36,13 +36,18 @@ import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.opennlp.internal.CasSentenceSampleStream; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.opennlp.internal.CasSentenceSampleStream; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.Parameters; +import eu.openminted.share.annotations.api.constants.OperationType; import opennlp.tools.dictionary.Dictionary; import opennlp.tools.ml.EventTrainer; import opennlp.tools.ml.maxent.GISTrainer; @@ -54,47 +59,87 @@ /** * Train a sentence splitter model for OpenNLP. */ +@Component(OperationType.TRAINER_OF_MACHINE_LEARNING_MODELS) @MimeTypeCapability(MimeTypes.APPLICATION_X_OPENNLP_SENT) -@ResourceMetaData(name="OpenNLP Sentence Splitter Trainer") +@Parameters( + exclude = { + OpenNlpSentenceTrainer.PARAM_TARGET_LOCATION }) +@ResourceMetaData(name = "OpenNLP Sentence Splitter Trainer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) public class OpenNlpSentenceTrainer extends JCasConsumer_ImplBase { + /** + * Store this language to the model instead of the document language. + */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true) private String language; + /** + * Location to which the output is written. + */ public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) private File targetLocation; + /** + * Training algorithm. + */ public static final String PARAM_ALGORITHM = "algorithm"; - @ConfigurationParameter(name = PARAM_ALGORITHM, mandatory = true, defaultValue = GISTrainer.MAXENT_VALUE) + @ConfigurationParameter(name = PARAM_ALGORITHM, mandatory = true, + defaultValue = GISTrainer.MAXENT_VALUE) private String algorithm; + /** + * Trainer type. + */ public static final String PARAM_TRAINER_TYPE = "trainerType"; - @ConfigurationParameter(name = PARAM_TRAINER_TYPE, mandatory = true, defaultValue = EventTrainer.EVENT_VALUE) + @ConfigurationParameter(name = PARAM_TRAINER_TYPE, mandatory = true, + defaultValue = EventTrainer.EVENT_VALUE) private String trainerType; + /** + * Number of training iterations. + */ public static final String PARAM_ITERATIONS = "iterations"; @ConfigurationParameter(name = PARAM_ITERATIONS, mandatory = true, defaultValue = "100") private int iterations; + /** + * Frequency cut-off. + */ public static final String PARAM_CUTOFF = "cutoff"; @ConfigurationParameter(name = PARAM_CUTOFF, mandatory = true, defaultValue = "5") private int cutoff; + /** + * End-of-sentence characters. + */ public static final String PARAM_EOS_CHARACTERS = "eosCharacters"; @ConfigurationParameter(name = PARAM_EOS_CHARACTERS, mandatory = false) private char[] eosCharacters; + /** + * Location of the abbreviation dictionary. + */ public static final String PARAM_ABBREVIATION_DICTIONARY_LOCATION = "abbreviationDictionaryLocation"; @ConfigurationParameter(name = PARAM_ABBREVIATION_DICTIONARY_LOCATION, mandatory = false) private String abbreviationDictionaryLocation; + /** + * Encoding of the abbreviation dictionary. + */ public static final String PARAM_ABBREVIATION_DICTIONARY_ENCODING = "abbreviationDictionaryEncoding"; @ConfigurationParameter(name = PARAM_ABBREVIATION_DICTIONARY_ENCODING, mandatory = true, defaultValue = "UTF-8") private String abbreviationDictionaryEncoding; + /** + * Number of parallel threads. + */ public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS; @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, defaultValue = "1") private int numThreads; diff --git a/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpSnowballStemmer.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpSnowballStemmer.java new file mode 100644 index 0000000000..fb50cecbf6 --- /dev/null +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpSnowballStemmer.java @@ -0,0 +1,225 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.opennlp; + +import static org.apache.commons.lang3.StringUtils.isBlank; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.FeaturePath; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.cas.text.AnnotationIndex; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.featurepath.FeaturePathAnnotatorBase; +import org.dkpro.core.api.featurepath.FeaturePathException; +import org.dkpro.core.api.parameter.ComponentParameters; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; +import opennlp.tools.stemmer.Stemmer; +import opennlp.tools.stemmer.snowball.SnowballStemmer; +import opennlp.tools.stemmer.snowball.SnowballStemmer.ALGORITHM; + +/** + * <p>UIMA wrapper for the Snowball stemmer included with OpenNLP. Annotation types to be stemmed + * can be configured by a {@link FeaturePath}.</p> + * <p>If you use this component in a pipeline which uses stop word removal, make sure that it + * runs after the stop word removal step, so only words that are no stop words are stemmed.</p> + * + * @see FeaturePathAnnotatorBase + */ +@Component(OperationType.STEMMER) +@ResourceMetaData(name = "OpenNLP Snowball Stemmer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability({ "ar", "da", "nl", "en", "fi", "fr", "de", "el", "hu", "ga", "it", "no", "pt", + "ro", "ru", "es", "sv", "tr" }) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem"}) +public class OpenNlpSnowballStemmer + extends FeaturePathAnnotatorBase +{ + private static final String MESSAGE_DIGEST = OpenNlpSnowballStemmer.class.getName() + "_Messages"; + + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Per default the stemmer runs in case-sensitive mode. If this parameter is enabled, tokens + * are lower-cased before being passed to the stemmer. + * + * <table border="1"> + * <caption>Examples</caption> + * <tr><th></th><th>false (default)</th><th>true</th></tr> + * <tr><td>EDUCATIONAL</td><td>EDUCATIONAL</td><td>educ</td></tr> + * <tr><td>Educational</td><td>Educat</td><td>educ</td></tr> + * <tr><td>educational</td><td>educ</td><td>educ</td></tr> + * </table> + */ + public static final String PARAM_LOWER_CASE = "lowerCase"; + @ConfigurationParameter(name = PARAM_LOWER_CASE, mandatory = false, defaultValue = "false") + protected boolean lowerCase; + + public static final Map<String, ALGORITHM> languages = new HashMap<>(); + + static { + languages.put("ar", ALGORITHM.ARABIC); + languages.put("da", ALGORITHM.DANISH); + languages.put("nl", ALGORITHM.DUTCH); + languages.put("en", ALGORITHM.ENGLISH); + languages.put("fi", ALGORITHM.FINNISH); + languages.put("fr", ALGORITHM.FRENCH); + languages.put("de", ALGORITHM.GERMAN); + languages.put("el", ALGORITHM.GREEK); + languages.put("hu", ALGORITHM.HUNGARIAN); + languages.put("ga", ALGORITHM.IRISH); + languages.put("it", ALGORITHM.ITALIAN); + languages.put("no", ALGORITHM.NORWEGIAN); + languages.put("pt", ALGORITHM.PORTUGUESE); + languages.put("ro", ALGORITHM.ROMANIAN); + languages.put("ru", ALGORITHM.RUSSIAN); + languages.put("es", ALGORITHM.SPANISH); + languages.put("sv", ALGORITHM.SWEDISH); + languages.put("tr", ALGORITHM.TURKISH); + } + + @Override + protected Set<String> getDefaultPaths() + { + return Collections.singleton(Token.class.getName()); + } + + @Override + protected void generateAnnotations(JCas jcas) + throws AnalysisEngineProcessException, FeaturePathException + { + // CAS is necessary to retrieve values + CAS currCAS = jcas.getCas(); + + String lang = language != null ? language : jcas.getDocumentLanguage(); + if (isBlank(lang)) { + throw new AnalysisEngineProcessException(MESSAGE_DIGEST, "no_language_error", null); + } + + ALGORITHM algorithm = languages.get(jcas.getDocumentLanguage()); + if (algorithm == null) { + throw new AnalysisEngineProcessException(MESSAGE_DIGEST, + "unsupported_language_error", new Object[] { lang }); + } + + Stemmer stemmer = new SnowballStemmer(algorithm); + + for (String path : paths) { + + // Separate Typename and featurepath + String[] segments = path.split("/", 2); + String typeName = segments[0]; + + // Try to get the type from the typesystem of the CAS + Type t = currCAS.getTypeSystem().getType(typeName); + if (t == null) { + throw new IllegalStateException("Type [" + typeName + "] not found in type system"); + } + + // get an fpi object and initialize it + // initialize the FeaturePathInfo with the corresponding part + initializeFeaturePathInfoFrom(fp, segments); + + // get the annotations + AnnotationIndex<?> idx = currCAS.getAnnotationIndex(t); + FSIterator<?> iterator = idx.iterator(); + + while (iterator.hasNext()) { + AnnotationFS fs = (AnnotationFS) iterator.next(); + + try { + if (this.filterFeaturePath != null) { + // check annotation filter condition + if (this.filterFeaturePathInfo.match(fs, this.filterCondition)) { + createStemAnnotation(stemmer, jcas, fs); + } + } + else { // no annotation filter specified + createStemAnnotation(stemmer, jcas, fs); + } + } + catch (AnalysisEngineProcessException e) { + // TODO Auto-generated catch block + throw new IllegalStateException( + "error occured while creating a stem annotation", e); + } + } + } + } + + /** + * Creates a Stem annotation with same begin and end as the AnnotationFS fs, the value is the + * stemmed value derived by applying the featurepath. + * + * @param jcas + * the JCas + * @param fs + * the AnnotationFS where the Stem annotation is created + * @throws AnalysisEngineProcessException + * if the {@code stem} method from the snowball stemmer cannot be invoked. + */ + private void createStemAnnotation(Stemmer aStemmer, JCas jcas, AnnotationFS fs) + throws AnalysisEngineProcessException + { + // Check for blank text, it makes no sense to add a stem then (and raised an exception) + String value = fp.getValue(fs); + if (!StringUtils.isBlank(value)) { + if (lowerCase) { + // Fixme - should use locale/language defined in CAS. + value = value.toLowerCase(Locale.US); + } + + Stem stemAnnot = new Stem(jcas, fs.getBegin(), fs.getEnd()); + stemAnnot.setValue(aStemmer.stem(value).toString()); + stemAnnot.addToIndexes(jcas); + + // Try setting the "stem" feature on Tokens. + Feature feat = fs.getType().getFeatureByBaseName("stem"); + if (feat != null && feat.getRange() != null + && jcas.getTypeSystem().subsumes(feat.getRange(), stemAnnot.getType())) { + fs.setFeatureValue(feat, stemAnnot); + } + } + } +} diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpTokenTrainer.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpTokenTrainer.java similarity index 80% rename from dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpTokenTrainer.java rename to dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpTokenTrainer.java index d4c6496836..213a37330e 100644 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpTokenTrainer.java +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/OpenNlpTokenTrainer.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; +package org.dkpro.core.opennlp; import java.io.File; import java.io.FileOutputStream; @@ -37,13 +37,18 @@ import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.opennlp.internal.CasTokenSampleStream; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.opennlp.internal.CasTokenSampleStream; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.Parameters; +import eu.openminted.share.annotations.api.constants.OperationType; import opennlp.tools.dictionary.Dictionary; import opennlp.tools.ml.EventTrainer; import opennlp.tools.ml.maxent.GISTrainer; @@ -56,51 +61,95 @@ /** * Train a tokenizer model for OpenNLP. */ +@Component(OperationType.TRAINER_OF_MACHINE_LEARNING_MODELS) @MimeTypeCapability(MimeTypes.APPLICATION_X_OPENNLP_TOKEN) -@ResourceMetaData(name="OpenNLP Tokenizer Trainer") +@Parameters( + exclude = { + OpenNlpTokenTrainer.PARAM_TARGET_LOCATION }) +@ResourceMetaData(name = "OpenNLP Tokenizer Trainer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) public class OpenNlpTokenTrainer extends JCasConsumer_ImplBase { + /** + * Store this language to the model instead of the document language. + */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true) private String language; + /** + * Location to which the output is written. + */ public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) private File targetLocation; + /** + * Training algorithm. + */ public static final String PARAM_ALGORITHM = "algorithm"; - @ConfigurationParameter(name = PARAM_ALGORITHM, mandatory = true, defaultValue = GISTrainer.MAXENT_VALUE) + @ConfigurationParameter(name = PARAM_ALGORITHM, mandatory = true, + defaultValue = GISTrainer.MAXENT_VALUE) private String algorithm; + /** + * Trainer type. + */ public static final String PARAM_TRAINER_TYPE = "trainerType"; - @ConfigurationParameter(name = PARAM_TRAINER_TYPE, mandatory = true, defaultValue = EventTrainer.EVENT_VALUE) + @ConfigurationParameter(name = PARAM_TRAINER_TYPE, mandatory = true, + defaultValue = EventTrainer.EVENT_VALUE) private String trainerType; + /** + * Number of training iterations. + */ public static final String PARAM_ITERATIONS = "iterations"; @ConfigurationParameter(name = PARAM_ITERATIONS, mandatory = true, defaultValue = "100") private int iterations; + /** + * Frequency cut-off. + */ public static final String PARAM_CUTOFF = "cutoff"; @ConfigurationParameter(name = PARAM_CUTOFF, mandatory = true, defaultValue = "5") private int cutoff; + /** + * If true alpha numerics are skipped. + */ public static final String PARAM_USE_ALPHANUMERIC_OPTIMIZATION = "useAlphaNumericOptimization"; @ConfigurationParameter(name = PARAM_USE_ALPHANUMERIC_OPTIMIZATION, mandatory = true, defaultValue = "true") private boolean useAlphaNumericOptimization; + /** + * Regular expression to detect alpha numerics. + */ public static final String PARAM_ALPHA_NUMERIC_PATTERN = "alphaNumericPattern"; - @ConfigurationParameter(name = PARAM_ALPHA_NUMERIC_PATTERN, mandatory = false, defaultValue = Factory.DEFAULT_ALPHANUMERIC) + @ConfigurationParameter(name = PARAM_ALPHA_NUMERIC_PATTERN, mandatory = false, + defaultValue = Factory.DEFAULT_ALPHANUMERIC) private Pattern alphaNumericPattern; + /** + * Location of the abbreviation dictionary. + */ public static final String PARAM_ABBREVIATION_DICTIONARY_LOCATION = "abbreviationDictionaryLocation"; @ConfigurationParameter(name = PARAM_ABBREVIATION_DICTIONARY_LOCATION, mandatory = false) private String abbreviationDictionaryLocation; + /** + * Encoding of the abbreviation dictionary. + */ public static final String PARAM_ABBREVIATION_DICTIONARY_ENCODING = "abbreviationDictionaryEncoding"; @ConfigurationParameter(name = PARAM_ABBREVIATION_DICTIONARY_ENCODING, mandatory = true, defaultValue = "UTF-8") private String abbreviationDictionaryEncoding; + /** + * Number of parallel threads. + */ public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS; @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, defaultValue = "1") private int numThreads; diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasChunkSampleStream.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasChunkSampleStream.java similarity index 93% rename from dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasChunkSampleStream.java rename to dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasChunkSampleStream.java index d3185db9c8..52418682a7 100644 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasChunkSampleStream.java +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasChunkSampleStream.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp.internal; +package org.dkpro.core.opennlp.internal; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; @@ -29,8 +29,8 @@ import org.apache.uima.cas.Type; import org.apache.uima.fit.util.CasUtil; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.IobEncoder; -import de.tudarmstadt.ukp.dkpro.core.api.io.IobEncoder; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; @@ -77,7 +77,7 @@ public ChunkSample produce(JCas aJCas) for (Token t : selectCovered(Token.class, sentence)) { words.add(t.getText()); if (t.getPos() == null) { - throw new IllegalStateException("Token ["+t.getText()+"] has no POS"); + throw new IllegalStateException("Token [" + t.getText() + "] has no POS"); } tags.add(t.getPos().getPosValue()); preds.add(chunkEncoder.encode(t)); diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasLemmaSampleStream.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasLemmaSampleStream.java similarity index 90% rename from dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasLemmaSampleStream.java rename to dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasLemmaSampleStream.java index 1a99fe70dc..ab2b3ebc18 100644 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasLemmaSampleStream.java +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasLemmaSampleStream.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp.internal; +package org.dkpro.core.opennlp.internal; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; @@ -23,6 +23,7 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; + import org.apache.uima.jcas.JCas; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; @@ -65,12 +66,12 @@ public LemmaSample produce(JCas aJCas) words.add(t.getText()); if (t.getPos() == null) { - throw new IllegalStateException("Token ["+t.getText()+"] has no POS"); + throw new IllegalStateException("Token [" + t.getText() + "] has no POS"); } tags.add(t.getPos().getPosValue()); - + if (t.getLemma() == null) { - throw new IllegalStateException("Token ["+t.getText()+"] has no lemma"); + throw new IllegalStateException("Token [" + t.getText() + "] has no lemma"); } lemmas.add(t.getLemma().getValue()); } diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasNameSampleStream.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasNameSampleStream.java similarity index 98% rename from dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasNameSampleStream.java rename to dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasNameSampleStream.java index e4f9d0f2ec..2a3b4bce96 100644 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasNameSampleStream.java +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasNameSampleStream.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp.internal; +package org.dkpro.core.opennlp.internal; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; @@ -97,7 +97,7 @@ public NameSample produce(JCas aJCas) int begin = idxToken.get(idxTokenOffset.get(ne.getBegin())); int end = begin; if (ne.getEnd() > ne.getBegin()) { - end = idxToken.get(idxTokenOffset.get(ne.getEnd()-1)); + end = idxToken.get(idxTokenOffset.get(ne.getEnd() - 1)); } names.add(new Span(begin, end + 1, ne.getValue())); } diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasPosSampleStream.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasPosSampleStream.java similarity index 93% rename from dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasPosSampleStream.java rename to dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasPosSampleStream.java index d3d47dfdad..362ad70308 100644 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasPosSampleStream.java +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasPosSampleStream.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp.internal; +package org.dkpro.core.opennlp.internal; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; @@ -23,6 +23,7 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; + import org.apache.uima.jcas.JCas; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; @@ -63,7 +64,7 @@ public POSSample produce(JCas aJCas) for (Token t : selectCovered(Token.class, sentence)) { words.add(t.getText()); if (t.getPos() == null) { - throw new IllegalStateException("Token ["+t.getText()+"] has no POS"); + throw new IllegalStateException("Token [" + t.getText() + "] has no POS"); } tags.add(t.getPos().getPosValue()); } diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasSampleStreamBase.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasSampleStreamBase.java similarity index 98% rename from dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasSampleStreamBase.java rename to dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasSampleStreamBase.java index 1f047fb71a..93842957e1 100644 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasSampleStreamBase.java +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasSampleStreamBase.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp.internal; +package org.dkpro.core.opennlp.internal; import java.io.IOException; import java.util.concurrent.TimeUnit; diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasSentenceSampleStream.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasSentenceSampleStream.java similarity index 96% rename from dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasSentenceSampleStream.java rename to dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasSentenceSampleStream.java index 144f9731a5..29af0f3db9 100644 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasSentenceSampleStream.java +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasSentenceSampleStream.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp.internal; +package org.dkpro.core.opennlp.internal; import static org.apache.uima.fit.util.JCasUtil.select; @@ -63,4 +63,4 @@ public SentenceSample produce(JCas aJCas) return sample; } -} \ No newline at end of file +} diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasTokenSampleStream.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasTokenSampleStream.java similarity index 96% rename from dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasTokenSampleStream.java rename to dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasTokenSampleStream.java index 3269dff696..1d19dd3edb 100644 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/CasTokenSampleStream.java +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/CasTokenSampleStream.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp.internal; +package org.dkpro.core.opennlp.internal; import static org.apache.uima.fit.util.JCasUtil.select; @@ -62,4 +62,4 @@ public TokenSample produce(JCas aJCas) return sample; } -} \ No newline at end of file +} diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/OpenNlpChunkerTagsetDescriptionProvider.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/OpenNlpChunkerTagsetDescriptionProvider.java similarity index 94% rename from dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/OpenNlpChunkerTagsetDescriptionProvider.java rename to dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/OpenNlpChunkerTagsetDescriptionProvider.java index 4310ce010b..a00e07521b 100644 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/OpenNlpChunkerTagsetDescriptionProvider.java +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/OpenNlpChunkerTagsetDescriptionProvider.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp.internal; +package org.dkpro.core.opennlp.internal; import java.util.Set; import java.util.TreeSet; @@ -25,7 +25,8 @@ public class OpenNlpChunkerTagsetDescriptionProvider extends OpenNlpTagsetDescriptionProvider { - public OpenNlpChunkerTagsetDescriptionProvider(String aName, Class<?> aLayer, MaxentModel aModel) + public OpenNlpChunkerTagsetDescriptionProvider(String aName, Class<?> aLayer, + MaxentModel aModel) { super(aName, aLayer, aModel); } diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/OpenNlpParserTagsetDescriptionProvider.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/OpenNlpParserTagsetDescriptionProvider.java similarity index 95% rename from dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/OpenNlpParserTagsetDescriptionProvider.java rename to dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/OpenNlpParserTagsetDescriptionProvider.java index f796b67240..a08c895c5d 100644 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/OpenNlpParserTagsetDescriptionProvider.java +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/OpenNlpParserTagsetDescriptionProvider.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp.internal; +package org.dkpro.core.opennlp.internal; import static java.util.Collections.singletonMap; @@ -24,14 +24,15 @@ import java.util.Set; import java.util.TreeSet; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.TagsetBase; +import org.dkpro.core.api.metadata.TagsetBase; + import opennlp.tools.ml.model.MaxentModel; import opennlp.tools.ml.model.SequenceClassificationModel; import opennlp.tools.parser.ParserModel; import opennlp.tools.util.TokenTag; public class OpenNlpParserTagsetDescriptionProvider -extends TagsetBase + extends TagsetBase { private String name; private String layer; diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/OpenNlpSequenceTagsetDescriptionProvider.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/OpenNlpSequenceTagsetDescriptionProvider.java similarity index 94% rename from dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/OpenNlpSequenceTagsetDescriptionProvider.java rename to dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/OpenNlpSequenceTagsetDescriptionProvider.java index 6670062ddd..fde19d3ed9 100644 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/OpenNlpSequenceTagsetDescriptionProvider.java +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/OpenNlpSequenceTagsetDescriptionProvider.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp.internal; +package org.dkpro.core.opennlp.internal; import static java.util.Collections.singletonMap; @@ -23,8 +23,9 @@ import java.util.Set; import java.util.TreeSet; +import org.dkpro.core.api.metadata.TagsetBase; + import opennlp.tools.ml.model.SequenceClassificationModel; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.TagsetBase; public class OpenNlpSequenceTagsetDescriptionProvider extends TagsetBase diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/OpenNlpTagsetDescriptionProvider.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/OpenNlpTagsetDescriptionProvider.java similarity index 94% rename from dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/OpenNlpTagsetDescriptionProvider.java rename to dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/OpenNlpTagsetDescriptionProvider.java index c4744edda1..a69483086c 100644 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/OpenNlpTagsetDescriptionProvider.java +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/OpenNlpTagsetDescriptionProvider.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp.internal; +package org.dkpro.core.opennlp.internal; import static java.util.Collections.singletonMap; @@ -23,8 +23,9 @@ import java.util.Set; import java.util.TreeSet; +import org.dkpro.core.api.metadata.TagsetBase; + import opennlp.tools.ml.model.MaxentModel; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.TagsetBase; public class OpenNlpTagsetDescriptionProvider extends TagsetBase diff --git a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/OpenNlpTrainerBase.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/OpenNlpTrainerBase.java similarity index 91% rename from dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/OpenNlpTrainerBase.java rename to dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/OpenNlpTrainerBase.java index 74ca275c78..0bc424e35d 100644 --- a/dkpro-core-opennlp-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/opennlp/internal/OpenNlpTrainerBase.java +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/OpenNlpTrainerBase.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp.internal; +package org.dkpro.core.opennlp.internal; import java.io.File; import java.io.FileOutputStream; @@ -33,16 +33,23 @@ import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; +import eu.openminted.share.annotations.api.Parameters; import opennlp.tools.util.model.BaseModel; /** * Train a model for OpenNLP. */ +@Parameters( + exclude = { + OpenNlpTrainerBase.PARAM_TARGET_LOCATION }) public abstract class OpenNlpTrainerBase<T extends CasSampleStreamBase> extends JCasConsumer_ImplBase { + /** + * Location to which the output is written. + */ public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) private File targetLocation; diff --git a/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/package-info.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/package-info.java new file mode 100644 index 0000000000..ba66f331a6 --- /dev/null +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/internal/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * INTERNAL + * + * @since 1.5.0 + */ +package org.dkpro.core.opennlp.internal; diff --git a/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/package-info.java b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/package-info.java new file mode 100644 index 0000000000..3d19bd93e9 --- /dev/null +++ b/dkpro-core-opennlp-asl/src/main/java/org/dkpro/core/opennlp/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Integration of the <a href="http://opennlp.apache.org/">Apache OpenNLP</a> tools. + * + * @since 1.4.0 + */ +package org.dkpro.core.opennlp; diff --git a/dkpro-core-opennlp-asl/src/main/resources/org/dkpro/core/opennlp/OpenNlpSnowballStemmer_Messages.properties b/dkpro-core-opennlp-asl/src/main/resources/org/dkpro/core/opennlp/OpenNlpSnowballStemmer_Messages.properties new file mode 100644 index 0000000000..8a65382c25 --- /dev/null +++ b/dkpro-core-opennlp-asl/src/main/resources/org/dkpro/core/opennlp/OpenNlpSnowballStemmer_Messages.properties @@ -0,0 +1,6 @@ +#---------------------------------------------------- +#Catalog of Exception Messages for the OpenNlpSnowballStemmer +#---------------------------------------------------- + +no_language_error = language is not specified +unsupported_language_error = language {0} is not supported \ No newline at end of file diff --git a/dkpro-core-opennlp-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/tagger-default-variants.map b/dkpro-core-opennlp-asl/src/main/resources/org/dkpro/core/opennlp/lib/tagger-default-variants.map similarity index 100% rename from dkpro-core-opennlp-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/tagger-default-variants.map rename to dkpro-core-opennlp-asl/src/main/resources/org/dkpro/core/opennlp/lib/tagger-default-variants.map diff --git a/dkpro-core-opennlp-asl/src/scripts/build.xml b/dkpro-core-opennlp-asl/src/scripts/build.xml index c6aa47ec59..22f7a1c2b8 100644 --- a/dkpro-core-opennlp-asl/src/scripts/build.xml +++ b/dkpro-core-opennlp-asl/src/scripts/build.xml @@ -17,7 +17,7 @@ --> <project basedir="../.." default="separate-jars"> <import> - <url url="https://raw.githubusercontent.com/dkpro/resource-packager/0.6.0/ant-macros.xml"/> + <url url="https://raw.githubusercontent.com/dkpro/resource-packager/0.8.0/ant-macros.xml"/> </import> <!-- @@ -146,7 +146,7 @@ <antcall target="de-tagger-maxent"/> <antcall target="de-tagger-perceptron"/> <antcall target="de-chunker-default"/> - <antcall target="de-ner-nemgp"/> + <antcall target="de-ner-nemgp"/> </target> <target name="de-sentence-maxent"> @@ -299,7 +299,7 @@ <target name="en"> <antcall target="en-chunker-default"/> - <antcall target="en-chunker-perceptron-ixa"/> + <antcall target="en-chunker-perceptron-ixa"/> <antcall target="en-ner-date"/> <antcall target="en-ner-location"/> <antcall target="en-ner-money"/> @@ -308,11 +308,10 @@ <antcall target="en-ner-person"/> <antcall target="en-ner-time"/> <antcall target="en-parser-chunking"/> - <antcall target="en-parser-chunking-ixa"/> + <antcall target="en-parser-chunking-ixa"/> <antcall target="en-sentence-maxent"/> <antcall target="en-tagger-maxent"/> <antcall target="en-tagger-perceptron"/> - <antcall target="en-tagger-perceptron-ixa"/> <antcall target="en-token-maxent"/> </target> @@ -387,7 +386,7 @@ </metadata> </install-stub-and-upstream-file> </target> - + <target name="en-tagger-perceptron"> <mkdir dir="target/download"/> <!-- FILE: models-1.5/en-pos-perceptron.bin - - - - - - - - - - - - - - - - - - - - - - - - @@ -653,37 +652,6 @@ </install-stub-and-upstream-file> </target> - <target name="en-tagger-perceptron-ixa" depends="download-ixa-pos-resources"> - <mkdir dir="target/download"/> - <!-- FILE: en-pos-perceptron-c0-b3-dev.bin - - - - - - - - - - - - - - - - - - - - - - - - - - - 2013-11-15 | now | bf0545ff008f211bb282ce08f6977872 - --> - <install-stub-and-upstream-file - file="target/download/ixa-pos-resources/en-pos-perceptron-c0-b3-dev.bin" - md5="bf0545ff008f211bb282ce08f6977872" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.opennlp" - upstreamVersion="20131115" - metaDataVersion="1" - tool="tagger" - language="en" - variant="perceptron-ixa" - extension="bin"> - <metadata> - <entry key="DC.title" value="en-pos-perceptron-c0-b3-dev.bin"/> - <entry key="DC.creator" value="Rodrigo Agerri"/> - <entry key="DC.identifier" value="http://ixa2.si.ehu.es/ixa-pipes/models/pos-resources.tgz#en-pos-perceptron-c0-b3-dev.bin"/> - <entry key="pos.tagset" value="ptb"/> - <!-- - - Contains 78 tags because it includes multitags such as "IN|RB" or "RB|IN", looks - - otherwise like PTB tagset. We tell OpenNlpPosTagger to keep only the first - - tag. - --> - <entry key="pos.tagset.tagSplitPattern" value="\|"/> - </metadata> - </install-stub-and-upstream-file> - </target> - <target name="en-parser-chunking-ixa" depends="download-ixa-parser-resources"> <!-- FILE: en-parser-chunking.bin - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 2014-04-26 | now | 47c1b3f4dd7d08e992abdca4e16638ce @@ -716,12 +684,10 @@ <antcall target="es-ner-organization"/> <antcall target="es-ner-person"/> <antcall target="es-tagger-maxent"/> - <antcall target="es-tagger-maxent-ixa"/> <antcall target="es-tagger-maxent-universal"/> <antcall target="es-tagger-perceptron"/> - <antcall target="es-tagger-perceptron-ixa"/> <antcall target="es-tagger-perceptron-universal"/> - <antcall target="es-parser-chunking-ixa"/> + <antcall target="es-parser-chunking-ixa"/> </target> <target name="es-tagger-maxent"> @@ -914,54 +880,6 @@ </install-stub-and-upstream-file> </target> - <target name="es-tagger-maxent-ixa" depends="download-ixa-pos-resources"> - <!-- FILE: es-pos-maxent-700-c0-b3.bin - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 2014-04-25 | now | 6893840d709c581e378750b53f24531b - --> - <install-stub-and-upstream-file - file="target/download/ixa-pos-resources/es-pos-maxent-700-c0-b3.bin" - md5="6893840d709c581e378750b53f24531b" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.opennlp" - upstreamVersion="20140425" - metaDataVersion="1" - tool="tagger" - language="es" - variant="maxent-ixa" - extension="bin"> - <metadata> - <entry key="DC.title" value="es-pos-maxent-700-c0-b3.bin"/> - <entry key="DC.creator" value="Rodrigo Agerri"/> - <entry key="DC.identifier" value="http://ixa2.si.ehu.es/ixa-pipes/models/pos-resources.tgz#es-pos-maxent-700-c0-b3.bin"/> - <entry key="pos.tagset" value="ancora-ixa"/> - </metadata> - </install-stub-and-upstream-file> - </target> - - <target name="es-tagger-perceptron-ixa" depends="download-ixa-pos-resources"> - <!-- FILE: es-pos-perceptron-c0-b3.bin - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 2013-11-15 | now | 148d7ff4c56c89624e6050fe608f6d8a - --> - <install-stub-and-upstream-file - file="target/download/ixa-pos-resources/es-pos-perceptron-c0-b3.bin" - md5="148d7ff4c56c89624e6050fe608f6d8a" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.opennlp" - upstreamVersion="20131115" - metaDataVersion="1" - tool="tagger" - language="es" - variant="perceptron-ixa" - extension="bin"> - <metadata> - <entry key="DC.title" value="es-pos-perceptron-c0-b3.bin"/> - <entry key="DC.creator" value="Rodrigo Agerri"/> - <entry key="DC.identifier" value="http://ixa2.si.ehu.es/ixa-pipes/models/pos-resources.tgz#es-pos-perceptron-c0-b3.bin"/> - <entry key="pos.tagset" value="ancora-ixa"/> - </metadata> - </install-stub-and-upstream-file> - </target> - <target name="es-parser-chunking-ixa" depends="download-ixa-parser-resources"> <!-- FILE: es-parser-chunking.bin - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 2014-04-26 | now | 42589e408c5ed89beaf8107662bc751c @@ -1010,7 +928,7 @@ </metadata> </install-stub-and-upstream-file> </target> - + <target name="it"> <antcall target="it-sentence-maxent"/> <antcall target="it-tagger-maxent"/> @@ -1653,32 +1571,6 @@ </install-stub-and-upstream-file> </target> - <target name="-check-download-ixa-pos-resources"> - <available - property="download-ixa-pos-resources.DONE" - file="target/download/ixa-pos-resources/DONE"/> - </target> - - <target name="download-ixa-pos-resources" depends="-check-download-ixa-pos-resources" unless="download-ixa-pos-resources.DONE"> - <mkdir dir="target/download/ixa-pos-resources"/> - <get - src="http://ixa2.si.ehu.es/ixa-pipes/models/pos-resources.tgz" - dest="target/download/ixa-pos-resources/pos-resources.tgz" - skipexisting="true"/> - <untar - src="target/download/ixa-pos-resources/pos-resources.tgz" - dest="target/download/ixa-pos-resources" - compression="gzip"> - <patternset> - <include name="**/*.bin"/> - </patternset> - <chainedmapper> - <mapper type="flatten"/> - </chainedmapper> - </untar> - <touch file="target/download/ixa-pos-resources/DONE"/> - </target> - <target name="-check-download-ixa-parser-resources"> <available property="download-ixa-parser-resources.DONE" diff --git a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpPosTaggerTest.java b/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpPosTaggerTest.java deleted file mode 100644 index 2b0428b0bf..0000000000 --- a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpPosTaggerTest.java +++ /dev/null @@ -1,254 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; - -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertPOS; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.fit.factory.JCasFactory; -import org.apache.uima.jcas.JCas; -import org.junit.Ignore; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -public class OpenNlpPosTaggerTest -{ - @Test - public void simpleExample() - throws Exception - { - // NOTE: This file contains Asciidoc markers for partial inclusion of this file in the - // documentation. Do not remove these tags! - // tag::example[] - JCas jcas = JCasFactory.createText("This is a test", "en"); - - runPipeline(jcas, - createEngineDescription(OpenNlpSegmenter.class), - createEngineDescription(OpenNlpPosTagger.class)); - - for (Token t : select(jcas, Token.class)) { - System.out.printf("%s %s%n", t.getCoveredText(), t.getPos().getPosValue()); - } - // end::example[] - - assertPOS( - new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN" }, - new String[] { "DT", "VBZ", "DT", "NN" }, - select(jcas, POS.class)); - } - - @Test - public void testEnglishAutoLoad() - throws Exception - { - String oldModelCache = System.setProperty(ResourceObjectProviderBase.PROP_REPO_CACHE, - "target/test-output/models"); - String oldOfflineMode = System.setProperty(ResourceObjectProviderBase.PROP_REPO_OFFLINE, - ResourceObjectProviderBase.FORCE_AUTO_LOAD); - - try { - TestRunner.autoloadModelsOnNextTestRun(); - runTest("en", null, "This is a test .", - new String[] { "DT", "VBZ", "DT", "NN", "." }, - new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - } - finally { - if (oldModelCache != null) { - System.setProperty(ResourceObjectProviderBase.PROP_REPO_CACHE, oldModelCache); - } - else { - System.getProperties().remove(ResourceObjectProviderBase.PROP_REPO_CACHE); - } - if (oldOfflineMode != null) { - System.setProperty(ResourceObjectProviderBase.PROP_REPO_OFFLINE, oldOfflineMode); - } - else { - System.getProperties().remove(ResourceObjectProviderBase.PROP_REPO_OFFLINE); - } - } - } - - @Test - public void testEnglish() - throws Exception - { - runTest("en", null, "This is a test .", - new String[] { "DT", "VBZ", "DT", "NN", "." }, - new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - - runTest("en", null, "A neural net .", - new String[] { "DT", "JJ", "NN", "." }, - new String[] { "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); - - runTest("en", null, "John is purchasing oranges .", - new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, - new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); - - // This is WRONG tagging. "jumps" is tagged as "NNS" - runTest("en", "maxent", "The quick brown fox jumps over the lazy dog . \n", - new String[] { "DT", "JJ", "JJ", "NN", "NNS", "IN", "DT", "JJ", "NN", "." }, - new String[] { "POS_DET", "POS_ADJ", "POS_ADJ", "POS_NOUN", "POS_NOUN", "POS_ADP", "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); - } - - @Test - public void testEnglishExtra() - throws Exception - { - runTest("en", "perceptron", "The quick brown fox jumps over the lazy dog . \n", - new String[] { "DT", "JJ", "JJ", "NN", "NNS", "IN", "DT", "JJ", "NN", "." }, - new String[] { "POS_DET", "POS_ADJ", "POS_ADJ", "POS_NOUN", "POS_NOUN", "POS_ADP", "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); - - runTest("en", "perceptron-ixa", "The quick brown fox jumps over the lazy dog . \n", - new String[] { "DT", "JJ", "JJ", "NN", "NNS", "IN", "DT", "JJ", "NN", "." }, - new String[] { "POS_DET", "POS_ADJ", "POS_ADJ", "POS_NOUN", "POS_NOUN", "POS_ADP", "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); - } - - @Test - public void testGerman() - throws Exception - { - runTest("de", null, "Das ist ein Test .", - new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - - runTest("de", "maxent", "Das ist ein Test .", - new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - - runTest("de", "perceptron", "Das ist ein Test .", - new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - } - - @Test - public void testItalian() - throws Exception - { - runTest("it", null, "Questo è un test .", - new String[] { "PD", "Vip3", "RI", "Sn", "FS" }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - - runTest("it", "perceptron", "Questo è un test .", - new String[] { "PD", "Vip3", "RI", "Sn", "FS" }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - } - - @Ignore("We don't have these models integrated yet") - @Test - public void testPortuguese() - throws Exception - { - String[] bosqueTags = new String[] { "?", "adj", "adv", "art", "conj-c", "conj-s", "ec", - "in", "n", "num", "pp", "pron-det", "pron-indp", "pron-pers", "prop", "prp", - "punc", "v-fin", "v-ger", "v-inf", "v-pcp", "vp" }; - - JCas jcas = runTest("pt", null, "Este é um teste .", - new String[] { "pron-det", "v-fin", "art", "n", "punc" }, - new String[] { "PRON", "V", "ART", "NN", "PUNC" }); - - AssertAnnotations.assertTagset(POS.class, "bosque", bosqueTags, jcas); - - jcas = runTest("pt", "maxent", "Este é um teste .", - new String[] { "pron-det", "v-fin", "art", "n", "punc" }, - new String[] { "PRON", "V", "ART", "NN", "PUNC" }); - - AssertAnnotations.assertTagset(POS.class, "bosque", bosqueTags, jcas); - - jcas = runTest("pt", "perceptron", "Este é um teste .", - new String[] { "pron-det", "v-fin", "art", "n", "punc" }, - new String[] { "PRON", "V", "ART", "NN", "PUNC" }); - - AssertAnnotations.assertTagset(POS.class, "bosque", bosqueTags, jcas); - - jcas = runTest("pt", "mm-maxent", "Este é um teste .", - new String[] { "PROSUB", "V", "ART", "N", "." }, - new String[] { "POS", "POS", "POS", "POS", "POS" }); - - // AssertAnnotations.assertTagset(POS.class, "bosque", bosqueTags, jcas); - - jcas = runTest("pt", "mm-perceptron", "Este é um teste .", - new String[] { "PROSUB", "V", "ART", "N", "." }, - new String[] { "POS", "POS", "POS", "POS", "POS" }); - - // AssertAnnotations.assertTagset(POS.class, "bosque", bosqueTags, jcas); - - jcas = runTest("pt", "cogroo", "Este é um teste .", - new String[] { "pron-det", "v-fin", "artm", "nm", "." }, - new String[] { "POS", "POS", "POS", "POS", "POS" }); - - AssertAnnotations.assertTagset(POS.class, "bosque", bosqueTags, jcas); - } - - @Test - public void testSpanish() - throws Exception - { - runTest("es", "maxent", "Esta es una prueba .", - new String[] { "PD", "VSI", "DI", "NC", "Fp" }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - - runTest("es", "maxent-ixa", "Esta es una prueba .", - new String[] { "PD0FS000", "VSIP3S0", "DI0FS0", "NCFS000", "Fp"}, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - - runTest("es", "perceptron-ixa", "Esta es una prueba .", - new String[] { "PD0FS000", "VSIP3S0", "DI0FS0", "NCFS000", "Fp"}, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - } - - @Test - public void testSwedish() - throws Exception - { - runTest("sv", "maxent", "Detta är ett test .", - new String[] { "PO", "AV", "EN", "NN", "IP" }, - new String[] { "POS", "POS", "POS", "POS", "POS" }); - } - - private JCas runTest(String language, String variant, String testDocument, String[] tags, - String[] tagClasses) - throws Exception - { - AssumeResource.assumeResource(OpenNlpPosTagger.class, "tagger", language, variant); - - AnalysisEngine engine = createEngine(OpenNlpPosTagger.class, - OpenNlpPosTagger.PARAM_VARIANT, variant, - OpenNlpPosTagger.PARAM_PRINT_TAGSET, true); - - JCas jcas = TestRunner.runTest(engine, language, testDocument); - - AssertAnnotations.assertPOS(tagClasses, tags, select(jcas, POS.class)); - - return jcas; - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpChunkerTest.java b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpChunkerTest.java similarity index 93% rename from dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpChunkerTest.java rename to dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpChunkerTest.java index 73e0fc0796..e3e90c0bb1 100644 --- a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpChunkerTest.java +++ b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpChunkerTest.java @@ -15,16 +15,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; +package org.dkpro.core.opennlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.*; +import static org.dkpro.core.testing.AssertAnnotations.assertChunks; +import static org.dkpro.core.testing.AssertAnnotations.assertTagset; +import static org.dkpro.core.testing.AssertAnnotations.assertTagsetMapping; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.jcas.JCas; +import org.dkpro.core.opennlp.OpenNlpChunker; +import org.dkpro.core.opennlp.OpenNlpPosTagger; +import org.dkpro.core.opennlp.OpenNlpSegmenter; +import org.dkpro.core.testing.AssumeResource; import org.junit.Before; import org.junit.Ignore; import org.junit.Rule; @@ -32,7 +38,6 @@ import org.junit.rules.TestName; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; public class OpenNlpChunkerTest { diff --git a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpChunkerTrainerTest.java b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpChunkerTrainerTest.java similarity index 87% rename from dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpChunkerTrainerTest.java rename to dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpChunkerTrainerTest.java index 87f192e72f..c62b639767 100644 --- a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpChunkerTrainerTest.java +++ b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpChunkerTrainerTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; +package org.dkpro.core.opennlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; @@ -30,19 +30,21 @@ import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.factory.ConfigurationParameterFactory; import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.api.datasets.Dataset; +import org.dkpro.core.api.datasets.DatasetFactory; +import org.dkpro.core.api.datasets.Split; +import org.dkpro.core.eval.EvalUtil; +import org.dkpro.core.eval.model.Span; +import org.dkpro.core.eval.report.Result; +import org.dkpro.core.io.conll.Conll2000Reader; +import org.dkpro.core.opennlp.OpenNlpChunker; +import org.dkpro.core.opennlp.OpenNlpChunkerTrainer; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Before; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Dataset; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetFactory; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Split; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; -import de.tudarmstadt.ukp.dkpro.core.eval.EvalUtil; -import de.tudarmstadt.ukp.dkpro.core.eval.model.Span; -import de.tudarmstadt.ukp.dkpro.core.eval.report.Result; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2000Reader; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class OpenNlpChunkerTrainerTest { @@ -88,9 +90,7 @@ OpenNlpChunkerTrainer.PARAM_TARGET_LOCATION, new File(targetFolder, "model.bin") OpenNlpChunker.PARAM_MODEL_LOCATION, new File(targetFolder, "model.bin")); List<Span<String>> actual = EvalUtil.loadSamples(iteratePipeline(testReader, ner), - Chunk.class, chunk -> { - return chunk.getChunkValue(); - }); + Chunk.class, chunk -> chunk.getChunkValue()); System.out.printf("Actual samples: %d%n", actual.size()); // Read reference data collect labels diff --git a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpLemmatizerTrainerTest.java b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpLemmatizerTrainerTest.java similarity index 86% rename from dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpLemmatizerTrainerTest.java rename to dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpLemmatizerTrainerTest.java index a48d965371..2a0124502b 100644 --- a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpLemmatizerTrainerTest.java +++ b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpLemmatizerTrainerTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; +package org.dkpro.core.opennlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; @@ -29,19 +29,21 @@ import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.factory.ConfigurationParameterFactory; import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.api.datasets.Dataset; +import org.dkpro.core.api.datasets.DatasetFactory; +import org.dkpro.core.api.datasets.Split; +import org.dkpro.core.eval.EvalUtil; +import org.dkpro.core.eval.model.Span; +import org.dkpro.core.eval.report.Result; +import org.dkpro.core.io.conll.Conll2006Reader; +import org.dkpro.core.io.conll.Conll2006Writer; +import org.dkpro.core.opennlp.OpenNlpLemmatizer; +import org.dkpro.core.opennlp.OpenNlpLemmatizerTrainer; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Dataset; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetFactory; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Split; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.eval.EvalUtil; -import de.tudarmstadt.ukp.dkpro.core.eval.model.Span; -import de.tudarmstadt.ukp.dkpro.core.eval.report.Result; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2006Reader; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2006Writer; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class OpenNlpLemmatizerTrainerTest { @@ -101,10 +103,9 @@ OpenNlpLemmatizerTrainer.PARAM_TARGET_LOCATION, new File(targetFolder, "model.bi Conll2006Writer.PARAM_SINGULAR_TARGET, true, Conll2006Writer.PARAM_TARGET_LOCATION, new File(targetFolder, "out.conll")); - List<Span<String>> actual = EvalUtil.loadSamples(iteratePipeline(testReader, lemmatizer, testWriter), - Lemma.class, lemma -> { - return lemma.getValue(); - }); + List<Span<String>> actual = EvalUtil.loadSamples( + iteratePipeline(testReader, lemmatizer, testWriter), Lemma.class, lemma -> + lemma.getValue()); System.out.printf("Actual samples: %d%n", actual.size()); // Read reference data collect labels diff --git a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpNamedEntityRecognizerTest.java b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpNamedEntityRecognizerTest.java similarity index 92% rename from dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpNamedEntityRecognizerTest.java rename to dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpNamedEntityRecognizerTest.java index 8cb10bc78f..4a6da8fda4 100644 --- a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpNamedEntityRecognizerTest.java +++ b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpNamedEntityRecognizerTest.java @@ -15,21 +15,23 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; +package org.dkpro.core.opennlp; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.*; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.jcas.JCas; +import org.dkpro.core.opennlp.OpenNlpNamedEntityRecognizer; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; + import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; // NOTE: This file contains Asciidoc markers for partial inclusion of this file in the documentation // Do not remove these tags! @@ -52,7 +54,7 @@ public void testEnglish() } // end::test[] - @Test(expected=AnalysisEngineProcessException.class) + @Test(expected = AnalysisEngineProcessException.class) public void testExceptionWithWrongMappingFileLocation() throws Exception { diff --git a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpNamedEntityRecognizerTrainerTest.java b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpNamedEntityRecognizerTrainerTest.java similarity index 84% rename from dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpNamedEntityRecognizerTrainerTest.java rename to dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpNamedEntityRecognizerTrainerTest.java index 3f3c3d1277..a5b3de71c9 100644 --- a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpNamedEntityRecognizerTrainerTest.java +++ b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpNamedEntityRecognizerTrainerTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; +package org.dkpro.core.opennlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; @@ -30,19 +30,22 @@ import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.factory.ConfigurationParameterFactory; import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.api.datasets.Dataset; +import org.dkpro.core.api.datasets.DatasetFactory; +import org.dkpro.core.api.datasets.Split; +import org.dkpro.core.eval.EvalUtil; +import org.dkpro.core.eval.model.Span; +import org.dkpro.core.eval.report.Result; +import org.dkpro.core.io.conll.Conll2002Reader; +import org.dkpro.core.io.conll.Conll2002Reader.ColumnSeparators; +import org.dkpro.core.opennlp.OpenNlpNamedEntityRecognizer; +import org.dkpro.core.opennlp.OpenNlpNamedEntityRecognizerTrainer; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Before; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Dataset; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetFactory; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Split; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.eval.EvalUtil; -import de.tudarmstadt.ukp.dkpro.core.eval.model.Span; -import de.tudarmstadt.ukp.dkpro.core.eval.report.Result; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2002Reader; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; import opennlp.tools.ml.maxent.GISTrainer; public class OpenNlpNamedEntityRecognizerTrainerTest @@ -63,7 +66,7 @@ public void test() Conll2002Reader.class, Conll2002Reader.PARAM_PATTERNS, split.getTrainingFiles(), Conll2002Reader.PARAM_LANGUAGE, ds.getLanguage(), - Conll2002Reader.PARAM_COLUMN_SEPARATOR, Conll2002Reader.ColumnSeparators.TAB.getName(), + Conll2002Reader.PARAM_COLUMN_SEPARATOR, ColumnSeparators.TAB.getName(), Conll2002Reader.PARAM_HAS_TOKEN_NUMBER, true, Conll2002Reader.PARAM_HAS_HEADER, true, Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true); @@ -85,7 +88,7 @@ public void test() Conll2002Reader.class, Conll2002Reader.PARAM_PATTERNS, split.getTestFiles(), Conll2002Reader.PARAM_LANGUAGE, "de", - Conll2002Reader.PARAM_COLUMN_SEPARATOR, Conll2002Reader.ColumnSeparators.TAB.getName(), + Conll2002Reader.PARAM_COLUMN_SEPARATOR, ColumnSeparators.TAB.getName(), Conll2002Reader.PARAM_HAS_TOKEN_NUMBER, true, Conll2002Reader.PARAM_HAS_HEADER, true, Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true, @@ -97,16 +100,14 @@ public void test() OpenNlpNamedEntityRecognizer.PARAM_MODEL_LOCATION, model); List<Span<String>> actual = EvalUtil.loadSamples(iteratePipeline(testReader, ner), - NamedEntity.class, ne -> { - return ne.getValue(); - }); + NamedEntity.class, ne -> ne.getValue()); System.out.printf("Actual samples: %d%n", actual.size()); // Read reference data collect labels ConfigurationParameterFactory.setParameter(testReader, Conll2002Reader.PARAM_READ_NAMED_ENTITY, true); - List<Span<String>> expected = EvalUtil.loadSamples(testReader, NamedEntity.class, - ne -> { return ne.getValue(); }); + List<Span<String>> expected = EvalUtil.loadSamples(testReader, NamedEntity.class, ne -> + ne.getValue()); System.out.printf("Expected samples: %d%n", expected.size()); Result results = EvalUtil.dumpResults(targetFolder, expected, actual); diff --git a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpParserTest.java b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpParserTest.java similarity index 82% rename from dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpParserTest.java rename to dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpParserTest.java index bf3868772a..7687a29320 100644 --- a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpParserTest.java +++ b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpParserTest.java @@ -15,32 +15,38 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; +package org.dkpro.core.opennlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectSingle; +import static org.dkpro.core.testing.AssertAnnotations.assertConstituents; +import static org.dkpro.core.testing.AssertAnnotations.assertPOS; +import static org.dkpro.core.testing.AssertAnnotations.assertPennTree; +import static org.dkpro.core.testing.AssertAnnotations.assertTagset; +import static org.dkpro.core.testing.AssertAnnotations.assertTagsetMapping; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.jcas.JCas; +import org.dkpro.core.opennlp.OpenNlpParser; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; + import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class OpenNlpParserTest { - @Test - public void testEnglish() - throws Exception - { - JCas jcas = runTest("en", "chunking", "We need a very complicated example sentence , " - + "which contains as many constituents and dependencies as possible ."); + @Test + public void testEnglish() + throws Exception + { + JCas jcas = runTest("en", "chunking", "We need a very complicated example sentence , " + + "which contains as many constituents and dependencies as possible ."); String[] constituentMapped = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,110", "NP 64,98", "NP 8,110", "NP 8,43", "PP 61,110", "PP 99,110", "ROOT 0,112", @@ -50,8 +56,9 @@ public void testEnglish() "NP 64,98", "NP 8,110", "NP 8,43", "PP 61,110", "PP 99,110", "ROOT 0,112", "S 0,112", "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", "POS_NOUN", "POS_PUNCT", "POS_DET", - "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", + "POS_NOUN", "POS_PUNCT", "POS_DET", "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", + "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; String[] posOriginal = { "PRP", "VBP", "DT", "RB", "VBN", "NN", "NN", ",", "WDT", "VBZ", "IN", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; @@ -75,14 +82,14 @@ public void testEnglish() String[] unmappedConst = { "ADV", "AUX", "EDITED", "NEG", "O", "TOP", "TYPO", "UH" }; - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - AssertAnnotations.assertTagset(POS.class, "ptb", posTags, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ptb", unmappedPos, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ptb", constituentTags, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ptb", unmappedConst, jcas); - } + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); + assertTagset(POS.class, "ptb", posTags, jcas); + assertTagsetMapping(POS.class, "ptb", unmappedPos, jcas); + assertTagset(Constituent.class, "ptb", constituentTags, jcas); + assertTagsetMapping(Constituent.class, "ptb", unmappedConst, jcas); + } @Test public void testEnglishIxa() @@ -125,13 +132,13 @@ public void testEnglishIxa() String[] unmappedConst = { "ADV", "AUX", "EDITED", "NEG", "O", "TOP", "TYPO", "UH" }; - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - AssertAnnotations.assertTagset(POS.class, "ptb", posTags, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ptb", unmappedPos, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ptb", constituentTags, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ptb", unmappedConst, jcas); + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); + assertTagset(POS.class, "ptb", posTags, jcas); + assertTagsetMapping(POS.class, "ptb", unmappedPos, jcas); + assertTagset(Constituent.class, "ptb", constituentTags, jcas); + assertTagsetMapping(Constituent.class, "ptb", unmappedConst, jcas); } @Test @@ -238,32 +245,32 @@ public void testSpanishIxa() String[] unmappedConst = { "O", "TOP" }; - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); - AssertAnnotations.assertTagset(POS.class, "ancora-ixa", posTags, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ancora-ixa", unmappedPos, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ancora", constituentTags, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ancora", unmappedConst, jcas); + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, select(jcas, Constituent.class)); + assertTagset(POS.class, "ancora-ixa", posTags, jcas); + assertTagsetMapping(POS.class, "ancora-ixa", unmappedPos, jcas); + assertTagset(Constituent.class, "ancora", constituentTags, jcas); + assertTagsetMapping(Constituent.class, "ancora", unmappedConst, jcas); } /** * Setup CAS to test parser for the English language (is only called once if an English test is * run) */ - private JCas runTest(String aLanguage, String aVariant, String aDocument) - throws Exception - { + private JCas runTest(String aLanguage, String aVariant, String aDocument) + throws Exception + { AssumeResource.assumeResource(OpenNlpParser.class, "parser", aLanguage, aVariant); - - AnalysisEngineDescription parser = createEngineDescription(OpenNlpParser.class, - OpenNlpParser.PARAM_VARIANT, aVariant, - OpenNlpParser.PARAM_PRINT_TAGSET, true, - OpenNlpParser.PARAM_WRITE_POS, true, - OpenNlpParser.PARAM_WRITE_PENN_TREE, true); - - return TestRunner.runTest(parser, aLanguage, aDocument); - } + + AnalysisEngineDescription parser = createEngineDescription(OpenNlpParser.class, + OpenNlpParser.PARAM_VARIANT, aVariant, + OpenNlpParser.PARAM_PRINT_TAGSET, true, + OpenNlpParser.PARAM_WRITE_POS, true, + OpenNlpParser.PARAM_WRITE_PENN_TREE, true); + + return TestRunner.runTest(parser, aLanguage, aDocument); + } @Rule public DkproTestContext testContext = new DkproTestContext(); diff --git a/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpPosTaggerBulkTest.java b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpPosTaggerBulkTest.java new file mode 100644 index 0000000000..3ffaba3d2b --- /dev/null +++ b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpPosTaggerBulkTest.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.opennlp; + +import static java.util.Arrays.asList; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.AssertAnnotations.assertPOS; +import static org.dkpro.core.testing.AssertAnnotations.assertTagset; +import static org.dkpro.core.testing.AssumeResource.assumeResource; + +import java.util.Collection; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; + +@RunWith(Parameterized.class) +public class OpenNlpPosTaggerBulkTest +{ + private static final String NO_TAGSET_CHECK = null; + + private static final String[] TAGSET_BOSQUE = { "?", "adj", "adv", "art", "conj-c", "conj-s", + "ec", "in", "n", "num", "pp", "pron-det", "pron-indp", "pron-pers", "prop", "prp", + "punc", "v-fin", "v-ger", "v-inf", "v-pcp", "vp" }; + + private static final Object[][] DATA = { + { "en", null, NO_TAGSET_CHECK, NO_TAGSET_CHECK, + "This is a test .", + new String[] { "DT", "VBZ", "DT", "NN", "." }, + new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" } }, + { "en", null, NO_TAGSET_CHECK, NO_TAGSET_CHECK, + "A neural net .", + new String[] { "DT", "JJ", "NN", "." }, + new String[] { "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" } }, + { "en", null, NO_TAGSET_CHECK, NO_TAGSET_CHECK, + "John is purchasing oranges .", + new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, + new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" } }, + // This is WRONG tagging. "jumps" is tagged as "NNS" + { "en", "maxent", NO_TAGSET_CHECK, NO_TAGSET_CHECK, + "The quick brown fox jumps over the lazy dog .", + new String[] { "DT", "JJ", "JJ", "NN", "NNS", "IN", "DT", "JJ", "NN", "." }, + new String[] { "POS_DET", "POS_ADJ", "POS_ADJ", "POS_NOUN", "POS_NOUN", + "POS_ADP", "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" } }, + { "en", "perceptron", NO_TAGSET_CHECK, NO_TAGSET_CHECK, + "The quick brown fox jumps over the lazy dog .", + new String[] { "DT", "JJ", "JJ", "NN", "NNS", "IN", "DT", "JJ", "NN", "." }, + new String[] { "POS_DET", "POS_ADJ", "POS_ADJ", "POS_NOUN", "POS_NOUN", + "POS_ADP", "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" } }, + { "de", null, NO_TAGSET_CHECK, NO_TAGSET_CHECK, + "Das ist ein Test .", + new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" } }, + { "de", "maxent", NO_TAGSET_CHECK, NO_TAGSET_CHECK, + "Das ist ein Test .", + new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" } }, + { "de", "perceptron", NO_TAGSET_CHECK, NO_TAGSET_CHECK, + "Das ist ein Test .", + new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" } }, + { "it", null, NO_TAGSET_CHECK, NO_TAGSET_CHECK, + "Questo è un test .", + new String[] { "PD", "Vip3", "RI", "Sn", "FS" }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" } }, + { "it", "perceptron", NO_TAGSET_CHECK, NO_TAGSET_CHECK, + "Questo è un test .", + new String[] { "PD", "Vip3", "RI", "Sn", "FS" }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" } }, + { "es", "maxent", NO_TAGSET_CHECK, NO_TAGSET_CHECK, + "Esta es una prueba .", + new String[] { "PD", "VSI", "DI", "NC", "Fp" }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" } }, + { "es", "maxent-ixa", NO_TAGSET_CHECK, NO_TAGSET_CHECK, + "Esta es una prueba .", + new String[] { "PD0FS000", "VSIP3S0", "DI0FS0", "NCFS000", "Fp" }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" } }, + { "es", "perceptron-ixa", NO_TAGSET_CHECK, NO_TAGSET_CHECK, + "Esta es una prueba .", + new String[] { "PD0FS000", "VSIP3S0", "DI0FS0", "NCFS000", "Fp" }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" } }, + { "sv", "maxent", NO_TAGSET_CHECK, NO_TAGSET_CHECK, + "Detta är ett test .", + new String[] { "PO", "AV", "EN", "NN", "IP" }, + new String[] { "POS", "POS", "POS", "POS", "POS" } }, + { "pt", null, "bosque", TAGSET_BOSQUE, + "Este é um teste .", + new String[] { "pron-det", "v-fin", "art", "n", "punc" }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" } }, + { "pt", "maxent", "bosque", TAGSET_BOSQUE, + "Este é um teste .", + new String[] { "pron-det", "v-fin", "art", "n", "punc" }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" } }, + { "pt", "perceptron", NO_TAGSET_CHECK, NO_TAGSET_CHECK, + "Este é um teste .", + new String[] { "pron-det", "v-fin", "art", "n", "punc" }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" } }, + { "pt", "mm-maxent", NO_TAGSET_CHECK, NO_TAGSET_CHECK, + "Este é um teste .", + new String[] { "PROSUB", "V", "ART", "N", "." }, + new String[] { "POS", "POS", "POS", "POS", "POS" } }, + { "pt", "mm-perceptron", NO_TAGSET_CHECK, NO_TAGSET_CHECK, + "Este é um teste .", + new String[] { "PROSUB", "V", "ART", "N", "." }, + new String[] { "POS", "POS", "POS", "POS", "POS" } }, + { "pt", "cogroo", NO_TAGSET_CHECK, NO_TAGSET_CHECK, + "Este é um teste .", + new String[] { "pron-det", "v-fin", "artm", "nm", "." }, + new String[] { "POS", "POS", "POS", "POS", "POS" } } }; + + @Parameters + public static Collection<Object[]> data() { + return asList(DATA); + } + + private final String language; + private final String variant; + private final String tagset; + private final String[] tags; + private final String text; + private final String[] originalPos; + private final String[] mappedPos; + + public OpenNlpPosTaggerBulkTest(String aLanguage, String aVariant, String aTagset, + String[] aTags, String aText, String[] aOriginalPos, String[] aMappedPos) + { + language = aLanguage; + variant = aVariant; + tagset = aTagset; + tags = aTags; + text = aText; + originalPos = aOriginalPos; + mappedPos = aMappedPos; + + if ((tags == null && tagset != null) || (tags != null && tagset == null)) { + throw new IllegalArgumentException( + "Tags and tagset must both be specified or both be null"); + } + } + + @Test + public void test() + throws Exception + { + assumeResource(OpenNlpPosTagger.class, "tagger", language, variant); + + AnalysisEngine engine = createEngine( + OpenNlpPosTagger.class, + OpenNlpPosTagger.PARAM_VARIANT, variant, + OpenNlpPosTagger.PARAM_PRINT_TAGSET, true); + + JCas jcas = TestRunner.runTest(engine, language, text); + + assertPOS(mappedPos, originalPos, select(jcas, POS.class)); + + if (tagset != null) { + assertTagset(POS.class, tagset, tags, jcas); + } + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpPosTaggerTest.java b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpPosTaggerTest.java new file mode 100644 index 0000000000..4bd545c81f --- /dev/null +++ b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpPosTaggerTest.java @@ -0,0 +1,162 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.opennlp; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.AssertAnnotations.assertPOS; + +import java.io.File; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.resources.ResourceObjectProviderBase; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class OpenNlpPosTaggerTest +{ + @Test + public void simpleExample() + throws Exception + { + // NOTE: This file contains Asciidoc markers for partial inclusion of this file in the + // documentation. Do not remove these tags! + // tag::example[] + JCas jcas = JCasFactory.createText("This is a test", "en"); + + runPipeline(jcas, + createEngineDescription(OpenNlpSegmenter.class), + createEngineDescription(OpenNlpPosTagger.class)); + + for (Token t : select(jcas, Token.class)) { + System.out.printf("%s %s%n", t.getCoveredText(), t.getPos().getPosValue()); + } + // end::example[] + + assertPOS( + new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN" }, + new String[] { "DT", "VBZ", "DT", "NN" }, + select(jcas, POS.class)); + } + + @Test + public void testEnglishAutoLoad() + throws Exception + { + File testOutput = testContext.getTestOutputFolder(); + + String oldModelCache = System.setProperty(ResourceObjectProviderBase.PROP_REPO_CACHE, + new File(testOutput, "models").getPath()); + String oldOfflineMode = System.setProperty(ResourceObjectProviderBase.PROP_REPO_OFFLINE, + ResourceObjectProviderBase.FORCE_AUTO_LOAD); + + try { + TestRunner.autoloadModelsOnNextTestRun(); + runTest("en", null, "This is a test .", + new String[] { "DT", "VBZ", "DT", "NN", "." }, + new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + } + finally { + if (oldModelCache != null) { + System.setProperty(ResourceObjectProviderBase.PROP_REPO_CACHE, oldModelCache); + } + else { + System.getProperties().remove(ResourceObjectProviderBase.PROP_REPO_CACHE); + } + if (oldOfflineMode != null) { + System.setProperty(ResourceObjectProviderBase.PROP_REPO_OFFLINE, oldOfflineMode); + } + else { + System.getProperties().remove(ResourceObjectProviderBase.PROP_REPO_OFFLINE); + } + } + } + + @Test + public void testEnglishManualURI() + throws Exception + { + File testOutput = testContext.getTestOutputFolder(); + + String oldModelCache = System.setProperty(ResourceObjectProviderBase.PROP_REPO_CACHE, + new File(testOutput, "models").getPath()); + String oldOfflineMode = System.setProperty(ResourceObjectProviderBase.PROP_REPO_OFFLINE, + ResourceObjectProviderBase.FORCE_AUTO_LOAD); + + try { + TestRunner.autoloadModelsOnNextTestRun(); + + String[] tagClasses = { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }; + String[] tags = { "DT", "VBZ", "DT", "NN", "." }; + + AnalysisEngine engine = createEngine(OpenNlpPosTagger.class, + OpenNlpPosTagger.PARAM_MODEL_ARTIFACT_URI, "mvn:de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.opennlp-model-tagger-en-maxent:20120616.1", + OpenNlpPosTagger.PARAM_VARIANT, "maxent", + OpenNlpPosTagger.PARAM_PRINT_TAGSET, true); + + JCas jcas = TestRunner.runTest(engine, "en", "This is a test ."); + + AssertAnnotations.assertPOS(tagClasses, tags, select(jcas, POS.class)); + } + finally { + if (oldModelCache != null) { + System.setProperty(ResourceObjectProviderBase.PROP_REPO_CACHE, oldModelCache); + } + else { + System.getProperties().remove(ResourceObjectProviderBase.PROP_REPO_CACHE); + } + if (oldOfflineMode != null) { + System.setProperty(ResourceObjectProviderBase.PROP_REPO_OFFLINE, oldOfflineMode); + } + else { + System.getProperties().remove(ResourceObjectProviderBase.PROP_REPO_OFFLINE); + } + } + } + + private JCas runTest(String language, String variant, String testDocument, String[] tags, + String[] tagClasses) + throws Exception + { + AssumeResource.assumeResource(OpenNlpPosTagger.class, "tagger", language, variant); + + AnalysisEngine engine = createEngine(OpenNlpPosTagger.class, + OpenNlpPosTagger.PARAM_VARIANT, variant, + OpenNlpPosTagger.PARAM_PRINT_TAGSET, true); + + JCas jcas = TestRunner.runTest(engine, language, testDocument); + + AssertAnnotations.assertPOS(tagClasses, tags, select(jcas, POS.class)); + + return jcas; + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpPosTaggerTrainerTest.java b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpPosTaggerTrainerTest.java similarity index 87% rename from dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpPosTaggerTrainerTest.java rename to dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpPosTaggerTrainerTest.java index 8d949e0110..2f36c9705e 100644 --- a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpPosTaggerTrainerTest.java +++ b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpPosTaggerTrainerTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; +package org.dkpro.core.opennlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; @@ -29,18 +29,20 @@ import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.factory.ConfigurationParameterFactory; import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.api.datasets.Dataset; +import org.dkpro.core.api.datasets.DatasetFactory; +import org.dkpro.core.api.datasets.Split; +import org.dkpro.core.eval.EvalUtil; +import org.dkpro.core.eval.model.Span; +import org.dkpro.core.eval.report.Result; +import org.dkpro.core.io.conll.Conll2006Reader; +import org.dkpro.core.opennlp.OpenNlpPosTagger; +import org.dkpro.core.opennlp.OpenNlpPosTaggerTrainer; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Dataset; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetFactory; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Split; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.eval.EvalUtil; -import de.tudarmstadt.ukp.dkpro.core.eval.model.Span; -import de.tudarmstadt.ukp.dkpro.core.eval.report.Result; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2006Reader; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class OpenNlpPosTaggerTrainerTest { @@ -91,9 +93,7 @@ OpenNlpPosTaggerTrainer.PARAM_TARGET_LOCATION, new File(targetFolder, "model.bin OpenNlpPosTagger.PARAM_MODEL_LOCATION, new File(targetFolder, "model.bin")); List<Span<String>> actual = EvalUtil.loadSamples(iteratePipeline(testReader, postagger), - POS.class, pos -> { - return pos.getPosValue(); - }); + POS.class, pos -> pos.getPosValue()); System.out.printf("Actual samples: %d%n", actual.size()); // Read reference data collect labels diff --git a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpSegmenterTest.java b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpSegmenterTest.java similarity index 90% rename from dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpSegmenterTest.java rename to dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpSegmenterTest.java index 88f11b5ba8..e747297f68 100644 --- a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpSegmenterTest.java +++ b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpSegmenterTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; +package org.dkpro.core.opennlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; @@ -24,15 +24,17 @@ import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.jcas.JCas; +import org.dkpro.core.opennlp.OpenNlpSegmenter; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.harness.SegmenterHarness; import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; + import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.harness.SegmenterHarness; public class OpenNlpSegmenterTest { @@ -67,10 +69,8 @@ public void runHarness() AnalysisEngineDescription aed = createEngineDescription(OpenNlpSegmenter.class); SegmenterHarness.run(aed, (language, variant) -> { - AssumeResource.assumeResource(OpenNlpSegmenter.class, "sentence", language, - "maxent"); - }, - "de.1", "en.7", "en.9", "ar.1", "zh.1", "zh.2"); + AssumeResource.assumeResource(OpenNlpSegmenter.class, "sentence", language, "maxent"); + }, "de.1", "en.7", "en.9", "ar.1", "zh.1", "zh.2"); } private JCas runTest(String aLanguage, String aVariant, String aDocument, String[] sentences, diff --git a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpSentenceTrainerTest.java b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpSentenceTrainerTest.java similarity index 87% rename from dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpSentenceTrainerTest.java rename to dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpSentenceTrainerTest.java index c6d6e48b7c..9335f9538d 100644 --- a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpSentenceTrainerTest.java +++ b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpSentenceTrainerTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; +package org.dkpro.core.opennlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; @@ -33,19 +33,22 @@ import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.datasets.Dataset; +import org.dkpro.core.api.datasets.DatasetFactory; +import org.dkpro.core.api.datasets.Split; +import org.dkpro.core.eval.EvalUtil; +import org.dkpro.core.eval.model.Span; +import org.dkpro.core.eval.report.Result; +import org.dkpro.core.io.conll.Conll2002Reader; +import org.dkpro.core.io.conll.Conll2002Reader.ColumnSeparators; +import org.dkpro.core.opennlp.OpenNlpSegmenter; +import org.dkpro.core.opennlp.OpenNlpSentenceTrainer; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Before; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Dataset; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetFactory; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Split; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.eval.EvalUtil; -import de.tudarmstadt.ukp.dkpro.core.eval.model.Span; -import de.tudarmstadt.ukp.dkpro.core.eval.report.Result; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2002Reader; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class OpenNlpSentenceTrainerTest { @@ -65,7 +68,7 @@ public void test() Conll2002Reader.class, Conll2002Reader.PARAM_PATTERNS, split.getTrainingFiles(), Conll2002Reader.PARAM_LANGUAGE, ds.getLanguage(), - Conll2002Reader.PARAM_COLUMN_SEPARATOR, Conll2002Reader.ColumnSeparators.TAB.getName(), + Conll2002Reader.PARAM_COLUMN_SEPARATOR, ColumnSeparators.TAB.getName(), Conll2002Reader.PARAM_HAS_TOKEN_NUMBER, true, Conll2002Reader.PARAM_HAS_HEADER, true, Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true); @@ -87,7 +90,7 @@ OpenNlpSentenceTrainer.PARAM_TARGET_LOCATION, new File(targetFolder, "model.bin" Conll2002Reader.class, Conll2002Reader.PARAM_PATTERNS, split.getTestFiles(), Conll2002Reader.PARAM_LANGUAGE, ds.getLanguage(), - Conll2002Reader.PARAM_COLUMN_SEPARATOR, Conll2002Reader.ColumnSeparators.TAB.getName(), + Conll2002Reader.PARAM_COLUMN_SEPARATOR, ColumnSeparators.TAB.getName(), Conll2002Reader.PARAM_HAS_TOKEN_NUMBER, true, Conll2002Reader.PARAM_HAS_HEADER, true, Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true); diff --git a/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpSnowballStemmerTest.java b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpSnowballStemmerTest.java new file mode 100644 index 0000000000..7c992ccf78 --- /dev/null +++ b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpSnowballStemmerTest.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.opennlp; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.opennlp.OpenNlpPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; + +public class OpenNlpSnowballStemmerTest +{ + @Test + public void testGerman() + throws Exception + { + runTest("de", "Automobile Fenster", + new String[] {"Automobil", "Fenst"} ); + } + + @Test + public void testEnglish() + throws Exception + { + runTest("en", "computers Computers deliberately", + new String[] {"comput", "Comput", "deliber"} ); + + runTest("en", "We need a very complicated example sentence , which " + + "contains as many constituents and dependencies as possible .", + new String[] { "We", "need", "a", "veri", "complic", "exampl", "sentenc", ",", + "which", "contain", "as", "mani", "constitu", "and", "depend", "as", + "possibl", "." }); + } + + @Test + public void testEnglishCaseInsensitive() + throws Exception + { + runTest("en", "EDUCATIONAL Educational educational", + new String[] {"educ", "educ", "educ"}, + OpenNlpSnowballStemmer.PARAM_LOWER_CASE, true); + } + + @Test + public void testEnglishCaseSensitive() + throws Exception + { + runTest("en", "EDUCATIONAL Educational educational", + new String[] {"EDUCATIONAL", "Educat", "educ"}, + OpenNlpSnowballStemmer.PARAM_LOWER_CASE, false); + } + + @Test + public void testEnglishCaseFiltered() + throws Exception + { + String[] stems = { "educ" }; + String[] pos = { "NNS", "JJ", "NN", "NNS" }; + + AnalysisEngineDescription aggregate = createEngineDescription( + createEngineDescription(OpenNlpPosTagger.class), + createEngineDescription(OpenNlpSnowballStemmer.class, + OpenNlpSnowballStemmer.PARAM_LOWER_CASE, true, + OpenNlpSnowballStemmer.PARAM_FILTER_FEATUREPATH, "pos/PosValue", + OpenNlpSnowballStemmer.PARAM_FILTER_CONDITION_OPERATOR, "EQUALS", + OpenNlpSnowballStemmer.PARAM_FILTER_CONDITION_VALUE, "JJ")); + + JCas result = TestRunner.runTest(aggregate, "en", "Babies educational sleep .s"); + + AssertAnnotations.assertStem(stems, select(result, Stem.class)); + AssertAnnotations.assertPOS(null, pos, select(result, POS.class)); + } + + private JCas runTest(String aLanguage, String aText, String[] aStems, Object... aParams) + throws Exception + { + JCas result = TestRunner.runTest( + createEngineDescription(OpenNlpSnowballStemmer.class, aParams), aLanguage, aText); + + AssertAnnotations.assertStem(aStems, select(result, Stem.class)); + + return result; + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpTokenTrainerTest.java b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpTokenTrainerTest.java similarity index 90% rename from dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpTokenTrainerTest.java rename to dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpTokenTrainerTest.java index 0a4b3dfd5a..ebe517fd28 100644 --- a/dkpro-core-opennlp-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/opennlp/OpenNlpTokenTrainerTest.java +++ b/dkpro-core-opennlp-asl/src/test/java/org/dkpro/core/opennlp/OpenNlpTokenTrainerTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.opennlp; +package org.dkpro.core.opennlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; @@ -33,19 +33,21 @@ import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.datasets.Dataset; +import org.dkpro.core.api.datasets.DatasetFactory; +import org.dkpro.core.api.datasets.Split; +import org.dkpro.core.eval.EvalUtil; +import org.dkpro.core.eval.model.Span; +import org.dkpro.core.eval.report.Result; +import org.dkpro.core.io.conll.ConllUReader; +import org.dkpro.core.opennlp.OpenNlpSegmenter; +import org.dkpro.core.opennlp.OpenNlpTokenTrainer; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Before; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Dataset; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetFactory; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Split; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.eval.EvalUtil; -import de.tudarmstadt.ukp.dkpro.core.eval.model.Span; -import de.tudarmstadt.ukp.dkpro.core.eval.report.Result; -import de.tudarmstadt.ukp.dkpro.core.io.conll.ConllUReader; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class OpenNlpTokenTrainerTest { diff --git a/dkpro-core-opennlp-asl/src/test/resources/log4j.properties b/dkpro-core-opennlp-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-opennlp-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-opennlp-asl/src/test/resources/log4j2.xml b/dkpro-core-opennlp-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-opennlp-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Logger name="org.dkpro.core.api.resources.ResourceObjectProviderBase" level="INFO"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-performance-asl/pom.xml b/dkpro-core-performance-asl/pom.xml index a08bc9a402..1675e522c5 100644 --- a/dkpro-core-performance-asl/pom.xml +++ b/dkpro-core-performance-asl/pom.xml @@ -18,14 +18,15 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <version>1.10.0-SNAPSHOT</version> + <artifactId>dkpro-core-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.performance-asl</artifactId> + <artifactId>dkpro-core-performance-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - Performance Measurement Support</name> + <url>https://dkpro.github.io/dkpro-core/</url> <dependencies> <dependency> <groupId>org.apache.uima</groupId> @@ -44,8 +45,12 @@ <artifactId>commons-math3</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> </dependency> <dependency> <groupId>junit</groupId> @@ -53,18 +58,18 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.io.tei-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-io-tei-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.io-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-io-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-opennlp-asl</artifactId> <scope>test</scope> </dependency> <dependency> @@ -73,17 +78,17 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.tokit-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-tokit-asl</artifactId> <scope>test</scope> </dependency> </dependencies> <dependencyManagement> <dependencies> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-opennlp-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <type>pom</type> <scope>import</scope> </dependency> diff --git a/dkpro-core-performance-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/performance/Stopwatch.java b/dkpro-core-performance-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/performance/Stopwatch.java deleted file mode 100644 index 95e0e1139b..0000000000 --- a/dkpro-core-performance-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/performance/Stopwatch.java +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.performance; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStream; -import java.util.ArrayList; -import java.util.Formatter; -import java.util.List; -import java.util.Locale; -import java.util.Properties; - -import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.performance.type.TimerAnnotation; - -/** - * Can be used to measure how long the processing between two points in a pipeline takes. - * For that purpose, the AE needs to be added two times, before and after the part of the pipeline that should be measured. - */ -@ResourceMetaData(name="Stopwatch") -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.type.TimerAnnotation"}, - outputs={ - "de.tudarmstadt.ukp.dkpro.core.type.TimerAnnotation"}) - -public class Stopwatch - extends JCasAnnotator_ImplBase -{ - - private Boolean isDownstreamTimer; - private JCas jcas;; - - public static final String KEY_MEAN = "mean"; - public static final String KEY_SUM = "sum"; - public static final String KEY_STDDEV = "stddev"; - - public static final String PARAM_TIMER_NAME = "timerName"; - /** - * Name of the timer pair. - * Upstream and downstream timer need to use the same name. - */ - @ConfigurationParameter(name = PARAM_TIMER_NAME, mandatory = true) - private String timerName; - - public static final String PARAM_OUTPUT_FILE = "timerOutputFile"; - /** - * Name of the timer pair. - * Upstream and downstream timer need to use the same name. - */ - @ConfigurationParameter(name = PARAM_OUTPUT_FILE, mandatory = false) - private File outputFile; - - private List<Long> times; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - - times = new ArrayList<Long>(); - - isDownstreamTimer = null; - } - - @Override - public void process(JCas jcas) - throws AnalysisEngineProcessException - { - this.jcas = jcas; - - long currentTime = System.currentTimeMillis(); - - if (isDownstreamTimer()) { - TimerAnnotation timerAnno = JCasUtil.selectSingle(jcas, TimerAnnotation.class); - timerAnno.setEndTime(currentTime); - - long startTime = timerAnno.getStartTime(); - - times.add(currentTime - startTime); - } - else { - TimerAnnotation timerAnno = new TimerAnnotation(jcas); - timerAnno.setName(timerName); - timerAnno.setStartTime(currentTime); - timerAnno.addToIndexes(); - } - } - - @Override - public void collectionProcessComplete() - throws AnalysisEngineProcessException - { - super.collectionProcessComplete(); - - if (isDownstreamTimer()) { - getLogger().info("Results from Timer '" + timerName + "' after processing all documents."); - - - DescriptiveStatistics statTimes = new DescriptiveStatistics(); - for (Long timeValue : times) { - statTimes.addValue((double) timeValue / 1000); - } - double sum = statTimes.getSum(); - double mean = statTimes.getMean(); - double stddev = statTimes.getStandardDeviation(); - - StringBuilder sb = new StringBuilder(); - sb.append("Estimate after processing " + times.size() + " documents."); - sb.append("\n"); - - Formatter formatter = new Formatter(sb, Locale.US); - - formatter.format("Aggregated time: %,.1fs\n", sum); - formatter.format("Time / Document: %,.3fs (%,.3fs)\n", mean, stddev); - - formatter.close(); - - getLogger().info(sb.toString()); - - if (outputFile != null) { - try { - Properties props = new Properties(); - props.setProperty(KEY_SUM, ""+sum); - props.setProperty(KEY_MEAN, ""+mean); - props.setProperty(KEY_STDDEV, ""+stddev); - OutputStream out = new FileOutputStream(outputFile); - props.store(out, "timer " + timerName + " result file"); - } catch (FileNotFoundException e) { - throw new AnalysisEngineProcessException(e); - } catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - } - } - } - - private boolean isDownstreamTimer() { - - if (isDownstreamTimer == null) { - // this is only a downstream timer if there already is a timer annotation with the same name - for (TimerAnnotation timer : JCasUtil.select(jcas, TimerAnnotation.class)) { - if (timer.getName().equals(timerName)) { - isDownstreamTimer = true; - } - } - } - - if (isDownstreamTimer == null) { - isDownstreamTimer = false; - } - - return isDownstreamTimer; - } -} diff --git a/dkpro-core-performance-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/performance/PerformanceTestUtil.java b/dkpro-core-performance-asl/src/main/java/org/dkpro/core/performance/PerformanceTestUtil.java similarity index 99% rename from dkpro-core-performance-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/performance/PerformanceTestUtil.java rename to dkpro-core-performance-asl/src/main/java/org/dkpro/core/performance/PerformanceTestUtil.java index 78a301b5ca..5aa3979adc 100644 --- a/dkpro-core-performance-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/performance/PerformanceTestUtil.java +++ b/dkpro-core-performance-asl/src/main/java/org/dkpro/core/performance/PerformanceTestUtil.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.performance; +package org.dkpro.core.performance; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; @@ -40,6 +40,7 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceConfigurationException; import org.apache.uima.resource.ResourceInitializationException; + import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; diff --git a/dkpro-core-performance-asl/src/main/java/org/dkpro/core/performance/Stopwatch.java b/dkpro-core-performance-asl/src/main/java/org/dkpro/core/performance/Stopwatch.java new file mode 100644 index 0000000000..038469941e --- /dev/null +++ b/dkpro-core-performance-asl/src/main/java/org/dkpro/core/performance/Stopwatch.java @@ -0,0 +1,187 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.performance; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Formatter; +import java.util.List; +import java.util.Locale; +import java.util.Properties; + +import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; + +import de.tudarmstadt.ukp.dkpro.core.performance.type.TimerAnnotation; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Can be used to measure how long the processing between two points in a pipeline takes. For that + * purpose, the AE needs to be added two times, before and after the part of the pipeline that + * should be measured. + */ +@ResourceMetaData(name = "Stopwatch") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.type.TimerAnnotation"}, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.type.TimerAnnotation"}) + +public class Stopwatch + extends JCasAnnotator_ImplBase +{ + private Boolean isDownstreamTimer; + private JCas jcas; + + public static final String KEY_MEAN = "mean"; + public static final String KEY_SUM = "sum"; + public static final String KEY_STDDEV = "stddev"; + + public static final String PARAM_TIMER_NAME = "timerName"; + /** + * Name of the timer pair. + * Upstream and downstream timer need to use the same name. + */ + @ConfigurationParameter(name = PARAM_TIMER_NAME, mandatory = true) + private String timerName; + + public static final String PARAM_OUTPUT_FILE = "timerOutputFile"; + /** + * Name of the timer pair. + * Upstream and downstream timer need to use the same name. + */ + @ConfigurationParameter(name = PARAM_OUTPUT_FILE, mandatory = false) + private File outputFile; + + private List<Long> times; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + times = new ArrayList<Long>(); + + isDownstreamTimer = null; + } + + @Override + public void process(JCas jcas) + throws AnalysisEngineProcessException + { + this.jcas = jcas; + + long currentTime = System.currentTimeMillis(); + + if (isDownstreamTimer()) { + TimerAnnotation timerAnno = JCasUtil.selectSingle(jcas, TimerAnnotation.class); + timerAnno.setEndTime(currentTime); + + long startTime = timerAnno.getStartTime(); + + times.add(currentTime - startTime); + } + else { + TimerAnnotation timerAnno = new TimerAnnotation(jcas); + timerAnno.setName(timerName); + timerAnno.setStartTime(currentTime); + timerAnno.addToIndexes(); + } + } + + @Override + public void collectionProcessComplete() + throws AnalysisEngineProcessException + { + super.collectionProcessComplete(); + + if (isDownstreamTimer()) { + getLogger().info("Results from Timer '" + timerName + "' after processing all documents."); + + + DescriptiveStatistics statTimes = new DescriptiveStatistics(); + for (Long timeValue : times) { + statTimes.addValue((double) timeValue / 1000); + } + double sum = statTimes.getSum(); + double mean = statTimes.getMean(); + double stddev = statTimes.getStandardDeviation(); + + StringBuilder sb = new StringBuilder(); + sb.append("Estimate after processing " + times.size() + " documents."); + sb.append("\n"); + + Formatter formatter = new Formatter(sb, Locale.US); + + formatter.format("Aggregated time: %,.1fs\n", sum); + formatter.format("Time / Document: %,.3fs (%,.3fs)\n", mean, stddev); + + formatter.close(); + + getLogger().info(sb.toString()); + + if (outputFile != null) { + try { + Properties props = new Properties(); + props.setProperty(KEY_SUM, "" + sum); + props.setProperty(KEY_MEAN, "" + mean); + props.setProperty(KEY_STDDEV, "" + stddev); + OutputStream out = new FileOutputStream(outputFile); + props.store(out, "timer " + timerName + " result file"); + } catch (FileNotFoundException e) { + throw new AnalysisEngineProcessException(e); + } catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + } + } + + private boolean isDownstreamTimer() { + + if (isDownstreamTimer == null) { + // this is only a downstream timer if there already is a timer annotation with the same + // name + for (TimerAnnotation timer : JCasUtil.select(jcas, TimerAnnotation.class)) { + if (timer.getName().equals(timerName)) { + isDownstreamTimer = true; + } + } + } + + if (isDownstreamTimer == null) { + isDownstreamTimer = false; + } + + return isDownstreamTimer; + } +} diff --git a/dkpro-core-performance-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/performance/OpenNlpPosTaggerTest.java b/dkpro-core-performance-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/performance/OpenNlpPosTaggerTest.java deleted file mode 100644 index a8ebf7b87d..0000000000 --- a/dkpro-core-performance-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/performance/OpenNlpPosTaggerTest.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.performance; - -import static de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase.INCLUDE_PREFIX; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; - -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.junit.Ignore; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.io.tei.TeiReader; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; - -/** - * - */ -public class OpenNlpPosTaggerTest -{ - @Ignore - @Test - public void performanceTest() - throws Exception - { - SimplePipeline.runPipeline( - createReader( - TeiReader.class, - TeiReader.PARAM_LANGUAGE, "en", - TeiReader.PARAM_SOURCE_LOCATION, "src/test/resources/corpus/", - TeiReader.PARAM_PATTERNS, new String[] {INCLUDE_PREFIX + "*.xml"} - ), - createEngineDescription( - createEngineDescription( - Stopwatch.class, - Stopwatch.PARAM_TIMER_NAME, "testTimer" - ), - createEngineDescription( - BreakIteratorSegmenter.class), - createEngineDescription( - OpenNlpPosTagger.class), - createEngineDescription( - Stopwatch.class, - Stopwatch.PARAM_TIMER_NAME, "testTimer", - Stopwatch.PARAM_OUTPUT_FILE, "target/result.txt" - ) - ) - ); - } -} diff --git a/dkpro-core-performance-asl/src/test/java/org/dkpro/core/performance/OpenNlpPosTaggerTest.java b/dkpro-core-performance-asl/src/test/java/org/dkpro/core/performance/OpenNlpPosTaggerTest.java new file mode 100644 index 0000000000..c9cc17a17f --- /dev/null +++ b/dkpro-core-performance-asl/src/test/java/org/dkpro/core/performance/OpenNlpPosTaggerTest.java @@ -0,0 +1,59 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.performance; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; +import static org.dkpro.core.api.io.ResourceCollectionReaderBase.INCLUDE_PREFIX; + +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.io.tei.TeiReader; +import org.dkpro.core.opennlp.OpenNlpPosTagger; +import org.dkpro.core.performance.Stopwatch; +import org.dkpro.core.tokit.BreakIteratorSegmenter; +import org.junit.Ignore; +import org.junit.Test; + +public class OpenNlpPosTaggerTest +{ + @Ignore + @Test + public void performanceTest() throws Exception + { + SimplePipeline.runPipeline( + createReader( + TeiReader.class, + TeiReader.PARAM_LANGUAGE, "en", + TeiReader.PARAM_SOURCE_LOCATION, "src/test/resources/corpus/", + TeiReader.PARAM_PATTERNS, new String[] {INCLUDE_PREFIX + "*.xml"}), + createEngineDescription( + createEngineDescription( + Stopwatch.class, + Stopwatch.PARAM_TIMER_NAME, "testTimer"), + createEngineDescription( + BreakIteratorSegmenter.class), + createEngineDescription( + OpenNlpPosTagger.class), + createEngineDescription( + Stopwatch.class, + Stopwatch.PARAM_TIMER_NAME, "testTimer", + Stopwatch.PARAM_OUTPUT_FILE, "target/result.txt") + ) + ); + } +} diff --git a/dkpro-core-performance-asl/suppressions.xml b/dkpro-core-performance-asl/suppressions.xml new file mode 100644 index 0000000000..05381817ea --- /dev/null +++ b/dkpro-core-performance-asl/suppressions.xml @@ -0,0 +1,9 @@ +<?xml version="1.0"?> + +<!DOCTYPE suppressions PUBLIC +"-//Puppy Crawl//DTD Suppressions 1.1//EN" +"http://www.puppycrawl.com/dtds/suppressions_1_1.dtd"> + +<suppressions> + <suppress files=".*[/\\]target[/\\].*" checks=".*"/> +</suppressions> diff --git a/dkpro-core-posfilter-asl/pom.xml b/dkpro-core-posfilter-asl/pom.xml index 048d5b1547..5627abe666 100644 --- a/dkpro-core-posfilter-asl/pom.xml +++ b/dkpro-core-posfilter-asl/pom.xml @@ -1,122 +1,123 @@ <!-- - Copyright 2010 - Ubiquitous Knowledge Processing (UKP) Lab - Technische Universität Darmstadt + Copyright 2010 + Ubiquitous Knowledge Processing (UKP) Lab + Technische Universität Darmstadt - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. --> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - <parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <version>1.10.0-SNAPSHOT</version> - <relativePath>../dkpro-core-asl</relativePath> - </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.posfilter-asl</artifactId> - <packaging>jar</packaging> - <name>DKPro Core ASL - Part-of-Speech Filter</name> - <dependencies> - <dependency> - <groupId>org.apache.uima</groupId> - <artifactId>uimaj-core</artifactId> - </dependency> - <dependency> - <groupId>org.apache.uima</groupId> - <artifactId>uimafit-core</artifactId> - </dependency> - <dependency> - <groupId>commons-io</groupId> - <artifactId>commons-io</artifactId> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.featurepath-asl</artifactId> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> - </dependency> - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.snowball-asl</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.io.conll-asl</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-model-tagger-de-maxent</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-model-tagger-en-maxent</artifactId> - <scope>test</scope> - </dependency> - </dependencies> - <dependencyManagement> - <dependencies> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> - <type>pom</type> - <scope>import</scope> - </dependency> - </dependencies> - </dependencyManagement> - <build> - <pluginManagement> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-dependency-plugin</artifactId> - <configuration> - <usedDependencies> - <!-- Models not detected by byte-code analysis --> - <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.opennlp-model-tagger-de-maxent</usedDependency> - <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.opennlp-model-tagger-en-maxent</usedDependency> - </usedDependencies> - </configuration> - </plugin> - </plugins> - </pluginManagement> - </build> + <modelVersion>4.0.0</modelVersion> + <parent> + <artifactId>dkpro-core-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <version>2.3.0-SNAPSHOT</version> + <relativePath>../dkpro-core-asl</relativePath> + </parent> + <artifactId>dkpro-core-posfilter-asl</artifactId> + <packaging>jar</packaging> + <name>DKPro Core ASL - Part-of-Speech Filter</name> + <url>https://dkpro.github.io/dkpro-core/</url> + <dependencies> + <dependency> + <groupId>org.apache.uima</groupId> + <artifactId>uimaj-core</artifactId> + </dependency> + <dependency> + <groupId>org.apache.uima</groupId> + <artifactId>uimafit-core</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-featurepath-asl</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> + </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-opennlp-asl</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-snowball-asl</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-io-conll-asl</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> + <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-model-tagger-de-maxent</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> + <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-model-tagger-en-maxent</artifactId> + <scope>test</scope> + </dependency> + </dependencies> + <dependencyManagement> + <dependencies> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-opennlp-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> + <type>pom</type> + <scope>import</scope> + </dependency> + </dependencies> + </dependencyManagement> + <build> + <pluginManagement> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + <configuration> + <usedDependencies> + <!-- Models not detected by byte-code analysis --> + <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.opennlp-model-tagger-de-maxent</usedDependency> + <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.opennlp-model-tagger-en-maxent</usedDependency> + </usedDependencies> + </configuration> + </plugin> + </plugins> + </pluginManagement> + </build> </project> diff --git a/dkpro-core-posfilter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/posfilter/PosMapper.java b/dkpro-core-posfilter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/posfilter/PosMapper.java deleted file mode 100644 index 16fa22cf48..0000000000 --- a/dkpro-core-posfilter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/posfilter/PosMapper.java +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright 2013 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.posfilter; - -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.io.Reader; -import java.util.Properties; - -import org.apache.commons.io.IOUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Type; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * Maps existing POS tags from one tagset to another using a user provided properties file. - */ -@ResourceMetaData(name="POS Mapper") -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}, - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}) -public class PosMapper - extends JCasAnnotator_ImplBase -{ - /** - * A properties file containing POS tagset mappings. - */ - public static final String PARAM_MAPPING_FILE = "mappingFile"; - @ConfigurationParameter(name = PARAM_MAPPING_FILE, mandatory = true) - private File mappingFile; - - /** - * A properties file containing mappings from the new tagset to (fully qualified) DKPro POS - * classes.<br> - * If such a file is not supplied, the DKPro POS classes stay the same regardless of the new POS - * tag value, and only the value is changed. - */ - public static final String PARAM_DKPRO_MAPPING_LOCATION = "dkproMappingLocation"; - @ConfigurationParameter(name = PARAM_DKPRO_MAPPING_LOCATION, mandatory = false) - private String dkproMappingLocation; - - private Properties posMap; - private MappingProvider mappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - posMap = new Properties(); - Reader reader = null; - try { - reader = new FileReader(mappingFile); - posMap.load(reader); - } - catch (IOException e) { - throw new ResourceInitializationException(e); - } - finally { - IOUtils.closeQuietly(reader); - } - - if (dkproMappingLocation != null) { - mappingProvider = new MappingProvider(); - mappingProvider.setDefault(MappingProvider.LOCATION, dkproMappingLocation); - mappingProvider.setDefault(MappingProvider.BASE_TYPE, POS.class.getName()); - mappingProvider.setDefault("pos.tagset", "default"); - // mappingProvider.setOverride(MappingProvider.LANGUAGE, language); - // mappingProvider.addImport("pos.tagset", modelProvider); - } - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - if (mappingProvider != null) { - CAS cas = aJCas.getCas(); - mappingProvider.configure(cas); - - for (Token t : JCasUtil.select(aJCas, Token.class)) { - POS oldPos = t.getPos(); - String newTag = posMap.getProperty(oldPos.getPosValue()); - - // replace the POS if the value differs (i.e. if the old value has a mapping) - if (newTag != null) { - Type type = mappingProvider.getTagType(newTag); - int begin = oldPos.getBegin(); - int end = oldPos.getEnd(); - - POS newPos = (POS) cas.createAnnotation(type, begin, end); - newPos.setPosValue(newTag); - POSUtils.assignCoarseValue(newPos); - - oldPos.removeFromIndexes(); - newPos.addToIndexes(); - t.setPos(newPos); - } - } - } - // if we don't have a MappingProvider, we only re-set the tags and not the classes - else { - for (POS pos : JCasUtil.select(aJCas, POS.class)) { - String newTag = posMap.getProperty(pos.getPosValue()); - if (newTag != null) { - pos.setPosValue(newTag); - } - } - } - } -} diff --git a/dkpro-core-posfilter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/posfilter/package-info.java b/dkpro-core-posfilter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/posfilter/package-info.java deleted file mode 100644 index d0d0c60705..0000000000 --- a/dkpro-core-posfilter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/posfilter/package-info.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Remove annotations on words with particular part of speech tags to exclude them - * from further processing. - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.posfilter; diff --git a/dkpro-core-posfilter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/posfilter/PosFilter.java b/dkpro-core-posfilter-asl/src/main/java/org/dkpro/core/posfilter/PosFilter.java similarity index 97% rename from dkpro-core-posfilter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/posfilter/PosFilter.java rename to dkpro-core-posfilter-asl/src/main/java/org/dkpro/core/posfilter/PosFilter.java index 5d5418f703..ba58bad00b 100755 --- a/dkpro-core-posfilter-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/posfilter/PosFilter.java +++ b/dkpro-core-posfilter-asl/src/main/java/org/dkpro/core/posfilter/PosFilter.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.posfilter; +package org.dkpro.core.posfilter; import java.util.ArrayList; import java.util.List; @@ -32,9 +32,9 @@ import org.apache.uima.fit.util.CasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.util.Level; +import org.dkpro.core.api.featurepath.FeaturePathException; +import org.dkpro.core.api.featurepath.FeaturePathFactory; -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathFactory; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_ADJ; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_ADP; @@ -55,13 +55,15 @@ import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.DocumentationResource; /** * Removes all tokens/lemmas/stems/POS tags (depending on the "Mode" setting) that do not match the * given parts of speech. * */ -@ResourceMetaData(name="POS Filter") +@ResourceMetaData(name = "POS Filter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }) public class PosFilter extends JCasAnnotator_ImplBase diff --git a/dkpro-core-posfilter-asl/src/main/java/org/dkpro/core/posfilter/PosMapper.java b/dkpro-core-posfilter-asl/src/main/java/org/dkpro/core/posfilter/PosMapper.java new file mode 100644 index 0000000000..283bcbad0f --- /dev/null +++ b/dkpro-core-posfilter-asl/src/main/java/org/dkpro/core/posfilter/PosMapper.java @@ -0,0 +1,143 @@ +/* + * Copyright 2013 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.posfilter; + +import static org.dkpro.core.api.resources.MappingProvider.BASE_TYPE; +import static org.dkpro.core.api.resources.ResourceObjectProviderBase.LOCATION; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.Reader; +import java.util.Properties; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Type; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.resources.MappingProvider; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +/** + * Maps existing POS tags from one tagset to another using a user provided properties file. + */ +@ResourceMetaData(name = "POS Mapper") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}) +public class PosMapper + extends JCasAnnotator_ImplBase +{ + /** + * A properties file containing POS tagset mappings. + */ + public static final String PARAM_MAPPING_FILE = "mappingFile"; + @ConfigurationParameter(name = PARAM_MAPPING_FILE, mandatory = true) + private File mappingFile; + + /** + * A properties file containing mappings from the new tagset to (fully qualified) DKPro POS + * classes.<br> + * If such a file is not supplied, the DKPro POS classes stay the same regardless of the new POS + * tag value, and only the value is changed. + */ + public static final String PARAM_DKPRO_MAPPING_LOCATION = "dkproMappingLocation"; + @ConfigurationParameter(name = PARAM_DKPRO_MAPPING_LOCATION, mandatory = false) + private String dkproMappingLocation; + + private Properties posMap; + private MappingProvider mappingProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + posMap = new Properties(); + try (Reader reader = new FileReader(mappingFile)) { + posMap.load(reader); + } + catch (IOException e) { + throw new ResourceInitializationException(e); + } + + if (dkproMappingLocation != null) { + mappingProvider = new MappingProvider(); + mappingProvider.setContextObject(this); + mappingProvider.setDefault(LOCATION, dkproMappingLocation); + mappingProvider.setDefault(BASE_TYPE, POS.class.getName()); + mappingProvider.setDefault("pos.tagset", "default"); + // mappingProvider.setOverride(MappingProvider.LANGUAGE, language); + // mappingProvider.addImport("pos.tagset", modelProvider); + } + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + if (mappingProvider != null) { + CAS cas = aJCas.getCas(); + mappingProvider.configure(cas); + + for (Token t : JCasUtil.select(aJCas, Token.class)) { + POS oldPos = t.getPos(); + String newTag = posMap.getProperty(oldPos.getPosValue()); + + // replace the POS if the value differs (i.e. if the old value has a mapping) + if (newTag != null) { + Type type = mappingProvider.getTagType(newTag); + int begin = oldPos.getBegin(); + int end = oldPos.getEnd(); + + POS newPos = (POS) cas.createAnnotation(type, begin, end); + newPos.setPosValue(newTag); + POSUtils.assignCoarseValue(newPos); + + oldPos.removeFromIndexes(); + newPos.addToIndexes(); + t.setPos(newPos); + } + } + } + // if we don't have a MappingProvider, we only re-set the tags and not the classes + else { + for (POS pos : JCasUtil.select(aJCas, POS.class)) { + String newTag = posMap.getProperty(pos.getPosValue()); + if (newTag != null) { + pos.setPosValue(newTag); + } + } + } + } +} diff --git a/dkpro-core-posfilter-asl/src/main/java/org/dkpro/core/posfilter/package-info.java b/dkpro-core-posfilter-asl/src/main/java/org/dkpro/core/posfilter/package-info.java new file mode 100644 index 0000000000..ced9a62dc7 --- /dev/null +++ b/dkpro-core-posfilter-asl/src/main/java/org/dkpro/core/posfilter/package-info.java @@ -0,0 +1,25 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Remove annotations on words with particular part of speech tags to exclude them + * from further processing. + * + * @since 1.1.0 + */ +package org.dkpro.core.posfilter; diff --git a/dkpro-core-posfilter-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/posfilter/PosMapperTest.java b/dkpro-core-posfilter-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/posfilter/PosMapperTest.java deleted file mode 100644 index 7c637a6beb..0000000000 --- a/dkpro-core-posfilter-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/posfilter/PosMapperTest.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.posfilter; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import java.io.File; -import java.util.ArrayList; -import java.util.List; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.jcas.JCas; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TestName; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -public class PosMapperTest -{ - private final File testBase = new File("src/test/resources/de/tudarmstadt/ukp/dkpro/core/posfilter"); - private final String testDocument1 = "This sentence consists of fourtynine characters ."; - - @Test - public void testEnglishOriginal() - throws Exception - { - String testDocument = testDocument1; - - String[] posOriginal = { "DT", "NN", "VBZ", "IN", "CD", "NNS", "." }; - String[] posOriginalDkpro = { "POS_DET", "POS_NOUN", "POS_VERB", "POS_ADP", "POS_NUM", "POS_NOUN", "POS_PUNCT" }; - - runTest("en", testDocument, posOriginal, posOriginalDkpro, false); - } - - @Test - public void testEnglishMapped() - throws Exception - { - String testDocument = testDocument1; - - String[] posMapped = { "DET", "N", "V", "IN", "MADE_UP_POS", "N", "." }; - String[] posMappedDkpro = { "POS_DET", "POS_NOUN", "POS_VERB", "POS_ADP", "POS_X", "POS_NOUN", "POS_PUNCT" }; - - runTest("en", testDocument, posMapped, posMappedDkpro, true); - } - - private void runTest(String language, String testDocument, String[] aPosOriginal, - String[] aPosDkpro, boolean mapToDifferentTagset) - throws Exception - { - List<AnalysisEngineDescription> descs = new ArrayList<AnalysisEngineDescription>(); - descs.add(createEngineDescription(OpenNlpPosTagger.class, - OpenNlpPosTagger.PARAM_LANGUAGE, "en")); - - if (mapToDifferentTagset) { - descs.add(createEngineDescription(PosMapper.class, PosMapper.PARAM_MAPPING_FILE, - new File(testBase, "ptb-to-dummy.map"), PosMapper.PARAM_DKPRO_MAPPING_LOCATION, - new File(testBase, "dummy-to-dkpro.map"))); - } - - AnalysisEngineDescription aggregate = createEngineDescription(descs - .toArray(new AnalysisEngineDescription[0])); - JCas jcas = TestRunner.runTest(aggregate, language, testDocument); - - AssertAnnotations.assertPOS(aPosDkpro, aPosOriginal, select(jcas, POS.class)); - } - - @Rule - public TestName name = new TestName(); - - @Before - public void printSeparator() - { - System.out.println("\n=== " + name.getMethodName() + " ====================="); - } -} diff --git a/dkpro-core-posfilter-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/posfilter/PosFilterTest.java b/dkpro-core-posfilter-asl/src/test/java/org/dkpro/core/posfilter/PosFilterTest.java similarity index 96% rename from dkpro-core-posfilter-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/posfilter/PosFilterTest.java rename to dkpro-core-posfilter-asl/src/test/java/org/dkpro/core/posfilter/PosFilterTest.java index 0d1d78af22..064d8a6aa1 100644 --- a/dkpro-core-posfilter-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/posfilter/PosFilterTest.java +++ b/dkpro-core-posfilter-asl/src/test/java/org/dkpro/core/posfilter/PosFilterTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.posfilter; +package org.dkpro.core.posfilter; import static java.util.Arrays.asList; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; @@ -33,6 +33,10 @@ import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.fit.util.CasUtil; import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.conll.Conll2006Reader; +import org.dkpro.core.posfilter.PosFilter; +import org.dkpro.core.snowball.SnowballStemmer; +import org.dkpro.core.testing.AssertAnnotations; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -42,9 +46,6 @@ import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2006Reader; -import de.tudarmstadt.ukp.dkpro.core.snowball.SnowballStemmer; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; import junit.framework.Assert; public class PosFilterTest diff --git a/dkpro-core-posfilter-asl/src/test/java/org/dkpro/core/posfilter/PosMapperTest.java b/dkpro-core-posfilter-asl/src/test/java/org/dkpro/core/posfilter/PosMapperTest.java new file mode 100644 index 0000000000..0021eaabab --- /dev/null +++ b/dkpro-core-posfilter-asl/src/test/java/org/dkpro/core/posfilter/PosMapperTest.java @@ -0,0 +1,100 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.posfilter; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.opennlp.OpenNlpPosTagger; +import org.dkpro.core.posfilter.PosMapper; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.TestRunner; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; + +public class PosMapperTest +{ + private final File testBase = new File("src/test/resources/org/dkpro/core/posfilter"); + private final String testDocument1 = "This sentence consists of fourtynine characters ."; + + @Test + public void testEnglishOriginal() + throws Exception + { + String testDocument = testDocument1; + + String[] posOriginal = { "DT", "NN", "VBZ", "IN", "CD", "NNS", "." }; + String[] posOriginalDkpro = { "POS_DET", "POS_NOUN", "POS_VERB", "POS_ADP", "POS_NUM", + "POS_NOUN", "POS_PUNCT" }; + + runTest("en", testDocument, posOriginal, posOriginalDkpro, false); + } + + @Test + public void testEnglishMapped() + throws Exception + { + String testDocument = testDocument1; + + String[] posMapped = { "DET", "N", "V", "IN", "MADE_UP_POS", "N", "." }; + String[] posMappedDkpro = { "POS_DET", "POS_NOUN", "POS_VERB", "POS_ADP", "POS_X", + "POS_NOUN", "POS_PUNCT" }; + + runTest("en", testDocument, posMapped, posMappedDkpro, true); + } + + private void runTest(String language, String testDocument, String[] aPosOriginal, + String[] aPosDkpro, boolean mapToDifferentTagset) + throws Exception + { + List<AnalysisEngineDescription> descs = new ArrayList<AnalysisEngineDescription>(); + descs.add(createEngineDescription(OpenNlpPosTagger.class, + OpenNlpPosTagger.PARAM_LANGUAGE, "en")); + + if (mapToDifferentTagset) { + descs.add(createEngineDescription(PosMapper.class, PosMapper.PARAM_MAPPING_FILE, + new File(testBase, "ptb-to-dummy.map"), PosMapper.PARAM_DKPRO_MAPPING_LOCATION, + new File(testBase, "dummy-to-dkpro.map"))); + } + + AnalysisEngineDescription aggregate = createEngineDescription(descs + .toArray(new AnalysisEngineDescription[0])); + JCas jcas = TestRunner.runTest(aggregate, language, testDocument); + + AssertAnnotations.assertPOS(aPosDkpro, aPosOriginal, select(jcas, POS.class)); + } + + @Rule + public TestName name = new TestName(); + + @Before + public void printSeparator() + { + System.out.println("\n=== " + name.getMethodName() + " ====================="); + } +} diff --git a/dkpro-core-posfilter-asl/src/test/resources/log4j.properties b/dkpro-core-posfilter-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-posfilter-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-posfilter-asl/src/test/resources/log4j2.xml b/dkpro-core-posfilter-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-posfilter-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Logger name="org.dkpro.core.api.resources.ResourceObjectProviderBase" level="INFO"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-posfilter-asl/src/test/resources/de/tudarmstadt/ukp/dkpro/core/posfilter/dummy-to-dkpro.map b/dkpro-core-posfilter-asl/src/test/resources/org/dkpro/core/posfilter/dummy-to-dkpro.map similarity index 100% rename from dkpro-core-posfilter-asl/src/test/resources/de/tudarmstadt/ukp/dkpro/core/posfilter/dummy-to-dkpro.map rename to dkpro-core-posfilter-asl/src/test/resources/org/dkpro/core/posfilter/dummy-to-dkpro.map diff --git a/dkpro-core-posfilter-asl/src/test/resources/de/tudarmstadt/ukp/dkpro/core/posfilter/ptb-to-dummy.map b/dkpro-core-posfilter-asl/src/test/resources/org/dkpro/core/posfilter/ptb-to-dummy.map similarity index 100% rename from dkpro-core-posfilter-asl/src/test/resources/de/tudarmstadt/ukp/dkpro/core/posfilter/ptb-to-dummy.map rename to dkpro-core-posfilter-asl/src/test/resources/org/dkpro/core/posfilter/ptb-to-dummy.map diff --git a/dkpro-core-posfilter-asl/src/test/resources/de/tudarmstadt/ukp/dkpro/core/posfilter/treetagger-to-ptb.properties b/dkpro-core-posfilter-asl/src/test/resources/org/dkpro/core/posfilter/treetagger-to-ptb.properties similarity index 100% rename from dkpro-core-posfilter-asl/src/test/resources/de/tudarmstadt/ukp/dkpro/core/posfilter/treetagger-to-ptb.properties rename to dkpro-core-posfilter-asl/src/test/resources/org/dkpro/core/posfilter/treetagger-to-ptb.properties diff --git a/dkpro-core-readability-asl/pom.xml b/dkpro-core-readability-asl/pom.xml index d7cd8e830f..ab8a834267 100644 --- a/dkpro-core-readability-asl/pom.xml +++ b/dkpro-core-readability-asl/pom.xml @@ -1,52 +1,57 @@ <!-- - Copyright 2013 - Ubiquitous Knowledge Processing (UKP) Lab - Technische Universität Darmstadt + Copyright 2013 + Ubiquitous Knowledge Processing (UKP) Lab + Technische Universität Darmstadt - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. --> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - <parent> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> - <relativePath>../dkpro-core-asl</relativePath> - </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.readability-asl</artifactId> - <packaging>jar</packaging> - <name>DKPro Core ASL - Readability</name> - <dependencies> - <dependency> - <groupId>org.apache.uima</groupId> - <artifactId>uimaj-core</artifactId> - </dependency> - <dependency> - <groupId>org.apache.uima</groupId> - <artifactId>uimafit-core</artifactId> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> - </dependency> - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <scope>test</scope> - </dependency> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> + <relativePath>../dkpro-core-asl</relativePath> + </parent> + <artifactId>dkpro-core-readability-asl</artifactId> + <packaging>jar</packaging> + <name>DKPro Core ASL - Readability</name> + <url>https://dkpro.github.io/dkpro-core/</url> + <dependencies> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.tokit-asl</artifactId> + <groupId>org.apache.uima</groupId> + <artifactId>uimaj-core</artifactId> + </dependency> + <dependency> + <groupId>org.apache.uima</groupId> + <artifactId>uimafit-core</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> + </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-tokit-asl</artifactId> <scope>test</scope> </dependency> <dependency> @@ -54,23 +59,23 @@ <artifactId>commons-io</artifactId> <scope>test</scope> </dependency> - </dependencies> - <build> - <resources> - <resource> - <filtering>false</filtering> - <directory>src/main/resources</directory> - <excludes> - <exclude>desc/type/**/*</exclude> - </excludes> - </resource> - <resource> - <filtering>true</filtering> - <directory>src/main/resources</directory> - <includes> - <include>desc/type/**/*</include> - </includes> - </resource> - </resources> - </build> + </dependencies> + <build> + <resources> + <resource> + <filtering>false</filtering> + <directory>src/main/resources</directory> + <excludes> + <exclude>desc/type/**/*</exclude> + </excludes> + </resource> + <resource> + <filtering>true</filtering> + <directory>src/main/resources</directory> + <includes> + <include>desc/type/**/*</include> + </includes> + </resource> + </resources> + </build> </project> \ No newline at end of file diff --git a/dkpro-core-readability-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/readability/measure/WordSyllableCounter.java b/dkpro-core-readability-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/readability/measure/WordSyllableCounter.java deleted file mode 100644 index c3938fef82..0000000000 --- a/dkpro-core-readability-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/readability/measure/WordSyllableCounter.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.readability.measure; - -import java.util.Arrays; -import java.util.HashSet; -import java.util.Set; - -/** - * Counts syllables in words. - * - * This class is based on the methods of 'syll_en' and 'syll_de' - * in Linux'Style' command (a part of 'diction' package). - * - * - */ -public class WordSyllableCounter { - - private final String[] vowelsArray = {"a", "e", "i", "o", "u"}; - private final String[] enVowelsArray = {"a", "e", "i", "o", "u", "y"}; - private final String[] deVowelsArray = {"a", "e", "i", "o", "u", "ä", "ö", "ü"}; - - private final Set<String> vowels; - private final Set<String> deVowels; - private final Set<String> enVowels; - - private final String languageCode; - - public WordSyllableCounter(String languageCode) - { - vowels = new HashSet<String>(Arrays.asList(vowelsArray)); - deVowels = new HashSet<String>(Arrays.asList(deVowelsArray)); - enVowels = new HashSet<String>(Arrays.asList(enVowelsArray)); - - this.languageCode = languageCode; - } - - private boolean isVowel(String character) - { - if(languageCode.equals("en")) { - return enVowels.contains(character); - } - else if(languageCode.equals("de")) { - return deVowels.contains(character); - } - else { - return vowels.contains(character); - } - } - - public int countSyllables(Iterable<String> words) { - int count = 0; - for (String word : words) { - count = count + countSyllables(word); - } - return count; - } - - public int countSyllables(String word){ - String lowcaseWord = word.toLowerCase(); - int count = 0; - - if (this.languageCode.equals("en")) { - if (lowcaseWord.length() >=2 && - lowcaseWord.substring(lowcaseWord.length() - 2, lowcaseWord.length()).equals("ed")) - { - lowcaseWord = lowcaseWord.substring(0, lowcaseWord.length() - 2); - } - } - else if (this.languageCode.equals("de")) { - if (lowcaseWord.length() >= 2 && - lowcaseWord.charAt(lowcaseWord.length() - 1) == 'e' && - !isVowel(lowcaseWord.substring(lowcaseWord.length() - 2, lowcaseWord.length() - 1))) - { - count++; - lowcaseWord = lowcaseWord.substring(0, lowcaseWord.length() - 2); - } - - } - - for(int i = 0; i < lowcaseWord.length() - 1; ++ i){ - String curCh = lowcaseWord.substring(i, i+ 1); - String nextCh = lowcaseWord.substring(i + 1, i + 2); - if(isVowel(curCh) && !isVowel(nextCh)) - ++ count; - } - return (count == 0 ? 1 : count); - } -} diff --git a/dkpro-core-readability-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/readability/ReadabilityAnnotator.java b/dkpro-core-readability-asl/src/main/java/org/dkpro/core/readability/ReadabilityAnnotator.java similarity index 75% rename from dkpro-core-readability-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/readability/ReadabilityAnnotator.java rename to dkpro-core-readability-asl/src/main/java/org/dkpro/core/readability/ReadabilityAnnotator.java index 42bc04ad0e..725180faf9 100644 --- a/dkpro-core-readability-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/readability/ReadabilityAnnotator.java +++ b/dkpro-core-readability-asl/src/main/java/org/dkpro/core/readability/ReadabilityAnnotator.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.readability; +package org.dkpro.core.readability; import java.util.ArrayList; import java.util.List; @@ -24,20 +24,32 @@ import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.readability.measure.ReadabilityMeasures; +import org.dkpro.core.readability.measure.ReadabilityMeasures.Measures; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.readability.measure.ReadabilityMeasures; -import de.tudarmstadt.ukp.dkpro.core.readability.measure.ReadabilityMeasures.Measures; import de.tudarmstadt.ukp.dkpro.core.type.ReadabilityScore; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Assign a set of popular readability scores to the text. */ -@ResourceMetaData(name="Readability Annotator") +@Component(OperationType.READABILITY_ANNOTATOR) +@ResourceMetaData(name = "Readability Annotator") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.type.ReadabilityScore" }) public class ReadabilityAnnotator extends JCasAnnotator_ImplBase { diff --git a/dkpro-core-readability-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/readability/measure/ReadabilityMeasures.java b/dkpro-core-readability-asl/src/main/java/org/dkpro/core/readability/measure/ReadabilityMeasures.java similarity index 83% rename from dkpro-core-readability-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/readability/measure/ReadabilityMeasures.java rename to dkpro-core-readability-asl/src/main/java/org/dkpro/core/readability/measure/ReadabilityMeasures.java index 070cb88940..4adc36437c 100644 --- a/dkpro-core-readability-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/readability/measure/ReadabilityMeasures.java +++ b/dkpro-core-readability-asl/src/main/java/org/dkpro/core/readability/measure/ReadabilityMeasures.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.readability.measure; +package org.dkpro.core.readability.measure; import java.util.ArrayList; import java.util.List; @@ -81,28 +81,30 @@ else if (measure.equals(Measures.smog)) { } } - - /* * only the strings consist of numbers or letters * are considered as words. */ - private boolean isWord(String strWord){ - for(int i = 0; i < strWord.length(); ++ i){ - char ch = strWord.charAt(i); - if(!Character.isLetterOrDigit(ch)) - return false; - } - return true; + private boolean isWord(String strWord) + { + for (int i = 0; i < strWord.length(); ++i) { + char ch = strWord.charAt(i); + if (!Character.isLetterOrDigit(ch)) { + return false; + } + } + return true; } - private List<String> filterWords(List<String> words){ - List<String> newWords = new ArrayList<String>(); - for(String word : words){ - if(isWord(word)) - newWords.add(word); - } - return newWords; + private List<String> filterWords(List<String> words) + { + List<String> newWords = new ArrayList<String>(); + for (String word : words) { + if (isWord(word)) { + newWords.add(word); + } + } + return newWords; } /** @@ -115,14 +117,15 @@ private List<String> filterWords(List<String> words){ */ public double kincaid(List<String> words, int nrofSentences) { - words = filterWords(words); + words = filterWords(words); int nrofSyllables = this.syllableCounter.countSyllables(words); return kincaid(words.size(), nrofSyllables, nrofSentences); } + private double kincaid(Integer nrofWords, Integer nrofSyllables, Integer nrofSentences) { - return 11.8 * (((double) nrofSyllables) / nrofWords) + 0.39 * (((double) nrofWords) / nrofSentences) - - 15.59; + return 11.8 * (((double) nrofSyllables) / nrofWords) + + 0.39 * (((double) nrofWords) / nrofSentences) - 15.59; } /** @@ -134,16 +137,17 @@ private double kincaid(Integer nrofWords, Integer nrofSyllables, Integer nrofSen */ public double ari(List<String> words, int nrofSentences) { - words = filterWords(words); + words = filterWords(words); int nrofLetters = this.getNrofLetters(words); return ari(nrofLetters, words.size(), nrofSentences); } + private double ari(Integer nrofLetters, Integer nrofWords, Integer nrofSentences) { - return 4.71 * (((double) nrofLetters) / nrofWords) + 0.5 * (((double) nrofWords) / nrofSentences) - 21.43; + return 4.71 * (((double) nrofLetters) / nrofWords) + + 0.5 * (((double) nrofWords) / nrofSentences) - 21.43; } - /** * Calculate Coleman-Liau formula. * @@ -154,15 +158,14 @@ private double ari(Integer nrofLetters, Integer nrofWords, Integer nrofSentences */ public double coleman_liau(List<String> words, int nrofSentences) { - words = filterWords(words); + words = filterWords(words); int nrofLetters = this.getNrofLetters(words); return coleman_liau(nrofLetters, words.size(), nrofSentences); } private double coleman_liau(Integer nrofLetters, Integer nrofWords, Integer nrofSentences) { - - return 5.89 * (((double) nrofLetters) / nrofWords) - 0.3 * (((double) nrofSentences) / (100 * nrofWords)) - - 15.8; + return 5.89 * (((double) nrofLetters) / nrofWords) + - 0.3 * (((double) nrofSentences) / (100 * nrofWords)) - 15.8; } /** @@ -174,7 +177,7 @@ private double coleman_liau(Integer nrofLetters, Integer nrofWords, Integer nrof */ public double flesch(List<String> words, int nrofSentences) { - words = filterWords(words); + words = filterWords(words); int nrofSyllables = this.syllableCounter.countSyllables(words); return flesch(nrofSyllables, words.size(), nrofSentences); } @@ -196,7 +199,7 @@ private double flesch(Integer nrofSyllables, Integer nrofWords, Integer nrofSent */ public double fog(List<String> words, int nrofSentences) { - words = filterWords(words); + words = filterWords(words); int nrofBigwords = getNrofBigwords(words); return fog(words.size(), nrofBigwords, nrofSentences); } @@ -216,31 +219,40 @@ private double fog(Integer nrofWords, Integer nrofBigwords, Integer nrofSentence */ public double lix(List<String> words, int nrofSentences) { - words = filterWords(words); + words = filterWords(words); int nrofLongWords = this.getNrofLongwords(words); return lix(words.size(), nrofLongWords, nrofSentences); } private double lix(Integer nrofWords, Integer nrofLongWords, Integer nrofSentences) { double idx = ((double) nrofWords) / nrofSentences + 100.0 * (nrofLongWords) / nrofWords; - if (idx < 34) + if (idx < 34) { return 0; - else if (idx < 38) + } + else if (idx < 38) { return 5; - else if (idx < 41) + } + else if (idx < 41) { return 6; - else if (idx < 44) + } + else if (idx < 44) { return 7; - else if (idx < 48) + } + else if (idx < 48) { return 8; - else if (idx < 51) + } + else if (idx < 51) { return 9; - else if (idx < 54) + } + else if (idx < 54) { return 10; - else if (idx < 57) + } + else if (idx < 57) { return 11; - else + } + else { return 99; + } } /** @@ -252,7 +264,7 @@ else if (idx < 57) */ public double smog(List<String> words, int nrofSentences) { - words = filterWords(words); + words = filterWords(words); int nrofBigwords = this.getNrofBigwords(words); return smog(nrofBigwords, nrofSentences); } diff --git a/dkpro-core-readability-asl/src/main/java/org/dkpro/core/readability/measure/WordSyllableCounter.java b/dkpro-core-readability-asl/src/main/java/org/dkpro/core/readability/measure/WordSyllableCounter.java new file mode 100644 index 0000000000..af95779995 --- /dev/null +++ b/dkpro-core-readability-asl/src/main/java/org/dkpro/core/readability/measure/WordSyllableCounter.java @@ -0,0 +1,104 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.readability.measure; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +/** + * Counts syllables in words. + * + * This class is based on the methods of 'syll_en' and 'syll_de' + * in Linux'Style' command (a part of 'diction' package). + * + * + */ +public class WordSyllableCounter { + + private final String[] vowelsArray = {"a", "e", "i", "o", "u"}; + private final String[] enVowelsArray = {"a", "e", "i", "o", "u", "y"}; + private final String[] deVowelsArray = {"a", "e", "i", "o", "u", "ä", "ö", "ü"}; + + private final Set<String> vowels; + private final Set<String> deVowels; + private final Set<String> enVowels; + + private final String languageCode; + + public WordSyllableCounter(String languageCode) + { + vowels = new HashSet<String>(Arrays.asList(vowelsArray)); + deVowels = new HashSet<String>(Arrays.asList(deVowelsArray)); + enVowels = new HashSet<String>(Arrays.asList(enVowelsArray)); + + this.languageCode = languageCode; + } + + private boolean isVowel(String character) + { + if (languageCode.equals("en")) { + return enVowels.contains(character); + } + else if (languageCode.equals("de")) { + return deVowels.contains(character); + } + else { + return vowels.contains(character); + } + } + + public int countSyllables(Iterable<String> words) { + int count = 0; + for (String word : words) { + count = count + countSyllables(word); + } + return count; + } + + public int countSyllables(String word) + { + String lowcaseWord = word.toLowerCase(); + int count = 0; + + if (this.languageCode.equals("en")) { + if (lowcaseWord.length() >= 2 && lowcaseWord + .substring(lowcaseWord.length() - 2, lowcaseWord.length()).equals("ed")) { + lowcaseWord = lowcaseWord.substring(0, lowcaseWord.length() - 2); + } + } + else if (this.languageCode.equals("de")) { + if (lowcaseWord.length() >= 2 && lowcaseWord.charAt(lowcaseWord.length() - 1) == 'e' + && !isVowel(lowcaseWord.substring(lowcaseWord.length() - 2, + lowcaseWord.length() - 1))) { + count++; + lowcaseWord = lowcaseWord.substring(0, lowcaseWord.length() - 2); + } + + } + + for (int i = 0; i < lowcaseWord.length() - 1; ++i) { + String curCh = lowcaseWord.substring(i, i + 1); + String nextCh = lowcaseWord.substring(i + 1, i + 2); + if (isVowel(curCh) && !isVowel(nextCh)) { + ++count; + } + } + return (count == 0 ? 1 : count); + } +} diff --git a/dkpro-core-readability-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/readability/ReadabilityAnnotatorTest.java b/dkpro-core-readability-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/readability/ReadabilityAnnotatorTest.java deleted file mode 100644 index a0bb1d86c2..0000000000 --- a/dkpro-core-readability-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/readability/ReadabilityAnnotatorTest.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.readability; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.io.File; -import java.util.HashMap; -import java.util.Map; - -import org.apache.commons.io.FileUtils; -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.readability.measure.ReadabilityMeasures.Measures; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; -import de.tudarmstadt.ukp.dkpro.core.type.ReadabilityScore; - -public class ReadabilityAnnotatorTest -{ - private final static double EPSILON = 0.1; - - static HashMap<String, Double> correctResult = new HashMap<String, Double>(); - static - { - correctResult.put("kincaid", 7.6); - correctResult.put("ari", 9.1); - correctResult.put("coleman_liau", 11.6); - correctResult.put("flesch", 70.6); - correctResult.put("lix", 5.0); - correctResult.put("smog", 9.9); - correctResult.put("fog", 10.6); - } - - @Test - public void readabilityAnnotatorTest() - throws Exception - { - String text = FileUtils.readFileToString( - new File("src/test/resources/readability/test_document_en.txt") - ); - - Map<String, Boolean> measureMap = new HashMap<String,Boolean>(); - for (Measures measure : Measures.values()) { - measureMap.put(measure.name(), true); - } - - - AnalysisEngineDescription segmenter = createEngineDescription( - BreakIteratorSegmenter.class - ); - - AnalysisEngineDescription readability = createEngineDescription( - ReadabilityAnnotator.class - ); - - AnalysisEngineDescription aggregate = createEngineDescription( - segmenter, - readability - ); - - AnalysisEngine ae = createEngine(aggregate); - JCas jcas = ae.newJCas(); - jcas.setDocumentLanguage("en"); - jcas.setDocumentText(text); - ae.process(jcas); - - int i = 0; - for (ReadabilityScore score : JCasUtil.select(jcas, ReadabilityScore.class)) { - String strMeasureName = score.getMeasureName(); - double dScore = score.getScore(); - System.out.println(strMeasureName + " : " + score.getScore()); - assertTrue(measureMap.containsKey(strMeasureName)); - assertEquals(correctResult.get(strMeasureName), dScore, EPSILON); - i++; - } - assertEquals(7, i); - } -} diff --git a/dkpro-core-readability-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/readability/ReadabilityMeasuresTest.java b/dkpro-core-readability-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/readability/ReadabilityMeasuresTest.java deleted file mode 100644 index 16f57c8f73..0000000000 --- a/dkpro-core-readability-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/readability/ReadabilityMeasuresTest.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.readability; - -import static org.junit.Assert.assertEquals; - -import java.lang.reflect.Method; - -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.readability.measure.ReadabilityMeasures; - -public class ReadabilityMeasuresTest -{ - - private final static double EPSILON = 0.001; - - @Test - public void testKincaid() - throws Exception - { - ReadabilityMeasures rm = new ReadabilityMeasures(); - Method method = rm.getClass().getDeclaredMethod("kincaid", new Class[]{Integer.class, Integer.class, Integer.class}); - method.setAccessible(true); - double dScore = (Double)method.invoke(rm, new Object[]{new Integer(292), new Integer(415), new Integer(17)}); - method.setAccessible(false); - System.out.println("kincaid:" + dScore); - assertEquals(dScore, 7.879, EPSILON); - } - - @Test - public void testAri() - throws Exception - { - ReadabilityMeasures rm = new ReadabilityMeasures(); - Method method = rm.getClass().getDeclaredMethod("ari", new Class[]{Integer.class, Integer.class, Integer.class}); - method.setAccessible(true); - double dScore = (Double)method.invoke(rm, new Object[]{new Integer(1359), new Integer(292), new Integer(17)}); - method.setAccessible(false); - System.out.println("ari:" + dScore); - assertEquals(dScore, 9.079, EPSILON); - } - - @Test - public void testColeman_liau() - throws Exception - { - ReadabilityMeasures rm = new ReadabilityMeasures(); - Method method = rm.getClass().getDeclaredMethod("coleman_liau", new Class[]{Integer.class, Integer.class, Integer.class}); - method.setAccessible(true); - double dScore = (Double)method.invoke(rm, new Object[]{new Integer(1359), new Integer(292), new Integer(17)}); - method.setAccessible(false); - System.out.println("coleman_liau:" + dScore); - assertEquals(dScore, 11.612, EPSILON); - } - - @Test - public void testFlesch() - throws Exception - { - ReadabilityMeasures rm = new ReadabilityMeasures(); - Method method = rm.getClass().getDeclaredMethod("flesch", new Class[]{Integer.class, Integer.class, Integer.class}); - method.setAccessible(true); - double dScore = (Double)method.invoke(rm, new Object[]{new Integer(415), new Integer(292), new Integer(17)}); - method.setAccessible(false); - System.out.println("flesch:" + dScore); - assertEquals(dScore, 69.165, EPSILON); - } - -} diff --git a/dkpro-core-readability-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/readability/WordSyllableCounterTest.java b/dkpro-core-readability-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/readability/WordSyllableCounterTest.java deleted file mode 100644 index c4fc3115db..0000000000 --- a/dkpro-core-readability-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/readability/WordSyllableCounterTest.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.readability; - -import static org.junit.Assert.assertEquals; - -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.readability.measure.WordSyllableCounter; - -public class WordSyllableCounterTest -{ - @Test - public void countWordSyllTest_vowelPairs() - throws Exception - { - WordSyllableCounter wsc = new WordSyllableCounter("en"); - - assertEquals(4, wsc.countSyllables("analysis")); - assertEquals(2, wsc.countSyllables("teacher")); - - //TODO - /* - * According to Linux 'style' algorithm, the syllables number - * of "readability" is 4. But in fact it should be 5. This means - * Linux'Style' algorithm is not always precise. - */ - assertEquals(4, wsc.countSyllables("readability")); - } - - @Test - public void countWordSyllTest_case() - throws Exception - { - WordSyllableCounter wsc = new WordSyllableCounter("en"); - - assertEquals(1, wsc.countSyllables("pEA")); - } - -} diff --git a/dkpro-core-readability-asl/src/test/java/org/dkpro/core/readability/ReadabilityAnnotatorTest.java b/dkpro-core-readability-asl/src/test/java/org/dkpro/core/readability/ReadabilityAnnotatorTest.java new file mode 100644 index 0000000000..440d2ba744 --- /dev/null +++ b/dkpro-core-readability-asl/src/test/java/org/dkpro/core/readability/ReadabilityAnnotatorTest.java @@ -0,0 +1,91 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.readability; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.io.FileUtils; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.readability.ReadabilityAnnotator; +import org.dkpro.core.readability.measure.ReadabilityMeasures.Measures; +import org.dkpro.core.tokit.BreakIteratorSegmenter; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.type.ReadabilityScore; + +public class ReadabilityAnnotatorTest +{ + private final static double EPSILON = 0.1; + + static HashMap<String, Double> correctResult = new HashMap<String, Double>(); + static + { + correctResult.put("kincaid", 7.6); + correctResult.put("ari", 9.1); + correctResult.put("coleman_liau", 11.6); + correctResult.put("flesch", 70.6); + correctResult.put("lix", 5.0); + correctResult.put("smog", 9.9); + correctResult.put("fog", 10.6); + } + + @Test + public void readabilityAnnotatorTest() + throws Exception + { + String text = FileUtils.readFileToString( + new File("src/test/resources/readability/test_document_en.txt") + ); + + Map<String, Boolean> measureMap = new HashMap<String, Boolean>(); + for (Measures measure : Measures.values()) { + measureMap.put(measure.name(), true); + } + + AnalysisEngineDescription aggregate = createEngineDescription( + createEngineDescription(BreakIteratorSegmenter.class), + createEngineDescription(ReadabilityAnnotator.class)); + + AnalysisEngine ae = createEngine(aggregate); + JCas jcas = ae.newJCas(); + jcas.setDocumentLanguage("en"); + jcas.setDocumentText(text); + ae.process(jcas); + + int i = 0; + for (ReadabilityScore score : JCasUtil.select(jcas, ReadabilityScore.class)) { + String strMeasureName = score.getMeasureName(); + double dScore = score.getScore(); + System.out.println(strMeasureName + " : " + score.getScore()); + assertTrue(measureMap.containsKey(strMeasureName)); + assertEquals(correctResult.get(strMeasureName), dScore, EPSILON); + i++; + } + assertEquals(7, i); + } +} diff --git a/dkpro-core-readability-asl/src/test/java/org/dkpro/core/readability/ReadabilityMeasuresTest.java b/dkpro-core-readability-asl/src/test/java/org/dkpro/core/readability/ReadabilityMeasuresTest.java new file mode 100644 index 0000000000..0c59f7c3b5 --- /dev/null +++ b/dkpro-core-readability-asl/src/test/java/org/dkpro/core/readability/ReadabilityMeasuresTest.java @@ -0,0 +1,86 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.readability; + +import static org.junit.Assert.assertEquals; + +import java.lang.reflect.Method; + +import org.dkpro.core.readability.measure.ReadabilityMeasures; +import org.junit.Test; + +public class ReadabilityMeasuresTest +{ + private final static double EPSILON = 0.001; + + @Test + public void testKincaid() throws Exception + { + ReadabilityMeasures rm = new ReadabilityMeasures(); + Method method = rm.getClass().getDeclaredMethod("kincaid", + new Class[] { Integer.class, Integer.class, Integer.class }); + method.setAccessible(true); + double dScore = (Double) method.invoke(rm, + new Object[] { new Integer(292), new Integer(415), new Integer(17) }); + method.setAccessible(false); + System.out.println("kincaid:" + dScore); + assertEquals(dScore, 7.879, EPSILON); + } + + @Test + public void testAri() throws Exception + { + ReadabilityMeasures rm = new ReadabilityMeasures(); + Method method = rm.getClass().getDeclaredMethod("ari", + new Class[] { Integer.class, Integer.class, Integer.class }); + method.setAccessible(true); + double dScore = (Double) method.invoke(rm, + new Object[] { new Integer(1359), new Integer(292), new Integer(17) }); + method.setAccessible(false); + System.out.println("ari:" + dScore); + assertEquals(dScore, 9.079, EPSILON); + } + + @Test + public void testColeman_liau() throws Exception + { + ReadabilityMeasures rm = new ReadabilityMeasures(); + Method method = rm.getClass().getDeclaredMethod("coleman_liau", + new Class[] { Integer.class, Integer.class, Integer.class }); + method.setAccessible(true); + double dScore = (Double) method.invoke(rm, + new Object[] { new Integer(1359), new Integer(292), new Integer(17) }); + method.setAccessible(false); + System.out.println("coleman_liau:" + dScore); + assertEquals(dScore, 11.612, EPSILON); + } + + @Test + public void testFlesch() throws Exception + { + ReadabilityMeasures rm = new ReadabilityMeasures(); + Method method = rm.getClass().getDeclaredMethod("flesch", + new Class[] { Integer.class, Integer.class, Integer.class }); + method.setAccessible(true); + double dScore = (Double) method.invoke(rm, + new Object[] { new Integer(415), new Integer(292), new Integer(17) }); + method.setAccessible(false); + System.out.println("flesch:" + dScore); + assertEquals(dScore, 69.165, EPSILON); + } +} diff --git a/dkpro-core-readability-asl/src/test/java/org/dkpro/core/readability/WordSyllableCounterTest.java b/dkpro-core-readability-asl/src/test/java/org/dkpro/core/readability/WordSyllableCounterTest.java new file mode 100644 index 0000000000..cd6e64241b --- /dev/null +++ b/dkpro-core-readability-asl/src/test/java/org/dkpro/core/readability/WordSyllableCounterTest.java @@ -0,0 +1,50 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.readability; + +import static org.junit.Assert.assertEquals; + +import org.dkpro.core.readability.measure.WordSyllableCounter; +import org.junit.Test; + +public class WordSyllableCounterTest +{ + @Test + public void countWordSyllTest_vowelPairs() throws Exception + { + WordSyllableCounter wsc = new WordSyllableCounter("en"); + + assertEquals(4, wsc.countSyllables("analysis")); + assertEquals(2, wsc.countSyllables("teacher")); + + // TODO + /* + * According to Linux 'style' algorithm, the syllables number of "readability" is 4. But in + * fact it should be 5. This means Linux'Style' algorithm is not always precise. + */ + assertEquals(4, wsc.countSyllables("readability")); + } + + @Test + public void countWordSyllTest_case() throws Exception + { + WordSyllableCounter wsc = new WordSyllableCounter("en"); + + assertEquals(1, wsc.countSyllables("pEA")); + } +} diff --git a/dkpro-core-readability-asl/suppressions.xml b/dkpro-core-readability-asl/suppressions.xml new file mode 100644 index 0000000000..05381817ea --- /dev/null +++ b/dkpro-core-readability-asl/suppressions.xml @@ -0,0 +1,9 @@ +<?xml version="1.0"?> + +<!DOCTYPE suppressions PUBLIC +"-//Puppy Crawl//DTD Suppressions 1.1//EN" +"http://www.puppycrawl.com/dtds/suppressions_1_1.dtd"> + +<suppressions> + <suppress files=".*[/\\]target[/\\].*" checks=".*"/> +</suppressions> diff --git a/dkpro-core-rftagger-asl/pom.xml b/dkpro-core-rftagger-asl/pom.xml index 560e1da998..1913f8a10b 100644 --- a/dkpro-core-rftagger-asl/pom.xml +++ b/dkpro-core-rftagger-asl/pom.xml @@ -18,13 +18,14 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.rftagger-asl</artifactId> + <artifactId>dkpro-core-rftagger-asl</artifactId> <name>DKPro Core ASL - RF Tagger</name> + <url>https://dkpro.github.io/dkpro-core/</url> <dependencies> <dependency> <groupId>org.apache.uima</groupId> @@ -39,24 +40,28 @@ <artifactId>commons-io</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.metadata-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-metadata-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> </dependency> <dependency> <groupId>junit</groupId> @@ -64,8 +69,8 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> <dependency> diff --git a/dkpro-core-rftagger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/rftagger/RfTagger.java b/dkpro-core-rftagger-asl/src/main/java/org/dkpro/core/rftagger/RfTagger.java similarity index 81% rename from dkpro-core-rftagger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/rftagger/RfTagger.java rename to dkpro-core-rftagger-asl/src/main/java/org/dkpro/core/rftagger/RfTagger.java index c3a8bdcb29..5ecbea3daf 100644 --- a/dkpro-core-rftagger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/rftagger/RfTagger.java +++ b/dkpro-core-rftagger-asl/src/main/java/org/dkpro/core/rftagger/RfTagger.java @@ -15,9 +15,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.rftagger; +package org.dkpro.core.rftagger; import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; import java.io.BufferedReader; import java.io.BufferedWriter; @@ -43,27 +44,31 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.lexmorph.morph.MorphologicalFeaturesParser; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.LittleEndianDataInputStream; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.resources.PlatformDetector; +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.api.resources.RuntimeProvider; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.morph.MorphologicalFeaturesParser; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.LittleEndianDataInputStream; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.PlatformDetector; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.resources.RuntimeProvider; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Rftagger morphological analyzer. */ -@ResourceMetaData(name="RFTagger Morphological Analyzer") +@Component(OperationType.MORPHOLOGICAL_TAGGER) +@ResourceMetaData(name = "RFTagger Morphological Analyzer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", @@ -88,6 +93,20 @@ public class RfTagger @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + /** * Load the model from this location instead of locating the model automatically. */ @@ -102,15 +121,29 @@ public class RfTagger @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = false) protected String modelEncoding; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating the * mapping automatically. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; - public static final String PARAM_MORPH_MAPPING_LOCATION = ComponentParameters.PARAM_MORPH_MAPPING_LOCATION; + /** + * Load the morphological features mapping from this location instead of locating the + * mapping automatically. + */ + public static final String PARAM_MORPH_MAPPING_LOCATION = + ComponentParameters.PARAM_MORPH_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_MORPH_MAPPING_LOCATION, mandatory = false) private String morphMappingLocation; @@ -142,6 +175,10 @@ public void initialize(UimaContext aContext) modelProvider = new ModelProviderBase<File>(this, "rftagger", "morph") { + { + setDefault(PACKAGE, "de/tudarmstadt/ukp/dkpro/core/rftagger"); + } + @Override protected File produceResource(URL aUrl) throws IOException @@ -196,8 +233,8 @@ private String readZeroTerminatedString(DataInput aIn, String aEncoding) } }; - mappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - language, modelProvider); + mappingProvider = createPosMappingProvider(this, posMappingLocation, language, + modelProvider); featuresParser = new MorphologicalFeaturesParser(this, modelProvider); } diff --git a/dkpro-core-rftagger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/rftagger/RfTaggerTest.java b/dkpro-core-rftagger-asl/src/test/java/org/dkpro/core/rftagger/RfTaggerTest.java similarity index 99% rename from dkpro-core-rftagger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/rftagger/RfTaggerTest.java rename to dkpro-core-rftagger-asl/src/test/java/org/dkpro/core/rftagger/RfTaggerTest.java index 086c0077de..2899eb1e52 100644 --- a/dkpro-core-rftagger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/rftagger/RfTaggerTest.java +++ b/dkpro-core-rftagger-asl/src/test/java/org/dkpro/core/rftagger/RfTaggerTest.java @@ -15,15 +15,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.rftagger; +package org.dkpro.core.rftagger; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertMorph; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertPOS; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTagset; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTagsetParser; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertToken; import static java.util.Arrays.asList; import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.AssertAnnotations.assertMorph; +import static org.dkpro.core.testing.AssertAnnotations.assertPOS; +import static org.dkpro.core.testing.AssertAnnotations.assertTagset; +import static org.dkpro.core.testing.AssertAnnotations.assertTagsetParser; +import static org.dkpro.core.testing.AssertAnnotations.assertToken; import java.util.Arrays; import java.util.HashSet; @@ -32,14 +32,14 @@ import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class RfTaggerTest { @@ -71,7 +71,8 @@ public void testCzech() "[ 30, 32] - - - - - - - - - - - - - - - - - na (R.R.-.-.6.-.-.-.-.-.-.-.-.-.-)", "[ 33, 40] - - - - - - - - - - - - - - - - - medvěda (A.A.F.S.6.-.-.-.-.1.A.-.-.-.-)", "[ 41, 46] - - - - - - - - - - - - - - - - - tváři (N.N.F.S.6.-.-.-.-.-.A.-.-.-.-)", - "[ 47, 48] - - - - - - - - - - - - - - - - - . (Z.:.-.-.-.-.-.-.-.-.-.-.-.-.-)" }; + "[ 47, 48] - - - - - - - - - - - - - - - - - . (Z.:.-.-.-.-.-.-.-.-.-.-.-.-.-)" + }; String[] morphTags = { "A.2.-.-.-.-.-.-.-.-.A.-.-.-.-", "A.A.F.D.7.-.-.-.-.1.A.-.-.-.-", "A.A.F.P.1.-.-.-.-.1.A.-.-.-.-", "A.A.F.P.1.-.-.-.-.1.A.-.-.-.6", @@ -721,7 +722,8 @@ public void testGerman() String[] tokens = { "Er", "nahm", "meine", "Fackel", "und", "schlug", "sie", "dem", "Bär", "ins", "Gesicht", "." }; - String[] posOrig = { "PRO", "VFIN", "PRO", "N", "CONJ", "VFIN", "PRO", "ART", "N", "APPRART", "N", "SYM" }; + String[] posOrig = { "PRO", "VFIN", "PRO", "N", "CONJ", "VFIN", "PRO", "ART", "N", + "APPRART", "N", "SYM" }; String[] posMapped = { "POS_PRON", "POS_VERB", "POS_PRON", "POS_NOUN", "POS_CONJ", "POS_VERB", "POS_PRON", "POS_DET", "POS_NOUN", "POS_ADP", "POS_NOUN", "POS_PUNCT" }; @@ -738,7 +740,8 @@ public void testGerman() "[ 40, 43] - - Dat - - Masc - - Sing - - - - - - - - Bär (N.Reg.Dat.Sg.Masc)", "[ 44, 47] - - Acc - - Neut - - Sing - - - - - - - - ins (APPRART.Acc.Sg.Neut)", "[ 48, 55] - - Acc - - Neut - - Sing - - - - - - - - Gesicht (N.Reg.Acc.Sg.Neut)", - "[ 56, 57] - - - - - - - - - - - - - - - - - . (SYM.Pun.Sent)" }; + "[ 56, 57] - - - - - - - - - - - - - - - - - . (SYM.Pun.Sent)" + }; String[] morphTags = { "ADJA.Comp.*.Pl.Fem", "ADJA.Comp.*.Sg.Fem", "ADJA.Comp.Acc.Pl.*", "ADJA.Comp.Acc.Pl.Fem", "ADJA.Comp.Acc.Pl.Masc", "ADJA.Comp.Acc.Pl.Neut", @@ -1128,7 +1131,8 @@ public void testHungarian() "[ 43, 44] - - - - - - - - - - - - - - - - - a (T.f)", "[ 45, 50] - - - - - - - - - - - - - - - - - medve (N.c.s.n)", "[ 51, 57] - - - - - - - - - - - - - - - - - arcára (N.c.s.s)", - "[ 58, 59] - - - - - - - - - - - - - - - - - . (IP.sent.period)" }; + "[ 58, 59] - - - - - - - - - - - - - - - - - . (IP.sent.period)" + }; String[] morphTags = { "A.f.c.p.2", "A.f.c.p.3", "A.f.c.p.a", "A.f.c.p.b", "A.f.c.p.c", "A.f.c.p.d", "A.f.c.p.g", "A.f.c.p.i", "A.f.c.p.n", "A.f.c.p.s", "A.f.c.p.t", @@ -1302,7 +1306,8 @@ public void testRussian() "[ 31, 32] - - - - - - - - - - - - - - - - - в (S.p.-.a)", "[ 33, 37] - - - - - - - - - - - - - - - - - лицо (N.c.n.s.a.n.-)", "[ 38, 45] - - - - - - - - - - - - - - - - - медведя (N.c.m.s.g.y.-)", - "[ 46, 47] - - - - - - - - - - - - - - - - - . (SENT)" }; + "[ 46, 47] - - - - - - - - - - - - - - - - - . (SENT)" + }; String[] morphTags = { ",", "-", "A.f.c.m.s.n.f", "A.f.p.f.p.g.f", "A.f.p.f.s.a.f", "A.f.p.f.s.a.s", "A.f.p.f.s.d.f", "A.f.p.f.s.g.f", "A.f.p.f.s.i.f", @@ -1600,7 +1605,8 @@ public void testSlovene() String[] posOrig = { "V", "P", "N", "C", "P", "V", "S", "N", "N", "Z" }; - String[] posMapped = { "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS" }; + String[] posMapped = { "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", + "POS" }; String[] morph = { "[ 0, 4] - - - - - - - - - - - - - - - - - Vzel (V.m.e.p.-.s.m)", @@ -1612,7 +1618,8 @@ public void testSlovene() "[ 31, 33] - - - - - - - - - - - - - - - - - na (S.l)", "[ 34, 40] - - - - - - - - - - - - - - - - - obrazu (N.c.m.s.l)", "[ 41, 48] - - - - - - - - - - - - - - - - - medveda (N.c.m.s.g)", - "[ 49, 50] - - - - - - - - - - - - - - - - - . (Z.p.-)" }; + "[ 49, 50] - - - - - - - - - - - - - - - - - . (Z.p.-)" + }; String[] morphTags = { "A.g.c.f.d.a", "A.g.c.f.d.i", "A.g.c.f.d.n", "A.g.c.f.p.a", "A.g.c.f.p.d", "A.g.c.f.p.g", "A.g.c.f.p.i", "A.g.c.f.p.l", "A.g.c.f.p.n", @@ -1865,7 +1872,8 @@ public void testSlovak() String[] posOrig = { "VL", "PP", "SS", "O", "VI", "PF", "E", "SS", "VK", "Z" }; - String[] posMapped = { "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS" }; + String[] posMapped = { "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", "POS", + "POS" }; String[] morph = { "[ 0, 4] - - - - - - - - - - - - - - - - - Vzal (VL.d.s.c.m.+.:-)", @@ -1877,7 +1885,8 @@ public void testSlovak() "[ 31, 33] - - - - - - - - - - - - - - - - - na (E.u.4.-.:-)", "[ 34, 41] - - - - - - - - - - - - - - - - - medveďa (SS.m.s.4.-.:-)", "[ 42, 47] - - - - - - - - - - - - - - - - - tvári (VK.e.s.c.+.:-)", - "[ 48, 49] - - - - - - - - - - - - - - - - - . (Z.:-)" }; + "[ 48, 49] - - - - - - - - - - - - - - - - - . (Z.:-)" + }; String[] morphTags = { "#", "%.:-", "%.:r", "0.:-", "0.:q", "AA.f.p.1.x.:-", "AA.f.p.1.x.:q", "AA.f.p.1.x.:r", "AA.f.p.1.x.:rq", "AA.f.p.1.y.:-", "AA.f.p.1.z.:-", diff --git a/dkpro-core-rftagger-asl/src/test/resources/log4j.properties b/dkpro-core-rftagger-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-rftagger-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-rftagger-asl/src/test/resources/log4j2.xml b/dkpro-core-rftagger-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-rftagger-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Logger name="org.dkpro.core.api.resources.ResourceObjectProviderBase" level="INFO"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-sfst-gpl/.license-header.txt b/dkpro-core-sfst-gpl/.license-header.txt index ab08133a17..bbaf6e0e56 100644 --- a/dkpro-core-sfst-gpl/.license-header.txt +++ b/dkpro-core-sfst-gpl/.license-header.txt @@ -13,4 +13,4 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with this program. If not, see http://www.gnu.org/licenses/. +along with this program. If not, see http://www.gnu.org/licenses/. diff --git a/dkpro-core-sfst-gpl/LICENSE.txt b/dkpro-core-sfst-gpl/LICENSE.txt index 6e22a15c3c..99ace43661 100644 --- a/dkpro-core-sfst-gpl/LICENSE.txt +++ b/dkpro-core-sfst-gpl/LICENSE.txt @@ -654,7 +654,7 @@ the "copyright" line and a pointer to where the full notice is found. GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. + along with this program. If not, see http://www.gnu.org/licenses/. Also add information on how to contact you by electronic and paper mail. diff --git a/dkpro-core-sfst-gpl/pom.xml b/dkpro-core-sfst-gpl/pom.xml index 257b67a28d..defa89fc46 100644 --- a/dkpro-core-sfst-gpl/pom.xml +++ b/dkpro-core-sfst-gpl/pom.xml @@ -1,6 +1,6 @@ <!-- - Copyright 2007-2017 + Copyright 2007-2019 Ubiquitous Knowledge Processing (UKP) Lab Technische Universität Darmstadt @@ -15,20 +15,21 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. + along with this program. If not, see http://www.gnu.org/licenses/. --> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core-gpl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-gpl</artifactId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-gpl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.sfst-gpl</artifactId> + <artifactId>dkpro-core-sfst-gpl</artifactId> <packaging>jar</packaging> <name>DKPro Core GPL - SFST</name> + <url>https://dkpro.github.io/dkpro-core/</url> <dependencies> <dependency> <groupId>org.apache.uima</groupId> @@ -39,37 +40,41 @@ <artifactId>uimafit-core</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.metadata-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-metadata-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.sfst-bin</artifactId> </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> + </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> <dependency> diff --git a/dkpro-core-sfst-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/sfst/SfstAnnotator.java b/dkpro-core-sfst-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/sfst/SfstAnnotator.java deleted file mode 100644 index 9c6d06ed8e..0000000000 --- a/dkpro-core-sfst-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/sfst/SfstAnnotator.java +++ /dev/null @@ -1,343 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.sfst; - -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; - -import java.io.BufferedReader; -import java.io.ByteArrayOutputStream; -import java.io.DataInput; -import java.io.File; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.io.PrintWriter; -import java.lang.ProcessBuilder.Redirect; -import java.net.URL; -import java.util.List; -import java.util.Properties; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.morph.MorphologicalFeaturesParser; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.LittleEndianDataInputStream; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.resources.RuntimeProvider; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * SFST morphological analyzer. - */ -@ResourceMetaData(name="SFST Morphological Analyzer") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures"}) -public class SfstAnnotator - extends JCasAnnotator_ImplBase -{ - private static final String FLUSH_TOKEN = "-= FLUSH =-"; - - public static enum Mode { - FIRST, - ALL - } - - /** - * Write part-of-speech information. - * - * Default: {@code true} - */ - public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; - @ConfigurationParameter(name=PARAM_WRITE_POS, mandatory=true, defaultValue="true") - private boolean writePos; - - /** - * Write lemma information. - * - * Default: {@code true} - */ - public static final String PARAM_WRITE_LEMMA = ComponentParameters.PARAM_WRITE_LEMMA; - @ConfigurationParameter(name=PARAM_WRITE_LEMMA, mandatory=true, defaultValue="true") - private boolean writeLemma; - - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - private String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - private String variant; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - private String modelLocation; - - /** - * Write the tag set(s) to the log when a model is loaded. - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") - protected boolean printTagSet; - - /** - * Specifies the model encoding. - */ - public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; - @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, defaultValue="UTF-8") - private String modelEncoding; - - public static final String PARAM_MODE = "mode"; - @ConfigurationParameter(name = PARAM_MODE, mandatory = true, defaultValue="FIRST") - private Mode mode; - - public static final String PARAM_MORPH_MAPPING_LOCATION = ComponentParameters.PARAM_MORPH_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_MORPH_MAPPING_LOCATION, mandatory = false) - private String morphMappingLocation; - - private ModelProviderBase<File> modelProvider; - private MorphologicalFeaturesParser featuresParser; - private RuntimeProvider runtimeProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - // Returns FST automaton for specified language, which is then passed to fst-infl from SFST. - // Currently available for Turkish and German. - modelProvider = new ModelProviderBase<File>(this, "sfst", "morph") - { - @Override - protected File produceResource(URL aUrl) - throws IOException - { - Properties metadata = getResourceMetaData(); - - SingletonTagset morphFeats = new SingletonTagset( - MorphologicalFeatures.class, metadata.getProperty("morph.tagset")); - - try (LittleEndianDataInputStream is = new LittleEndianDataInputStream( - aUrl.openStream())) { - byte type = is.readByte(); // "c" for "compact" - if (type != 0x63) { - throw new IOException("Incompatible model. Must be a compact model."); - } - byte enc = is.readByte(); // "0" for ??? - "1" for UTF-8 - getLogger().info("Model encoding: " + (enc == 0 ? "unknown" : "UTF-8")); - short n = is.readShort(); // alphabet size - for (int i = 0; i < n; i++) { - @SuppressWarnings("unused") - int idx = is.readShort(); // need to read index - String symbol = readZeroTerminatedString(is, "UTF-8"); - if (symbol.startsWith("<") && symbol.endsWith(">") && symbol.length() > 2) { - morphFeats.add(symbol); - } - } - } - addTagset(morphFeats); - - if (printTagSet) { - getLogger().info(getTagset().toString()); - } - - return ResourceUtils.getUrlAsFile(aUrl, true); - } - - private String readZeroTerminatedString(DataInput aIn, String aEncoding) - throws IOException - { - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - byte b = aIn.readByte(); - while (b != 0) { - bos.write(b); - b = aIn.readByte(); - } - return new String(bos.toByteArray(), aEncoding); - } - }; - - featuresParser = new MorphologicalFeaturesParser(this, modelProvider); - - // provider for the sfst binary - runtimeProvider = new RuntimeProvider("classpath:/de/tudarmstadt/ukp/dkpro/core/sfst/bin/"); - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - - modelProvider.configure(cas); - featuresParser.configure(cas); - - String modelEncoding = (String) modelProvider.getResourceMetaData().get("model.encoding"); - if (modelEncoding == null) { - throw new AnalysisEngineProcessException( - new Throwable("Model should contain encoding metadata")); - } - File model = modelProvider.getResource(); - File executable; - - try { - executable = runtimeProvider.getFile("fst-infl2"); - } - catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - - ProcessBuilder pb = new ProcessBuilder(executable.getAbsolutePath(), "-s", "-q", - model.getAbsolutePath()); - pb.redirectError(Redirect.INHERIT); - - StringBuffer lastOut = new StringBuffer(); - String lastIn = null; - boolean success = false; - Process proc = null; - try { - proc = pb.start(); - - PrintWriter out = new PrintWriter(new OutputStreamWriter(proc.getOutputStream(), - modelEncoding)); - BufferedReader in = new BufferedReader(new InputStreamReader(proc.getInputStream(), - modelEncoding)); - - for (Sentence sentence : select(aJCas, Sentence.class)) { - List<Token> tokens = selectCovered(Token.class, sentence); - - // Skip empty sentences - if (tokens.isEmpty()) { - continue; - } - - // Send full sentence - for (Token token : tokens) { - lastOut.append(token.getText()).append(' '); - out.printf("%s%n", token.getText()); - out.printf("%s%n", FLUSH_TOKEN); - } - out.flush(); - - // Read sentence tags - tokenLoop: for (Token token : tokens) { - boolean skip = false; - analysisLoop: while ((lastIn = in.readLine()) != null) { - // Analysis line - if (lastIn.startsWith(">")) { - // Echo line, ignore. - continue analysisLoop; - } - - if (lastIn.contains(FLUSH_TOKEN)) { - // End of analysis - continue tokenLoop; - } - - if (lastIn.startsWith("no result for")) { - // No analysis for this token - MorphologicalFeatures morph = new MorphologicalFeatures(aJCas, - token.getBegin(), token.getEnd()); - morph.setValue(""); - morph.addToIndexes(); - - if (token.getMorph() == null) { - token.setMorph(morph); - } - - // We need to continue the inner loop because we still need to consume - // the flush marker. - continue analysisLoop; - } - - // Analysis line - if (!skip) { - MorphologicalFeatures morph = featuresParser - .parse(aJCas, token, lastIn); - - if (token.getMorph() == null) { - token.setMorph(morph); - } - } - - switch (mode) { - case FIRST: - // Go to next token after reading first analysis - skip = true; - break; - case ALL: - // We record all analyses - break; - } - } - } - - lastOut.setLength(0); - } - - success = true; - } - catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - finally { - if (!success) { - getLogger().error("Sent before error: [" + lastOut + "]"); - getLogger().error("Last response before error: [" + lastIn + "]"); - } - if (proc != null) { - proc.destroy(); - } - } - } - - @Override - public void destroy() - { - runtimeProvider.uninstall(); - super.destroy(); - } -} diff --git a/dkpro-core-sfst-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/sfst/package-info.java b/dkpro-core-sfst-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/sfst/package-info.java deleted file mode 100644 index 0c02a32671..0000000000 --- a/dkpro-core-sfst-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/sfst/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -/** - * SFST-based morphological analysis. - * - * @since 1.5.0 - */ -package de.tudarmstadt.ukp.dkpro.core.sfst; diff --git a/dkpro-core-sfst-gpl/src/main/java/org/dkpro/core/sfst/SfstAnnotator.java b/dkpro-core-sfst-gpl/src/main/java/org/dkpro/core/sfst/SfstAnnotator.java new file mode 100644 index 0000000000..256b44c383 --- /dev/null +++ b/dkpro-core-sfst-gpl/src/main/java/org/dkpro/core/sfst/SfstAnnotator.java @@ -0,0 +1,405 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.sfst; + +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; + +import java.io.BufferedReader; +import java.io.ByteArrayOutputStream; +import java.io.DataInput; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.lang.ProcessBuilder.Redirect; +import java.net.URL; +import java.util.List; +import java.util.Locale; +import java.util.Properties; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.lexmorph.morph.MorphologicalFeaturesParser; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.LittleEndianDataInputStream; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.api.resources.RuntimeProvider; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * SFST morphological analyzer. + */ +@Component(OperationType.MORPHOLOGICAL_TAGGER) +@ResourceMetaData(name = "SFST Morphological Analyzer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures"}) +public class SfstAnnotator + extends JCasAnnotator_ImplBase +{ + private static final String FLUSH_TOKEN = "-= FLUSH =-"; + + public static enum Mode { + FIRST, + ALL + } + + /** + * Write part-of-speech information. + */ + public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; + @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "true") + private boolean writePos; + + /** + * Write lemma information. + */ + public static final String PARAM_WRITE_LEMMA = ComponentParameters.PARAM_WRITE_LEMMA; + @ConfigurationParameter(name = PARAM_WRITE_LEMMA, mandatory = true, defaultValue = "true") + private boolean writeLemma; + + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + private String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + private String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + private String modelLocation; + + /** + * Write the tag set(s) to the log when a model is loaded. + */ + public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") + protected boolean printTagSet; + + /** + * Specifies the model encoding. + */ + public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; + @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, defaultValue = "UTF-8") + private String modelEncoding; + + /** + * Whether to record only the first ({@code FIRST}) or all possible analyses ({@code ALL}). + */ + public static final String PARAM_MODE = "mode"; + @ConfigurationParameter(name = PARAM_MODE, mandatory = true, defaultValue = "FIRST") + private Mode mode; + + /** + * Load the morphological features mapping from this location instead of locating the + * mapping automatically. + */ + public static final String PARAM_MORPH_MAPPING_LOCATION = + ComponentParameters.PARAM_MORPH_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_MORPH_MAPPING_LOCATION, mandatory = false) + private String morphMappingLocation; + + /** + * Whether to lookup the first word of a sentence in lowercase, useful if the employed model + * does not handle lowercasing. + */ + public static final String PARAM_LOWERCASE_FIRST_WORD = "lowercaseFirstWord"; + @ConfigurationParameter(name = PARAM_LOWERCASE_FIRST_WORD, mandatory = false, defaultValue = "false") + private boolean lowercaseFirstWord; + + private ModelProviderBase<File> modelProvider; + private MorphologicalFeaturesParser featuresParser; + private RuntimeProvider runtimeProvider; + private Locale locale; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + // Returns FST automaton for specified language, which is then passed to fst-infl from SFST. + // Currently available for Turkish and German. + modelProvider = new ModelProviderBase<File>(this, "sfst", "morph") + { + { + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/sfst/lib/morph-${language}-${variant}.properties"); + } + + @Override + protected File produceResource(URL aUrl) + throws IOException + { + Properties metadata = getResourceMetaData(); + + SingletonTagset morphFeats = new SingletonTagset( + MorphologicalFeatures.class, metadata.getProperty("morph.tagset")); + + try (LittleEndianDataInputStream is = new LittleEndianDataInputStream( + aUrl.openStream())) { + byte type = is.readByte(); // "c" for "compact" + if (type != 0x63) { + throw new IOException("Incompatible model. Must be a compact model."); + } + byte enc = is.readByte(); // "0" for ??? - "1" for UTF-8 + getLogger().info("Model encoding: " + (enc == 0 ? "unknown" : "UTF-8")); + short n = is.readShort(); // alphabet size + for (int i = 0; i < n; i++) { + @SuppressWarnings("unused") + int idx = is.readShort(); // need to read index + String symbol = readZeroTerminatedString(is, "UTF-8"); + if (symbol.startsWith("<") && symbol.endsWith(">") && symbol.length() > 2) { + morphFeats.add(symbol); + } + } + } + addTagset(morphFeats); + + if (printTagSet) { + getLogger().info(getTagset().toString()); + } + + return ResourceUtils.getUrlAsFile(aUrl, true); + } + + private String readZeroTerminatedString(DataInput aIn, String aEncoding) + throws IOException + { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + byte b = aIn.readByte(); + while (b != 0) { + bos.write(b); + b = aIn.readByte(); + } + return new String(bos.toByteArray(), aEncoding); + } + }; + + featuresParser = new MorphologicalFeaturesParser(this, modelProvider); + + // provider for the sfst binary + runtimeProvider = new RuntimeProvider("classpath:/de/tudarmstadt/ukp/dkpro/core/sfst/bin/"); + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + CAS cas = aJCas.getCas(); + + modelProvider.configure(cas); + featuresParser.configure(cas); + + if (lowercaseFirstWord) { + // locale for lowercasing + locale = new Locale( + PARAM_LANGUAGE != null ? PARAM_LANGUAGE : cas.getDocumentLanguage()); + } + + String modelEncoding = (String) modelProvider.getResourceMetaData().get("model.encoding"); + if (modelEncoding == null) { + throw new AnalysisEngineProcessException( + new Throwable("Model should contain encoding metadata")); + } + File model = modelProvider.getResource(); + File executable; + + try { + executable = runtimeProvider.getFile("fst-infl2"); + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + + ProcessBuilder pb = new ProcessBuilder(executable.getAbsolutePath(), "-s", "-q", + model.getAbsolutePath()); + pb.redirectError(Redirect.INHERIT); + + StringBuffer lastOut = new StringBuffer(); + String lastIn = null; + boolean success = false; + Process proc = null; + try { + proc = pb.start(); + + PrintWriter out = new PrintWriter(new OutputStreamWriter(proc.getOutputStream(), + modelEncoding)); + BufferedReader in = new BufferedReader(new InputStreamReader(proc.getInputStream(), + modelEncoding)); + + for (Sentence sentence : select(aJCas, Sentence.class)) { + List<Token> tokens = selectCovered(Token.class, sentence); + + // Skip empty sentences + if (tokens.isEmpty()) { + continue; + } + + // Send full sentence + boolean first = true; + for (Token token : tokens) { + lastOut.append(token.getCoveredText()).append(' '); + out.printf("%s%n", token.getCoveredText()); + // treat first token differently if parameter is set + if (first && lowercaseFirstWord) { + String lcToken = token.getCoveredText().toLowerCase(locale); + if (!lcToken.equals(token.getCoveredText())) { + out.printf("%s%n", token.getCoveredText().toLowerCase(locale)); + } + first = false; + } + out.printf("%s%n", FLUSH_TOKEN); + } + out.flush(); + + first = true; + // Read sentence tags + tokenLoop: for (Token token : tokens) { + boolean skip = false; + analysisLoop: while ((lastIn = in.readLine()) != null) { + // Analysis line + if (lastIn.startsWith(">")) { + // Echo line, ignore. + continue analysisLoop; + } + + if (lastIn.contains(FLUSH_TOKEN)) { + // End of analysis + continue tokenLoop; + } + + if (lastIn.startsWith("no result for")) { + // if we're treating sentence-initial tokens specially, + // don't create an empty analysis just yet + if (first && lowercaseFirstWord) { + first = false; + continue analysisLoop; + } + + // No analysis for this token + MorphologicalFeatures morph = new MorphologicalFeatures(aJCas, + token.getBegin(), token.getEnd()); + morph.setValue(""); + morph.addToIndexes(); + + if (token.getMorph() == null) { + token.setMorph(morph); + } + + // We need to continue the inner loop because we still need to consume + // the flush marker. + continue analysisLoop; + } + + // Analysis line + if (!skip) { + MorphologicalFeatures morph = featuresParser + .parse(aJCas, token, lastIn); + + if (token.getMorph() == null) { + token.setMorph(morph); + } + } + + switch (mode) { + case FIRST: + // Go to next token after reading first analysis + skip = true; + break; + case ALL: + // We record all analyses + break; + } + } + } + + lastOut.setLength(0); + } + + success = true; + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + finally { + if (!success) { + getLogger().error("Sent before error: [" + lastOut + "]"); + getLogger().error("Last response before error: [" + lastIn + "]"); + } + if (proc != null) { + proc.destroy(); + } + } + } + + @Override + public void destroy() + { + runtimeProvider.uninstall(); + super.destroy(); + } +} diff --git a/dkpro-core-sfst-gpl/src/main/java/org/dkpro/core/sfst/package-info.java b/dkpro-core-sfst-gpl/src/main/java/org/dkpro/core/sfst/package-info.java new file mode 100644 index 0000000000..4a10d64900 --- /dev/null +++ b/dkpro-core-sfst-gpl/src/main/java/org/dkpro/core/sfst/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +/** + * SFST-based morphological analysis. + * + * @since 1.5.0 + */ +package org.dkpro.core.sfst; diff --git a/dkpro-core-sfst-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/sfst/lib/morph-default-variants.map b/dkpro-core-sfst-gpl/src/main/resources/org/dkpro/core/sfst/lib/morph-default-variants.map similarity index 100% rename from dkpro-core-sfst-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/sfst/lib/morph-default-variants.map rename to dkpro-core-sfst-gpl/src/main/resources/org/dkpro/core/sfst/lib/morph-default-variants.map diff --git a/dkpro-core-sfst-gpl/src/scripts/build.xml b/dkpro-core-sfst-gpl/src/scripts/build.xml index 0866033a96..5358c50827 100644 --- a/dkpro-core-sfst-gpl/src/scripts/build.xml +++ b/dkpro-core-sfst-gpl/src/scripts/build.xml @@ -1,6 +1,6 @@ <!-- - Copyright 2007-2017 + Copyright 2007-2019 Ubiquitous Knowledge Processing (UKP) Lab Technische Universität Darmstadt @@ -15,7 +15,7 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. + along with this program. If not, see http://www.gnu.org/licenses/. --> <project basedir="../.." default="separate-jars"> diff --git a/dkpro-core-sfst-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/sfst/SfstAnnotatorTest.java b/dkpro-core-sfst-gpl/src/test/java/org/dkpro/core/sfst/SfstAnnotatorTest.java similarity index 98% rename from dkpro-core-sfst-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/sfst/SfstAnnotatorTest.java rename to dkpro-core-sfst-gpl/src/test/java/org/dkpro/core/sfst/SfstAnnotatorTest.java index bb9ebe8aae..f14c7f9da7 100644 --- a/dkpro-core-sfst-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/sfst/SfstAnnotatorTest.java +++ b/dkpro-core-sfst-gpl/src/test/java/org/dkpro/core/sfst/SfstAnnotatorTest.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,22 +14,25 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.sfst; +package org.dkpro.core.sfst; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.*; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.AssertAnnotations.assertMorph; +import static org.dkpro.core.testing.AssertAnnotations.assertTagset; +import static org.dkpro.core.testing.AssertAnnotations.assertTagsetParser; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; +import org.dkpro.core.sfst.SfstAnnotator; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class SfstAnnotatorTest { @@ -52,7 +55,8 @@ public void testTurkish() "[ 15, 20] - - - - - - - - Plur - 3 - - - - - - çalış (çal<v><D_yIS><n><3p>)", "[ 15, 20] - - - - - - - - Plur - 3 - - - - - - çalış (çalış<v><t_imp><3p>)", "[ 15, 20] - - - - - - - - Sing - 2 - - - - - - çalış (çalış<v><t_imp><2s>)", - "[ 21, 22] - - - - - - - - - - - - - - - - - . (.<pnct>)" }; + "[ 21, 22] - - - - - - - - - - - - - - - - - . (.<pnct>)" + }; String[] tags = { "<1p>", "<1s>", "<2p>", "<2s>", "<3p>", "<3s>", "<D_AcIK>", "<D_CA>", "<D_CAK>", "<D_CAgIz>", "<D_CI>", "<D_CIK>", "<D_IcIK>", "<D_IncI>", "<D_ca>", @@ -133,7 +137,8 @@ public void testGermanMorphisto() "[ 21, 32] - - Nom - - Neut - - Sing - - - - - - - - Krankenhaus (krank<ADJ><NN><SUFF>Haus<+NN><Neut><Nom><Sg>)", "[ 21, 32] - - Dat - - Neut - - Sing - - - - - - - - Krankenhaus (krank<ADJ><NN><SUFF>Haus<+NN><Neut><Dat><Sg>)", "[ 21, 32] - - Acc - - Neut - - Sing - - - - - - - - Krankenhaus (krank<ADJ><NN><SUFF>Haus<+NN><Neut><Akk><Sg>)", - "[ 33, 34] - - - - - - - - - - - - - - - - - . (.<+IP><Norm>)" }; + "[ 33, 34] - - - - - - - - - - - - - - - - - . (.<+IP><Norm>)" + }; String[] tags = { "<+ADJ>", "<+ADV>", "<+ART>", "<+CARD>", "<+CHAR>", "<+CIRCP>", "<+DEM>", "<+DEMPRO>", "<+INDEF>", "<+INTJ>", "<+IP>", "<+KONJ>", "<+NE>", "<+NN>", "<+ORD>", @@ -192,7 +197,8 @@ public void testGermanSmor() "[ 21, 32] - - Nom - - Neut - - Sing - - - - - - - - Krankenhaus (Krankenhaus<+NN><Neut><Nom><Sg>)", "[ 21, 32] - - Dat - - Neut - - Sing - - - - - - - - Krankenhaus (Krankenhaus<+NN><Neut><Dat><Sg>)", "[ 21, 32] - - Acc - - Neut - - Sing - - - - - - - - Krankenhaus (Krankenhaus<+NN><Neut><Acc><Sg>)", - "[ 33, 34] - - - - - - - - - - - - - - - - - . (.<+PUNCT><Norm>)" }; + "[ 33, 34] - - - - - - - - - - - - - - - - - . (.<+PUNCT><Norm>)" + }; String[] tags = { "<+ADJ>", "<+ADV>", "<+ART>", "<+CARD>", "<+CIRCP>", "<+CONJ>", "<+DEM>", "<+INDEF>", "<+INTJ>", "<+NN>", "<+NPROP>", "<+ORD>", "<+POSS>", "<+POSTP>", @@ -255,7 +261,8 @@ public void testGermanZmorgeOrig() "[ 33, 34] - - Dat - - Fem - - Sing - - - - - - - - . (.<^ABBR><+NN><Fem><Dat><Sg>)", "[ 33, 34] - - Gen - - Fem - - Sing - - - - - - - - . (.<^ABBR><+NN><Fem><Gen><Sg>)", "[ 33, 34] - - Nom - - Fem - - Sing - - - - - - - - . (.<^ABBR><+NN><Fem><Nom><Sg>)", - "[ 33, 34] - - - - - - - - - - - - - - - - - . (.<+PUNCT><Norm>)" }; + "[ 33, 34] - - - - - - - - - - - - - - - - - . (.<+PUNCT><Norm>)" + }; String[] tags = { "<+ADJ>", "<+ADV>", "<+ART>", "<+CARD>", "<+CONJ>", "<+DEM>", "<+INDEF>", "<+INTJ>", "<+NN>", "<+NPROP>", "<+ORD>", "<+POSS>", "<+POSTP>", "<+PPRO>", @@ -317,7 +324,8 @@ public void testGermanZmorgeNewlemma() "[ 33, 34] - - Dat - - Fem - - Sing - - - - - - - - . (.<+NN><Fem><Dat><Sg>)", "[ 33, 34] - - Gen - - Fem - - Sing - - - - - - - - . (.<+NN><Fem><Gen><Sg>)", "[ 33, 34] - - Nom - - Fem - - Sing - - - - - - - - . (.<+NN><Fem><Nom><Sg>)", - "[ 33, 34] - - - - - - - - - - - - - - - - - . (.<+PUNCT><Norm>)" }; + "[ 33, 34] - - - - - - - - - - - - - - - - - . (.<+PUNCT><Norm>)" + }; String[] tags = { "<#>", "<+ADJ>", "<+ADV>", "<+ART>", "<+CARD>", "<+CONJ>", "<+DEM>", "<+INDEF>", "<+INTJ>", "<+NN>", "<+NPROP>", "<+ORD>", "<+POSS>", "<+POSTP>", @@ -359,7 +367,8 @@ public void testItalian() "[ 14, 20] - - - - - - Ind - Sing - 3 - - - Pres - - lavora (lavorare<VER><ind><pres><3><s>)", "[ 21, 23] - - - - - - - - - - - - - - - - - in (in<PRE>)", "[ 24, 32] - - - - - - - - Sing - - - - - - - - ospedale (ospedale<NOUN><M><s>)", - "[ 33, 34] - - - - - - - - - - - - - - - - - . (.<SENT>)" }; + "[ 33, 34] - - - - - - - - - - - - - - - - - . (.<SENT>)" + }; String[] tags = { "<1>", "<2>", "<3>", "<ABL>", "<ADJ>", "<ADV>", "<ART>", "<ARTPRE>", "<ASP>", "<AUX>", "<CARD>", "<CAU>", "<CE>", "<CHE>", "<CI>", "<CLI>", "<COM>", diff --git a/dkpro-core-sfst-gpl/src/test/resources/log4j.properties b/dkpro-core-sfst-gpl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-sfst-gpl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-sfst-gpl/src/test/resources/log4j2.xml b/dkpro-core-sfst-gpl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-sfst-gpl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Logger name="org.dkpro.core.api.resources.ResourceObjectProviderBase" level="INFO"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-smile-asl/LICENSE.txt b/dkpro-core-smile-asl/LICENSE.txt new file mode 100644 index 0000000000..d645695673 --- /dev/null +++ b/dkpro-core-smile-asl/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/dkpro-core-smile-asl/pom.xml b/dkpro-core-smile-asl/pom.xml new file mode 100644 index 0000000000..a6ad257d1e --- /dev/null +++ b/dkpro-core-smile-asl/pom.xml @@ -0,0 +1,125 @@ +<!-- + Licensed to the Technische Universität Darmstadt under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The Technische Universität Darmstadt + licenses this file to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <artifactId>dkpro-core-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <version>2.3.0-SNAPSHOT</version> + <relativePath>../dkpro-core-asl</relativePath> + </parent> + <artifactId>dkpro-core-smile-asl</artifactId> + <packaging>jar</packaging> + <name>DKPro Core ASL - Smile (v ${smile.version}) (ASL)</name> + <url>https://dkpro.github.io/dkpro-core/</url> + <description>http://haifengl.github.io/smile</description> + <properties> + <smile.version>2.6.0</smile.version> + </properties> + <dependencies> + <dependency> + <groupId>org.apache.uima</groupId> + <artifactId>uimaj-core</artifactId> + </dependency> + <dependency> + <groupId>org.apache.uima</groupId> + <artifactId>uimafit-core</artifactId> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-lang3</artifactId> + </dependency> + <dependency> + <groupId>com.github.haifengl</groupId> + <artifactId>smile-nlp</artifactId> + <version>${smile.version}</version> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-featurepath-asl</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> + </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-opennlp-asl</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> + <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-model-tagger-en-maxent</artifactId> + <scope>test</scope> + </dependency> + </dependencies> + <dependencyManagement> + <dependencies> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-opennlp-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> + <type>pom</type> + <scope>import</scope> + </dependency> + </dependencies> + </dependencyManagement> + <build> + <pluginManagement> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + <configuration> + <usedDependencies> + <!-- Models not detected by byte-code analysis --> + <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.opennlp-model-tagger-en-maxent</usedDependency> + </usedDependencies> + </configuration> + </plugin> + </plugins> + </pluginManagement> + </build> +</project> diff --git a/dkpro-core-smile-asl/src/main/java/org/dkpro/core/smile/SmileLancasterStemmer.java b/dkpro-core-smile-asl/src/main/java/org/dkpro/core/smile/SmileLancasterStemmer.java new file mode 100644 index 0000000000..62b208bf2a --- /dev/null +++ b/dkpro-core-smile-asl/src/main/java/org/dkpro/core/smile/SmileLancasterStemmer.java @@ -0,0 +1,224 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.smile; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Collections; +import java.util.Locale; +import java.util.Set; + +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.cas.text.AnnotationIndex; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.CasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.featurepath.FeaturePathAnnotatorBase; +import org.dkpro.core.api.featurepath.FeaturePathException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.ResourceUtils; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * This Paice/Husk Lancaster stemmer implementation only works with the English language so far. + */ +@Component(OperationType.STEMMER) +@ResourceMetaData(name = "Lancaster Stemmer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability("en") +@TypeCapability( + inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, + outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem" }) +public class SmileLancasterStemmer + extends FeaturePathAnnotatorBase +{ + private static final String MESSAGE_DIGEST = SmileLancasterStemmer.class.getName() + "_Messages"; + + /** + * True if the stemmer will strip prefix such as kilo, micro, milli, intra, ultra, mega, nano, + * pico, pseudo. + */ + public static final String PARAM_STRIP_PREFIXES = "stripPrefix"; + @ConfigurationParameter(name = PARAM_STRIP_PREFIXES, mandatory = true, defaultValue = "false") + private boolean stripPrefix; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Specifies an URL that should resolve to a location from where to load custom rules. If the + * location starts with {@code classpath:} the location is interpreted as a classpath location, + * e.g. "classpath:my/path/to/the/rules". Otherwise it is tried as an URL, file and at last UIMA + * resource. + * + * @see ResourceUtils + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + private String modelLocation; + + /** + * Specifies the language supported by the stemming model. Default value is "en" (English). + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true, defaultValue = "en") + protected String language; + + /** + * The stemmer only has to be initialized once since it's used like a pure function with the + * given configuration parameters. + */ + private smile.nlp.stemmer.LancasterStemmer stemmer; + + @Override + protected Set<String> getDefaultPaths() + { + return Collections.singleton(Token.class.getName()); + } + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + language = language.toLowerCase(); + + if (modelLocation != null) { + try { + + URL url = ResourceUtils.resolveLocation(modelLocation, this, aContext); + stemmer = new smile.nlp.stemmer.LancasterStemmer(url.openStream(), stripPrefix); + } catch (MalformedURLException e) { + throw new ResourceInitializationException(e); + } catch (IOException e) { + throw new ResourceInitializationException(e); + } + } else { + stemmer = new smile.nlp.stemmer.LancasterStemmer(stripPrefix); + } + } + + @Override + protected void generateAnnotations(JCas jcas) + throws FeaturePathException, AnalysisEngineProcessException + { + // CAS is necessary to retrieve values + CAS currCAS = jcas.getCas(); + + // Try language set in CAS. + String lang = jcas.getDocumentLanguage(); + + if (StringUtils.isBlank(lang)) { + throw new AnalysisEngineProcessException(MESSAGE_DIGEST, "no_language_error", null); + } + + lang = lang.toLowerCase(Locale.US); + + if (!language.equals(lang)) { // Only specified language is supported + throw new AnalysisEngineProcessException(MESSAGE_DIGEST, "unsupported_language_error", + new Object[] { lang }); + } + + + for (String path : paths) { + // Separate Typename and featurepath + String[] segments = path.split("/", 2); + String typeName = segments[0]; + + // Try to get the type from the typesystem of the CAS + Type t = CasUtil.getType(currCAS, typeName); + if (t == null) { + throw new IllegalStateException("Type [" + typeName + "] not found in type system"); + } + + // get an fpi object and initialize it + // initialize the FeaturePathInfo with the corresponding part + initializeFeaturePathInfoFrom(fp, segments); + + // get the annotations + AnnotationIndex<?> idx = currCAS.getAnnotationIndex(t); + FSIterator<?> iterator = idx.iterator(); + + while (iterator.hasNext()) { + AnnotationFS fs = (AnnotationFS) iterator.next(); + + if (this.filterFeaturePath != null) { + // check annotation filter condition + if (this.filterFeaturePathInfo.match(fs, this.filterCondition)) { + createStemAnnotation(jcas, stemmer, fs); + } + } + else { // no annotation filter specified + createStemAnnotation(jcas, stemmer, fs); + } + } + } + + } + + private void createStemAnnotation(JCas jcas, smile.nlp.stemmer.LancasterStemmer stemmer, + AnnotationFS fs) + throws AnalysisEngineProcessException + { + // Check for blank text, it makes no sense to add a stem then (and raised an exception) + String value = fp.getValue(fs); + if (!StringUtils.isBlank(value)) { + Stem stemAnnot = new Stem(jcas, fs.getBegin(), fs.getEnd()); + + stemAnnot.setValue(stemmer.stem(value)); + stemAnnot.addToIndexes(jcas); + + // Try setting the "stem" feature on Tokens. + Feature feat = fs.getType().getFeatureByBaseName("stem"); + if (feat != null && feat.getRange() != null + && jcas.getTypeSystem().subsumes(feat.getRange(), stemAnnot.getType())) { + fs.setFeatureValue(feat, stemAnnot); + } + } + } + +} diff --git a/dkpro-core-smile-asl/src/main/java/org/dkpro/core/smile/package-info.java b/dkpro-core-smile-asl/src/main/java/org/dkpro/core/smile/package-info.java new file mode 100644 index 0000000000..0af91a9160 --- /dev/null +++ b/dkpro-core-smile-asl/src/main/java/org/dkpro/core/smile/package-info.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Lancaster stemmer based on the <a href="http://haifengl.github.io/smile">Smile</a> machine learning package. + * + * @since 1.9.0 + */ +package org.dkpro.core.smile; diff --git a/dkpro-core-lancaster-asl/src/main/resources/org/dkpro/core/lancaster/LancasterStemmer_Messages.properties b/dkpro-core-smile-asl/src/main/resources/org/dkpro/core/lancaster/LancasterStemmer_Messages.properties similarity index 100% rename from dkpro-core-lancaster-asl/src/main/resources/org/dkpro/core/lancaster/LancasterStemmer_Messages.properties rename to dkpro-core-smile-asl/src/main/resources/org/dkpro/core/lancaster/LancasterStemmer_Messages.properties diff --git a/dkpro-core-smile-asl/src/test/java/org/dkpro/core/smile/SmileLancasterStemmerTest.java b/dkpro-core-smile-asl/src/test/java/org/dkpro/core/smile/SmileLancasterStemmerTest.java new file mode 100644 index 0000000000..4c37b9232f --- /dev/null +++ b/dkpro-core-smile-asl/src/test/java/org/dkpro/core/smile/SmileLancasterStemmerTest.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.smile; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.opennlp.OpenNlpPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; + +public class SmileLancasterStemmerTest +{ + @Test + public void testEnglish() + throws Exception + { + runTest("en", "computers Computers deliberately", + new String[] {"comput", "comput", "delib"} ); + + runTest("en", + "We need a very complicated example sentence , which " + + "contains as many constituents and dependencies as possible .", + new String[] { "we", "need", "a", "very", "comply", "exampl", "sent", "", "which", + "contain", "as", "many", "constitu", "and", "depend", "as", "poss", "" }); + } + + @Test + public void testEnglishWithDefaultRulesConfiguration() + throws Exception + { + runTest("en", "proceed", + new String[] {"process"} + ); + } + + @Test + public void testEnglishWithClassPathRulesConfiguration() + throws Exception + { + runTest("en", "proceed", + new String[] {"procee"}, // using default rules the expected would be process + SmileLancasterStemmer.PARAM_MODEL_LOCATION, "classpath:Lancaster_test_rules.txt" + ); + } + + @Test + public void testEnglishWithFilePathRulesConfiguration() + throws Exception + { + runTest("en", "proceed", + new String[] {"procee"}, // using default rules the expected would be process + SmileLancasterStemmer.PARAM_MODEL_LOCATION, "file:src/test/resources/Lancaster_test_rules.txt" + ); + } + + @Test + public void testAlternativeLanguageConfiguration() + throws Exception + { + runTest("dl", "proceed", + new String[] {"procee"}, // using default rules the expected would be process + SmileLancasterStemmer.PARAM_MODEL_LOCATION, "classpath:Lancaster_test_rules.txt", + SmileLancasterStemmer.PARAM_LANGUAGE, "dl" + ); + } + + @Test + public void testEnglishCaseInsensitive() + throws Exception + { + runTest("en", "EDUCATIONAL Educational educational", + new String[] {"educ", "educ", "educ"}); + } + + @Test + public void testEnglishCaseFiltered() + throws Exception + { + String[] stems = { "educ" }; + String[] pos = { "NNS", "JJ", "NN", "NNS" }; + + AnalysisEngineDescription aggregate = createEngineDescription( + createEngineDescription(OpenNlpPosTagger.class), + createEngineDescription(SmileLancasterStemmer.class, + SmileLancasterStemmer.PARAM_FILTER_FEATUREPATH, "pos/PosValue", + SmileLancasterStemmer.PARAM_FILTER_CONDITION_OPERATOR, "EQUALS", + SmileLancasterStemmer.PARAM_FILTER_CONDITION_VALUE, "JJ")); + + JCas result = TestRunner.runTest(aggregate, "en", "Babies educational sleep .s"); + + AssertAnnotations.assertStem(stems, select(result, Stem.class)); + AssertAnnotations.assertPOS(null, pos, select(result, POS.class)); + } + + private JCas runTest(String aLanguage, String aText, String[] aStems, Object... aParams) + throws Exception + { + JCas result = TestRunner.runTest( + createEngineDescription(SmileLancasterStemmer.class, aParams), aLanguage, aText); + + AssertAnnotations.assertStem(aStems, select(result, Stem.class)); + + return result; + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-lancaster-asl/src/test/resources/Lancaster_test_rules.txt b/dkpro-core-smile-asl/src/test/resources/Lancaster_test_rules.txt similarity index 100% rename from dkpro-core-lancaster-asl/src/test/resources/Lancaster_test_rules.txt rename to dkpro-core-smile-asl/src/test/resources/Lancaster_test_rules.txt diff --git a/dkpro-core-smile-asl/src/test/resources/log4j2.xml b/dkpro-core-smile-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-smile-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Logger name="org.dkpro.core.api.resources.ResourceObjectProviderBase" level="INFO"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-snowball-asl/pom.xml b/dkpro-core-snowball-asl/pom.xml index edc7e7d48e..e5bbaa9e0a 100644 --- a/dkpro-core-snowball-asl/pom.xml +++ b/dkpro-core-snowball-asl/pom.xml @@ -18,14 +18,15 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <version>1.10.0-SNAPSHOT</version> + <artifactId>dkpro-core-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.snowball-asl</artifactId> + <artifactId>dkpro-core-snowball-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - Snowball (BSD)</name> + <url>https://dkpro.github.io/dkpro-core/</url> <description>http://snowball.tartarus.org/</description> <dependencies> <dependency> @@ -45,16 +46,20 @@ <artifactId>lucene-analyzers-common</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.featurepath-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-featurepath-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> </dependency> <dependency> <groupId>junit</groupId> @@ -62,18 +67,18 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-opennlp-asl</artifactId> <scope>test</scope> </dependency> <dependency> @@ -85,9 +90,9 @@ <dependencyManagement> <dependencies> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-opennlp-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <type>pom</type> <scope>import</scope> </dependency> diff --git a/dkpro-core-snowball-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/snowball/SnowballStemmer.java b/dkpro-core-snowball-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/snowball/SnowballStemmer.java deleted file mode 100644 index 93edbc0e65..0000000000 --- a/dkpro-core-snowball-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/snowball/SnowballStemmer.java +++ /dev/null @@ -1,260 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.snowball; - -import static org.apache.commons.lang3.StringUtils.isBlank; - -import java.util.Collections; -import java.util.HashMap; -import java.util.Locale; -import java.util.Map; -import java.util.Set; - -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.reflect.MethodUtils; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.FSIterator; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.FeaturePath; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.cas.text.AnnotationIndex; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.LanguageCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.tartarus.snowball.SnowballProgram; - -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathAnnotatorBase; -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * <p>UIMA wrapper for the Snowball stemmer. Annotation types to be stemmed can be configured by a - * {@link FeaturePath}.</p> - * <p>If you use this component in a pipeline which uses stop word removal, make sure that it - * runs after the stop word removal step, so only words that are no stop words are stemmed.</p> - * - * @see <a href="http://snowball.tartarus.org/">Snowball stemmer homepage</a> - * @see FeaturePathAnnotatorBase - * @since 1.1.0 - */ -@ResourceMetaData(name="Snowball Stemmer") -@LanguageCapability({ "da", "nl", "en", "fi", "fr", "de", "hu", "it", "no", "pt", "ro", "ru", "es", - "sv", "tr" }) -@TypeCapability( - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem"}) -public class SnowballStemmer - extends FeaturePathAnnotatorBase -{ - private static final String MESSAGE_DIGEST = SnowballStemmer.class.getName()+"_Messages"; - private static final String SNOWBALL_PACKAGE = "org.tartarus.snowball.ext."; - - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Per default the stemmer runs in case-sensitive mode. If this parameter is enabled, tokens - * are lower-cased before being passed to the stemmer. - * - * <table border="1" cellspacing="0"> - * <caption>Examples</caption> - * <tr><th></th><th>false (default)</th><th>true</th></tr> - * <tr><td>EDUCATIONAL</td><td>EDUCATIONAL</td><td>educ</td></tr> - * <tr><td>Educational</td><td>Educat</td><td>educ</td></tr> - * <tr><td>educational</td><td>educ</td><td>educ</td></tr> - * </table> - */ - public static final String PARAM_LOWER_CASE = "lowerCase"; - @ConfigurationParameter(name = PARAM_LOWER_CASE, mandatory = false, defaultValue="false") - protected boolean lowerCase; - - public static final Map<String, String> languages = new HashMap<String, String>(); - static { - languages.put("da", "Danish"); - languages.put("nl", "Dutch"); - languages.put("en", "English"); - languages.put("fi", "Finnish"); - languages.put("fr", "French"); - languages.put("de", "German"); - languages.put("hu", "Hungarian"); - languages.put("it", "Italian"); - languages.put("no", "Norwegian"); - languages.put("pt", "Portuguese"); - languages.put("ro", "Romanian"); - languages.put("ru", "Russian"); - languages.put("es", "Spanish"); - languages.put("sv", "Swedish"); - languages.put("tr", "Turkish"); - } - - private SnowballProgram snowballProgram; - private String snowballProgramLanguage; - - @Override - protected Set<String> getDefaultPaths() - { - return Collections.singleton(Token.class.getName()); - } - - @Override - protected void generateAnnotations(JCas jcas) - throws AnalysisEngineProcessException, FeaturePathException - { - // CAS is necessary to retrieve values - CAS currCAS = jcas.getCas(); - - for (String path : paths) { - - // Separate Typename and featurepath - String[] segments = path.split("/", 2); - String typeName = segments[0]; - - // Try to get the type from the typesystem of the CAS - Type t = currCAS.getTypeSystem().getType(typeName); - if (t == null) { - throw new IllegalStateException("Type [" + typeName + "] not found in type system"); - } - - // get an fpi object and initialize it - // initialize the FeaturePathInfo with the corresponding part - initializeFeaturePathInfoFrom(fp, segments); - - // get the annotations - AnnotationIndex<?> idx = currCAS.getAnnotationIndex(t); - FSIterator<?> iterator = idx.iterator(); - - while (iterator.hasNext()) { - AnnotationFS fs = (AnnotationFS) iterator.next(); - - try { - if (this.filterFeaturePath != null) { - // check annotation filter condition - if (this.filterFeaturePathInfo.match(fs, this.filterCondition)) { - createStemAnnotation(jcas, fs); - } - } - else { // no annotation filter specified - createStemAnnotation(jcas, fs); - } - } - catch (AnalysisEngineProcessException e) { - // TODO Auto-generated catch block - throw new IllegalStateException( - "error occured while creating a stem annotation", e); - } - } - } - } - - private SnowballProgram getSnowballProgram(JCas aCas) - throws AnalysisEngineProcessException - { - // Try language set on analysis engine - String lang = language; - if (isBlank(lang)) { - lang = aCas.getDocumentLanguage(); - } - - // Try language set in CAS. - if (isBlank(lang)) { - throw new AnalysisEngineProcessException(MESSAGE_DIGEST, "no_language_error", null); - } - - lang = lang.toLowerCase(Locale.US); - - if (!lang.equals(snowballProgramLanguage)) { - try { - String langPart = languages.get(lang); - if (langPart == null) { - throw new AnalysisEngineProcessException(MESSAGE_DIGEST, - "unsupported_language_error", new Object[] { lang }); - } - String snowballStemmerClass = SNOWBALL_PACKAGE + languages.get(lang) + "Stemmer"; - @SuppressWarnings("unchecked") - Class<SnowballProgram> stemClass = (Class<SnowballProgram>) Class - .forName(snowballStemmerClass); - snowballProgram = stemClass.newInstance(); - snowballProgramLanguage = lang; - } - catch (Exception e) { - throw new AnalysisEngineProcessException(e); - } - } - - return snowballProgram; - } - - /** - * Creates a Stem annotation with same begin and end as the AnnotationFS fs, the value is the - * stemmed value derived by applying the featurepath. - * - * @param jcas - * the JCas - * @param fs - * the AnnotationFS where the Stem annotation is created - * @throws AnalysisEngineProcessException - * if the {@code stem} method from the snowball stemmer cannot be invoked. - */ - private void createStemAnnotation(JCas jcas, AnnotationFS fs) - throws AnalysisEngineProcessException - { - // Check for blank text, it makes no sense to add a stem then (and raised an exception) - String value = fp.getValue(fs); - if (!StringUtils.isBlank(value)) { - if (lowerCase) { - // Fixme - should use locale/language defined in CAS. - value = value.toLowerCase(Locale.US); - } - - Stem stemAnnot = new Stem(jcas, fs.getBegin(), fs.getEnd()); - SnowballProgram programm = getSnowballProgram(jcas); - programm.setCurrent(value); - - try { - // The patched snowball from Lucene has this as a method on SnowballProgram - // but if we have some other snowball also in the classpath, Java might - // choose to use the other. So to be safe, we use a reflection here. - // -- REC, 2011-04-17 - MethodUtils.invokeMethod(programm, "stem", null); - } - catch (Exception e) { - throw new AnalysisEngineProcessException(e); - } - - stemAnnot.setValue(programm.getCurrent()); - stemAnnot.addToIndexes(jcas); - - // Try setting the "stem" feature on Tokens. - Feature feat = fs.getType().getFeatureByBaseName("stem"); - if (feat != null && feat.getRange() != null - && jcas.getTypeSystem().subsumes(feat.getRange(), stemAnnot.getType())) { - fs.setFeatureValue(feat, stemAnnot); - } - } - } -} diff --git a/dkpro-core-snowball-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/snowball/package-info.java b/dkpro-core-snowball-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/snowball/package-info.java deleted file mode 100644 index 19de126762..0000000000 --- a/dkpro-core-snowball-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/snowball/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Stemmer based on the <a href="http://snowball.tartarus.org/">Snowball</a> stemmer package. - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.snowball; diff --git a/dkpro-core-snowball-asl/src/main/java/org/dkpro/core/snowball/SnowballStemmer.java b/dkpro-core-snowball-asl/src/main/java/org/dkpro/core/snowball/SnowballStemmer.java new file mode 100644 index 0000000000..1a936d9c9c --- /dev/null +++ b/dkpro-core-snowball-asl/src/main/java/org/dkpro/core/snowball/SnowballStemmer.java @@ -0,0 +1,265 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.snowball; + +import static org.apache.commons.lang3.StringUtils.isBlank; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.reflect.MethodUtils; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.FeaturePath; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.cas.text.AnnotationIndex; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.featurepath.FeaturePathAnnotatorBase; +import org.dkpro.core.api.featurepath.FeaturePathException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.tartarus.snowball.SnowballProgram; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * <p>UIMA wrapper for the Snowball stemmer. Annotation types to be stemmed can be configured by a + * {@link FeaturePath}.</p> + * <p>If you use this component in a pipeline which uses stop word removal, make sure that it + * runs after the stop word removal step, so only words that are no stop words are stemmed.</p> + * + * @see <a href="http://snowball.tartarus.org/">Snowball stemmer homepage</a> + * @see FeaturePathAnnotatorBase + * @since 1.1.0 + */ +@Component(OperationType.STEMMER) +@ResourceMetaData(name = "Snowball Stemmer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability({ "da", "nl", "en", "fi", "fr", "de", "hu", "it", "no", "pt", "ro", "ru", "es", + "sv", "tr" }) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem"}) +public class SnowballStemmer + extends FeaturePathAnnotatorBase +{ + private static final String MESSAGE_DIGEST = SnowballStemmer.class.getName() + "_Messages"; + private static final String SNOWBALL_PACKAGE = "org.tartarus.snowball.ext."; + + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Per default the stemmer runs in case-sensitive mode. If this parameter is enabled, tokens + * are lower-cased before being passed to the stemmer. + * + * <table border="1"> + * <caption>Examples</caption> + * <tr><th></th><th>false (default)</th><th>true</th></tr> + * <tr><td>EDUCATIONAL</td><td>EDUCATIONAL</td><td>educ</td></tr> + * <tr><td>Educational</td><td>Educat</td><td>educ</td></tr> + * <tr><td>educational</td><td>educ</td><td>educ</td></tr> + * </table> + */ + public static final String PARAM_LOWER_CASE = "lowerCase"; + @ConfigurationParameter(name = PARAM_LOWER_CASE, mandatory = false, defaultValue = "false") + protected boolean lowerCase; + + public static final Map<String, String> languages = new HashMap<String, String>(); + static { + languages.put("da", "Danish"); + languages.put("nl", "Dutch"); + languages.put("en", "English"); + languages.put("fi", "Finnish"); + languages.put("fr", "French"); + languages.put("de", "German"); + languages.put("hu", "Hungarian"); + languages.put("it", "Italian"); + languages.put("no", "Norwegian"); + languages.put("pt", "Portuguese"); + languages.put("ro", "Romanian"); + languages.put("ru", "Russian"); + languages.put("es", "Spanish"); + languages.put("sv", "Swedish"); + languages.put("tr", "Turkish"); + } + + private SnowballProgram snowballProgram; + private String snowballProgramLanguage; + + @Override + protected Set<String> getDefaultPaths() + { + return Collections.singleton(Token.class.getName()); + } + + @Override + protected void generateAnnotations(JCas jcas) + throws AnalysisEngineProcessException, FeaturePathException + { + // CAS is necessary to retrieve values + CAS currCAS = jcas.getCas(); + + for (String path : paths) { + + // Separate Typename and featurepath + String[] segments = path.split("/", 2); + String typeName = segments[0]; + + // Try to get the type from the typesystem of the CAS + Type t = currCAS.getTypeSystem().getType(typeName); + if (t == null) { + throw new IllegalStateException("Type [" + typeName + "] not found in type system"); + } + + // get an fpi object and initialize it + // initialize the FeaturePathInfo with the corresponding part + initializeFeaturePathInfoFrom(fp, segments); + + // get the annotations + AnnotationIndex<?> idx = currCAS.getAnnotationIndex(t); + FSIterator<?> iterator = idx.iterator(); + + while (iterator.hasNext()) { + AnnotationFS fs = (AnnotationFS) iterator.next(); + + try { + if (this.filterFeaturePath != null) { + // check annotation filter condition + if (this.filterFeaturePathInfo.match(fs, this.filterCondition)) { + createStemAnnotation(jcas, fs); + } + } + else { // no annotation filter specified + createStemAnnotation(jcas, fs); + } + } + catch (AnalysisEngineProcessException e) { + // TODO Auto-generated catch block + throw new IllegalStateException( + "error occured while creating a stem annotation", e); + } + } + } + } + + private SnowballProgram getSnowballProgram(JCas aCas) + throws AnalysisEngineProcessException + { + // Try language set on analysis engine + String lang = language; + if (isBlank(lang)) { + lang = aCas.getDocumentLanguage(); + } + + // Try language set in CAS. + if (isBlank(lang)) { + throw new AnalysisEngineProcessException(MESSAGE_DIGEST, "no_language_error", null); + } + + lang = lang.toLowerCase(Locale.US); + + if (!lang.equals(snowballProgramLanguage)) { + try { + String langPart = languages.get(lang); + if (langPart == null) { + throw new AnalysisEngineProcessException(MESSAGE_DIGEST, + "unsupported_language_error", new Object[] { lang }); + } + String snowballStemmerClass = SNOWBALL_PACKAGE + languages.get(lang) + "Stemmer"; + @SuppressWarnings("unchecked") + Class<SnowballProgram> stemClass = (Class<SnowballProgram>) Class + .forName(snowballStemmerClass); + snowballProgram = stemClass.newInstance(); + snowballProgramLanguage = lang; + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + } + + return snowballProgram; + } + + /** + * Creates a Stem annotation with same begin and end as the AnnotationFS fs, the value is the + * stemmed value derived by applying the featurepath. + * + * @param jcas + * the JCas + * @param fs + * the AnnotationFS where the Stem annotation is created + * @throws AnalysisEngineProcessException + * if the {@code stem} method from the snowball stemmer cannot be invoked. + */ + private void createStemAnnotation(JCas jcas, AnnotationFS fs) + throws AnalysisEngineProcessException + { + // Check for blank text, it makes no sense to add a stem then (and raised an exception) + String value = fp.getValue(fs); + if (!StringUtils.isBlank(value)) { + if (lowerCase) { + // Fixme - should use locale/language defined in CAS. + value = value.toLowerCase(Locale.US); + } + + Stem stemAnnot = new Stem(jcas, fs.getBegin(), fs.getEnd()); + SnowballProgram programm = getSnowballProgram(jcas); + programm.setCurrent(value); + + try { + // The patched snowball from Lucene has this as a method on SnowballProgram + // but if we have some other snowball also in the classpath, Java might + // choose to use the other. So to be safe, we use a reflection here. + // -- REC, 2011-04-17 + MethodUtils.invokeMethod(programm, "stem", null); + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + + stemAnnot.setValue(programm.getCurrent()); + stemAnnot.addToIndexes(jcas); + + // Try setting the "stem" feature on Tokens. + Feature feat = fs.getType().getFeatureByBaseName("stem"); + if (feat != null && feat.getRange() != null + && jcas.getTypeSystem().subsumes(feat.getRange(), stemAnnot.getType())) { + fs.setFeatureValue(feat, stemAnnot); + } + } + } +} diff --git a/dkpro-core-snowball-asl/src/main/java/org/dkpro/core/snowball/package-info.java b/dkpro-core-snowball-asl/src/main/java/org/dkpro/core/snowball/package-info.java new file mode 100644 index 0000000000..320ab8b60a --- /dev/null +++ b/dkpro-core-snowball-asl/src/main/java/org/dkpro/core/snowball/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Stemmer based on the <a href="http://snowball.tartarus.org/">Snowball</a> stemmer package. + * + * @since 1.1.0 + */ +package org.dkpro.core.snowball; diff --git a/dkpro-core-snowball-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/snowball/SnowballStemmer_Messages.properties b/dkpro-core-snowball-asl/src/main/resources/org/dkpro/core/snowball/SnowballStemmer_Messages.properties similarity index 100% rename from dkpro-core-snowball-asl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/snowball/SnowballStemmer_Messages.properties rename to dkpro-core-snowball-asl/src/main/resources/org/dkpro/core/snowball/SnowballStemmer_Messages.properties diff --git a/dkpro-core-snowball-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/snowball/SnowballStemmerTest.java b/dkpro-core-snowball-asl/src/test/java/org/dkpro/core/snowball/SnowballStemmerTest.java similarity index 89% rename from dkpro-core-snowball-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/snowball/SnowballStemmerTest.java rename to dkpro-core-snowball-asl/src/test/java/org/dkpro/core/snowball/SnowballStemmerTest.java index 55903e0b9e..db2a84f5a6 100644 --- a/dkpro-core-snowball-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/snowball/SnowballStemmerTest.java +++ b/dkpro-core-snowball-asl/src/test/java/org/dkpro/core/snowball/SnowballStemmerTest.java @@ -15,22 +15,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.snowball; +package org.dkpro.core.snowball; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.jcas.JCas; +import org.dkpro.core.opennlp.OpenNlpPosTagger; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class SnowballStemmerTest { @@ -51,9 +51,9 @@ public void testEnglish() runTest("en", "We need a very complicated example sentence , which " + "contains as many constituents and dependencies as possible .", - new String[] { "We", "need", "a", "veri", "complic", "exampl", "sentenc", ",", - "which", "contain", "as", "mani", "constitu", "and", "depend", "as", "possibl", - "." }); + new String[] { "We", "need", "a", "veri", "complic", "exampl", "sentenc", ",", + "which", "contain", "as", "mani", "constitu", "and", "depend", "as", + "possibl", "." }); } @Test diff --git a/dkpro-core-snowball-asl/src/test/resources/log4j.properties b/dkpro-core-snowball-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-snowball-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-snowball-asl/src/test/resources/log4j2.xml b/dkpro-core-snowball-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-snowball-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Logger name="org.dkpro.core.api.resources.ResourceObjectProviderBase" level="INFO"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-stanfordnlp-gpl/.license-header.txt b/dkpro-core-stanfordnlp-gpl/.license-header.txt index ab08133a17..bbaf6e0e56 100644 --- a/dkpro-core-stanfordnlp-gpl/.license-header.txt +++ b/dkpro-core-stanfordnlp-gpl/.license-header.txt @@ -13,4 +13,4 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with this program. If not, see http://www.gnu.org/licenses/. +along with this program. If not, see http://www.gnu.org/licenses/. diff --git a/dkpro-core-stanfordnlp-gpl/LICENSE.txt b/dkpro-core-stanfordnlp-gpl/LICENSE.txt index 6e22a15c3c..99ace43661 100644 --- a/dkpro-core-stanfordnlp-gpl/LICENSE.txt +++ b/dkpro-core-stanfordnlp-gpl/LICENSE.txt @@ -654,7 +654,7 @@ the "copyright" line and a pointer to where the full notice is found. GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. + along with this program. If not, see http://www.gnu.org/licenses/. Also add information on how to contact you by electronic and paper mail. diff --git a/dkpro-core-stanfordnlp-gpl/pom.xml b/dkpro-core-stanfordnlp-gpl/pom.xml index 59acbf6e94..2e6b5d828f 100644 --- a/dkpro-core-stanfordnlp-gpl/pom.xml +++ b/dkpro-core-stanfordnlp-gpl/pom.xml @@ -1,6 +1,6 @@ <!-- - Copyright 2007-2017 + Copyright 2007-2019 Ubiquitous Knowledge Processing (UKP) Lab Technische Universität Darmstadt @@ -15,22 +15,23 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. + along with this program. If not, see http://www.gnu.org/licenses/. --> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core-gpl</artifactId> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-gpl</artifactId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-gpl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-gpl</artifactId> + <artifactId>dkpro-core-stanfordnlp-gpl</artifactId> <packaging>jar</packaging> - <name>DKPro Core GPL - Stanford CoreNLP Suite (v ${corenlp.version}) (GPL)</name> + <name>DKPro Core GPL - Stanford CoreNLP Suite (v ${corenlp.version}) - Classic API (GPL)</name> + <url>https://dkpro.github.io/dkpro-core/</url> <properties> - <corenlp.version>3.8.0</corenlp.version> + <corenlp.version>3.9.2</corenlp.version> </properties> <dependencies> <dependency> @@ -52,6 +53,7 @@ <dependency> <groupId>com.googlecode.efficient-java-matrix-library</groupId> <artifactId>ejml</artifactId> + <!-- CAUTION: Upgrade to newer version breaks deserialization of coref models --> <version>0.23</version> </dependency> <dependency> @@ -60,56 +62,70 @@ <version>${corenlp.version}</version> </dependency> <!-- https://github.com/dkpro/dkpro-core/issues/779 - <dependency> + <dependency> <groupId>edu.stanford.nlp</groupId> <artifactId>stanford-corenlp</artifactId> <version>${corenlp.version}</version> <classifier>models</classifier> - </dependency> + </dependency> --> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.io-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-io-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.transform-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-transform-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.ner-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-ner-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.syntax-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-syntax-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.metadata-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-metadata-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.coref-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-coref-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.sentiment-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-sentiment-asl</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> + </dependency> + <dependency> + <!-- + - Xerces 2.12.0 no longer draws in the XML APIs causing ClassNotFoundExceptions on JREs + - not including the XML APIs either. However, there is no direct code dependency on the + - artifact in this module. + --> + <groupId>xml-apis</groupId> + <artifactId>xml-apis</artifactId> + <scope>runtime</scope> </dependency> <dependency> <groupId>junit</groupId> @@ -117,8 +133,8 @@ <scope>test</scope> </dependency> <dependency> - <groupId>log4j</groupId> - <artifactId>log4j</artifactId> + <groupId>org.assertj</groupId> + <artifactId>assertj-core</artifactId> <scope>test</scope> </dependency> <dependency> @@ -127,38 +143,38 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.eval-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-eval-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.datasets-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-datasets-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.io.text-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-io-text-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.io.conll-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-io-conll-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.io.penntree-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-io-penntree-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.languagetool-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-languagetool-asl</artifactId> <scope>test</scope> </dependency> <dependency> @@ -187,7 +203,7 @@ <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-coref-en-default</artifactId> - <version>20170609.1</version> + <version>20181005.1</version> </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> @@ -196,13 +212,8 @@ </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-dewac_175m_600.crf</artifactId> - <version>20150130.1</version> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-hgc_175m_600.crf</artifactId> - <version>20161213.1</version> + <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-germeval2014.hgc_175m_600.crf</artifactId> + <version>20180227.1</version> </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> @@ -267,7 +278,7 @@ <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-parser-ar-sr</artifactId> - <version>20141031.1</version> + <version>20180227.1</version> </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> @@ -346,19 +357,14 @@ </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-ar-accurate</artifactId> - <version>20131112.1</version> + <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-ar-default</artifactId> + <version>20180103.1</version> </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-ud</artifactId> <version>20161213.1</version> </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-dewac</artifactId> - <version>20140827.1</version> - </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-fast</artifactId> @@ -377,17 +383,17 @@ <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-en-bidirectional-distsim</artifactId> - <version>20140616.1</version> + <version>20181002.1</version> </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-en-left3words-distsim</artifactId> - <version>20140616.1</version> + <version>20181002.1</version> </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-en-caseless-left3words-distsim</artifactId> - <version>20140827.0</version> + <version>20181002.0</version> </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> @@ -402,7 +408,7 @@ <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-en-wsj-0-18-left3words-distsim</artifactId> - <version>20131112.1</version> + <version>20140616.1</version> </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> @@ -439,11 +445,6 @@ <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-zh-distsim</artifactId> <version>20140616.1</version> </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-zh-nodistsim</artifactId> - <version>20140616.1</version> - </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-es-ancora.distsim.s512.crf</artifactId> @@ -534,11 +535,21 @@ <build> <pluginManagement> <plugins> + <plugin> + <groupId>com.mycila</groupId> + <artifactId>license-maven-plugin</artifactId> + <configuration> + <excludes combine.children="append"> + <exclude>src/main/java/org/dkpro/core/stanfordnlp/util/CasCopier.java</exclude> + </excludes> + </configuration> + </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-dependency-plugin</artifactId> <configuration> <usedDependencies> + <usedDependency>xml-apis:xml-apis</usedDependency> <!-- Models not detected by byte-code analysis --> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-coref-en-default</usedDependency> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-en-all.3class.distsim.crf</usedDependency> @@ -646,12 +657,7 @@ </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-dewac_175m_600.crf</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-hgc_175m_600.crf</artifactId> + <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-germeval2014.hgc_175m_600.crf</artifactId> <scope>test</scope> </dependency> <dependency> @@ -674,11 +680,6 @@ <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-hgc</artifactId> <scope>test</scope> </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-dewac</artifactId> - <scope>test</scope> - </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-en-caseless-left3words-distsim</artifactId> @@ -696,7 +697,7 @@ </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-ar-accurate</artifactId> + <artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-ar-default</artifactId> <scope>test</scope> </dependency> <dependency> @@ -809,12 +810,9 @@ <configuration> <usedDependencies combine.children="append"> <!-- Models not detected by byte-code analysis --> - <!-- usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-coref-en-default</usedDependency --> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-nemgp</usedDependency> - <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-dewac_175m_600.crf</usedDependency> - <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-hgc_175m_600.crf</usedDependency> + <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-de-germeval2014.hgc_175m_600.crf</usedDependency> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-en-all.3class.caseless.distsim.crf</usedDependency> - <!-- usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-en-all.3class.distsim.crf</usedDependency --> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-en-conll.4class.caseless.distsim.crf</usedDependency> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-en-conll.4class.distsim.crf</usedDependency> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-ner-en-muc.7class.caseless.distsim.crf</usedDependency> @@ -834,7 +832,6 @@ <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-parser-de-sr</usedDependency> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-parser-en-factored</usedDependency> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-parser-en-pcfg</usedDependency> - <!-- usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-parser-en-rnn</usedDependency --> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-parser-en-sr-beam</usedDependency> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-parser-en-sr</usedDependency> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-parser-en-wsj-rnn</usedDependency> @@ -846,13 +843,11 @@ <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-parser-zh-factored</usedDependency> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-parser-zh-sr</usedDependency> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-parser-zh-xinhua-factored</usedDependency> - <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-ar-accurate</usedDependency> + <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-ar-default</usedDependency> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-ud</usedDependency> - <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-dewac</usedDependency> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-fast-caseless</usedDependency> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-fast</usedDependency> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-de-hgc</usedDependency> - <!-- usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-en-bidirectional-distsim</usedDependency --> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-en-caseless-left3words-distsim</usedDependency> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-en-fast.41</usedDependency> <usedDependency>de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.stanfordnlp-model-tagger-en-twitter-fast</usedDependency> diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordPosTagger.java b/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordPosTagger.java deleted file mode 100644 index b7806b33be..0000000000 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordPosTagger.java +++ /dev/null @@ -1,235 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; - -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; -import static org.apache.uima.util.Level.INFO; - -import java.io.IOException; -import java.net.URL; -import java.util.ArrayList; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Type; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ResourceParameter; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.CoreNlpUtils; -import edu.stanford.nlp.ling.HasWord; -import edu.stanford.nlp.ling.TaggedWord; -import edu.stanford.nlp.ling.Word; -import edu.stanford.nlp.process.PTBEscapingProcessor; -import edu.stanford.nlp.tagger.maxent.MaxentTagger; -import edu.stanford.nlp.util.StringUtils; - -/** - * Stanford Part-of-Speech tagger component. - */ -@ResourceMetaData(name="CoreNLP POS-Tagger (old API)") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, - outputs = {"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}) -public class StanfordPosTagger - extends JCasAnnotator_ImplBase -{ - /** - * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") - protected boolean printTagSet; - - /** - * Use this language instead of the document language to resolve the model and tag set mapping. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Variant of a model the model. Used to address a specific model if here are multiple models - * for one language. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Location from which the model is read. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - @ResourceParameter(MimeTypes.APPLICATION_X_STANFORDNLP_TAGGER) - protected String modelLocation; - - /** - * Location of the mapping file for part-of-speech tags to UIMA types. - */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - protected String posMappingLocation; - - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code false} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internStrings; - - /** - * Enable all traditional PTB3 token transforms (like -LRB-, -RRB-). - * - * @see PTBEscapingProcessor - */ - public static final String PARAM_PTB3_ESCAPING = "ptb3Escaping"; - @ConfigurationParameter(name = PARAM_PTB3_ESCAPING, mandatory = true, defaultValue = "true") - private boolean ptb3Escaping; - - /** - * List of extra token texts (usually single character strings) that should be treated like - * opening quotes and escaped accordingly before being sent to the parser. - */ - public static final String PARAM_QUOTE_BEGIN = "quoteBegin"; - @ConfigurationParameter(name = PARAM_QUOTE_BEGIN, mandatory = false) - private List<String> quoteBegin; - - /** - * List of extra token texts (usually single character strings) that should be treated like - * closing quotes and escaped accordingly before being sent to the parser. - */ - public static final String PARAM_QUOTE_END = "quoteEnd"; - @ConfigurationParameter(name = PARAM_QUOTE_END, mandatory = false) - private List<String> quoteEnd; - - /** - * Sentences with more tokens than the specified max amount will be ignored if this parameter - * is set to a value larger than zero. The default value zero will allow all sentences to be - * POS tagged. - */ - public static final String PARAM_MAX_SENTENCE_LENGTH = ComponentParameters.PARAM_MAX_SENTENCE_LENGTH;; - @ConfigurationParameter(name = PARAM_MAX_SENTENCE_LENGTH, mandatory = false) - private int maxSentenceTokens = 0; - - private CasConfigurableProviderBase<MaxentTagger> modelProvider; - private MappingProvider posMappingProvider; - - private final PTBEscapingProcessor<HasWord, String, Word> escaper = new PTBEscapingProcessor<HasWord, String, Word>(); - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase<MaxentTagger>(this, "stanfordnlp", "tagger") { - @Override - protected MaxentTagger produceResource(URL aUrl) throws IOException - { - String modelFile = aUrl.toString(); - - MaxentTagger tagger = new MaxentTagger(modelFile, - StringUtils.argsToProperties(new String[] { "-model", modelFile }), - false); - - SingletonTagset tags = new SingletonTagset(POS.class, getResourceMetaData() - .getProperty(("pos.tagset"))); - tags.addAll(tagger.tagSet()); - addTagset(tags); - - if (printTagSet) { - getContext().getLogger().log(INFO, getTagset().toString()); - } - - return tagger; - } - }; - - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - language, modelProvider); - posMappingProvider.setDefaultVariantsLocation( - "de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/tagger-default-variants.map"); - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - - modelProvider.configure(cas); - posMappingProvider.configure(cas); - - for (Sentence sentence : select(aJCas, Sentence.class)) { - List<Token> tokens = selectCovered(aJCas, Token.class, sentence); - - if(maxSentenceTokens > 0 && tokens.size() > maxSentenceTokens) { - continue; - } - - List<HasWord> words = new ArrayList<HasWord>(tokens.size()); - for (Token t : tokens) { - words.add(new TaggedWord(t.getText())); - } - - if (ptb3Escaping) { - words = CoreNlpUtils.applyPtbEscaping(words, quoteBegin, quoteEnd); - } - - List<TaggedWord> taggedWords = modelProvider.getResource().tagSentence(words); - - int i = 0; - for (Token t : tokens) { - TaggedWord tt = taggedWords.get(i); - Type posTag = posMappingProvider.getTagType(tt.tag()); - POS posAnno = (POS) cas.createAnnotation(posTag, t.getBegin(), t.getEnd()); - posAnno.setStringValue(posTag.getFeatureByBaseName("PosValue"), - internStrings ? tt.tag().intern() : tt.tag()); - posAnno.addToIndexes(); - t.setPos(posAnno); - i++; - } - } - } -} diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordSegmenter.java b/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordSegmenter.java deleted file mode 100644 index aa9acec5cf..0000000000 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordSegmenter.java +++ /dev/null @@ -1,372 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; - -import static org.apache.uima.fit.util.JCasUtil.selectCovered; - -import java.io.StringReader; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.LanguageCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.Messages; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import edu.stanford.nlp.international.arabic.process.ArabicTokenizer; -import edu.stanford.nlp.international.french.process.FrenchTokenizer; -import edu.stanford.nlp.international.spanish.process.SpanishTokenizer; -import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation; -import edu.stanford.nlp.ling.CoreLabel; -import edu.stanford.nlp.process.CoreLabelTokenFactory; -import edu.stanford.nlp.process.PTBEscapingProcessor; -import edu.stanford.nlp.process.PTBTokenizer; -import edu.stanford.nlp.process.Tokenizer; -import edu.stanford.nlp.process.WordToSentenceProcessor; -import edu.stanford.nlp.process.WordToSentenceProcessor.NewlineIsSentenceBreak; - -/** - * Stanford sentence splitter and tokenizer. - */ -@ResourceMetaData(name="CoreNLP Segmenter (old API)") -@LanguageCapability({"en", "es", "fr"}) -@TypeCapability( - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) -public class StanfordSegmenter - extends SegmenterBase -{ - private static final Map<String, InternalTokenizerFactory> tokenizerFactories; -// private static final Map<String, TreebankLanguagePack> languagePacks; - - static { - tokenizerFactories = new HashMap<String, InternalTokenizerFactory>(); -// tokenizerFactories.put("ar", new InternalArabicTokenizerFactory()); - tokenizerFactories.put("en", new InternalPTBTokenizerFactory()); - tokenizerFactories.put("es", new InternalSpanishTokenizerFactory()); - tokenizerFactories.put("fr", new InternalFrenchTokenizerFactory()); - // The Negra tokenizer is not really a full tokenizer. -// tokenizerFactories.put("de", new InternalNegraPennTokenizerFactory()); - // Not sure if those really work - don't know how to test -// tokenizerFactories.put("zh", new InternalCHTBTokenizerFactory()); - -// languagePacks = new HashMap<String, TreebankLanguagePack>(); -// languagePacks.put("en", new PennTreebankLanguagePack()); -// languagePacks.put("zh", new ChineseTreebankLanguagePack()); -// languagePacks.put("en", new ArabicTreebankLanguagePack()); -// languagePacks.put("de", new NegraPennLanguagePack()); - } - - /** - * If this component is not configured for a specific language and if the language stored in - * the document metadata is not supported, use the given language as a fallback. - */ - public static final String PARAM_LANGUAGE_FALLBACK = "languageFallback"; - @ConfigurationParameter(name = PARAM_LANGUAGE_FALLBACK, mandatory = false) - private String languageFallback; - - /** - * The set of boundary tokens. If null, use default. - * - * @see WordToSentenceProcessor#WordToSentenceProcessor - */ - public static final String PARAM_BOUNDARY_TOKEN_REGEX = "boundaryTokenRegex"; - @ConfigurationParameter(name = PARAM_BOUNDARY_TOKEN_REGEX, mandatory = false, defaultValue = WordToSentenceProcessor.DEFAULT_BOUNDARY_REGEX) - private String boundaryTokenRegex; - - /** - * This is a Set of String that are matched with .equals() which are allowed to be tacked onto - * the end of a sentence after a sentence boundary token, for example ")". - * - * @see WordToSentenceProcessor#DEFAULT_BOUNDARY_FOLLOWERS_REGEX - */ - public static final String PARAM_BOUNDARY_FOLLOWERS_REGEX = "boundaryFollowersRegex"; - @ConfigurationParameter(name = PARAM_BOUNDARY_FOLLOWERS_REGEX, mandatory = false, defaultValue = - WordToSentenceProcessor.DEFAULT_BOUNDARY_FOLLOWERS_REGEX) - private String boundaryFollowersRegex; - - /** - * These are elements like "p" or "sent", which will be wrapped into regex for approximate XML - * matching. They will be deleted in the output, and will always trigger a sentence boundary. - */ - public static final String PARAM_XML_BREAK_ELEMENTS_TO_DISCARD = "xmlBreakElementsToDiscard"; - @ConfigurationParameter(name = PARAM_XML_BREAK_ELEMENTS_TO_DISCARD, mandatory = false) - private Set<String> xmlBreakElementsToDiscard; - - /** - * The set of regex for sentence boundary tokens that should be discarded. - * - * @see WordToSentenceProcessor#DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD - */ - public static final String PARAM_BOUNDARIES_TO_DISCARD = "boundaryToDiscard"; - @ConfigurationParameter(name = PARAM_BOUNDARIES_TO_DISCARD, mandatory = false, defaultValue = { - "\n", "*NL*" }) - private Set<String> boundariesToDiscard; - - /** - * A regular expression for element names containing a sentence region. Only tokens in such - * elements will be included in sentences. The start and end tags themselves are not included in - * the sentence. - */ - public static final String PARAM_REGION_ELEMENT_REGEX = "regionElementRegex"; - @ConfigurationParameter(name = PARAM_REGION_ELEMENT_REGEX, mandatory = false) - private String regionElementRegex; - - /** - * Strategy for treating newlines as paragraph breaks. - */ - public static final String PARAM_NEWLINE_IS_SENTENCE_BREAK = "newlineIsSentenceBreak"; - @ConfigurationParameter(name = PARAM_NEWLINE_IS_SENTENCE_BREAK, mandatory = false, defaultValue = "TWO_CONSECUTIVE") - private NewlineIsSentenceBreak newlineIsSentenceBreak; - - /** - * The set of regex for sentence boundary tokens that should be discarded. - */ - public static final String PARAM_TOKEN_REGEXES_TO_DISCARD = "tokenRegexesToDiscard"; - @ConfigurationParameter(name = PARAM_TOKEN_REGEXES_TO_DISCARD, mandatory = false, defaultValue = {}) - private Set<String> tokenRegexesToDiscard; - - /** - * Whether to treat all input as one sentence. - */ - public static final String PARAM_IS_ONE_SENTENCE = "isOneSentence"; - @ConfigurationParameter(name = PARAM_IS_ONE_SENTENCE, mandatory = true, defaultValue = "false") - private boolean isOneSentence; - - /** - * Whether to generate empty sentences. - */ - public static final String PARAM_ALLOW_EMPTY_SENTENCES = "allowEmptySentences"; - @ConfigurationParameter(name = PARAM_ALLOW_EMPTY_SENTENCES, mandatory = true, defaultValue = "false") - private boolean allowEmptySentences; - - /** - * Additional options that should be passed to the tokenizers. The available options depend on - * the language-specific tokenizer being used. - */ - private String[] additionalOptions; - - @Override - protected void process(JCas aJCas, String aText, int aZoneBegin) - throws AnalysisEngineProcessException - { - List<Token> casTokens = null; - - // Use value from language parameter, document language or fallback language - whatever - // is available - String language = getLanguage(aJCas); - - if (isWriteToken()) { - casTokens = new ArrayList<Token>(); - final Tokenizer<?> tokenizer = getTokenizer(language, aText); - - List<?> tokens = tokenizer.tokenize(); - for (int i = 0; i < tokens.size(); i++) { - final Object token = tokens.get(i); - // System.out.println("Token class: "+token.getClass()); - CoreLabel l = (CoreLabel) token; - String t = l.word(); - int begin = l.get(CharacterOffsetBeginAnnotation.class); - int end = l.get(CharacterOffsetEndAnnotation.class); - - casTokens.add(createToken(aJCas, t, aZoneBegin + begin, aZoneBegin + end)); - } - } - - if (isWriteSentence()) { - if (casTokens == null) { - casTokens = selectCovered(aJCas, Token.class, aZoneBegin, - aZoneBegin + aText.length()); - } - - // Prepare the tokens for processing by WordToSentenceProcessor - List<CoreLabel> tokensInDocument = new ArrayList<CoreLabel>(); - Pattern nlPattern = Pattern.compile(".*(\r\n|\n|\r).*"); - Matcher nlMatcher = nlPattern.matcher(""); - int lastTokenEnd = 0; - for (Token token : casTokens) { - if (!NewlineIsSentenceBreak.NEVER.equals(newlineIsSentenceBreak)) { - // add newline as token for newlineIsSentenceBreak parameter - nlMatcher.reset(aJCas.getDocumentText().subSequence(lastTokenEnd, token.getBegin())); - if (nlMatcher.matches()) { - CoreLabel l = new CoreLabel(); - l.set(CharacterOffsetBeginAnnotation.class, lastTokenEnd + nlMatcher.start(1)); - l.set(CharacterOffsetEndAnnotation.class, lastTokenEnd + nlMatcher.end(1)); - l.setWord("\n"); - tokensInDocument.add(l); - } - } - lastTokenEnd = token.getEnd(); - // add regular token - CoreLabel l = new CoreLabel(); - l.set(CharacterOffsetBeginAnnotation.class, token.getBegin()); - l.set(CharacterOffsetEndAnnotation.class, token.getEnd()); - l.setWord(token.getText()); - tokensInDocument.add(l); - } - - // The sentence splitter (probably) requires the escaped text, so we prepare it here - PTBEscapingProcessor escaper = new PTBEscapingProcessor(); - escaper.apply(tokensInDocument); - - // Apply the WordToSentenceProcessor to find the sentence boundaries - WordToSentenceProcessor<CoreLabel> proc = new WordToSentenceProcessor<CoreLabel>( - boundaryTokenRegex, boundaryFollowersRegex, boundariesToDiscard, - xmlBreakElementsToDiscard, regionElementRegex, newlineIsSentenceBreak, null, - tokenRegexesToDiscard, isOneSentence, allowEmptySentences); - - List<List<CoreLabel>> sentencesInDocument = proc.process(tokensInDocument); - for (List<CoreLabel> sentence : sentencesInDocument) { - int begin = sentence.get(0).get(CharacterOffsetBeginAnnotation.class); - int end = sentence.get(sentence.size()-1).get(CharacterOffsetEndAnnotation.class); - - createSentence(aJCas, begin, end); - } - } - } - - private - Tokenizer getTokenizer( - final String aLanguage, - final String aText) throws AnalysisEngineProcessException - { - InternalTokenizerFactory tk = tokenizerFactories.get(aLanguage); - if (tk == null) { - if (languageFallback == null) { - throw new AnalysisEngineProcessException(Messages.BUNDLE, - Messages.ERR_UNSUPPORTED_LANGUAGE, new String[] { aLanguage }); - } - else { - tk = tokenizerFactories.get(languageFallback); - if (tk == null) { - throw new AnalysisEngineProcessException(Messages.BUNDLE, - Messages.ERR_UNSUPPORTED_LANGUAGE, new String[] { languageFallback }); - } - } - } - - - return tk.create(aText); - } - - private static - interface InternalTokenizerFactory - { - Tokenizer<?> create(String s); - } - - private static - class InternalPTBTokenizerFactory - implements InternalTokenizerFactory - { - @Override - public - Tokenizer<?> create( - final String s) - { -// TokenizerFactory<CoreLabel> f = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible,ptb3Escaping=false"); - return new PTBTokenizer<CoreLabel>(new StringReader(s),new CoreLabelTokenFactory(),"invertible"); - } - } - - // The InternalNegraPennTokenizer is not meant for German text. It - // is for parsing a particular corpus format. -// private static -// class InternalNegraPennTokenizerFactory -// implements InternalTokenizerFactory -// { -// @Override -// public -// Tokenizer<?> create( -// final String s) -// { -// return new NegraPennTokenizer(new StringReader(s)); -// } -// } - - private static - class InternalArabicTokenizerFactory - implements InternalTokenizerFactory - { - @Override - public - Tokenizer<?> create( - final String s) - { - return ArabicTokenizer.newArabicTokenizer(new StringReader(s), new Properties()); - } - } - - private static - class InternalFrenchTokenizerFactory - implements InternalTokenizerFactory - { - @Override - public - Tokenizer<?> create( - final String s) - { - return FrenchTokenizer.factory().getTokenizer(new StringReader(s), "tokenizeNLs=false"); - } - } - - private static - class InternalSpanishTokenizerFactory - implements InternalTokenizerFactory - { - @Override - public - Tokenizer<?> create( - final String s) - { - return SpanishTokenizer.factory(new CoreLabelTokenFactory(), null).getTokenizer( - new StringReader(s)); - } - } - - // While the stanford parser should come with a proper tokenizer for - // Chinese (because it can parse chinese text), this does not seem to be - // the right one or I am using it wrong. The associated test cases do not - // work. -// private static -// class InternalCHTBTokenizerFactory -// implements InternalTokenizerFactory -// { -// @Override -// public -// Tokenizer<?> create( -// final String s) -// { -// return new CHTBTokenizer(new StringReader(s)); -// } -// } -} diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordSentimentAnalyzer.java b/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordSentimentAnalyzer.java deleted file mode 100644 index c538d4cc02..0000000000 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordSentimentAnalyzer.java +++ /dev/null @@ -1,101 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.sentiment.type.StanfordSentimentAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations; -import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations; -import edu.stanford.nlp.pipeline.Annotation; -import edu.stanford.nlp.pipeline.StanfordCoreNLP; -import edu.stanford.nlp.sentiment.SentimentCoreAnnotations; -import edu.stanford.nlp.trees.Tree; -import edu.stanford.nlp.util.CoreMap; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.ejml.simple.SimpleMatrix; - -import java.util.Properties; - -/** - * Experimental wrapper for {@link edu.stanford.nlp.pipeline.SentimentAnnotator} which assigns - * 5 scores to each sentence. NOTE: Is very slow in the current state as it runs full Stanford - * pipeline and does not take into account any existing DKPro annotations. - */ -@ResourceMetaData(name="CoreNLP Sentiment Analyzer") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" - }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.sentiment.type.StanfordSentimentAnnotation" - } -) -public class StanfordSentimentAnalyzer - extends JCasAnnotator_ImplBase { - - private StanfordCoreNLP pipeline; - - @Override - public void initialize(UimaContext context) throws ResourceInitializationException { - super.initialize(context); - - Properties props = new Properties(); - props.setProperty("annotators", "tokenize, ssplit, pos, parse, sentiment"); - pipeline = new StanfordCoreNLP(props); - } - - @Override - public void process(JCas jCas) - throws AnalysisEngineProcessException { - for (Sentence sentenceDKPro : JCasUtil.select(jCas, Sentence.class)) { - String sentenceText = sentenceDKPro.getCoveredText(); - - Annotation annotation = pipeline.process(sentenceText); - - for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { - Tree tree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class); - SimpleMatrix sentimentCoefficients = RNNCoreAnnotations.getPredictions(tree); - - double veryNegative = sentimentCoefficients.get(0); - double negative = sentimentCoefficients.get(1); - double neutral = sentimentCoefficients.get(2); - double positive = sentimentCoefficients.get(3); - double veryPositive = sentimentCoefficients.get(4); - - StanfordSentimentAnnotation sentimentAnnotation = new StanfordSentimentAnnotation(jCas); - sentimentAnnotation.setBegin(sentenceDKPro.getBegin()); - sentimentAnnotation.setEnd(sentenceDKPro.getEnd()); - sentimentAnnotation.setVeryNegative(veryNegative); - sentimentAnnotation.setNegative(negative); - sentimentAnnotation.setNeutral(neutral); - sentimentAnnotation.setPositive(positive); - sentimentAnnotation.setVeryPositive(veryPositive); - sentimentAnnotation.addToIndexes(); - } - } - } -} diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/internal/TokenKey.java b/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/internal/TokenKey.java deleted file mode 100644 index b818b44a85..0000000000 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/internal/TokenKey.java +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp.internal; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import edu.stanford.nlp.util.TypesafeMap.Key; - -public class TokenKey - implements Key<Token> -{ -}; diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/package-info.java b/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/package-info.java deleted file mode 100644 index e82269d2e1..0000000000 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/package-info.java +++ /dev/null @@ -1,25 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -/** - * Integration of NLP components from the <a href="http://nlp.stanford.edu/software/corenlp.shtml"> - * Stanford CoreNLP suite</a>. - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/TreeWithTokens.java b/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/TreeWithTokens.java deleted file mode 100644 index fbf4bdd0a0..0000000000 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/TreeWithTokens.java +++ /dev/null @@ -1,188 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util; - -import java.util.Iterator; -import java.util.List; - -import org.apache.uima.jcas.tcas.Annotation; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import edu.stanford.nlp.ling.CoreLabel; -import edu.stanford.nlp.trees.Tree; -import edu.stanford.nlp.util.IntPair; - -/** - * A wrapper object that manages a tree object together with the respective - * Token annotations for the leafs of the tree. This is needed for being able to - * map the leaves of the tree to words in a CAS. - * - * Trees in TreeWithToken-object are always converted to trees with - * CoreLabel-type labels. - * - * - */ -public class TreeWithTokens -{ - private Tree tree; - private List<Token> tokens; - - public TreeWithTokens(Tree tree, List<Token> tokens) - { - setTree(tree); - setTokens(tokens); - } - - public void setTree(Tree tree) - { - if (!(tree.label() instanceof CoreLabel)) { - tree = tree.deepCopy(tree.treeFactory(), CoreLabel.factory()); - } - - tree.indexLeaves(); - - this.tree = tree; - } - - public Tree getTree() - { - return tree; - } - - public void setTokens(List<Token> tokens) - { - this.tokens = tokens; - } - - public List<Token> getTokens() - { - return tokens; - } - - /** - * Returns the span of the documentText that is covered by this - * TreeWithTokens. - * - * @return an IntPair describing the span of the documentText that is - * covered by this tree - */ - public IntPair getSpan() - { - return getSpan(getTree()); - } - - /** - * Returns the span of the documentText that is covered by a given subtree, - * that has to be taken directly from the original tree. - * <p> - * NOTE: Possibly we could make this more general to also support general - * trees that are contained in the original tree, but are not directly taken - * from it (i.e. with different leaf-numbering). In order to do so, we would - * have to make a Tregex-Matching of the given subtree in the original tree - * to identify the positition of the given subtree. - * <p> - * This could be achieved by translating the subtree into a Tregex pattern - * and then matching this pattern against the original tree. - * - * @param subtree - * a subtree of this TreeWithTokens (it has to be a real - * subtree(!), because index numbering of subtree has to fit to - * the numbering of the original tree) - * @return an IntPair describing the span of the documentText that is - * covered by this tree - */ - public IntPair getSpan(Tree subtree) - { - // TODO check if subtree is a real subtree of tokenTree.getTree() - - int nodeIndexLeft = ((CoreLabel) getLeftmostLeaf(subtree).label()) - .index(); - int nodeIndexRight = ((CoreLabel) getRightmostLeaf(subtree).label()) - .index(); - int a = tokens.get(nodeIndexLeft - 1).getBegin(); - int b = tokens.get(nodeIndexRight - 1).getEnd(); - - return new IntPair(a, b); - } - - private Tree getLeftmostLeaf(Tree t) - { - if (t.isLeaf()) { - return t; - } - else { - return getLeftmostLeaf(t.firstChild()); - } - } - - private Tree getRightmostLeaf(Tree t) - { - if (t.isLeaf()) { - return t; - } - else { - return getRightmostLeaf(t.lastChild()); - } - } - - /** - * Finds the best-fitting node in the tree for a given annotation. - * - * The best-fitting node for an annotation is the deepest node in the tree - * that still completely contains the span of the given annotation. - * - * TODO Could be done more efficiently, I think. In a recursive method, for - * example, recursion could be stopped as soon as overlap becomes -1 - * - * @param anno - * the annotation to find a best fit for - * - * @return the node of the tree that is the best fit for <code>anno</code> - */ - public Tree getBestFit(Annotation anno) - { - Tree curBestFit = null; - int curBestOverlap = Integer.MAX_VALUE; - - Iterator<Tree> treeIterator = getTree().iterator(); - while (treeIterator.hasNext()) { - Tree curTree = treeIterator.next(); - IntPair span = getSpan(curTree); - - // calc overlap: if annotation not completely contained in span of - // subtree, overlap will be -1, otherwise it will be >0 - // Our goal is to find the node with minimal positive overlap - int overlap = -1; - int leftBorder = anno.getBegin() - span.getSource(); - int rightBorder = span.getTarget() - anno.getEnd(); - if (!(leftBorder < 0) && !(rightBorder < 0)) { - overlap = leftBorder + rightBorder; - } - - // determine whether node is better than the temporary best fit - if ((overlap > -1) && overlap < curBestOverlap) { - curBestFit = curTree; - curBestOverlap = overlap; - } - } - - return curBestFit; - } - -} diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/UIMAAnnotations.java b/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/UIMAAnnotations.java deleted file mode 100644 index 1bd9fc6c38..0000000000 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/UIMAAnnotations.java +++ /dev/null @@ -1,42 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util; - -import java.util.Collection; - -import org.apache.uima.jcas.tcas.Annotation; - -import edu.stanford.nlp.ling.CoreAnnotation; - -/** - * Annotations of the type "UIMAAnnotations" should contain a Collection - * of org.apache.uima.jcas.tcas.Annotation objects. - * - */ -@SuppressWarnings("rawtypes") -public class UIMAAnnotations - implements CoreAnnotation<Collection<Annotation>> -{ - @SuppressWarnings("unchecked") - @Override - public Class<Collection<Annotation>> getType() - { - return (Class<Collection<Annotation>>) (Class) Collection.class; - } -} diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/package-info.java b/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/package-info.java deleted file mode 100644 index 407ac579d0..0000000000 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -/** - * Utility classes shared between different components. - */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util; diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordCoreferenceResolver.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordCoreferenceResolver.java similarity index 90% rename from dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordCoreferenceResolver.java rename to dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordCoreferenceResolver.java index 8bba564dd3..8fbd63fd2e 100644 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordCoreferenceResolver.java +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordCoreferenceResolver.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,9 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; +package org.dkpro.core.stanfordnlp; import static java.util.Arrays.asList; import static org.apache.uima.fit.util.JCasUtil.select; @@ -38,18 +38,18 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.stanfordnlp.internal.RootKey; +import org.dkpro.core.stanfordnlp.internal.TokenKey; +import org.dkpro.core.stanfordnlp.util.CoreNlpUtils; +import org.dkpro.core.stanfordnlp.util.TreeUtils; import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain; import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.internal.RootKey; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.internal.TokenKey; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.CoreNlpUtils; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.TreeUtils; import edu.stanford.nlp.dcoref.Constants; import edu.stanford.nlp.dcoref.CorefChain; import edu.stanford.nlp.dcoref.CorefChain.CorefMention; @@ -78,10 +78,15 @@ import edu.stanford.nlp.trees.TreeFactory; import edu.stanford.nlp.trees.TreebankLanguagePack; import edu.stanford.nlp.util.CoreMap; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** */ -@ResourceMetaData(name="CoreNLP Coreference Resolver (old API)") +@Component(OperationType.CO_REFERENCE_ANNOTATOR) +@ResourceMetaData(name = "CoreNLP Coreference Resolver (old API)") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", @@ -100,7 +105,8 @@ public class StanfordCoreferenceResolver * DCoRef parameter: Sieve passes - each class is defined in dcoref/sievepasses/. */ public static final String PARAM_SIEVES = "sieves"; - @ConfigurationParameter(name = PARAM_SIEVES, defaultValue = Constants.SIEVEPASSES, mandatory = true) + @ConfigurationParameter(name = PARAM_SIEVES, defaultValue = Constants.SIEVEPASSES, + mandatory = true) private String sieves; /** @@ -144,8 +150,10 @@ public void initialize(UimaContext aContext) { setContextObject(StanfordCoreferenceResolver.this); + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); setDefault(ARTIFACT_ID, "${groupId}.stanfordnlp-model-coref-${language}-${variant}"); - setDefault(LOCATION, "classpath:/${package}/lib/coref/${language}/${variant}/countries"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/coref/${language}/${variant}/countries"); setDefault(VARIANT, "default"); // setOverride(LOCATION, modelLocation); @@ -157,7 +165,7 @@ public void initialize(UimaContext aContext) protected Coreferencer produceResource(URL aUrl) throws IOException { - String base = FilenameUtils.getFullPathNoEndSeparator(aUrl.toString())+"/"; + String base = FilenameUtils.getFullPathNoEndSeparator(aUrl.toString()) + "/"; Properties props = new Properties(); props.setProperty(Constants.SIEVES_PROP, sieves); @@ -176,7 +184,8 @@ protected Coreferencer produceResource(URL aUrl) props.setProperty(Constants.DEMONYM_PROP, base + "demonyms.txt"); // props.getProperty(Constants.ANIMATE_PROP, DefaultPaths.DEFAULT_DCOREF_ANIMATE), props.setProperty(Constants.ANIMATE_PROP, base + "animate.unigrams.txt"); - // props.getProperty(Constants.INANIMATE_PROP, DefaultPaths.DEFAULT_DCOREF_INANIMATE), + // props.getProperty(Constants.INANIMATE_PROP, + // DefaultPaths.DEFAULT_DCOREF_INANIMATE), props.setProperty(Constants.INANIMATE_PROP, base + "inanimate.unigrams.txt"); // props.getProperty(Constants.MALE_PROP), props.setProperty(Constants.MALE_PROP, base + "male.unigrams.txt"); @@ -190,24 +199,29 @@ protected Coreferencer produceResource(URL aUrl) props.setProperty(Constants.SINGULAR_PROP, base + "singular.unigrams.txt"); // props.getProperty(Constants.STATES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES), props.setProperty(Constants.STATES_PROP, base + "state-abbreviations.txt"); - //props.getProperty(Constants.GENDER_NUMBER_PROP, DefaultPaths.DEFAULT_DCOREF_GENDER_NUMBER); + // props.getProperty(Constants.GENDER_NUMBER_PROP, + // DefaultPaths.DEFAULT_DCOREF_GENDER_NUMBER); props.setProperty(Constants.GENDER_NUMBER_PROP, base + "gender.map.ser.gz"); - // props.getProperty(Constants.COUNTRIES_PROP, DefaultPaths.DEFAULT_DCOREF_COUNTRIES), + // props.getProperty(Constants.COUNTRIES_PROP, + // DefaultPaths.DEFAULT_DCOREF_COUNTRIES), props.setProperty(Constants.COUNTRIES_PROP, base + "countries"); - // props.getProperty(Constants.STATES_PROVINCES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES_AND_PROVINCES), + // props.getProperty(Constants.STATES_PROVINCES_PROP, + // DefaultPaths.DEFAULT_DCOREF_STATES_AND_PROVINCES), props.setProperty(Constants.STATES_PROVINCES_PROP, base + "statesandprovinces"); // The following properties are only relevant if the "CorefDictionaryMatch" sieve // is enabled. // PropertiesUtils.getStringArray(props, Constants.DICT_LIST_PROP, - // new String[]{DefaultPaths.DEFAULT_DCOREF_DICT1, DefaultPaths.DEFAULT_DCOREF_DICT2, + // new String[]{DefaultPaths.DEFAULT_DCOREF_DICT1, + // DefaultPaths.DEFAULT_DCOREF_DICT2, // DefaultPaths.DEFAULT_DCOREF_DICT3, DefaultPaths.DEFAULT_DCOREF_DICT4}), props.put(Constants.DICT_LIST_PROP, '[' + base + "coref.dict1.tsv" + ',' + base + "coref.dict2.tsv" + ',' + base + "coref.dict3.tsv" + ',' + base + "coref.dict4.tsv" + ']'); // props.getProperty(Constants.DICT_PMI_PROP, DefaultPaths.DEFAULT_DCOREF_DICT1), props.put(Constants.DICT_PMI_PROP, base + "coref.dict1.tsv"); - // props.getProperty(Constants.SIGNATURES_PROP, DefaultPaths.DEFAULT_DCOREF_NE_SIGNATURES)); + // props.getProperty(Constants.SIGNATURES_PROP, + // DefaultPaths.DEFAULT_DCOREF_NE_SIGNATURES)); props.put(Constants.SIGNATURES_PROP, base + "ne.signatures.txt"); try { @@ -281,7 +295,7 @@ public Tree newTreeNode(String aParent, List<Tree> aChildren) SemanticGraph deps = sentence .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); for (IndexedWord vertex : deps.vertexSet()) { - vertex.setWord(vertex.value()); + vertex.setWord(vertex.value()); } // These lines are necessary since CoreNLP 3.5.2 - without them the mentions lack diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordDependencyConverter.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordDependencyConverter.java similarity index 90% rename from dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordDependencyConverter.java rename to dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordDependencyConverter.java index bc5f3f0824..6881594cc2 100644 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordDependencyConverter.java +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordDependencyConverter.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,9 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; +package org.dkpro.core.stanfordnlp; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; @@ -33,19 +33,19 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.stanfordnlp.StanfordParser.DependenciesMode; +import org.dkpro.core.stanfordnlp.internal.RootKey; +import org.dkpro.core.stanfordnlp.internal.TokenKey; +import org.dkpro.core.stanfordnlp.util.CoreNlpUtils; +import org.dkpro.core.stanfordnlp.util.StanfordAnnotator; +import org.dkpro.core.stanfordnlp.util.TreeUtils; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordParser.DependenciesMode; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.internal.RootKey; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.internal.TokenKey; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.CoreNlpUtils; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.StanfordAnnotator; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.TreeUtils; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.trees.GrammaticalStructure; @@ -58,11 +58,16 @@ import edu.stanford.nlp.trees.Trees; import edu.stanford.nlp.trees.TypedDependency; import edu.stanford.nlp.util.CoreMap; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Converts a constituency structure into a dependency structure. */ -@ResourceMetaData(name="CoreNLP Dependency Converter") +@Component(OperationType.DEPENDENCY_CONVERTER) +@ResourceMetaData(name = "CoreNLP Dependency Converter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", @@ -85,7 +90,6 @@ public class StanfordDependencyConverter /** * Sets the kind of dependencies being created. * - * <p>Default: {@link DependenciesMode#COLLAPSED TREE} * @see DependenciesMode */ public static final String PARAM_MODE = "mode"; @@ -241,7 +245,8 @@ protected void doCreateDependencyTags(JCas aJCas, TreebankLanguagePack aLP, Tree dep.addToIndexes(); } - dep.setFlavor(currTypedDep.extra() ? DependencyFlavor.ENHANCED : DependencyFlavor.BASIC); + dep.setFlavor( + currTypedDep.extra() ? DependencyFlavor.ENHANCED : DependencyFlavor.BASIC); } } diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordLemmatizer.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordLemmatizer.java similarity index 90% rename from dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordLemmatizer.java rename to dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordLemmatizer.java index 2244aa82cf..05925aa568 100644 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordLemmatizer.java +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordLemmatizer.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,9 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; +package org.dkpro.core.stanfordnlp; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; @@ -24,7 +24,6 @@ import java.util.ArrayList; import java.util.List; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; @@ -34,14 +33,14 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.Messages; +import org.dkpro.core.stanfordnlp.internal.TokenKey; +import org.dkpro.core.stanfordnlp.util.CoreNlpUtils; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.Messages; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.internal.TokenKey; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.CoreNlpUtils; -import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.IndexAnnotation; @@ -50,11 +49,15 @@ import edu.stanford.nlp.ling.CoreAnnotations.SentenceIndexAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; +import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.MorphaAnnotator; import edu.stanford.nlp.process.CoreLabelTokenFactory; import edu.stanford.nlp.process.PTBEscapingProcessor; import edu.stanford.nlp.util.CoreMap; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Stanford Lemmatizer component. The Stanford Morphology-class computes the base form of English @@ -66,7 +69,9 @@ * * <p>This only works for ENGLISH.</p> */ -@ResourceMetaData(name="CoreNLP Lemmatizer (old API)") +@Component(OperationType.LEMMATIZER) +@ResourceMetaData(name = "CoreNLP Lemmatizer (old API)") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @LanguageCapability("en") @TypeCapability( inputs = { @@ -119,7 +124,8 @@ public void process(JCas aJCas) { if (!"en".equals(aJCas.getDocumentLanguage())) { throw new AnalysisEngineProcessException(Messages.BUNDLE, - Messages.ERR_UNSUPPORTED_LANGUAGE, new String[] { aJCas.getDocumentLanguage() }); + Messages.ERR_UNSUPPORTED_LANGUAGE, + new String[] { aJCas.getDocumentLanguage() }); } Annotation document = new Annotation(aJCas.getDocumentText()); diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizer.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizer.java similarity index 81% rename from dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizer.java rename to dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizer.java index 8ab5550c02..fdd1745625 100644 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizer.java +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizer.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,14 +14,15 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; +package org.dkpro.core.stanfordnlp; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import static org.apache.uima.util.Level.INFO; +import static org.dkpro.core.api.resources.MappingProviderFactory.createNerMappingProvider; import java.io.IOException; import java.io.InputStream; @@ -41,28 +42,33 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.parameter.ResourceParameter; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.stanfordnlp.util.CoreNlpUtils; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ResourceParameter; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.CoreNlpUtils; import edu.stanford.nlp.ie.AbstractSequenceClassifier; import edu.stanford.nlp.ie.crf.CRFClassifier; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.process.PTBEscapingProcessor; import edu.stanford.nlp.util.CoreMap; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Stanford Named Entity Recognizer component. */ -@ResourceMetaData(name="CoreNLP Named Entity Recogizer (old API)") +@Component(OperationType.NAMED_ENTITITY_RECOGNIZER) +@ResourceMetaData(name = "CoreNLP Named Entity Recogizer (old API)") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", @@ -94,6 +100,20 @@ public class StanfordNamedEntityRecognizer @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + /** * Location from which the model is read. */ @@ -105,7 +125,8 @@ public class StanfordNamedEntityRecognizer /** * Location of the mapping file for named entity tags to UIMA types. */ - public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; + public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = + ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_NAMED_ENTITY_MAPPING_LOCATION, mandatory = false) protected String mappingLocation; @@ -145,16 +166,8 @@ public void initialize(UimaContext aContext) modelProvider = new StanfordNlpNamedEntityRecognizerModelProvider(this); - mappingProvider = new MappingProvider(); - mappingProvider - .setDefaultVariantsLocation("de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-default-variants.map"); - mappingProvider.setDefault(MappingProvider.LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/" - + "core/stanfordnlp/lib/ner-${language}-${variant}.map"); - mappingProvider.setDefault(MappingProvider.BASE_TYPE, NamedEntity.class.getName()); - mappingProvider.setOverride(MappingProvider.LOCATION, mappingLocation); - mappingProvider.setOverride(MappingProvider.LANGUAGE, language); - mappingProvider.setOverride(MappingProvider.VARIANT, variant); - mappingProvider.addTagMappingImport("ner", modelProvider); + mappingProvider = createNerMappingProvider(this, mappingLocation, language, variant, + modelProvider); } @Override @@ -191,7 +204,8 @@ public void process(JCas aJCas) if ("O".equals(tokenType) || !tokenType.equals(entityType)) { if (entityType != null) { Type type = mappingProvider.getTagType(entityType); - NamedEntity neAnno = (NamedEntity) cas.createAnnotation(type, entityBegin, entityEnd); + NamedEntity neAnno = (NamedEntity) cas.createAnnotation(type, entityBegin, + entityEnd); neAnno.setValue(entityType); neAnno.addToIndexes(); entityType = null; @@ -210,7 +224,8 @@ public void process(JCas aJCas) // If the last entity is still open, then close it if (entityType != null) { Type type = mappingProvider.getTagType(entityType); - NamedEntity neAnno = (NamedEntity) cas.createAnnotation(type, entityBegin, entityEnd); + NamedEntity neAnno = (NamedEntity) cas.createAnnotation(type, entityBegin, + entityEnd); neAnno.setValue(entityType); neAnno.addToIndexes(); } @@ -223,7 +238,7 @@ private class StanfordNlpNamedEntityRecognizerModelProvider public StanfordNlpNamedEntityRecognizerModelProvider(Object aObject) { super(aObject, "stanfordnlp", "ner"); - // setDefault(PACKAGE, "de/tudarmstadt/ukp/dkpro/core/stanfordnlp"); + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-${language}-${variant}.properties"); } @@ -242,8 +257,8 @@ protected AbstractSequenceClassifier<CoreMap> produceResource(URL aUrl) is = new GZIPInputStream(is); } - AbstractSequenceClassifier<CoreMap> classifier = (AbstractSequenceClassifier<CoreMap>) - CRFClassifier.getClassifier(is); + AbstractSequenceClassifier<CoreMap> classifier = + (AbstractSequenceClassifier<CoreMap>) CRFClassifier.getClassifier(is); String tagsetName = metadata.getProperty("ner.tagset"); if (tagsetName == null) { @@ -252,7 +267,7 @@ protected AbstractSequenceClassifier<CoreMap> produceResource(URL aUrl) SingletonTagset tsdp = new SingletonTagset(NamedEntity.class, tagsetName); for (String tag : classifier.classIndex) { - String mapped = metadata.getProperty("ner.tag.map."+tag); + String mapped = metadata.getProperty("ner.tag.map." + tag); String finalTag = mapped != null ? mapped : tag; // "O" has a special meaning in the CRF-NER: not a named entity diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizerTrainer.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizerTrainer.java similarity index 77% rename from dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizerTrainer.java rename to dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizerTrainer.java index 44b3a88c7b..64432402fb 100644 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizerTrainer.java +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizerTrainer.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,31 +14,13 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; +package org.dkpro.core.stanfordnlp; -import de.tudarmstadt.ukp.dkpro.core.api.io.IobEncoder; -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import edu.stanford.nlp.ie.crf.CRFClassifier; -import edu.stanford.nlp.ling.CoreLabel; -import edu.stanford.nlp.sequences.SeqClassifierFlags; -import org.apache.commons.io.IOUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.Type; -import org.apache.uima.fit.component.JCasConsumer_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; +import static org.apache.uima.fit.util.JCasUtil.indexCovered; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; import java.io.File; import java.io.FileInputStream; @@ -57,15 +39,49 @@ import java.util.Properties; import java.util.regex.Pattern; -import static org.apache.uima.fit.util.JCasUtil.indexCovered; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import org.apache.commons.io.IOUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.Type; +import org.apache.uima.fit.component.JCasConsumer_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.io.IobEncoder; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; + +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import edu.stanford.nlp.ie.crf.CRFClassifier; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.sequences.SeqClassifierFlags; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.Parameters; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Train a NER model for Stanford CoreNLP Named Entity Recognizer. */ +@Component(OperationType.TRAINER_OF_MACHINE_LEARNING_MODELS) @MimeTypeCapability(MimeTypes.APPLICATION_X_STANFORDNLP_NER) +@Parameters( + exclude = { + StanfordNamedEntityRecognizerTrainer.PARAM_TARGET_LOCATION }) @ResourceMetaData(name = "CoreNLP Named Entity Recognizer Trainer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity" }) public class StanfordNamedEntityRecognizerTrainer extends JCasConsumer_ImplBase { @@ -86,18 +102,18 @@ public class StanfordNamedEntityRecognizerTrainer private File propertiesFile; /** - * Regex to filter the {@link de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity#getValue() named entity} by - * type. + * Regex to filter the {@link de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity#getValue() + * named entity} by type. */ - public static final String PARAM_ACCEPTED_TAGS_REGEX = ComponentParameters.PARAM_ACCEPTED_TAGS_REGEX; + public static final String PARAM_ACCEPTED_TAGS_REGEX = + ComponentParameters.PARAM_ACCEPTED_TAGS_REGEX; @ConfigurationParameter(name = PARAM_ACCEPTED_TAGS_REGEX, mandatory = false) protected String acceptedTagsRegex; - /* - * Label set to use for training. Options: IOB1, IOB2, IOE1, IOE2, SBIEO, IO, BIO, BILOU, - * noprefix - * - * Default: noprefix + /** + * Label set to use for training. + * <p> + * Options: IOB1, IOB2, IOE1, IOE2, SBIEO, IO, BIO, BILOU, noprefix */ public static final String PARAM_LABEL_SET = "entitySubClassification"; @ConfigurationParameter(name = PARAM_LABEL_SET, mandatory = false, defaultValue = "noprefix") @@ -105,7 +121,7 @@ public class StanfordNamedEntityRecognizerTrainer /** * Flag to keep the label set specified by PARAM_LABEL_SET. If set to false, representation is - * mapped to IOB1 on output. Default: true + * mapped to IOB1 on output. */ public static final String PARAM_RETAIN_CLASS = "retainClassification"; @ConfigurationParameter(name = PARAM_RETAIN_CLASS, mandatory = false, defaultValue = "true") @@ -121,8 +137,8 @@ public void initialize(UimaContext aContext) } @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException { + public void process(JCas aJCas) throws AnalysisEngineProcessException + { if (tempData == null) { try { tempData = File.createTempFile("dkpro-stanford-ner-trainer", ".tsv"); @@ -130,7 +146,8 @@ public void process(JCas aJCas) .info(String.format("Created temp file: %s", tempData.getAbsolutePath())); out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(tempData), StandardCharsets.UTF_8)); - } catch (IOException e) { + } + catch (IOException e) { throw new AnalysisEngineProcessException(e); } } @@ -141,14 +158,15 @@ public void process(JCas aJCas) /* * Taken from Conll2003Writer and modified for the task at hand. */ - private void convert(JCas aJCas, PrintWriter aOut) { + private void convert(JCas aJCas, PrintWriter aOut) + { Type neType = JCasUtil.getType(aJCas, NamedEntity.class); Feature neValue = neType.getFeatureByBaseName("value"); // Named Entities IobEncoder neEncoder = new IobEncoder(aJCas.getCas(), neType, neValue, false); - Map<Sentence, Collection<NamedEntity>> idx = getNamedEntityIndex(aJCas); + Map<Sentence, List<NamedEntity>> idx = getNamedEntityIndex(aJCas); Collection<NamedEntity> coveredNEs; for (Sentence sentence : select(aJCas, Sentence.class)) { @@ -183,15 +201,17 @@ private void convert(JCas aJCas, PrintWriter aOut) { } } - private Map<Sentence, Collection<NamedEntity>> getNamedEntityIndex(JCas aJCas) { - Map<Sentence, Collection<NamedEntity>> idx = indexCovered(aJCas, Sentence.class, NamedEntity.class); + private Map<Sentence, List<NamedEntity>> getNamedEntityIndex(JCas aJCas) + { + Map<Sentence, List<NamedEntity>> idx = indexCovered(aJCas, Sentence.class, + NamedEntity.class); if (acceptedTagsRegex != null) { Pattern pattern = Pattern.compile(acceptedTagsRegex); - Map<Sentence, Collection<NamedEntity>> filteredIdx = new HashMap<>(); + Map<Sentence, List<NamedEntity>> filteredIdx = new HashMap<>(); for (Sentence sentence : select(aJCas, Sentence.class)) { - Collection<NamedEntity> nes = new ArrayList<>(); + List<NamedEntity> nes = new ArrayList<>(); for (NamedEntity ne : idx.get(sentence)) { if (pattern.matcher(ne.getValue()).matches()) { @@ -208,14 +228,20 @@ private Map<Sentence, Collection<NamedEntity>> getNamedEntityIndex(JCas aJCas) { return idx; } - private static final class Row { + private static final class Row + { Token token; String ne; } @Override - public void collectionProcessComplete() - throws AnalysisEngineProcessException { + public void collectionProcessComplete() throws AnalysisEngineProcessException + { + if (tempData == null) { + throw new AnalysisEngineProcessException( + new IllegalStateException("Trainer did not receive any training data.")); + } + IOUtils.closeQuietly(out); // Load user-provided configuration @@ -258,7 +284,8 @@ public void collectionProcessComplete() } @Override - public void destroy() { + public void destroy() + { super.destroy(); // Clean up temporary data file diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordParser.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordParser.java similarity index 88% rename from dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordParser.java rename to dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordParser.java index b60e048cef..a8239331d1 100644 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordParser.java +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordParser.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,14 +14,16 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; +package org.dkpro.core.stanfordnlp; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.util.Level.FINE; import static org.apache.uima.util.Level.INFO; import static org.apache.uima.util.Level.WARNING; +import static org.dkpro.core.api.resources.MappingProviderFactory.createConstituentMappingProvider; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; import java.io.BufferedInputStream; import java.io.IOException; @@ -49,23 +51,22 @@ import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Level; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.stanfordnlp.util.CoreNlpUtils; +import org.dkpro.core.stanfordnlp.util.StanfordAnnotator; +import org.dkpro.core.stanfordnlp.util.TreeWithTokens; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.CoreNlpUtils; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.StanfordAnnotator; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.TreeWithTokens; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.parser.common.ParserGrammar; @@ -88,11 +89,16 @@ import edu.stanford.nlp.trees.UniversalEnglishGrammaticalStructureFactory; import edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalRelations; import edu.stanford.nlp.util.Filters; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Stanford Parser component. */ -@ResourceMetaData(name="CoreNLP Parser (old API)") +@Component(OperationType.CONSTITUENCY_PARSER) +@ResourceMetaData(name = "CoreNLP Parser (old API)") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", @@ -152,7 +158,7 @@ public static enum DependenciesMode { * </ol> * Corresponding parser option: {@code tree} */ - TREE, // tree - typedDependencies(false) + collapseDependenciesTree(tdl) + TREE, // tree - typedDependencies(false) + collapseDependenciesTree(tdl) ENHANCED, // ENHANCED_PLUS_PLUS // } @@ -179,6 +185,20 @@ public static enum DependenciesMode { @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + /** * Location from which the model is read. */ @@ -186,24 +206,32 @@ public static enum DependenciesMode { @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) protected String modelLocation; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Location of the mapping file for part-of-speech tags to UIMA types. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; /** * Location of the mapping file for constituent tags to UIMA types. */ - public static final String PARAM_CONSTITUENT_MAPPING_LOCATION = ComponentParameters.PARAM_CONSTITUENT_MAPPING_LOCATION; + public static final String PARAM_CONSTITUENT_MAPPING_LOCATION = + ComponentParameters.PARAM_CONSTITUENT_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_CONSTITUENT_MAPPING_LOCATION, mandatory = false) protected String constituentMappingLocation; /** * Sets whether to create or not to create dependency annotations. - * - * <p>Default: {@code true} */ public static final String PARAM_WRITE_DEPENDENCY = ComponentParameters.PARAM_WRITE_DEPENDENCY; @ConfigurationParameter(name = PARAM_WRITE_DEPENDENCY, mandatory = true, defaultValue = "true") @@ -212,7 +240,6 @@ public static enum DependenciesMode { /** * Sets the kind of dependencies being created. * - * <p>Default: {@link DependenciesMode#TREE TREE} * @see DependenciesMode */ public static final String PARAM_MODE = "mode"; @@ -222,18 +249,15 @@ public static enum DependenciesMode { /** * Sets whether to create or not to create constituent tags. This is required for POS-tagging * and lemmatization. - * <p> - * Default: {@code true} */ - public static final String PARAM_WRITE_CONSTITUENT = ComponentParameters.PARAM_WRITE_CONSTITUENT; + public static final String PARAM_WRITE_CONSTITUENT = + ComponentParameters.PARAM_WRITE_CONSTITUENT; @ConfigurationParameter(name = PARAM_WRITE_CONSTITUENT, mandatory = true, defaultValue = "true") private boolean writeConstituent; /** * If this parameter is set to true, each sentence is annotated with a PennTree-Annotation, * containing the whole parse tree in Penn Treebank style format. - * <p> - * Default: {@code false} */ public static final String PARAM_WRITE_PENN_TREE = ComponentParameters.PARAM_WRITE_PENN_TREE; @ConfigurationParameter(name = PARAM_WRITE_PENN_TREE, mandatory = true, defaultValue = "false") @@ -244,7 +268,6 @@ public static enum DependenciesMode { * annotation as the basic unit for parsing. * <p>If the parameter is set with the name of an annotation type <i>x</i>, the parser will no * longer parse <i>Sentence</i>-annotations, but <i>x</i>-Annotations.</p> - * <p>Default: {@code null} */ public static final String PARAM_ANNOTATIONTYPE_TO_PARSE = "annotationTypeToParse"; @ConfigurationParameter(name = PARAM_ANNOTATIONTYPE_TO_PARSE, mandatory = false) @@ -253,8 +276,6 @@ public static enum DependenciesMode { /** * Sets whether to create or not to create POS tags. The creation of constituent tags must be * turned on for this to work. - * <p> - * Default: {@code false} */ public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "false") @@ -263,8 +284,6 @@ public static enum DependenciesMode { /** * Sets whether to use or not to use already existing POS tags from another annotator for the * parsing process. - * <p> - * Default: {@code true} */ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") @@ -273,20 +292,17 @@ public static enum DependenciesMode { /** * Maximum number of tokens in a sentence. Longer sentences are not parsed. This is to avoid out * of memory exceptions. - * <p> - * Default: {@code 130} * * @see TestOptions#maxLength */ - public static final String PARAM_MAX_SENTENCE_LENGTH = ComponentParameters.PARAM_MAX_SENTENCE_LENGTH; + public static final String PARAM_MAX_SENTENCE_LENGTH = + ComponentParameters.PARAM_MAX_SENTENCE_LENGTH; @ConfigurationParameter(name = PARAM_MAX_SENTENCE_LENGTH, mandatory = true, defaultValue = "130") private int maxTokens; /** * Controls when the factored parser considers a sentence to be too complex and falls back to * the PCFG parser. - * <p> - * Default: {@code 200000} * * @see TestOptions#MAX_ITEMS */ @@ -319,6 +335,9 @@ public static enum DependenciesMode { @ConfigurationParameter(name = PARAM_QUOTE_END, mandatory = false) private List<String> quoteEnd; + /** + * Whether to keep the punctuation as part of the parse tree. + */ public static final String PARAM_KEEP_PUNCTUATION = "keepPunctuation"; @ConfigurationParameter(name = PARAM_KEEP_PUNCTUATION, mandatory = true, defaultValue = "false") private boolean keepPunctuation; @@ -348,20 +367,13 @@ public void initialize(UimaContext context) modelProvider = new StanfordParserModelProvider(); - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - language, modelProvider); - - constituentMappingProvider = MappingProviderFactory.createConstituentMappingProvider( + posMappingProvider = createPosMappingProvider(this, posMappingLocation, language, + modelProvider); + + constituentMappingProvider = createConstituentMappingProvider(this, constituentMappingLocation, language, modelProvider); } - /** - * Processes the given text using the StanfordParser. - * - * @param aJCas - * the {@link JCas} to process - * @see org.apache.uima.analysis_component.JCasAnnotator_ImplBase#process(org.apache.uima.jcas.JCas) - */ @Override public void process(JCas aJCas) throws AnalysisEngineProcessException @@ -453,8 +465,8 @@ public void process(JCas aJCas) } } - protected void doCreateDependencyTags(ParserGrammar aParser, StanfordAnnotator sfAnnotator, Tree parseTree, - List<Token> tokens) + protected void doCreateDependencyTags(ParserGrammar aParser, StanfordAnnotator sfAnnotator, + Tree parseTree, List<Token> tokens) { GrammaticalStructure gs; try { @@ -512,7 +524,8 @@ protected void doCreateDependencyTags(ParserGrammar aParser, StanfordAnnotator s Token govToken = tokens.get(govIndex - 1); Token depToken = tokens.get(depIndex - 1); - dep = sfAnnotator.createDependencyAnnotation(currTypedDep.reln(), govToken, depToken); + dep = sfAnnotator.createDependencyAnnotation(currTypedDep.reln(), govToken, + depToken); } else { Token depToken = tokens.get(depIndex - 1); @@ -526,7 +539,8 @@ protected void doCreateDependencyTags(ParserGrammar aParser, StanfordAnnotator s dep.addToIndexes(); } - dep.setFlavor(currTypedDep.extra() ? DependencyFlavor.ENHANCED : DependencyFlavor.BASIC); + dep.setFlavor( + currTypedDep.extra() ? DependencyFlavor.ENHANCED : DependencyFlavor.BASIC); } } @@ -546,8 +560,10 @@ private class StanfordParserModelProvider { setContextObject(StanfordParser.this); + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); setDefault(ARTIFACT_ID, "${groupId}.stanfordnlp-model-parser-${language}-${variant}"); - setDefault(LOCATION, "classpath:/${package}/lib/parser-${language}-${variant}.properties"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/parser-${language}-${variant}.properties"); setDefaultVariantsLocation("${package}/lib/parser-default-variants.map"); setOverride(LOCATION, modelLocation); @@ -671,7 +687,8 @@ else if (pd instanceof ShiftReduceParser) { "Current model does not seem to support " + "dependencies."); } - if (gsf != null && EnglishGrammaticalStructureFactory.class.equals(gsf.getClass())) { + if (gsf != null + && EnglishGrammaticalStructureFactory.class.equals(gsf.getClass())) { SingletonTagset depTags = new SingletonTagset(Dependency.class, "stanford341"); for (GrammaticalRelation r : EnglishGrammaticalRelations.values()) { depTags.add(r.getShortName()); @@ -680,7 +697,8 @@ else if (pd instanceof ShiftReduceParser) { addTagset(depTags); } } - else if (gsf != null && UniversalEnglishGrammaticalStructureFactory.class.equals(gsf.getClass())) { + else if (gsf != null && UniversalEnglishGrammaticalStructureFactory.class + .equals(gsf.getClass())) { SingletonTagset depTags = new SingletonTagset(Dependency.class, "universal"); for (GrammaticalRelation r : UniversalEnglishGrammaticalRelations.values()) { depTags.add(r.getShortName()); diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordPosTagger.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordPosTagger.java new file mode 100644 index 0000000000..0f4a48cd11 --- /dev/null +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordPosTagger.java @@ -0,0 +1,258 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.stanfordnlp; + +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.apache.uima.util.Level.INFO; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Type; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; +import org.dkpro.core.api.parameter.ResourceParameter; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.stanfordnlp.util.CoreNlpUtils; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import edu.stanford.nlp.ling.HasWord; +import edu.stanford.nlp.ling.TaggedWord; +import edu.stanford.nlp.ling.Word; +import edu.stanford.nlp.process.PTBEscapingProcessor; +import edu.stanford.nlp.tagger.maxent.MaxentTagger; +import edu.stanford.nlp.util.StringUtils; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Stanford Part-of-Speech tagger component. + */ +@Component(OperationType.PART_OF_SPEECH_TAGGER) +@ResourceMetaData(name = "CoreNLP POS-Tagger (old API)") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, + outputs = {"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}) +public class StanfordPosTagger + extends JCasAnnotator_ImplBase +{ + /** + * Log the tag set(s) when a model is loaded. + */ + public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") + protected boolean printTagSet; + + /** + * Use this language instead of the document language to resolve the model and tag set mapping. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Variant of a model the model. Used to address a specific model if here are multiple models + * for one language. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Location from which the model is read. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + @ResourceParameter(MimeTypes.APPLICATION_X_STANFORDNLP_TAGGER) + protected String modelLocation; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Location of the mapping file for part-of-speech tags to UIMA types. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + protected String posMappingLocation; + + /** + * Enable all traditional PTB3 token transforms (like -LRB-, -RRB-). + * + * @see PTBEscapingProcessor + */ + public static final String PARAM_PTB3_ESCAPING = "ptb3Escaping"; + @ConfigurationParameter(name = PARAM_PTB3_ESCAPING, mandatory = true, defaultValue = "true") + private boolean ptb3Escaping; + + /** + * List of extra token texts (usually single character strings) that should be treated like + * opening quotes and escaped accordingly before being sent to the parser. + */ + public static final String PARAM_QUOTE_BEGIN = "quoteBegin"; + @ConfigurationParameter(name = PARAM_QUOTE_BEGIN, mandatory = false) + private List<String> quoteBegin; + + /** + * List of extra token texts (usually single character strings) that should be treated like + * closing quotes and escaped accordingly before being sent to the parser. + */ + public static final String PARAM_QUOTE_END = "quoteEnd"; + @ConfigurationParameter(name = PARAM_QUOTE_END, mandatory = false) + private List<String> quoteEnd; + + /** + * Sentences with more tokens than the specified max amount will be ignored if this parameter + * is set to a value larger than zero. The default value zero will allow all sentences to be + * POS tagged. + */ + public static final String PARAM_MAX_SENTENCE_LENGTH = + ComponentParameters.PARAM_MAX_SENTENCE_LENGTH;; + @ConfigurationParameter(name = PARAM_MAX_SENTENCE_LENGTH, mandatory = false) + private int maxSentenceTokens = 0; + + private CasConfigurableProviderBase<MaxentTagger> modelProvider; + private MappingProvider posMappingProvider; + + private final PTBEscapingProcessor<HasWord, String, Word> escaper = + new PTBEscapingProcessor<HasWord, String, Word>(); + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase<MaxentTagger>(this, "stanfordnlp", "tagger") { + { + setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/tagger-${language}-${variant}.properties"); + } + + @Override + protected MaxentTagger produceResource(URL aUrl) throws IOException + { + String modelFile = aUrl.toString(); + + MaxentTagger tagger = new MaxentTagger(modelFile, + StringUtils.argsToProperties(new String[] { "-model", modelFile }), + false); + + SingletonTagset tags = new SingletonTagset(POS.class, getResourceMetaData() + .getProperty(("pos.tagset"))); + tags.addAll(tagger.tagSet()); + addTagset(tags); + + if (printTagSet) { + getContext().getLogger().log(INFO, getTagset().toString()); + } + + return tagger; + } + }; + + posMappingProvider = createPosMappingProvider(this, posMappingLocation, language, + modelProvider); + posMappingProvider.setDefaultVariantsLocation("${package}/lib/tagger-default-variants.map"); + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + CAS cas = aJCas.getCas(); + + modelProvider.configure(cas); + posMappingProvider.configure(cas); + + for (Sentence sentence : select(aJCas, Sentence.class)) { + List<Token> tokens = selectCovered(aJCas, Token.class, sentence); + + if (maxSentenceTokens > 0 && tokens.size() > maxSentenceTokens) { + continue; + } + + List<HasWord> words = new ArrayList<HasWord>(tokens.size()); + for (Token t : tokens) { + words.add(new TaggedWord(t.getText())); + } + + if (ptb3Escaping) { + words = CoreNlpUtils.applyPtbEscaping(words, quoteBegin, quoteEnd); + } + + List<TaggedWord> taggedWords = modelProvider.getResource().tagSentence(words); + + int i = 0; + for (Token t : tokens) { + TaggedWord tt = taggedWords.get(i); + Type posTag = posMappingProvider.getTagType(tt.tag()); + POS posAnno = (POS) cas.createAnnotation(posTag, t.getBegin(), t.getEnd()); + posAnno.setStringValue(posTag.getFeatureByBaseName("PosValue"), + tt.tag() != null ? tt.tag().intern() : null); + posAnno.addToIndexes(); + t.setPos(posAnno); + i++; + } + } + } +} diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordPosTaggerTrainer.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordPosTaggerTrainer.java similarity index 78% rename from dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordPosTaggerTrainer.java rename to dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordPosTaggerTrainer.java index 7f61bc9d6d..e82bd14459 100644 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordPosTaggerTrainer.java +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordPosTaggerTrainer.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,9 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; +package org.dkpro.core.stanfordnlp; import static org.apache.uima.fit.util.JCasUtil.indexCovered; import static org.apache.uima.fit.util.JCasUtil.select; @@ -31,6 +31,7 @@ import java.io.PrintWriter; import java.nio.charset.StandardCharsets; import java.util.Collection; +import java.util.List; import java.util.Map; import java.util.Properties; import java.util.regex.Matcher; @@ -43,32 +44,50 @@ import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.parameter.MimeTypes; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import edu.stanford.nlp.tagger.maxent.MaxentTagger; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.Parameters; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Train a POS tagging model for the Stanford POS tagger. */ +@Component(OperationType.TRAINER_OF_MACHINE_LEARNING_MODELS) @MimeTypeCapability(MimeTypes.APPLICATION_X_STANFORDNLP_TAGGER) -@ResourceMetaData(name="CoreNLP POS-Tagger Trainer") +@Parameters( + exclude = { + StanfordPosTaggerTrainer.PARAM_TARGET_LOCATION }) +@ResourceMetaData(name = "CoreNLP POS-Tagger Trainer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}) public class StanfordPosTaggerTrainer extends JCasConsumer_ImplBase { + /** + * Location to which the output is written. + */ public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) private File targetLocation; /** * Training file containing the parameters. The <code>trainFile</code>, <code>model</code> and - * <code>encoding</code> parameters in this file are ignored/overwritten. In the <code>arch</code> - * parameter, the string <code>${distsimCluster}</code> is replaced with the path to the cluster - * files if {@link #PARAM_CLUSTER_FILE} is specified. + * <code>encoding</code> parameters in this file are ignored/overwritten. In the + * <code>arch</code> parameter, the string <code>${distsimCluster}</code> is replaced with the + * path to the cluster files if {@link #PARAM_CLUSTER_FILE} is specified. */ public static final String PARAM_PARAMETER_FILE = "trainFile"; @ConfigurationParameter(name = PARAM_PARAMETER_FILE, mandatory = false) @@ -97,9 +116,10 @@ public void initialize(UimaContext aContext) if (clusterFile != null) { String p = clusterFile.getAbsolutePath(); if (p.contains("(") || p.contains(")") || p.contains(",")) { - // The Stanford POS tagger trainer does not support these characters in the cluster - // files path. If we have those, try to copy the clusters somewhere save before - // training. See: https://github.com/stanfordnlp/CoreNLP/issues/255 + // The Stanford POS tagger trainer does not support these characters in the + // cluster files path. If we have those, try to copy the clusters somewhere + // save before training. + // See: https://github.com/stanfordnlp/CoreNLP/issues/255 File tempClusterFile = File.createTempFile("dkpro-stanford-pos-trainer", ".cluster"); FileUtils.copyFile(clusterFile, tempClusterFile); @@ -120,15 +140,15 @@ public void process(JCas aJCas) if (tempData == null) { try { tempData = File.createTempFile("dkpro-stanford-pos-trainer", ".tsv"); - out = new PrintWriter( - new OutputStreamWriter(new FileOutputStream(tempData), StandardCharsets.UTF_8)); + out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(tempData), + StandardCharsets.UTF_8)); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } } - Map<Sentence, Collection<Token>> index = indexCovered(aJCas, Sentence.class, Token.class); + Map<Sentence, List<Token>> index = indexCovered(aJCas, Sentence.class, Token.class); for (Sentence sentence : select(aJCas, Sentence.class)) { Collection<Token> tokens = index.get(sentence); for (Token token : tokens) { diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordPtbTransformer.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordPtbTransformer.java similarity index 75% rename from dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordPtbTransformer.java rename to dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordPtbTransformer.java index 3467a7379d..52eeea15dd 100644 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordPtbTransformer.java +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordPtbTransformer.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,27 +14,32 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; +package org.dkpro.core.stanfordnlp; import java.io.StringReader; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.process.CoreLabelTokenFactory; import edu.stanford.nlp.process.PTBTokenizer; import edu.stanford.nlp.process.Tokenizer; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Uses the normalizing tokenizer of the Stanford CoreNLP tools to escape the text PTB-style. This * component operates directly on the text and does not require prior segmentation. */ -@ResourceMetaData(name="Stanford Penn Treebank Normalizer") +@Component(OperationType.NORMALIZER) +@ResourceMetaData(name = "Stanford Penn Treebank Normalizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") public class StanfordPtbTransformer extends JCasTransformerChangeBased_ImplBase { diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordSegmenter.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordSegmenter.java new file mode 100644 index 0000000000..16d2a6c94a --- /dev/null +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordSegmenter.java @@ -0,0 +1,363 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.stanfordnlp; + +import static org.apache.uima.fit.util.JCasUtil.selectCovered; + +import java.io.StringReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.parameter.Messages; +import org.dkpro.core.api.segmentation.SegmenterBase; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import edu.stanford.nlp.international.arabic.process.ArabicTokenizer; +import edu.stanford.nlp.international.french.process.FrenchTokenizer; +import edu.stanford.nlp.international.spanish.process.SpanishTokenizer; +import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.process.CoreLabelTokenFactory; +import edu.stanford.nlp.process.PTBEscapingProcessor; +import edu.stanford.nlp.process.PTBTokenizer; +import edu.stanford.nlp.process.Tokenizer; +import edu.stanford.nlp.process.WordToSentenceProcessor; +import edu.stanford.nlp.process.WordToSentenceProcessor.NewlineIsSentenceBreak; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Stanford sentence splitter and tokenizer. + */ +@ResourceMetaData(name = "CoreNLP Segmenter (old API)") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability({"en", "es", "fr"}) +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) +public class StanfordSegmenter + extends SegmenterBase +{ + private static final Map<String, InternalTokenizerFactory> tokenizerFactories; +// private static final Map<String, TreebankLanguagePack> languagePacks; + + static { + tokenizerFactories = new HashMap<String, InternalTokenizerFactory>(); +// tokenizerFactories.put("ar", new InternalArabicTokenizerFactory()); + tokenizerFactories.put("en", new InternalPTBTokenizerFactory()); + tokenizerFactories.put("es", new InternalSpanishTokenizerFactory()); + tokenizerFactories.put("fr", new InternalFrenchTokenizerFactory()); + // The Negra tokenizer is not really a full tokenizer. +// tokenizerFactories.put("de", new InternalNegraPennTokenizerFactory()); + // Not sure if those really work - don't know how to test +// tokenizerFactories.put("zh", new InternalCHTBTokenizerFactory()); + +// languagePacks = new HashMap<String, TreebankLanguagePack>(); +// languagePacks.put("en", new PennTreebankLanguagePack()); +// languagePacks.put("zh", new ChineseTreebankLanguagePack()); +// languagePacks.put("en", new ArabicTreebankLanguagePack()); +// languagePacks.put("de", new NegraPennLanguagePack()); + } + + /** + * If this component is not configured for a specific language and if the language stored in + * the document metadata is not supported, use the given language as a fallback. + */ + public static final String PARAM_LANGUAGE_FALLBACK = "languageFallback"; + @ConfigurationParameter(name = PARAM_LANGUAGE_FALLBACK, mandatory = false) + private String languageFallback; + + /** + * The set of boundary tokens. If null, use default. + * + * @see WordToSentenceProcessor#WordToSentenceProcessor + */ + public static final String PARAM_BOUNDARY_TOKEN_REGEX = "boundaryTokenRegex"; + @ConfigurationParameter(name = PARAM_BOUNDARY_TOKEN_REGEX, mandatory = false, + defaultValue = WordToSentenceProcessor.DEFAULT_BOUNDARY_REGEX) + private String boundaryTokenRegex; + + /** + * This is a Set of String that are matched with .equals() which are allowed to be tacked onto + * the end of a sentence after a sentence boundary token, for example ")". + * + * @see WordToSentenceProcessor#DEFAULT_BOUNDARY_FOLLOWERS_REGEX + */ + public static final String PARAM_BOUNDARY_FOLLOWERS_REGEX = "boundaryFollowersRegex"; + @ConfigurationParameter(name = PARAM_BOUNDARY_FOLLOWERS_REGEX, mandatory = false, defaultValue = + WordToSentenceProcessor.DEFAULT_BOUNDARY_FOLLOWERS_REGEX) + private String boundaryFollowersRegex; + + /** + * These are elements like "p" or "sent", which will be wrapped into regex for approximate XML + * matching. They will be deleted in the output, and will always trigger a sentence boundary. + */ + public static final String PARAM_XML_BREAK_ELEMENTS_TO_DISCARD = "xmlBreakElementsToDiscard"; + @ConfigurationParameter(name = PARAM_XML_BREAK_ELEMENTS_TO_DISCARD, mandatory = false) + private Set<String> xmlBreakElementsToDiscard; + + /** + * The set of regex for sentence boundary tokens that should be discarded. + * + * @see WordToSentenceProcessor#DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD + */ + public static final String PARAM_BOUNDARIES_TO_DISCARD = "boundaryToDiscard"; + @ConfigurationParameter(name = PARAM_BOUNDARIES_TO_DISCARD, mandatory = false, defaultValue = { + "\n", "*NL*" }) + private Set<String> boundariesToDiscard; + + /** + * A regular expression for element names containing a sentence region. Only tokens in such + * elements will be included in sentences. The start and end tags themselves are not included in + * the sentence. + */ + public static final String PARAM_REGION_ELEMENT_REGEX = "regionElementRegex"; + @ConfigurationParameter(name = PARAM_REGION_ELEMENT_REGEX, mandatory = false) + private String regionElementRegex; + + /** + * Strategy for treating newlines as paragraph breaks. + */ + public static final String PARAM_NEWLINE_IS_SENTENCE_BREAK = "newlineIsSentenceBreak"; + @ConfigurationParameter(name = PARAM_NEWLINE_IS_SENTENCE_BREAK, mandatory = false, defaultValue = "TWO_CONSECUTIVE") + private NewlineIsSentenceBreak newlineIsSentenceBreak; + + /** + * The set of regex for sentence boundary tokens that should be discarded. + */ + public static final String PARAM_TOKEN_REGEXES_TO_DISCARD = "tokenRegexesToDiscard"; + @ConfigurationParameter(name = PARAM_TOKEN_REGEXES_TO_DISCARD, mandatory = false, + defaultValue = {}) + private Set<String> tokenRegexesToDiscard; + + /** + * Whether to treat all input as one sentence. + */ + public static final String PARAM_IS_ONE_SENTENCE = "isOneSentence"; + @ConfigurationParameter(name = PARAM_IS_ONE_SENTENCE, mandatory = true, defaultValue = "false") + private boolean isOneSentence; + + /** + * Whether to generate empty sentences. + */ + public static final String PARAM_ALLOW_EMPTY_SENTENCES = "allowEmptySentences"; + @ConfigurationParameter(name = PARAM_ALLOW_EMPTY_SENTENCES, mandatory = true, defaultValue = "false") + private boolean allowEmptySentences; + + /** + * Additional options that should be passed to the tokenizers. The available options depend on + * the language-specific tokenizer being used. + */ + private String[] additionalOptions; + + @Override + protected void process(JCas aJCas, String aText, int aZoneBegin) + throws AnalysisEngineProcessException + { + List<Token> casTokens = null; + + // Use value from language parameter, document language or fallback language - whatever + // is available + String language = getLanguage(aJCas); + + if (isWriteToken()) { + casTokens = new ArrayList<Token>(); + final Tokenizer<?> tokenizer = getTokenizer(language, aText); + + List<?> tokens = tokenizer.tokenize(); + for (int i = 0; i < tokens.size(); i++) { + final Object token = tokens.get(i); + // System.out.println("Token class: "+token.getClass()); + CoreLabel l = (CoreLabel) token; + String t = l.word(); + int begin = l.get(CharacterOffsetBeginAnnotation.class); + int end = l.get(CharacterOffsetEndAnnotation.class); + + casTokens.add(createToken(aJCas, t, aZoneBegin + begin, aZoneBegin + end)); + } + } + + if (isWriteSentence()) { + if (casTokens == null) { + casTokens = selectCovered(aJCas, Token.class, aZoneBegin, + aZoneBegin + aText.length()); + } + + // Prepare the tokens for processing by WordToSentenceProcessor + List<CoreLabel> tokensInDocument = new ArrayList<CoreLabel>(); + Pattern nlPattern = Pattern.compile(".*(\r\n|\n|\r).*"); + Matcher nlMatcher = nlPattern.matcher(""); + int lastTokenEnd = 0; + for (Token token : casTokens) { + if (!NewlineIsSentenceBreak.NEVER.equals(newlineIsSentenceBreak)) { + // add newline as token for newlineIsSentenceBreak parameter + nlMatcher.reset( + aJCas.getDocumentText().subSequence(lastTokenEnd, token.getBegin())); + if (nlMatcher.matches()) { + CoreLabel l = new CoreLabel(); + l.set(CharacterOffsetBeginAnnotation.class, + lastTokenEnd + nlMatcher.start(1)); + l.set(CharacterOffsetEndAnnotation.class, lastTokenEnd + nlMatcher.end(1)); + l.setWord("\n"); + tokensInDocument.add(l); + } + } + lastTokenEnd = token.getEnd(); + // add regular token + CoreLabel l = new CoreLabel(); + l.set(CharacterOffsetBeginAnnotation.class, token.getBegin()); + l.set(CharacterOffsetEndAnnotation.class, token.getEnd()); + l.setWord(token.getText()); + tokensInDocument.add(l); + } + + // The sentence splitter (probably) requires the escaped text, so we prepare it here + PTBEscapingProcessor escaper = new PTBEscapingProcessor(); + escaper.apply(tokensInDocument); + + // Apply the WordToSentenceProcessor to find the sentence boundaries + WordToSentenceProcessor<CoreLabel> proc = new WordToSentenceProcessor<CoreLabel>( + boundaryTokenRegex, boundaryFollowersRegex, boundariesToDiscard, + xmlBreakElementsToDiscard, regionElementRegex, newlineIsSentenceBreak, null, + tokenRegexesToDiscard, isOneSentence, allowEmptySentences); + + List<List<CoreLabel>> sentencesInDocument = proc.process(tokensInDocument); + for (List<CoreLabel> sentence : sentencesInDocument) { + int begin = sentence.get(0).get(CharacterOffsetBeginAnnotation.class); + int end = sentence.get(sentence.size() - 1).get(CharacterOffsetEndAnnotation.class); + + createSentence(aJCas, begin, end); + } + } + } + + private Tokenizer getTokenizer(final String aLanguage, final String aText) + throws AnalysisEngineProcessException + { + InternalTokenizerFactory tk = tokenizerFactories.get(aLanguage); + if (tk == null) { + if (languageFallback == null) { + throw new AnalysisEngineProcessException(Messages.BUNDLE, + Messages.ERR_UNSUPPORTED_LANGUAGE, new String[] { aLanguage }); + } + else { + tk = tokenizerFactories.get(languageFallback); + if (tk == null) { + throw new AnalysisEngineProcessException(Messages.BUNDLE, + Messages.ERR_UNSUPPORTED_LANGUAGE, new String[] { languageFallback }); + } + } + } + + + return tk.create(aText); + } + + private static interface InternalTokenizerFactory + { + Tokenizer<?> create(String s); + } + + private static class InternalPTBTokenizerFactory + implements InternalTokenizerFactory + { + @Override + public Tokenizer<?> create(final String s) + { +// TokenizerFactory<CoreLabel> f = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible,ptb3Escaping=false"); + return new PTBTokenizer<CoreLabel>(new StringReader(s),new CoreLabelTokenFactory(),"invertible"); + } + } + + // The InternalNegraPennTokenizer is not meant for German text. It + // is for parsing a particular corpus format. +// private static +// class InternalNegraPennTokenizerFactory +// implements InternalTokenizerFactory +// { +// @Override +// public +// Tokenizer<?> create( +// final String s) +// { +// return new NegraPennTokenizer(new StringReader(s)); +// } +// } + + private static class InternalArabicTokenizerFactory + implements InternalTokenizerFactory + { + @Override + public Tokenizer<?> create(final String s) + { + return ArabicTokenizer.newArabicTokenizer(new StringReader(s), new Properties()); + } + } + + private static class InternalFrenchTokenizerFactory + implements InternalTokenizerFactory + { + @Override + public Tokenizer<?> create(final String s) + { + return FrenchTokenizer.factory().getTokenizer(new StringReader(s), "tokenizeNLs=false"); + } + } + + private static class InternalSpanishTokenizerFactory + implements InternalTokenizerFactory + { + @Override + public Tokenizer<?> create(final String s) + { + return SpanishTokenizer.factory(new CoreLabelTokenFactory(), null) + .getTokenizer(new StringReader(s)); + } + } + + // While the stanford parser should come with a proper tokenizer for + // Chinese (because it can parse chinese text), this does not seem to be + // the right one or I am using it wrong. The associated test cases do not + // work. +// private static +// class InternalCHTBTokenizerFactory +// implements InternalTokenizerFactory +// { +// @Override +// public +// Tokenizer<?> create( +// final String s) +// { +// return new CHTBTokenizer(new StringReader(s)); +// } +// } +} diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordSentimentAnalyzer.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordSentimentAnalyzer.java new file mode 100644 index 0000000000..758ac95712 --- /dev/null +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/StanfordSentimentAnalyzer.java @@ -0,0 +1,105 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.stanfordnlp; + +import java.util.Properties; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.ejml.simple.SimpleMatrix; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.sentiment.type.StanfordSentimentAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.sentiment.SentimentCoreAnnotations; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.util.CoreMap; + +/** + * Experimental wrapper for {@link edu.stanford.nlp.pipeline.SentimentAnnotator} which assigns + * 5 scores to each sentence. NOTE: Is very slow in the current state as it runs full Stanford + * pipeline and does not take into account any existing DKPro annotations. + */ +@ResourceMetaData(name = "CoreNLP Sentiment Analyzer") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" + }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.sentiment.type.StanfordSentimentAnnotation" + } +) +public class StanfordSentimentAnalyzer + extends JCasAnnotator_ImplBase +{ + + private StanfordCoreNLP pipeline; + + @Override + public void initialize(UimaContext context) throws ResourceInitializationException + { + super.initialize(context); + + Properties props = new Properties(); + props.setProperty("annotators", "tokenize, ssplit, pos, parse, sentiment"); + pipeline = new StanfordCoreNLP(props); + } + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException + { + for (Sentence sentenceDKPro : JCasUtil.select(jCas, Sentence.class)) { + String sentenceText = sentenceDKPro.getCoveredText(); + + Annotation annotation = pipeline.process(sentenceText); + + for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { + Tree tree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class); + SimpleMatrix sentimentCoefficients = RNNCoreAnnotations.getPredictions(tree); + + double veryNegative = sentimentCoefficients.get(0); + double negative = sentimentCoefficients.get(1); + double neutral = sentimentCoefficients.get(2); + double positive = sentimentCoefficients.get(3); + double veryPositive = sentimentCoefficients.get(4); + + StanfordSentimentAnnotation sentimentAnnotation = new StanfordSentimentAnnotation( + jCas); + sentimentAnnotation.setBegin(sentenceDKPro.getBegin()); + sentimentAnnotation.setEnd(sentenceDKPro.getEnd()); + sentimentAnnotation.setVeryNegative(veryNegative); + sentimentAnnotation.setNegative(negative); + sentimentAnnotation.setNeutral(neutral); + sentimentAnnotation.setPositive(positive); + sentimentAnnotation.setVeryPositive(veryPositive); + sentimentAnnotation.addToIndexes(); + } + } + } +} diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/internal/RootKey.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/internal/RootKey.java similarity index 84% rename from dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/internal/RootKey.java rename to dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/internal/RootKey.java index 30ab08ead0..625ac908ce 100644 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/internal/RootKey.java +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/internal/RootKey.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,9 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp.internal; +package org.dkpro.core.stanfordnlp.internal; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; import edu.stanford.nlp.util.TypesafeMap.Key; diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/internal/TokenKey.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/internal/TokenKey.java new file mode 100644 index 0000000000..f4449c181b --- /dev/null +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/internal/TokenKey.java @@ -0,0 +1,27 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.stanfordnlp.internal; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import edu.stanford.nlp.util.TypesafeMap.Key; + +public class TokenKey + implements Key<Token> +{ +}; diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/package-info.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/package-info.java new file mode 100644 index 0000000000..7f5aea7150 --- /dev/null +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/package-info.java @@ -0,0 +1,25 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +/** + * Integration of NLP components from the <a href="http://nlp.stanford.edu/software/corenlp.shtml"> + * Stanford CoreNLP suite</a>. + * + * @since 1.1.0 + */ +package org.dkpro.core.stanfordnlp; diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/CasCopier.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/CasCopier.java similarity index 98% rename from dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/CasCopier.java rename to dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/CasCopier.java index 83f1513a4d..b4c5fd5509 100644 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/CasCopier.java +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/CasCopier.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -27,9 +27,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util; +package org.dkpro.core.stanfordnlp.util; import java.util.ArrayList; import java.util.HashMap; @@ -77,7 +77,8 @@ public class CasCopier private final LowLevelCAS mLowLevelDestCas; private final Feature mDestSofaFeature; - private final Map<FeatureStructure, FeatureStructure> mFsMap = new HashMap<FeatureStructure, FeatureStructure>(); + private final Map<FeatureStructure, FeatureStructure> mFsMap = + new HashMap<FeatureStructure, FeatureStructure>(); private List<Annotation> batchCopyAnnoList = null; @@ -319,8 +320,8 @@ private void copyFeatures(FeatureStructure aSrcFS, FeatureStructure aDestFS) // enumerate all possible primitive types. Maybe LowLevel CAS API // could help? if (srcFeat.getRange().isPrimitive()) { - - aDestFS.setFeatureValueFromString(destFeat, aSrcFS.getFeatureValueAsString(srcFeat)); + aDestFS.setFeatureValueFromString(destFeat, + aSrcFS.getFeatureValueAsString(srcFeat)); } else { // recursive copy diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/CoreNlpUtils.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/CoreNlpUtils.java similarity index 94% rename from dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/CoreNlpUtils.java rename to dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/CoreNlpUtils.java index cee4b7ae6b..65ade5db54 100644 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/CoreNlpUtils.java +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/CoreNlpUtils.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,9 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util; +package org.dkpro.core.stanfordnlp.util; import java.util.Collection; import java.util.List; diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/StanfordAnnotator.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/StanfordAnnotator.java similarity index 90% rename from dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/StanfordAnnotator.java rename to dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/StanfordAnnotator.java index 2263f2af73..0301c40879 100644 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/StanfordAnnotator.java +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/StanfordAnnotator.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,9 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util; +package org.dkpro.core.stanfordnlp.util; import java.util.ArrayList; import java.util.Collection; @@ -31,13 +31,12 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.Tag; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import edu.stanford.nlp.ling.CoreLabel; @@ -153,15 +152,6 @@ private Annotation createConstituentAnnotationFromTree( // calculate span for the current subtree IntPair span = tokenTree.getSpan(aNode); - // Check if the node has been marked by a TSurgeon operation. - // If so, add a tag-annotation on the constituent - if (nodeLabelValue.contains(TAG_SEPARATOR) && !nodeLabelValue.equals(TAG_SEPARATOR)) { - int separatorIndex = nodeLabelValue.indexOf(TAG_SEPARATOR); - String tag = nodeLabelValue.substring(0, separatorIndex); - nodeLabelValue = nodeLabelValue.substring(separatorIndex + 1, nodeLabelValue.length()); - createTagAnnotation(span.getSource(), span.getTarget(), tag); - } - // Check if node is a constituent node on sentence or phrase-level if (aNode.isPhrasal()) { @@ -229,23 +219,6 @@ else if (aNode.isPreTerminal()) { } } - /** - * Creates a tag-annotation over a constituent - * - * @param aBegin - * start-index of the constituent span - * @param aEnd - * end-index of the constituent span - * @param aTag - * the tag value - */ - public void createTagAnnotation(int aBegin, int aEnd, String aTag) - { - Tag newTag = new Tag(jCas, aBegin, aEnd); - newTag.setValue(aTag); - jCas.addFsToIndexes(newTag); - } - /** * Creates a new Constituent annotation. Links to parent- and child-annotations are not yet * created here. @@ -266,11 +239,12 @@ public Constituent createConstituentAnnotation(int aBegin, int aEnd, String aCon // create the necessary objects and methods Type constType = constituentMappingProvider.getTagType(aConstituentType); - Constituent constAnno = (Constituent) jCas.getCas().createAnnotation(constType, aBegin, aEnd); + Constituent constAnno = (Constituent) jCas.getCas().createAnnotation(constType, aBegin, + aEnd); constAnno.setConstituentType(aConstituentType); - constAnno.setSyntacticFunction(aSyntacticFunction); - return constAnno; - } + constAnno.setSyntacticFunction(aSyntacticFunction); + return constAnno; + } /** * Creates a new Constituent annotation. Links to parent- and child-annotations are not yet @@ -318,8 +292,8 @@ public Dependency createDependencyAnnotation(GrammaticalRelation aDependencyType * the dependent-word * @return the newly created dependency annotation. */ - public static Dependency createDependencyAnnotation(JCas jCas, GrammaticalRelation aDependencyType, - Token aGovernor, Token aDependent) + public static Dependency createDependencyAnnotation(JCas jCas, + GrammaticalRelation aDependencyType, Token aGovernor, Token aDependent) { // create the necessary objects and methods String dependencyTypeName = DEPPACKAGE + aDependencyType.getShortName().toUpperCase(); diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/TreeUtils.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/TreeUtils.java similarity index 98% rename from dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/TreeUtils.java rename to dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/TreeUtils.java index 3a4fb6d4ef..b8751c07ae 100644 --- a/dkpro-core-stanfordnlp-gpl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/util/TreeUtils.java +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/TreeUtils.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,9 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util; +package org.dkpro.core.stanfordnlp.util; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.fit.util.JCasUtil.selectCovered; @@ -35,6 +35,7 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; +import org.dkpro.core.stanfordnlp.internal.TokenKey; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; @@ -44,7 +45,6 @@ import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.internal.TokenKey; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.CoreLabel; diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/TreeWithTokens.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/TreeWithTokens.java new file mode 100644 index 0000000000..afb0618252 --- /dev/null +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/TreeWithTokens.java @@ -0,0 +1,188 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.stanfordnlp.util; + +import java.util.Iterator; +import java.util.List; + +import org.apache.uima.jcas.tcas.Annotation; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.util.IntPair; + +/** + * A wrapper object that manages a tree object together with the respective + * Token annotations for the leafs of the tree. This is needed for being able to + * map the leaves of the tree to words in a CAS. + * + * Trees in TreeWithToken-object are always converted to trees with + * CoreLabel-type labels. + * + * + */ +public class TreeWithTokens +{ + private Tree tree; + private List<Token> tokens; + + public TreeWithTokens(Tree tree, List<Token> tokens) + { + setTree(tree); + setTokens(tokens); + } + + public void setTree(Tree tree) + { + if (!(tree.label() instanceof CoreLabel)) { + tree = tree.deepCopy(tree.treeFactory(), CoreLabel.factory()); + } + + tree.indexLeaves(); + + this.tree = tree; + } + + public Tree getTree() + { + return tree; + } + + public void setTokens(List<Token> tokens) + { + this.tokens = tokens; + } + + public List<Token> getTokens() + { + return tokens; + } + + /** + * Returns the span of the documentText that is covered by this + * TreeWithTokens. + * + * @return an IntPair describing the span of the documentText that is + * covered by this tree + */ + public IntPair getSpan() + { + return getSpan(getTree()); + } + + /** + * Returns the span of the documentText that is covered by a given subtree, + * that has to be taken directly from the original tree. + * <p> + * NOTE: Possibly we could make this more general to also support general + * trees that are contained in the original tree, but are not directly taken + * from it (i.e. with different leaf-numbering). In order to do so, we would + * have to make a Tregex-Matching of the given subtree in the original tree + * to identify the positition of the given subtree. + * <p> + * This could be achieved by translating the subtree into a Tregex pattern + * and then matching this pattern against the original tree. + * + * @param subtree + * a subtree of this TreeWithTokens (it has to be a real + * subtree(!), because index numbering of subtree has to fit to + * the numbering of the original tree) + * @return an IntPair describing the span of the documentText that is + * covered by this tree + */ + public IntPair getSpan(Tree subtree) + { + // TODO check if subtree is a real subtree of tokenTree.getTree() + + int nodeIndexLeft = ((CoreLabel) getLeftmostLeaf(subtree).label()) + .index(); + int nodeIndexRight = ((CoreLabel) getRightmostLeaf(subtree).label()) + .index(); + int a = tokens.get(nodeIndexLeft - 1).getBegin(); + int b = tokens.get(nodeIndexRight - 1).getEnd(); + + return new IntPair(a, b); + } + + private Tree getLeftmostLeaf(Tree t) + { + if (t.isLeaf()) { + return t; + } + else { + return getLeftmostLeaf(t.firstChild()); + } + } + + private Tree getRightmostLeaf(Tree t) + { + if (t.isLeaf()) { + return t; + } + else { + return getRightmostLeaf(t.lastChild()); + } + } + + /** + * Finds the best-fitting node in the tree for a given annotation. + * + * The best-fitting node for an annotation is the deepest node in the tree + * that still completely contains the span of the given annotation. + * + * TODO Could be done more efficiently, I think. In a recursive method, for + * example, recursion could be stopped as soon as overlap becomes -1 + * + * @param anno + * the annotation to find a best fit for + * + * @return the node of the tree that is the best fit for <code>anno</code> + */ + public Tree getBestFit(Annotation anno) + { + Tree curBestFit = null; + int curBestOverlap = Integer.MAX_VALUE; + + Iterator<Tree> treeIterator = getTree().iterator(); + while (treeIterator.hasNext()) { + Tree curTree = treeIterator.next(); + IntPair span = getSpan(curTree); + + // calc overlap: if annotation not completely contained in span of + // subtree, overlap will be -1, otherwise it will be >0 + // Our goal is to find the node with minimal positive overlap + int overlap = -1; + int leftBorder = anno.getBegin() - span.getSource(); + int rightBorder = span.getTarget() - anno.getEnd(); + if (!(leftBorder < 0) && !(rightBorder < 0)) { + overlap = leftBorder + rightBorder; + } + + // determine whether node is better than the temporary best fit + if ((overlap > -1) && overlap < curBestOverlap) { + curBestFit = curTree; + curBestOverlap = overlap; + } + } + + return curBestFit; + } + +} diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/UIMAAnnotations.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/UIMAAnnotations.java new file mode 100644 index 0000000000..1a5146cfbe --- /dev/null +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/UIMAAnnotations.java @@ -0,0 +1,42 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.stanfordnlp.util; + +import java.util.Collection; + +import org.apache.uima.jcas.tcas.Annotation; + +import edu.stanford.nlp.ling.CoreAnnotation; + +/** + * Annotations of the type "UIMAAnnotations" should contain a Collection of + * org.apache.uima.jcas.tcas.Annotation objects. + * + */ +@SuppressWarnings("rawtypes") +public class UIMAAnnotations + implements CoreAnnotation<Collection<Annotation>> +{ + @SuppressWarnings("unchecked") + @Override + public Class<Collection<Annotation>> getType() + { + return (Class<Collection<Annotation>>) (Class) Collection.class; + } +} diff --git a/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/package-info.java b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/package-info.java new file mode 100644 index 0000000000..44905faa34 --- /dev/null +++ b/dkpro-core-stanfordnlp-gpl/src/main/java/org/dkpro/core/stanfordnlp/util/package-info.java @@ -0,0 +1,22 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +/** + * Utility classes shared between different components. + */ +package org.dkpro.core.stanfordnlp.util; diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-default-variants.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-default-variants.map deleted file mode 100644 index 22fa925b02..0000000000 --- a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-default-variants.map +++ /dev/null @@ -1,3 +0,0 @@ -de=dewac_175m_600.crf -en=all.3class.distsim.crf -es=ancora.distsim.s512.crf diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/tagger-default-variants.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/tagger-default-variants.map deleted file mode 100644 index 3942bc99a7..0000000000 --- a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/tagger-default-variants.map +++ /dev/null @@ -1,6 +0,0 @@ -ar=accurate -de=fast -fr=default -en=bidirectional-distsim -es=distsim -zh=distsim diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-de-dewac_175m_600.crf.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-de-dewac_175m_600.crf.map similarity index 100% rename from dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-de-dewac_175m_600.crf.map rename to dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-de-dewac_175m_600.crf.map diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-de-germeval2014.hgc_175m_600.crf.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-de-germeval2014.hgc_175m_600.crf.map new file mode 100644 index 0000000000..e7c53d13de --- /dev/null +++ b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-de-germeval2014.hgc_175m_600.crf.map @@ -0,0 +1,5 @@ +PERSON=de.tudarmstadt.ukp.dkpro.core.api.ner.type.Person +LOCATION=de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location +ORGANIZATION=de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization +#MISC +*=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-de-hgc_175m_600.crf.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-de-hgc_175m_600.crf.map similarity index 100% rename from dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-de-hgc_175m_600.crf.map rename to dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-de-hgc_175m_600.crf.map diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-de-nemgp.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-de-nemgp.map similarity index 100% rename from dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-de-nemgp.map rename to dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-de-nemgp.map diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-default-variants.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-default-variants.map new file mode 100644 index 0000000000..1372b1e156 --- /dev/null +++ b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-default-variants.map @@ -0,0 +1,3 @@ +de=germeval2014.hgc_175m_600.crf +en=all.3class.distsim.crf +es=ancora.distsim.s512.crf diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-en-all.3class.caseless.distsim.crf.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-en-all.3class.caseless.distsim.crf.map similarity index 100% rename from dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-en-all.3class.caseless.distsim.crf.map rename to dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-en-all.3class.caseless.distsim.crf.map diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-en-all.3class.distsim.crf.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-en-all.3class.distsim.crf.map similarity index 100% rename from dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-en-all.3class.distsim.crf.map rename to dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-en-all.3class.distsim.crf.map diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-en-conll.4class.caseless.distsim.crf.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-en-conll.4class.caseless.distsim.crf.map similarity index 100% rename from dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-en-conll.4class.caseless.distsim.crf.map rename to dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-en-conll.4class.caseless.distsim.crf.map diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-en-conll.4class.distsim.crf.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-en-conll.4class.distsim.crf.map similarity index 100% rename from dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-en-conll.4class.distsim.crf.map rename to dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-en-conll.4class.distsim.crf.map diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-en-freme-wikiner.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-en-freme-wikiner.map similarity index 100% rename from dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-en-freme-wikiner.map rename to dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-en-freme-wikiner.map diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-en-muc.7class.caseless.distsim.crf.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-en-muc.7class.caseless.distsim.crf.map similarity index 100% rename from dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-en-muc.7class.caseless.distsim.crf.map rename to dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-en-muc.7class.caseless.distsim.crf.map diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-en-muc.7class.distsim.crf.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-en-muc.7class.distsim.crf.map similarity index 100% rename from dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-en-muc.7class.distsim.crf.map rename to dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-en-muc.7class.distsim.crf.map diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-en-nowiki.3class.caseless.distsim.crf.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-en-nowiki.3class.caseless.distsim.crf.map similarity index 100% rename from dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-en-nowiki.3class.caseless.distsim.crf.map rename to dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-en-nowiki.3class.caseless.distsim.crf.map diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-es-ancora.distsim.s512.crf.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-es-ancora.distsim.s512.crf.map similarity index 100% rename from dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-es-ancora.distsim.s512.crf.map rename to dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-es-ancora.distsim.s512.crf.map diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-es-freme-wikiner.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-es-freme-wikiner.map similarity index 100% rename from dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-es-freme-wikiner.map rename to dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-es-freme-wikiner.map diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-fr-freme-wikiner.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-fr-freme-wikiner.map similarity index 100% rename from dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-fr-freme-wikiner.map rename to dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-fr-freme-wikiner.map diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-it-freme-wikiner.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-it-freme-wikiner.map similarity index 100% rename from dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-it-freme-wikiner.map rename to dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-it-freme-wikiner.map diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-nl-freme-wikiner.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-nl-freme-wikiner.map similarity index 100% rename from dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-nl-freme-wikiner.map rename to dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-nl-freme-wikiner.map diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-ru-freme-wikiner.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-ru-freme-wikiner.map similarity index 100% rename from dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-ru-freme-wikiner.map rename to dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/ner-ru-freme-wikiner.map diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/parser-default-variants.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/parser-default-variants.map similarity index 100% rename from dkpro-core-stanfordnlp-gpl/src/main/resources/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/parser-default-variants.map rename to dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/parser-default-variants.map diff --git a/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/tagger-default-variants.map b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/tagger-default-variants.map new file mode 100644 index 0000000000..aa4c942157 --- /dev/null +++ b/dkpro-core-stanfordnlp-gpl/src/main/resources/org/dkpro/core/stanfordnlp/lib/tagger-default-variants.map @@ -0,0 +1,6 @@ +ar=default +de=fast +fr=default +en=bidirectional-distsim +es=distsim +zh=distsim diff --git a/dkpro-core-stanfordnlp-gpl/src/scripts/build.xml b/dkpro-core-stanfordnlp-gpl/src/scripts/build.xml index a8adfd80ff..631378d6e6 100644 --- a/dkpro-core-stanfordnlp-gpl/src/scripts/build.xml +++ b/dkpro-core-stanfordnlp-gpl/src/scripts/build.xml @@ -1,6 +1,6 @@ <!-- - Copyright 2007-2017 + Copyright 2007-2019 Ubiquitous Knowledge Processing (UKP) Lab Technische Universität Darmstadt @@ -15,7 +15,7 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. + along with this program. If not, see http://www.gnu.org/licenses/. --> <project basedir="../.." default="separate-jars"> @@ -25,27 +25,22 @@ <!-- - Upstream versions - meta data versions are maintained per model below - --> - <!-- http://nlp.stanford.edu/software/tagger.shtml --> - <property name="tagger.version.real" value="3.8.0"/> - <property name="tagger.date" value="2017-06-09"/> - + --> <!-- http://nlp.stanford.edu/software/lex-parser.shtml --> - <property name="parser.version.real" value="3.8.0"/> - <property name="parser.date" value="2017-06-09"/> - <property name="core.srparser.date" value="2014-10-23"/> + <property name="parser.version.real" value="3.9.2"/> + <property name="parser.date" value="20181017"/> <!-- http://nlp.stanford.edu/software/corenlp.shtml --> - <property name="core.version.real" value="3.8.0"/> - <property name="core.version" value="20170609"/> - <property name="core.date" value="2017-06-09"/> + <property name="core.version.real" value="3.9.2"/> + <property name="core.version" value="20181005"/> + <property name="core.date" value="2018-10-05"/> - <property name="core.arabic.date" value="2017-06-09"/> - <property name="core.chinese.date" value="2017-06-09"/> - <property name="core.english.date" value="2017-06-09"/> - <property name="core.french.date" value="2017-06-09"/> - <property name="core.german.date" value="2017-06-09"/> - <property name="core.spanish.date" value="2017-06-09"/> + <property name="core.arabic.date" value="2018-10-05"/> + <property name="core.chinese.date" value="2018-10-05"/> + <property name="core.english.date" value="2018-10-05"/> + <property name="core.french.date" value="2018-10-05"/> + <property name="core.german.date" value="2018-10-05"/> + <property name="core.spanish.date" value="2018-10-05"/> <!-- - Output package configuration @@ -69,22 +64,9 @@ <target name="newmodels"> <property name="install-artifact-mode" value="remote"/> - <!-- - <antcall target="tagger-de-ud"/> - - <antcall target="ner-de-hgc_175m_600.crf"/> - - <antcall target="ner-en-all.3class.distsim.crf"/> - <antcall target="ner-en-all.3class.caseless.distsim.crf"/> - <antcall target="ner-en-nowiki.3class.caseless.distsim.crf"/> - <antcall target="tagger-es-default"/> - <antcall target="tagger-es-distsim"/> - <antcall target="ner-es-ancora.distsim.s512.crf"/> - <antcall target="parser-es-pcfg"/> - <antcall target="parser-es-sr"/> - <antcall target="parser-es-sr-beam"/> - --> - + <antcall target="tagger-en-bidirectional-distsim"/> + <antcall target="tagger-en-left3words-distsim"/> + <antcall target="tagger-en-caseless-left3words-distsim"/> <antcall target="coref-en-default"/> </target> @@ -94,27 +76,34 @@ - We cannot use the one from the stanford-parser:models artifact, because the packages - collide with the corenlp:models. --> - <antcall target="tagger-ar-accurate"/> + <antcall target="tagger-ar-default"/> <antcall target="parser-ar-factored"/> <antcall target="parser-ar-sr"/> </target> - <target name="tagger-ar-accurate" depends="download-tagger"> + <target name="tagger-ar-default" depends="download-ar"> + <mkdir dir="target/download/tmp"/> + <unzip src="target/download/ar/ar-models.zip" dest="target/download/tmp"> + <patternset> + <include name="edu/stanford/nlp/models/pos-tagger/arabic/arabic.tagger*"/> + </patternset> + </unzip> <install-stub-and-upstream-file - file="target/download/tagger/arabic.tagger" - md5="f6c1a9a591e5a8210549feefb1cc8e1e" + file="target/download/tmp/edu/stanford/nlp/models/pos-tagger/arabic/arabic.tagger" + md5="bc8520298da1ca08ee307029ba15133e" groupId="de.tudarmstadt.ukp.dkpro.core" artifactIdBase="de.tudarmstadt.ukp.dkpro.core.stanfordnlp" - upstreamVersion="20131112" + upstreamVersion="20180103" metaDataVersion="1" tool="tagger" language="ar" - variant="accurate" + variant="default" extension="tagger"> <metadata> <entry key="pos.tagset" value="atb"/> </metadata> </install-stub-and-upstream-file> + <delete dir="target/download/tmp"/> </target> <target name="parser-ar-factored" depends="download-ar"> @@ -143,13 +132,19 @@ <delete dir="target/download/tmp"/> </target> - <target name="parser-ar-sr" depends="download-parser"> + <target name="parser-ar-sr" depends="download-ar"> + <mkdir dir="target/download/tmp"/> + <unzip src="target/download/ar/ar-models.zip" dest="target/download/tmp"> + <patternset> + <include name="edu/stanford/nlp/models/srparser/arabicSR.ser.gz"/> + </patternset> + </unzip> <install-stub-and-upstream-file - file="target/download/parser/arabicSR.ser.gz" - md5="7bf3eeb2ddd5dda934a88157f94b1c0a" + file="target/download/tmp/edu/stanford/nlp/models/srparser/arabicSR.ser.gz" + md5="9801cb03f75d80fb1c89f96db4170a56" groupId="de.tudarmstadt.ukp.dkpro.core" artifactIdBase="de.tudarmstadt.ukp.dkpro.core.stanfordnlp" - upstreamVersion="20141031" + upstreamVersion="20180227" metaDataVersion="1" tool="parser" language="ar" @@ -160,6 +155,7 @@ <entry key="constituent.tagset" value="atb"/> </metadata> </install-stub-and-upstream-file> + <delete dir="target/download/tmp"/> </target> <!-- @@ -199,44 +195,17 @@ - to repackage them. If upstream at some point provides smaller JARs (best per-model) - then we should reconsider referring to them directly instead of re-packaging. --> - <!-- NOT IN 3.7.0 antcall target="tagger-de-dewac"/ --> <antcall target="tagger-de-ud"/> <antcall target="tagger-de-hgc"/> <antcall target="tagger-de-fast"/> <antcall target="tagger-de-fast-caseless"/> <antcall target="ner-de-nemgp"/> - <antcall target="ner-de-hgc_175m_600.crf"/> - <!-- NOT IN 3.7.0 antcall target="ner-de-dewac_175m_600.crf"/ --> + <antcall target="ner-de-germeval2014.hgc_175m_600.crf"/> <antcall target="parser-de-pcfg"/> <antcall target="parser-de-factored"/> <antcall target="parser-de-sr"/> </target> - <target name="tagger-de-dewac" depends="download-de"> - <mkdir dir="target/download/tmp"/> - <unzip src="target/download/de/de-models.zip" dest="target/download/tmp"> - <patternset> - <include name="edu/stanford/nlp/models/pos-tagger/german/german-dewac.tagger"/> - </patternset> - </unzip> - <install-stub-and-upstream-file - file="target/download/tmp/edu/stanford/nlp/models/pos-tagger/german/german-dewac.tagger" - md5="a209ddb12142338762a7cb848733a916" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.stanfordnlp" - upstreamVersion="20140827" - metaDataVersion="1" - tool="tagger" - language="de" - variant="dewac" - extension="tagger"> - <metadata> - <entry key="pos.tagset" value="stts"/> - </metadata> - </install-stub-and-upstream-file> - <delete dir="target/download/tmp"/> - </target> - <target name="tagger-de-ud" depends="download-de"> <mkdir dir="target/download/tmp"/> <unzip src="target/download/de/de-models.zip" dest="target/download/tmp"> @@ -343,7 +312,7 @@ - 2014-10-24 | now | 0feec395d8db89909360a4a3e172b96a --> <get - src="http://www.thomas-zastrow.de/nlp/nemgp_stanford_01.gz" + src="https://www.thomas-zastrow.de/nlp/nemgp_stanford_01.gz" dest="target/download/nemgp_stanford_01.ser.gz" skipexisting="true"/> <install-stub-and-upstream-file @@ -371,46 +340,25 @@ </install-stub-and-upstream-file> </target> - <target name="ner-de-hgc_175m_600.crf" depends="download-de"> - <mkdir dir="target/download/tmp"/> - <unzip src="target/download/de/de-models.zip" dest="target/download/tmp"> - <patternset> - <include name="edu/stanford/nlp/models/ner/german.conll.hgc_175m_600.*"/> - </patternset> - </unzip> + <target name="ner-de-germeval2014.hgc_175m_600.crf" depends="download-de"> + <mkdir dir="target/download/tmp"/> + <unzip src="target/download/de/de-models.zip" dest="target/download/tmp"> + <patternset> + <include name="edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.*"/> + </patternset> + </unzip> <install-stub-and-upstream-file - file="target/download/tmp/edu/stanford/nlp/models/ner/german.conll.hgc_175m_600.crf.ser.gz" - md5="596e40c6b5a38a322ae9d66a73396c00" + file="target/download/tmp/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz" + md5="6071e39c77d6956498ae64557c24e82a" groupId="de.tudarmstadt.ukp.dkpro.core" artifactIdBase="de.tudarmstadt.ukp.dkpro.core.stanfordnlp" - upstreamVersion="20161213" - metaDataVersion="1" + upstreamVersion="20180227" + metaDataVersion="1" tool="ner" language="de" - variant="hgc_175m_600.crf" + variant="germeval2014.hgc_175m_600.crf" extension="ser.gz"/> - <delete dir="target/download/tmp"/> - </target> - - <target name="ner-de-dewac_175m_600.crf" depends="download-de"> - <mkdir dir="target/download/tmp"/> - <unzip src="target/download/de/de-models.zip" dest="target/download/tmp"> - <patternset> - <include name="edu/stanford/nlp/models/ner/german.dewac_175m_600.*"/> - </patternset> - </unzip> - <install-stub-and-upstream-file - file="target/download/tmp/edu/stanford/nlp/models/ner/german.dewac_175m_600.crf.ser.gz" - md5="7cfbf0f0464bee2105e9db193fe69097" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.stanfordnlp" - upstreamVersion="20150130" - metaDataVersion="1" - tool="ner" - language="de" - variant="dewac_175m_600.crf" - extension="ser.gz"/> - <delete dir="target/download/tmp"/> + <delete dir="target/download/tmp"/> </target> <target name="parser-de-pcfg" depends="download-de"> @@ -505,16 +453,16 @@ <antcall target="tagger-en-twitter"/> <antcall target="tagger-en-twitter-fast"/> <antcall target="ner-en-muc.7class.distsim.crf"/> - <antcall target="ner-en-muc.7class.nodistsim.crf"/> + <antcall target="ner-en-muc.7class.nodistsim.crf"/> <antcall target="ner-en-muc.7class.caseless.distsim.crf"/> <antcall target="ner-en-conll.4class.distsim.crf"/> - <antcall target="ner-en-conll.4class.nodistsim.crf"/> + <antcall target="ner-en-conll.4class.nodistsim.crf"/> <antcall target="ner-en-conll.4class.caseless.distsim.crf"/> <antcall target="ner-en-all.3class.distsim.crf"/> - <antcall target="ner-en-all.3class.nodistsim.crf"/> + <antcall target="ner-en-all.3class.nodistsim.crf"/> <antcall target="ner-en-all.3class.caseless.distsim.crf"/> <antcall target="ner-en-nowiki.3class.caseless.distsim.crf"/> - <antcall target="ner-en-nowiki.3class.nodistsim.crf"/> + <antcall target="ner-en-nowiki.3class.nodistsim.crf"/> <antcall target="ner-en-freme-wikiner"/> <antcall target="parser-en-pcfg"/> <antcall target="parser-en-pcfg.caseless"/> @@ -538,10 +486,10 @@ <install-stub-and-upstream-file file="target/download/tmp/edu/stanford/nlp/models/pos-tagger/english-bidirectional/english-bidirectional-distsim.tagger" - md5="a56c39cc598cea170fc6902652552e25" + md5="fa2cbb839d97329a78ea47b3895339cc" groupId="de.tudarmstadt.ukp.dkpro.core" artifactIdBase="de.tudarmstadt.ukp.dkpro.core.stanfordnlp" - upstreamVersion="20140616" + upstreamVersion="20181002" metaDataVersion="1" tool="tagger" language="en" @@ -565,10 +513,10 @@ <install-stub-and-upstream-file file="target/download/tmp/edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger" - md5="38a80a1c6b0fb99bb233b470989135e0" + md5="5309d92af51bcbdf71d2c160c644a79c" groupId="de.tudarmstadt.ukp.dkpro.core" artifactIdBase="de.tudarmstadt.ukp.dkpro.core.stanfordnlp" - upstreamVersion="20140616" + upstreamVersion="20181002" metaDataVersion="1" tool="tagger" language="en" @@ -592,10 +540,10 @@ <install-stub-and-upstream-file file="target/download/tmp/edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger" - md5="26444b971e910cff226be355f39b687f" + md5="7b2bdb109c60237948ecfa7fca507dfe" groupId="de.tudarmstadt.ukp.dkpro.core" artifactIdBase="de.tudarmstadt.ukp.dkpro.core.stanfordnlp" - upstreamVersion="20140827" + upstreamVersion="20181002" metaDataVersion="0" tool="tagger" language="en" @@ -1377,21 +1325,21 @@ <target name="coref-en-default" depends="download-core"> <install-stub-and-upstream-folder - folder="target/download/core/coref-en" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.stanfordnlp" + folder="target/download/core/coref-en" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.stanfordnlp" upstreamVersion="${core.version}" metaDataVersion="1" - tool="coref" - language="en" - variant="default"/> + tool="coref" + language="en" + variant="default"/> </target> <target name="es"> <antcall target="tagger-es-default"/> <antcall target="tagger-es-distsim"/> <antcall target="ner-es-ancora.distsim.s512.crf"/> - <antcall target="ner-es-freme-wikiner"/> + <antcall target="ner-es-freme-wikiner"/> <antcall target="parser-es-pcfg"/> <antcall target="parser-es-sr"/> <antcall target="parser-es-sr-beam"/> @@ -1918,7 +1866,6 @@ <target name="zh"> <antcall target="tagger-zh-distsim"/> - <antcall target="tagger-zh-nodistsim"/> <antcall target="parser-zh-factored"/> <antcall target="parser-zh-pcfg"/> <antcall target="parser-zh-xinhua-factored"/> @@ -1952,29 +1899,6 @@ <delete dir="target/download/tmp"/> </target> - <target name="tagger-zh-nodistsim" depends="download-tagger"> - <!-- - - Non-english models are not contained in the CoreNLP models (yet) - - We cannot use the one from the stanford-parser:models artifact, because the packages - - collide with the corenlp:models. - --> - <install-stub-and-upstream-file - file="target/download/tagger/chinese-nodistsim.tagger" - md5="dc7cee311eaeeb8ff799f0d6574416f5" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.stanfordnlp" - upstreamVersion="20140616" - metaDataVersion="1" - tool="tagger" - language="zh" - variant="nodistsim" - extension="tagger" > - <metadata> - <entry key="pos.tagset" value="ctb"/> - </metadata> - </install-stub-and-upstream-file> - </target> - <target name="parser-zh-pcfg" depends="download-zh"> <mkdir dir="target/download/tmp"/> <unzip src="target/download/zh/zh-models.zip" dest="target/download/tmp"> @@ -2142,21 +2066,21 @@ <target name="download-core" depends="-check-download-core" unless="download-core.DONE"> <mkdir dir="target/download/core"/> - <get - src="http://nlp.stanford.edu/software/stanford-corenlp-full-${core.date}.zip" - dest="target/download/core/core-nlp.zip" - skipexisting="true"/> - <unzip src="target/download/core/core-nlp.zip" dest="target/download/core"> - <patternset> - <include name="**/stanford-corenlp-*-models.jar"/> - </patternset> - <chainedmapper> - <mapper type="flatten"/> - <firstmatchmapper> - <globmapper from="*" to="stanford-corenlp-models.jar"/> - </firstmatchmapper> - </chainedmapper> - </unzip> + <get + src="http://nlp.stanford.edu/software/stanford-corenlp-full-${core.date}.zip" + dest="target/download/core/core-nlp.zip" + skipexisting="true"/> + <unzip src="target/download/core/core-nlp.zip" dest="target/download/core"> + <patternset> + <include name="**/stanford-corenlp-*-models.jar"/> + </patternset> + <chainedmapper> + <mapper type="flatten"/> + <firstmatchmapper> + <globmapper from="*" to="stanford-corenlp-models.jar"/> + </firstmatchmapper> + </chainedmapper> + </unzip> <unzip src="target/download/core/stanford-corenlp-models.jar" dest="target/download/core"> <patternset> <include name="**/*.ser.gz"/> @@ -2167,14 +2091,15 @@ </chainedmapper> </unzip> <unzip src="target/download/core/stanford-corenlp-models.jar" dest="target/download/core/coref-en"> - <patternset> - <include name="**/dcoref/*"/> + <patternset> + <include name="**/dcoref/*"/> + <include name="**/models/coref/**/*"/> </patternset> - <chainedmapper> - <mapper type="flatten"/> - </chainedmapper> + <chainedmapper> + <mapper type="flatten"/> + </chainedmapper> </unzip> - <delete file="target/download/core/stanford-corenlp-models.jar"/> + <delete file="target/download/core/stanford-corenlp-models.jar"/> <touch file="target/download/core/DONE"/> </target> @@ -2207,181 +2132,157 @@ <target name="download-parser" depends="-check-download-parser" unless="download-parser.DONE"> <mkdir dir="target/download/parser"/> - <get - src="http://nlp.stanford.edu/software/stanford-parser-full-${parser.date}.zip" - dest="target/download/parser/parser.zip" - skipexisting="true"/> - <unzip src="target/download/parser/parser.zip" dest="target/download/parser"> - <patternset> - <include name="**/grammar/**/*.ser.gz"/> + <get + src="http://nlp.stanford.edu/software/stanford-parser-full-${parser.date}.zip" + dest="target/download/parser/parser.zip" + skipexisting="true"/> + <unzip src="target/download/parser/parser.zip" dest="target/download/parser"> + <patternset> + <include name="**/grammar/**/*.ser.gz"/> </patternset> - <chainedmapper> - <mapper type="flatten"/> - </chainedmapper> - </unzip> + <chainedmapper> + <mapper type="flatten"/> + </chainedmapper> + </unzip> - <unzip src="target/download/parser/parser.zip" dest="target/download/parser"> - <patternset> - <include name="**/stanford-parser-*-models.jar"/> + <unzip src="target/download/parser/parser.zip" dest="target/download/parser"> + <patternset> + <include name="**/stanford-parser-*-models.jar"/> </patternset> - <chainedmapper> - <mapper type="flatten"/> - <firstmatchmapper> - <globmapper from="*" to="stanford-parser-models.jar"/> - </firstmatchmapper> - </chainedmapper> - </unzip> + <chainedmapper> + <mapper type="flatten"/> + <firstmatchmapper> + <globmapper from="*" to="stanford-parser-models.jar"/> + </firstmatchmapper> + </chainedmapper> + </unzip> <unzip src="target/download/parser/stanford-parser-models.jar" dest="target/download/parser"> - <patternset> - <include name="**/*.ser.gz"/> - <include name="**/*.parser*"/> + <patternset> + <include name="**/*.ser.gz"/> + <include name="**/*.parser*"/> </patternset> - <chainedmapper> - <mapper type="flatten"/> - </chainedmapper> + <chainedmapper> + <mapper type="flatten"/> + </chainedmapper> </unzip> - <delete file="stanford-parser-models.jar"/> + <delete file="stanford-parser-models.jar"/> - <get + <get src="http://nlp.stanford.edu/software/stanford-chinese-corenlp-${core.chinese.date}-models.jar" dest="target/download/parser/stanford-chinese-corenlp-models.jar" skipexisting="true"/> <unzip src="target/download/parser/stanford-chinese-corenlp-models.jar" dest="target/download/parser"> - <patternset> - <include name="**/*.ser.gz"/> - <include name="**/*.parser*"/> + <patternset> + <include name="**/*.ser.gz"/> + <include name="**/*.parser*"/> </patternset> - <chainedmapper> - <mapper type="flatten"/> - </chainedmapper> + <chainedmapper> + <mapper type="flatten"/> + </chainedmapper> </unzip> - <get - src="http://nlp.stanford.edu/software/stanford-srparser-${core.srparser.date}-models.jar" - dest="target/download/parser/stanford-srparser-models.jar" - skipexisting="true"/> + <get + src="http://nlp.stanford.edu/software/stanford-srparser-${core.srparser.date}-models.jar" + dest="target/download/parser/stanford-srparser-models.jar" + skipexisting="true"/> <unzip src="target/download/parser/stanford-srparser-models.jar" dest="target/download/parser"> - <patternset> - <include name="**/*.ser.gz"/> - <include name="**/*.parser*"/> + <patternset> + <include name="**/*.ser.gz"/> + <include name="**/*.parser*"/> </patternset> - <chainedmapper> - <mapper type="flatten"/> - </chainedmapper> + <chainedmapper> + <mapper type="flatten"/> + </chainedmapper> </unzip> <touch file="target/download/parser/DONE"/> </target> - <target name="-check-download-twitie"> - <available property="download-twitie.DONE" file="target/download/twitie/DONE"/> - </target> - - <target name="download-twitie" depends="-check-download-twitie" unless="download-twitie.DONE"> - <mkdir dir="target/download/twitie"/> - - <get - src="https://s3-eu-west-1.amazonaws.com/software.gate.ac.uk/twitie/twitie-tagger.zip" - dest="target/download/twitie/twitie-tagger.zip" - skipexisting="true"/> - <touch file="target/download/twitie/DONE"/> - </target> - - <target name="-check-download-tagger"> - <available property="download-tagger.DONE" file="target/download/tagger/DONE"/> + <target name="-check-download-twitie"> + <available property="download-twitie.DONE" file="target/download/twitie/DONE"/> </target> - <target name="download-tagger" depends="-check-download-tagger" unless="download-tagger.DONE"> - <mkdir dir="target/download/tagger"/> - <get - src="http://nlp.stanford.edu/software/stanford-postagger-full-${tagger.date}.zip" - dest="target/download/tagger/postagger.zip" + <target name="download-twitie" depends="-check-download-twitie" unless="download-twitie.DONE"> + <mkdir dir="target/download/twitie"/> + + <get + src="https://s3-eu-west-1.amazonaws.com/software.gate.ac.uk/twitie/twitie-tagger.zip" + dest="target/download/twitie/twitie-tagger.zip" skipexisting="true"/> - <unzip src="target/download/tagger/postagger.zip" dest="target/download/tagger"> - <patternset> - <include name="**/models//*.tagger*"/> - </patternset> - <chainedmapper> - <mapper type="flatten"/> - </chainedmapper> - </unzip> - <touch file="target/download/tagger/DONE"/> + <touch file="target/download/twitie/DONE"/> </target> - - <!-- - ============================================================================================ - MODELS - ============================================================================================ --> - <target name="-check-download-ar"> - <available property="download-ar.DONE" file="target/download/ar/DONE"/> - </target> + <target name="-check-download-ar"> + <available property="download-ar.DONE" file="target/download/ar/DONE"/> + </target> - <target name="download-ar" depends="-check-download-ar" unless="download-ar.DONE"> - <mkdir dir="target/download/ar"/> - <get src="http://nlp.stanford.edu/software/stanford-arabic-corenlp-${core.arabic.date}-models.jar" - dest="target/download/ar/ar-models.zip" skipexisting="true"/> - <touch file="target/download/ar/DONE"/> - </target> - - <target name="-check-download-en"> - <available property="download-en.DONE" file="target/download/en/DONE"/> - </target> + <target name="download-ar" depends="-check-download-ar" unless="download-ar.DONE"> + <mkdir dir="target/download/ar"/> + <get src="http://nlp.stanford.edu/software/stanford-arabic-corenlp-${core.arabic.date}-models.jar" + dest="target/download/ar/ar-models.zip" skipexisting="true"/> + <touch file="target/download/ar/DONE"/> + </target> - <target name="download-en" depends="-check-download-en" unless="download-en.DONE"> - <mkdir dir="target/download/en"/> - <get src="http://nlp.stanford.edu/software/stanford-english-corenlp-${core.english.date}-models.jar" - dest="target/download/en/en-models.zip" skipexisting="true"/> - <touch file="target/download/en/DONE"/> - </target> + <target name="-check-download-en"> + <available property="download-en.DONE" file="target/download/en/DONE"/> + </target> - <target name="-check-download-de"> - <available property="download-de.DONE" file="target/download/de/DONE"/> - </target> + <target name="download-en" depends="-check-download-en" unless="download-en.DONE"> + <mkdir dir="target/download/en"/> + <get src="http://nlp.stanford.edu/software/stanford-english-corenlp-${core.english.date}-models.jar" + dest="target/download/en/en-models.zip" skipexisting="true"/> + <touch file="target/download/en/DONE"/> + </target> - <target name="download-de" depends="-check-download-de" unless="download-de.DONE"> - <mkdir dir="target/download/de"/> - <get src="http://nlp.stanford.edu/software/stanford-german-corenlp-${core.german.date}-models.jar" - dest="target/download/de/de-models.zip" skipexisting="true"/> - <touch file="target/download/de/DONE"/> - </target> + <target name="-check-download-de"> + <available property="download-de.DONE" file="target/download/de/DONE"/> + </target> - <target name="-check-download-fr"> - <available property="download-fr.DONE" file="target/download/fr/DONE"/> - </target> + <target name="download-de" depends="-check-download-de" unless="download-de.DONE"> + <mkdir dir="target/download/de"/> + <get src="http://nlp.stanford.edu/software/stanford-german-corenlp-${core.german.date}-models.jar" + dest="target/download/de/de-models.zip" skipexisting="true"/> + <touch file="target/download/de/DONE"/> + </target> - <target name="download-fr" depends="-check-download-fr" unless="download-fr.DONE"> - <mkdir dir="target/download/fr"/> - <get src="http://nlp.stanford.edu/software/stanford-french-corenlp-${core.french.date}-models.jar" - dest="target/download/fr/fr-models.zip" skipexisting="true"/> - <touch file="target/download/fr/DONE"/> - </target> - - <target name="-check-download-es"> - <available property="download-es.DONE" file="target/download/es/DONE"/> - </target> + <target name="-check-download-fr"> + <available property="download-fr.DONE" file="target/download/fr/DONE"/> + </target> - <target name="download-es" depends="-check-download-es" unless="download-es.DONE"> - <mkdir dir="target/download/es"/> - <get src="http://nlp.stanford.edu/software/stanford-spanish-corenlp-${core.spanish.date}-models.jar" - dest="target/download/es/es-models.zip" skipexisting="true"/> - <touch file="target/download/es/DONE"/> - </target> - - <target name="-check-download-zh"> - <available property="download-zh.DONE" file="target/download/zh/DONE"/> - </target> + <target name="download-fr" depends="-check-download-fr" unless="download-fr.DONE"> + <mkdir dir="target/download/fr"/> + <get src="http://nlp.stanford.edu/software/stanford-french-corenlp-${core.french.date}-models.jar" + dest="target/download/fr/fr-models.zip" skipexisting="true"/> + <touch file="target/download/fr/DONE"/> + </target> + + <target name="-check-download-es"> + <available property="download-es.DONE" file="target/download/es/DONE"/> + </target> - <target name="download-zh" depends="-check-download-zh" unless="download-zh.DONE"> - <mkdir dir="target/download/zh"/> - <get src="http://nlp.stanford.edu/software/stanford-chinese-corenlp-${core.chinese.date}-models.jar" - dest="target/download/zh/zh-models.zip" skipexisting="true"/> - <touch file="target/download/zh/DONE"/> - </target> + <target name="download-es" depends="-check-download-es" unless="download-es.DONE"> + <mkdir dir="target/download/es"/> + <get src="http://nlp.stanford.edu/software/stanford-spanish-corenlp-${core.spanish.date}-models.jar" + dest="target/download/es/es-models.zip" skipexisting="true"/> + <touch file="target/download/es/DONE"/> + </target> + <target name="-check-download-zh"> + <available property="download-zh.DONE" file="target/download/zh/DONE"/> + </target> + + <target name="download-zh" depends="-check-download-zh" unless="download-zh.DONE"> + <mkdir dir="target/download/zh"/> + <get src="http://nlp.stanford.edu/software/stanford-chinese-corenlp-${core.chinese.date}-models.jar" + dest="target/download/zh/zh-models.zip" skipexisting="true"/> + <touch file="target/download/zh/DONE"/> + </target> <target name="jar-notice"> <echo>================================</echo> diff --git a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordCoreferenceResolverTest.java b/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordCoreferenceResolverTest.java deleted file mode 100644 index e7de3c62ed..0000000000 --- a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordCoreferenceResolverTest.java +++ /dev/null @@ -1,237 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.junit.Assert.assertFalse; -import java.util.ArrayList; -import java.util.List; -import java.util.logging.ConsoleHandler; -import java.util.logging.Filter; -import java.util.logging.Level; -import java.util.logging.LogManager; -import java.util.logging.LogRecord; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.jcas.JCas; -import org.junit.Ignore; -import org.junit.Rule; -import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import edu.stanford.nlp.dcoref.Constants; - -/** - */ -public class StanfordCoreferenceResolverTest -{ - @Test - public void test() - throws Exception - { - JCas jcas = runTest("en", "John bought a car. He is very happy with it."); - - String[][] ref = { - { "John", "He" }, - { "a car", "it" } }; - - AssertAnnotations.assertCoreference(ref, select(jcas, CoreferenceChain.class)); - } - - // https://github.com/dkpro/dkpro-core/issues/582 - // Jan 22, 2015 5:11:54 PM edu.stanford.nlp.dcoref.Document findSpeaker - // WARNING: Cannot find node in dependency for word rally - // Jan 22, 2015 5:11:54 PM edu.stanford.nlp.dcoref.Document findSpeaker - // WARNING: Cannot find node in dependency for word told - @Test - public void test2() - throws Exception - { - final List<LogRecord> records = new ArrayList<LogRecord>(); - ConsoleHandler handler = (ConsoleHandler) LogManager.getLogManager().getLogger("") - .getHandlers()[0]; - java.util.logging.Level oldLevel = handler.getLevel(); - handler.setLevel(Level.ALL); - handler.setFilter(new Filter() - { - @Override - public boolean isLoggable(LogRecord record) - { - records.add(record); - return false; - } - }); - - try { - JCas jcas = runTest("en", - "\" We cannot forgive this war , \" Miyako Fuji , 20 , one of the rally 's " - + "organisers told Jiji news agency ."); - - String[][] ref = { - { "Jiji" }, - { "We" }, - { "this war" }, - { "Miyako Fuji , 20 , one of the rally 's organisers" }, - { "Miyako Fuji , 20" }, - { "Miyako Fuji", "20" }, - { "one of the rally 's organisers" }, - { "Jiji news agency" } }; - - for (LogRecord r : records) { - assertFalse(r.getMessage().contains("Cannot find node in dependency for word")); - } - - AssertAnnotations.assertCoreference(ref, select(jcas, CoreferenceChain.class)); - } - finally { - if (oldLevel != null) { - handler.setLevel(oldLevel); - handler.setFilter(null); - } - } - } - - @Test - public void testDictionarySieve() - throws Exception - { - JCas jcas = runTest("en", "John joined Google in 2012. He is doing research for the company.", - Constants.SIEVEPASSES + ",CorefDictionaryMatch"); - - String[][] ref = new String[][] { - { "John", "He" }, - { "Google", "the company" }, - { "2012" } }; - - AssertAnnotations.assertCoreference(ref, select(jcas, CoreferenceChain.class)); - } - - @Test - public void testTriggerReparse() - throws Exception - { - JCas jcas = runTest("en", "'Let's go! I want to see the Don', he said."); - - String[][] ref = { - { "'s", "I" }, - { "the Don'", "he" } }; - - String[] pennTree = { - "(ROOT (S (`` ') (VP (VB Let) (S (NP (PRP 's)) (VP (VB go)))) (. !)))", - "(ROOT (S (S (NP (PRP I)) (VP (VBP want) (S (VP (TO to) (VP (VB see) (NP (DT the) " - + "(NNPS Don) (POS '))))))) (, ,) (NP (PRP he)) (VP (VBD said)) (. .)))" - }; - - AssertAnnotations.assertPennTree(pennTree, select(jcas, PennTree.class)); - AssertAnnotations.assertCoreference(ref, select(jcas, CoreferenceChain.class)); - } - - @Test - @Ignore("Disabled due to side effects on parser unit tests. See issue 175") - public void testTriggerReparse1() - throws Exception - { - JCas jcas = runTest("en", - "Other major domestic initiatives in his presidency include the Patient Protection and " + - "Affordable Care Act, often referred to as \"Obamacare\"; the Dodd–Frank Wall Street Reform and " + - "Consumer Protection Act; the Don't Ask, Don't Tell Repeal Act of 2010; the Budget Control " + - "Act of 2011; and the American Taxpayer Relief Act of 2012."); - - String[][] ref = { - { "Other major domestic initiatives in his presidency" }, - { "his presidency" }, - { "his" }, - { "the Patient Protection and Affordable Care Act, often referred to as \"Obamacare\"; the Dodd–Frank Wall Street Reform and Consumer Protection Act; the Don't Ask" }, - { "the Patient Protection and Affordable Care Act" }, - { "the Patient Protection" }, - { "Affordable Care Act" }, - { "\"Obamacare\"; the Dodd–Frank Wall Street Reform and Consumer Protection Act;" }, - { "the Dodd" }, - { "Frank Wall Street Reform and Consumer Protection Act" }, - { "Frank Wall Street Reform" }, - { "Consumer Protection Act" }, - { "Repeal Act of 2010; the Budget Control Act of 2011; and the American Taxpayer Relief Act of 2012" }, - { "2010" }, - { "the Budget Control Act of 2011" }, - { "the American Taxpayer Relief Act of 2012" }, - { "2011" }, - { "2012" } }; - - String[] pennTree = { - "(ROOT (S (NP (NP (JJ Other) (JJ major) (JJ domestic) (NNS initiatives)) (PP (IN in) " - + "(NP (PRP$ his) (NN presidency)))) (VP (VBP include) (SBAR (S (NP (NP (DT the) " - + "(NNP Patient) (NNP Protection) (CC and) (NNP Affordable) (NNP Care) (NNP Act)) " - + "(, ,) (VP (ADVP (RB often)) (VBN referred) (PP (TO to) (SBAR (IN as) (S (NP " - + "(`` \") (NP (NNP Obamacare)) ('' \") (PRN (: ;) (S (NP (DT the) (NNP Dodd)) (VP " - + "(VBP –) (NP (NP (NNP Frank) (NNP Wall) (NNP Street) (NNP Reform)) (CC and) (NP " - + "(NNP Consumer) (NNP Protection) (NNP Act))))) (: ;))) (DT the) (VP (VBP Do) " - + "(RB n't) (VP (VB Ask))))))) (, ,)) (VP (VBP Do) (RB n't) (VP (VB Tell) (NP (NP " - + "(NP (NN Repeal) (NNP Act)) (PP (IN of) (NP (CD 2010)))) (: ;) (NP (NP (DT the) " - + "(NNP Budget) (NNP Control) (NNP Act)) (PP (IN of) (NP (CD 2011)))) (: ;) " - + "(CC and) (NP (NP (DT the) (NNP American) (NNP Taxpayer) (NNP Relief) (NNP Act)) " - + "(PP (IN of) (NP (CD 2012)))))))))) (. .)))" - }; - - AssertAnnotations.assertPennTree(pennTree, select(jcas, PennTree.class)); - AssertAnnotations.assertCoreference(ref, select(jcas, CoreferenceChain.class)); - } - - private JCas runTest(String aLanguage, String aText) - throws Exception - { - return runTest(aLanguage, aText, Constants.SIEVEPASSES); - } - - - private JCas runTest(String aLanguage, String aText, String aSieves) - throws Exception - { - AssumeResource.assumeResource(StanfordCoreferenceResolver.class, "coref", aLanguage, - "default"); - - // Coreference resolution requires the parser and the NER to run before - AnalysisEngine engine = createEngine(createEngineDescription( - createEngineDescription(StanfordSegmenter.class), - createEngineDescription(StanfordParser.class, - StanfordParser.PARAM_WRITE_CONSTITUENT, true, - StanfordParser.PARAM_WRITE_DEPENDENCY, true, - StanfordParser.PARAM_WRITE_PENN_TREE, true, - StanfordParser.PARAM_WRITE_POS, true), - createEngineDescription(StanfordLemmatizer.class), - createEngineDescription(StanfordNamedEntityRecognizer.class), - createEngineDescription(StanfordCoreferenceResolver.class, - StanfordCoreferenceResolver.PARAM_SIEVES, aSieves))); - - // Set up a simple example - JCas jcas = engine.newJCas(); - jcas.setDocumentLanguage(aLanguage); - jcas.setDocumentText(aText); - engine.process(jcas); - - return jcas; - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordLemmatizerTest.java b/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordLemmatizerTest.java deleted file mode 100644 index 507a4282d4..0000000000 --- a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordLemmatizerTest.java +++ /dev/null @@ -1,86 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.jcas.JCas; -import org.junit.Rule; -import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -public class StanfordLemmatizerTest -{ - @Test - public void testUnderscore() throws Exception - { - runTest("en", "foo _ bar", - new String[] { "foo", "_", "bar" }); - } - - @Test - public void testEnglish() throws Exception - { - runTest("en", "This is a test .", - new String[] { "this", "be", "a", "test", "." }); - - runTest("en", "We need a very complicated example sentence , which " - + "contains as many constituents and dependencies as possible .", - new String[] { "we", "need", "a", "very", "complicated", "example", - "sentence", ",", "which", "contain", "as", "many", "constituent", "and", - "dependency", "as", "possible", "." }); - } - - @Test(expected = AnalysisEngineProcessException.class) - public void testNotEnglish() - throws Exception - { - runTest("de", "Das ist ein test .", new String[] {} ); - } - - @Test - public void testUrl() throws Exception - { - runTest("en", - "Details hinzu findet man unter http://www.armytimes.com/news/2009/11/army_M4_112109w/ .", - new String[] { "detail", "hinzu", "findet", "man", "unter", - "http://www.armytimes.com/news/2009/11/army_m4_112109w/", "." }); - } - - private void runTest(String aLanguage, String testDocument, String[] lemmas) - throws Exception - { - AnalysisEngineDescription posTagger = createEngineDescription(StanfordPosTagger.class); - AnalysisEngineDescription lemmatizer = createEngineDescription(StanfordLemmatizer.class); - - JCas aJCas = TestRunner.runTest(createEngineDescription(posTagger, lemmatizer), - aLanguage, testDocument); - - AssertAnnotations.assertLemma(lemmas, select(aJCas, Lemma.class)); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizerTest.java b/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizerTest.java deleted file mode 100644 index 05832c2687..0000000000 --- a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizerTest.java +++ /dev/null @@ -1,365 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.jcas.JCas; -import org.junit.Assume; -import org.junit.Rule; -import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -/** - */ -public class StanfordNamedEntityRecognizerTest -{ - @Test - public void testDutchFremeNer() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("nl", "freme-wikiner", "10 jaar Markus werkzaam bij SAP in Duitsland ."); - - String[] ne = { - "[ 8, 14]Person(I-PER) (Markus)", - "[ 28, 31]Organization(I-ORG) (SAP)", - "[ 35, 44]Location(I-LOC) (Duitsland)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void testEnglish() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("en", null, "IBM where John Miller works is in Germany ."); - - String[] ne = { - "[ 0, 3]Organization(ORGANIZATION) (IBM)", - "[ 10, 21]Person(PERSON) (John Miller)", - "[ 34, 41]Location(LOCATION) (Germany)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void testEnglishAdjacent() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("en", null, "Jake John called late at night ."); - - String[] ne = { - "[ 0, 9]Person(PERSON) (Jake John)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void testEnglishFremeNer() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("en", "freme-wikiner", "IBM where John Miller works is in Germany ."); - - String[] ne = { - "[ 0, 3]Organization(I-ORG) (IBM)", - "[ 10, 21]Person(I-PER) (John Miller)", - "[ 34, 41]Location(I-LOC) (Germany)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void test3classCaselessEnglish() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("en", "all.3class.caseless.distsim.crf", "ibm where john works is in germany ."); - - String[] ne = { - "[ 0, 3]Organization(ORGANIZATION) (ibm)", - "[ 10, 14]Person(PERSON) (john)", - "[ 27, 34]Location(LOCATION) (germany)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void testNoWiki3classCaselessEnglish() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("en", "nowiki.3class.caseless.distsim.crf", "ibm where john works is in germany ."); - - String[] ne = { - "[ 0, 3]Organization(ORGANIZATION) (ibm)", - "[ 10, 14]Person(PERSON) (john)", - "[ 27, 34]Location(LOCATION) (germany)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - - @Test - public void test4classEnglish() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("en", "conll.4class.distsim.crf", "IBM where John works is in Germany ."); - - String[] ne = { - "[ 0, 3]Organization(ORGANIZATION) (IBM)", - "[ 10, 14]Person(PERSON) (John)", - "[ 27, 34]Location(LOCATION) (Germany)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - - @Test - public void test4classCaselessEnglish() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("en", "conll.4class.caseless.distsim.crf", "ibm where john works is in germany ."); - - String[] ne = { - "[ 0, 3]Organization(ORGANIZATION) (ibm)", - "[ 10, 14]Person(PERSON) (john)", - "[ 27, 34]Location(LOCATION) (germany)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void test4classCaselessMixedEnglish() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("en", "conll.4class.caseless.distsim.crf", "IBM where john works is in Germany ."); - - String[] ne = { - "[ 0, 3]Organization(ORGANIZATION) (IBM)", - "[ 10, 14]Person(PERSON) (john)", - "[ 27, 34]Location(LOCATION) (Germany)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void test7classEnglish() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("en", "muc.7class.distsim.crf", "IBM where John works is in Germany ."); - - String[] ne = { - "[ 0, 3]Organization(ORGANIZATION) (IBM)", - "[ 10, 14]Person(PERSON) (John)", - "[ 27, 34]Location(LOCATION) (Germany)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void testEnglishWithNEInLastToken() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("en", null, "IBM where John works is in Germany"); - - String[] ne = { - "[ 0, 3]Organization(ORGANIZATION) (IBM)", - "[ 10, 14]Person(PERSON) (John)", - "[ 27, 34]Location(LOCATION) (Germany)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void testGerman() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("de", null, "Markus arbeitet seit 10 Jahren bei SAP in Deutschland ."); - - String[] ne = { - "[ 0, 6]Person(I-PER) (Markus)", - "[ 35, 38]Organization(I-ORG) (SAP)", - "[ 42, 53]Location(I-LOC) (Deutschland)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void testGermanNemgp() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("de", "nemgp", "Markus arbeitet seit 10 Jahren bei SAP in Deutschland ."); - - String[] ne = { - "[ 0, 6]Person(PER) (Markus)", - "[ 35, 38]Organization(ORG) (SAP)", - "[ 42, 53]Location(LOC) (Deutschland)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void testHgcGerman() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("de", "hgc_175m_600.crf", "Markus arbeitet seit 10 Jahren bei SAP in Deutschland ."); - - String[] ne = { - "[ 0, 6]Person(I-PER) (Markus)", - "[ 35, 38]Organization(I-ORG) (SAP)", - "[ 42, 53]Location(I-LOC) (Deutschland)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void testFrenchFremeNer() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("fr", "freme-wikiner", "Il y a 10 ans Markus travaille dans SAP en Allemagne ."); - - String[] ne = { - "[ 14, 20]Person(I-PER) (Markus)", - "[ 36, 39]Organization(I-ORG) (SAP)", - "[ 43, 52]Location(I-LOC) (Allemagne)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void testItalianFremeNer() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("it", "freme-wikiner", "10 anni fa Markus lavora in SAP in Germania ."); - - String[] ne = { - "[ 11, 17]Person(I-PER) (Markus)", - "[ 28, 31]Organization(I-ORG) (SAP)", - "[ 35, 43]Location(I-LOC) (Germania)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void testRussianFremeNer() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("ru", "freme-wikiner", "10 лет Маркус работал в SAP в Германии ."); - - String[] ne = { - "[ 7, 13]Person(I-PER) (Маркус)", - "[ 24, 27]Organization(I-ORG) (SAP)", - "[ 30, 38]Location(I-LOC) (Германии)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void testSpanish() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("es", null, "Hace 10 años Markus trabaja en SAP en Alemania ."); - - String[] ne = { - "[ 13, 19]Person(PERS) (Markus)", - "[ 31, 34]Organization(ORG) (SAP)", - "[ 38, 46]Location(LUG) (Alemania)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test - public void testSpanishFremeNer() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("es", "freme-wikiner", "Hace 10 años Markus trabaja en SAP en Alemania ."); - - String[] ne = { - "[ 13, 19]Person(I-PER) (Markus)", - "[ 31, 34]NamedEntity(I-MISC) (SAP)", - "[ 38, 46]Location(I-LOC) (Alemania)" }; - - AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); - } - - @Test(expected = AnalysisEngineProcessException.class) - public void testMissingModel() throws Exception - { - runTest("xx", null, "Xec xena Xeo ."); - } - - private JCas runTest(String language, String variant, String testDocument) - throws Exception - { - AssumeResource.assumeResource(StanfordNamedEntityRecognizer.class, "ner", language, - variant); - - AnalysisEngine engine = createEngine(StanfordNamedEntityRecognizer.class, - StanfordNamedEntityRecognizer.PARAM_VARIANT, variant, - StanfordNamedEntityRecognizer.PARAM_PRINT_TAGSET, true); - - return TestRunner.runTest(engine, language, testDocument); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizerTrainerTest.java b/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizerTrainerTest.java deleted file mode 100644 index e7388f5e25..0000000000 --- a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizerTrainerTest.java +++ /dev/null @@ -1,142 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.apache.uima.fit.pipeline.SimplePipeline.iteratePipeline; -import static org.junit.Assert.assertEquals; - -import java.io.File; -import java.io.IOException; -import java.util.List; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.factory.ConfigurationParameterFactory; -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Dataset; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetFactory; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Split; -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.eval.EvalUtil; -import de.tudarmstadt.ukp.dkpro.core.eval.model.Span; -import de.tudarmstadt.ukp.dkpro.core.eval.report.Result; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2002Reader; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2002Reader.ColumnSeparators; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2002Writer; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - -public class StanfordNamedEntityRecognizerTrainerTest -{ - private Dataset ds; - - @Test - public void test() - throws Exception - { - File targetFolder = testContext.getTestOutputFolder(); - - System.out.println("Target Folder: " + targetFolder.getAbsolutePath()); - Split split = ds.getDefaultSplit(); - - File model = new File(targetFolder, "ner-model.ser.gz"); - - File properties = new File("ner/train-english.props"); - - File[] trainingFiles = split.getTrainingFiles(); - for (File file : trainingFiles) { - System.out.println("Training file: " + file.getAbsolutePath()); - } - - CollectionReaderDescription trainReader = createReaderDescription(Conll2002Reader.class, - Conll2002Reader.PARAM_PATTERNS, split.getDevelopmentFiles(), - Conll2002Reader.PARAM_LANGUAGE, ds.getLanguage(), - Conll2002Reader.PARAM_COLUMN_SEPARATOR, ColumnSeparators.TAB.getName(), - Conll2002Reader.PARAM_HAS_TOKEN_NUMBER, true, - Conll2002Reader.PARAM_HAS_HEADER, true, - Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true); - - AnalysisEngineDescription trainer = createEngineDescription( - StanfordNamedEntityRecognizerTrainer.class, - StanfordNamedEntityRecognizerTrainer.PARAM_TARGET_LOCATION, model, - StanfordNamedEntityRecognizerTrainer.PARAM_PROPERTIES_LOCATION, properties, - StanfordNamedEntityRecognizerTrainer.PARAM_LABEL_SET, "noprefix", - StanfordNamedEntityRecognizerTrainer.PARAM_RETAIN_CLASS, true); - - SimplePipeline.runPipeline(trainReader, trainer); - - // Apply model and collect labels - System.out.println("Applying model to test data"); - CollectionReaderDescription testReader = createReaderDescription(Conll2002Reader.class, - Conll2002Reader.PARAM_PATTERNS, split.getTestFiles(), - Conll2002Reader.PARAM_LANGUAGE, "de", - Conll2002Reader.PARAM_COLUMN_SEPARATOR, ColumnSeparators.TAB.getName(), - Conll2002Reader.PARAM_HAS_TOKEN_NUMBER, true, - Conll2002Reader.PARAM_HAS_HEADER, true, - Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true, - Conll2002Reader.PARAM_READ_NAMED_ENTITY, false); - - AnalysisEngineDescription ner = createEngineDescription(StanfordNamedEntityRecognizer.class, - StanfordNamedEntityRecognizer.PARAM_PRINT_TAGSET, true, - StanfordNamedEntityRecognizer.PARAM_MODEL_LOCATION, model); - - AnalysisEngineDescription writer = createEngineDescription( - Conll2002Writer.class, - Conll2002Writer.PARAM_SINGULAR_TARGET, true, - Conll2002Writer.PARAM_TARGET_LOCATION, new File(targetFolder, "output.conll")); - - List<Span<String>> actual = EvalUtil.loadSamples(iteratePipeline(testReader, ner, writer), - NamedEntity.class, NamedEntity::getValue); - System.out.printf("Actual samples: %d%n", actual.size()); - - // Read reference data collect labels - ConfigurationParameterFactory.setParameter(testReader, - Conll2002Reader.PARAM_READ_NAMED_ENTITY, true); - List<Span<String>> expected = EvalUtil.loadSamples(testReader, NamedEntity.class, NamedEntity::getValue); - System.out.printf("Expected samples: %d%n", expected.size()); - - Result results = EvalUtil.dumpResults(targetFolder, expected, actual); - - // Using split.getTrainingFiles() with 10GB heap takes ~80 minutes to train - // F-score 0.692730 - // Precision 0.765778 - // Recall 0.632405 - - // - assertEquals(0.493260, results.getFscore(), 0.0001); - assertEquals(0.621921, results.getPrecision(), 0.0001); - assertEquals(0.408708, results.getRecall(), 0.0001); - } - - @Before - public void setup() - throws IOException - { - DatasetFactory loader = new DatasetFactory(DkproTestContext.getCacheFolder()); - ds = loader.load("germeval2014-de"); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordParserTest.java b/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordParserTest.java deleted file mode 100644 index 98b8a96811..0000000000 --- a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordParserTest.java +++ /dev/null @@ -1,1253 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectSingle; -import static org.junit.Assert.assertTrue; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.commons.lang3.ArrayUtils; -import org.apache.log4j.Appender; -import org.apache.log4j.Logger; -import org.apache.log4j.spi.LoggingEvent; -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.fit.factory.AggregateBuilder; -import org.apache.uima.fit.factory.JCasBuilder; -import org.apache.uima.jcas.JCas; -import org.junit.Assume; -import org.junit.Rule; -import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.TreeUtils; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; -import edu.stanford.nlp.ling.StringLabel; -import edu.stanford.nlp.trees.Tree; - -/** - */ -public class StanfordParserTest -{ - private static final String[] GERMAN_POS_TAGS = { "$,", "$.", "$[", ".$$.", "ADJA", "ADJD", - "ADV", "APPO", "APPR", "APPRART", "APZR", "ART", "CARD", "FM", "ITJ", "KOKOM", "KON", - "KOUI", "KOUS", "NE", "NN", "PDAT", "PDS", "PIAT", "PIDAT", "PIS", "PPER", "PPOSAT", - "PPOSS", "PRELAT", "PRELS", "PRF", "PROAV", "PTKA", "PTKANT", "PTKNEG", "PTKVZ", - "PTKZU", "PWAT", "PWAV", "PWS", "TRUNC", "VAFIN", "VAIMP", "VAINF", "VAPP", "VMFIN", - "VMINF", "VMPP", "VVFIN", "VVIMP", "VVINF", "VVIZU", "VVPP", "XY" }; - - private static final String[] GERMAN_CONSTITUENT_TAGS = { "AA", "AP", "AVP", "CAC", "CAP", - "CAVP", "CCP", "CH", "CNP", "CO", "CPP", "CS", "CVP", "CVZ", "DL", "ISU", "MPN", "MTA", - "NM", "NP", "NUR", "PP", "QL", "ROOT", "S", "VP", "VZ" }; - - private static final String[] ENGLISH_POS_TAGS = { "#", "$", "''", ",", "-LRB-", "-RRB-", ".", - ".$$.", ":", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", - "NNP", "NNPS", "NNS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", - "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "``" }; - - private static final String[] ENGLISH_POS_UNMAPPED = { ".$$."}; - - private static final String[] ENGLISH_CONSTITUENT_TAGS = { "ADJP", "ADVP", "CONJP", "FRAG", - "INTJ", "LST", "NAC", "NP", "NX", "PP", "PRN", "PRT", "QP", "ROOT", "RRC", "S", "SBAR", - "SBARQ", "SINV", "SQ", "UCP", "VP", "WHADJP", "WHADVP", "WHNP", "WHPP", "X" }; - - private static final String[] ENGLISH_CONSTITUENT_UNMAPPED = { }; - - private static final String[] ENGLISH_DEPENDENCY_TAGS = { "acomp", "advcl", "advmod", "agent", - "amod", "appos", "arg", "aux", "auxpass", "cc", "ccomp", "comp", "conj", "cop", - "csubj", "csubjpass", "dep", "det", "discourse", "dobj", "expl", "goeswith", "gov", - "iobj", "mark", "mod", "mwe", "neg", "nn", "npadvmod", "nsubj", "nsubjpass", "num", - "number", "obj", "parataxis", "pcomp", "pobj", "poss", "possessive", "preconj", "pred", - "predet", "prep", "prt", "punct", "quantmod", "rcmod", "ref", "rel", "sdep", "subj", - "tmod", "vmod", "xcomp" }; - - private static final String[] SPANISH_POS_TAGS = { ".$$.", "359000", "NCMS000", "ac0000", - "ao0000", "ap0000", "aq0000", "aqs000", "cc", "cs", "d00000", "da0000", "dd0000", - "de0000", "di0000", "dn0000", "do0000", "dp0000", "dt0000", "f0", "faa", "fat", "fc", - "fca", "fct", "fd", "fe", "fg", "fh", "fi", "fia", "fit", "fp", "fpa", "fpt", "fra", - "frc", "fs", "fsa", "ft", "fx", "fz", "i", "nc00000", "nc0a000", "nc0c000", "nc0n000", - "nc0p000", "nc0s000", "np00000", "p0000000", "pd000000", "pe000000", "pi000000", - "pn000000", "po000000", "pp000000", "pr000000", "pt000000", "px000000", "rg", "rn", - "sc000", "se000", "sp000", "va00000", "vag0000", "vaic000", "vaif000", "vaii000", - "vaip000", "vais000", "vam0000", "van0000", "vap0000", "vasi000", "vasp000", "vass000", - "vm00000", "vm0p000", "vmg0000", "vmi0000", "vmi2000", "vmic000", "vmif000", "vmii000", - "vmim000", "vmip000", "vmis000", "vmm0000", "vmmp000", "vmms000", "vmn0000", "vmp0000", - "vms0000", "vmsf000", "vmsi000", "vmsp000", "vq00000", "vs00000", "vsg0000", "vsic000", - "vsif000", "vsii000", "vsip000", "vsis000", "vsm0000", "vsmp000", "vsn0000", "vsp0000", - "vssf000", "vssi000", "vssp000", "vsss000", "w", "word", "z0", "zd", "zm", "zp", "zu" }; - - private static final String[] FRENCH_POS_TAGS = { ".$$.", "A", "ADJ", "ADJWH", "ADV", "ADVWH", - "C", "CC", "CL", "CLO", "CLR", "CLS", "CS", "DET", "DETWH", "ET", "I", "N", "NC", - "NPP", "P", "PREF", "PRO", "PROREL", "PROWH", "PUNC", "V", "VIMP", "VINF", "VPP", - "VPR", "VS" }; - - // TODO Maybe test link to parents (not tested by syntax tree recreation) - - @Test - public void testGermanPcfg() - throws Exception - { - JCas jcas = runTest("de", "pcfg", "Wir brauchen ein sehr kompliziertes Beispiel , welches " - + "möglichst viele Konstituenten und Dependenzen beinhaltet ."); - - String[] constituentMapped = { "ADJP 17,35", "NP 13,111", "NP 55,100", "NP 71,100", - "ROOT 0,113", "S 0,113", "S 47,111" }; - - String[] constituentOriginal = { "AP 17,35", "CNP 71,100", "NP 13,111", "NP 55,100", - "ROOT 0,113", "S 0,113", "S 47,111" }; - - String[] synFunc = {}; - - String[] posOriginal = { "PPER", "VVFIN", "ART", "ADV", "ADJA", "NN", "$,", "PRELS", "ADV", - "PIDAT", "NN", "KON", "NN", "VVFIN", "$." }; - - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", "POS_PUNCT", "POS_PRON", "POS_ADV", - "POS_PRON", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_VERB", "POS_PUNCT" }; - - String[] dependencies = {/** No dependencies for German */ }; - - String pennTree = "(ROOT (S (PPER Wir) (VVFIN brauchen) (NP (ART ein) (AP (ADV sehr) " - + "(ADJA kompliziertes)) (NN Beispiel) ($, ,) (S (PRELS welches) (NP " - + "(ADV möglichst) (PIDAT viele) (CNP (NN Konstituenten) (KON und) " - + "(NN Dependenzen))) (VVFIN beinhaltet))) ($. .)))"; - - String[] unmappedPos = { "$[", ".$$." }; - - String[] unmappedConst = { "NUR" }; - - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, - select(jcas, Constituent.class)); - AssertAnnotations.assertSyntacticFunction(synFunc, select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(POS.class, "stts", GERMAN_POS_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "stts", unmappedPos, jcas); - AssertAnnotations.assertTagset(Constituent.class, "negra", GERMAN_CONSTITUENT_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "negra", unmappedConst, jcas); - } - - @Test - public void testGermanFactored() - throws Exception - { - JCas jcas = runTest("de", "factored", - "Wir brauchen ein sehr kompliziertes Beispiel , welches " - + "möglichst viele Konstituenten und Dependenzen beinhaltet ."); - - String[] constituentMapped = { "ADJP 17,35", "ADJP 55,70", "NP 13,111", "NP 55,100", - "NP 71,100", "ROOT 0,113", "S 0,113", "S 47,111" }; - - String[] constituentOriginal = { "AP 17,35", "AP 55,70", "CNP 71,100", "NP 13,111", - "NP 55,100", "ROOT 0,113", "S 0,113", "S 47,111" }; - - String[] posOriginal = { "PPER", "VVFIN", "ART", "ADV", "ADJA", "NN", "$,", "PRELS", "ADV", - "PIDAT", "NN", "KON", "NN", "VVFIN", "$." }; - - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", "POS_PUNCT", "POS_PRON", "POS_ADV", - "POS_PRON", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_VERB", "POS_PUNCT" }; - - String[] dependencies = { /** No dependencies for German */ }; - - String pennTree = "(ROOT (S (PPER Wir) (VVFIN brauchen) (NP (ART ein) (AP " - + "(ADV sehr) (ADJA kompliziertes)) (NN Beispiel) ($, ,) (S (PRELS welches) " - + "(NP (AP (ADV möglichst) (PIDAT viele)) (CNP (NN Konstituenten) (KON und) " - + "(NN Dependenzen))) (VVFIN beinhaltet))) ($. .)))"; - - String[] unmappedPos = { "$[", ".$$." }; - - String[] unmappedConst = { "NUR" }; - - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, - select(jcas, Constituent.class)); - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(POS.class, "stts", GERMAN_POS_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "stts", unmappedPos, jcas); - AssertAnnotations.assertTagset(Constituent.class, "negra", GERMAN_CONSTITUENT_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "negra", unmappedConst, jcas, true); - } - - @Test - public void testEnglishPcfg() - throws Exception - { - JCas jcas = runTest("en", "pcfg", "We need a very complicated example sentence , which " - + "contains as many constituents and dependencies as possible ."); - - String[] constituentMapped = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,98", - "NP 8,110", "NP 8,43", "PP 61,98", "PP 99,110", "ROOT 0,112", "S 0,112", - "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; - - String[] constituentOriginal = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,98", - "NP 8,110", "NP 8,43", "PP 61,98", "PP 99,110", "ROOT 0,112", "S 0,112", - "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; - - String[] dependencies = { - "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](We) G[3,7](need)", - "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", - "[ 8, 9]DET(det,basic) D[8,9](a) G[35,43](sentence)", - "[ 10, 14]ADVMOD(advmod,basic) D[10,14](very) G[15,26](complicated)", - "[ 15, 26]AMOD(amod,basic) D[15,26](complicated) G[35,43](sentence)", - "[ 27, 34]NN(nn,basic) D[27,34](example) G[35,43](sentence)", - "[ 35, 43]DOBJ(dobj,basic) D[35,43](sentence) G[3,7](need)", - "[ 46, 51]NSUBJ(nsubj,basic) D[46,51](which) G[52,60](contains)", - "[ 52, 60]RCMOD(rcmod,basic) D[52,60](contains) G[35,43](sentence)", - "[ 64, 68]AMOD(amod,basic) D[64,68](many) G[69,81](constituents)", - "[ 69, 81]PREP(prep_as,basic) D[69,81](constituents) G[52,60](contains)", - "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", - "[102,110]PREP(prep_as,basic) D[102,110](possible) G[52,60](contains)" }; - - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", "POS_NOUN", "POS_PUNCT", "POS_DET", - "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; - - String[] posOriginal = { "PRP", "VBP", "DT", "RB", "VBN", "NN", "NN", ",", "WDT", "VBZ", - "IN", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; - - String pennTree = "(ROOT (S (NP (PRP We)) (VP (VBP need) (NP (NP (DT a) (ADJP (RB very) " - + "(VBN complicated)) (NN example) (NN sentence)) (, ,) (SBAR (WHNP (WDT which)) " - + "(S (VP (VBZ contains) (PP (IN as) (NP (JJ many) (NNS constituents) (CC and) " - + "(NNS dependencies))) (PP (IN as) (ADJP (JJ possible)))))))) (. .)))"; - - String[] unmappedDep = { "gov" }; - - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, - select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); - } - - @Test - public void testEnglishPcfgCollapsed() - throws Exception - { - JCas jcas = runTest("en", "pcfg", "We need a very complicated example sentence , which " - + "contains as many constituents and dependencies as possible .", - StanfordParser.PARAM_MODE, StanfordParser.DependenciesMode.COLLAPSED_WITH_EXTRA); - - String[] constituentMapped = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,98", - "NP 8,110", "NP 8,43", "PP 61,98", "PP 99,110", "ROOT 0,112", "S 0,112", - "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; - - String[] constituentOriginal = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,98", - "NP 8,110", "NP 8,43", "PP 61,98", "PP 99,110", "ROOT 0,112", "S 0,112", - "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; - - String[] dependencies = { - "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](We) G[3,7](need)", - "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", - "[ 8, 9]DET(det,basic) D[8,9](a) G[35,43](sentence)", - "[ 10, 14]ADVMOD(advmod,basic) D[10,14](very) G[15,26](complicated)", - "[ 15, 26]AMOD(amod,basic) D[15,26](complicated) G[35,43](sentence)", - "[ 27, 34]NN(nn,basic) D[27,34](example) G[35,43](sentence)", - "[ 35, 43]DOBJ(dobj,basic) D[35,43](sentence) G[3,7](need)", - "[ 35, 43]NSUBJ(nsubj,enhanced) D[35,43](sentence) G[52,60](contains)", - "[ 52, 60]RCMOD(rcmod,basic) D[52,60](contains) G[35,43](sentence)", - "[ 64, 68]AMOD(amod,basic) D[64,68](many) G[69,81](constituents)", - "[ 69, 81]PREP(prep_as,basic) D[69,81](constituents) G[52,60](contains)", - "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", - "[102,110]PREP(prep_as,basic) D[102,110](possible) G[52,60](contains)" }; - - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", "POS_NOUN", "POS_PUNCT", "POS_DET", - "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; - - String[] posOriginal = { "PRP", "VBP", "DT", "RB", "VBN", "NN", "NN", ",", "WDT", "VBZ", - "IN", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; - - String pennTree = "(ROOT (S (NP (PRP We)) (VP (VBP need) (NP (NP (DT a) (ADJP (RB very) " - + "(VBN complicated)) (NN example) (NN sentence)) (, ,) (SBAR (WHNP (WDT which)) " - + "(S (VP (VBZ contains) (PP (IN as) (NP (JJ many) (NNS constituents) (CC and) " - + "(NNS dependencies))) (PP (IN as) (ADJP (JJ possible)))))))) (. .)))"; - - String[] unmappedDep = { "gov" }; - - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, - select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); - } - - @Test - public void testEnglishFactored() - throws Exception - { - JCas jcas = runTest("en", "factored", "We need a very complicated example sentence , which " - + "contains as many constituents and dependencies as possible ."); - - String[] constituentMapped = { "ADJP 10,26", "ADJP 102,110", "ADJP 61,68", "NP 0,2", - "NP 61,98", "NP 8,110", "NP 8,43", "PP 99,110", "ROOT 0,112", "S 0,112", - "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; - - String[] constituentOriginal = { "ADJP 10,26", "ADJP 102,110", "ADJP 61,68", "NP 0,2", - "NP 61,98", "NP 8,110", "NP 8,43", "PP 99,110", "ROOT 0,112", "S 0,112", - "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; - - String[] dependencies = { - "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](We) G[3,7](need)", - "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", - "[ 8, 9]DET(det,basic) D[8,9](a) G[35,43](sentence)", - "[ 10, 14]ADVMOD(advmod,basic) D[10,14](very) G[15,26](complicated)", - "[ 15, 26]AMOD(amod,basic) D[15,26](complicated) G[35,43](sentence)", - "[ 27, 34]NN(nn,basic) D[27,34](example) G[35,43](sentence)", - "[ 35, 43]DOBJ(dobj,basic) D[35,43](sentence) G[3,7](need)", - "[ 46, 51]NSUBJ(nsubj,basic) D[46,51](which) G[52,60](contains)", - "[ 52, 60]RCMOD(rcmod,basic) D[52,60](contains) G[35,43](sentence)", - "[ 61, 63]ADVMOD(advmod,basic) D[61,63](as) G[64,68](many)", - "[ 64, 68]AMOD(amod,basic) D[64,68](many) G[69,81](constituents)", - "[ 69, 81]DOBJ(dobj,basic) D[69,81](constituents) G[52,60](contains)", - "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", - "[102,110]PREP(prep_as,basic) D[102,110](possible) G[52,60](contains)" }; - - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", "POS_NOUN", "POS_PUNCT", "POS_DET", - "POS_VERB", "POS_ADV", "POS_ADJ", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; - - String[] posOriginal = { "PRP", "VBP", "DT", "RB", "VBN", "NN", "NN", ",", "WDT", "VBZ", - "RB", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; - - String pennTree = "(ROOT (S (NP (PRP We)) (VP (VBP need) (NP (NP (DT a) (ADJP (RB very) " - + "(VBN complicated)) (NN example) (NN sentence)) (, ,) (SBAR (WHNP (WDT which)) " - + "(S (VP (VBZ contains) (NP (ADJP (RB as) (JJ many)) (NNS constituents) (CC and) " - + "(NNS dependencies)) (PP (IN as) (ADJP (JJ possible)))))))) (. .)))"; - - String[] unmappedDep = { "gov" }; - - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, - select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); - } - - @Test - public void testEnglishKeepPunctuation() - throws Exception - { - JCas jcas = runTest("en", "rnn", "This is a test .", - StanfordParser.PARAM_KEEP_PUNCTUATION, true); - - String[] dependencies = { - "[ 0, 4]NSUBJ(nsubj,basic) D[0,4](This) G[10,14](test)", - "[ 5, 7]COP(cop,basic) D[5,7](is) G[10,14](test)", - "[ 8, 9]DET(det,basic) D[8,9](a) G[10,14](test)", - "[ 10, 14]ROOT(root,basic) D[10,14](test) G[10,14](test)", - "[ 15, 16]PUNCT(punct,basic) D[15,16](.) G[10,14](test)" }; - - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - } - - @Test - public void testEnglishRnn() - throws Exception - { - JCas jcas = runTest("en", "rnn", "We need a very complicated example sentence , which " - + "contains as many constituents and dependencies as possible ."); - - String[] constituentMapped = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 61,98", - "NP 8,110", "NP 8,43", "PP 99,110", "QP 61,68", "ROOT 0,112", "S 0,112", - "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; - - String[] constituentOriginal = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 61,98", - "NP 8,110", "NP 8,43", "PP 99,110", "QP 61,68", "ROOT 0,112", "S 0,112", - "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; - - String[] dependencies = { - "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](We) G[3,7](need)", - "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", - "[ 8, 9]DET(det,basic) D[8,9](a) G[35,43](sentence)", - "[ 10, 14]ADVMOD(advmod,basic) D[10,14](very) G[15,26](complicated)", - "[ 15, 26]AMOD(amod,basic) D[15,26](complicated) G[35,43](sentence)", - "[ 27, 34]NN(nn,basic) D[27,34](example) G[35,43](sentence)", - "[ 35, 43]DOBJ(dobj,basic) D[35,43](sentence) G[3,7](need)", - "[ 46, 51]NSUBJ(nsubj,basic) D[46,51](which) G[52,60](contains)", - "[ 52, 60]RCMOD(rcmod,basic) D[52,60](contains) G[35,43](sentence)", - "[ 61, 63]QUANTMOD(quantmod,basic) D[61,63](as) G[64,68](many)", - "[ 64, 68]NUM(num,basic) D[64,68](many) G[69,81](constituents)", - "[ 69, 81]DOBJ(dobj,basic) D[69,81](constituents) G[52,60](contains)", - "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", - "[102,110]PREP(prep_as,basic) D[102,110](possible) G[52,60](contains)" }; - - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", "POS_NOUN", "POS_PUNCT", "POS_DET", - "POS_VERB", "POS_ADV", "POS_ADJ", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; - - String[] posOriginal = { "PRP", "VBP", "DT", "RB", "VBN", "NN", "NN", ",", "WDT", "VBZ", - "RB", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; - - String pennTree = "(ROOT (S (NP (PRP We)) (VP (VBP need) (NP (NP (DT a) (ADJP (RB very) " - + "(VBN complicated)) (NN example) (NN sentence)) (, ,) (SBAR (WHNP (WDT which)) " - + "(S (VP (VBZ contains) (NP (QP (RB as) (JJ many)) (NNS constituents) (CC and) " - + "(NNS dependencies)) (PP (IN as) (ADJP (JJ possible)))))))) (. .)))"; - - String[] unmappedDep = { "gov" }; - - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, - select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); - } - - @Test - public void testEnglishShiftReduce() - throws Exception - { - JCas jcas = runTestWithPosTagger("en", "sr", "We need a very complicated example sentence , which " - + "contains as many constituents and dependencies as possible ."); - - String[] constituentMapped = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,110", - "NP 64,98", "NP 8,110", "NP 8,43", "PP 61,110", "PP 99,110", "ROOT 0,112", - "S 0,112", "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; - - String[] constituentOriginal = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,110", - "NP 64,98", "NP 8,110", "NP 8,43", "PP 61,110", "PP 99,110", "ROOT 0,112", - "S 0,112", "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; - - String[] dependencies = { - "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](We) G[3,7](need)", - "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", - "[ 8, 9]DET(det,basic) D[8,9](a) G[35,43](sentence)", - "[ 10, 14]ADVMOD(advmod,basic) D[10,14](very) G[15,26](complicated)", - "[ 15, 26]AMOD(amod,basic) D[15,26](complicated) G[35,43](sentence)", - "[ 27, 34]NN(nn,basic) D[27,34](example) G[35,43](sentence)", - "[ 35, 43]DOBJ(dobj,basic) D[35,43](sentence) G[3,7](need)", - "[ 46, 51]NSUBJ(nsubj,basic) D[46,51](which) G[52,60](contains)", - "[ 52, 60]RCMOD(rcmod,basic) D[52,60](contains) G[35,43](sentence)", - "[ 64, 68]AMOD(amod,basic) D[64,68](many) G[69,81](constituents)", - "[ 69, 81]PREP(prep_as,basic) D[69,81](constituents) G[52,60](contains)", - "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", - "[102,110]PREP(prep_as,basic) D[102,110](possible) G[69,81](constituents)" }; - - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", "POS_NOUN", "POS_PUNCT", "POS_DET", - "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; - - String[] posOriginal = { "PRP", "VBP", "DT", "RB", "JJ", "NN", "NN", ",", "WDT", "VBZ", - "IN", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; - - String pennTree = "(ROOT (S (NP (PRP We)) (VP (VBP need) (NP (NP (DT a) (ADJP (RB very) " - + "(JJ complicated)) (NN example) (NN sentence)) (, ,) (SBAR (WHNP (WDT which)) " - + "(S (VP (VBZ contains) (PP (IN as) (NP (NP (JJ many) (NNS constituents) " - + "(CC and) (NNS dependencies)) (PP (IN as) (ADJP (JJ possible)))))))))) (. .)))"; - - String[] unmappedDep = { "gov" }; - - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, - select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); - } - - @Test - public void testEnglishShiftReduceBeam() - throws Exception - { - JCas jcas = runTestWithPosTagger("en", "sr-beam", "We need a very complicated example " - + "sentence , which contains as many constituents and dependencies as possible ."); - - String[] constituentMapped = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,98", - "NP 8,110", "NP 8,43", "PP 61,110", "PP 61,98", "PP 99,110", "ROOT 0,112", - "S 0,112", "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; - - String[] constituentOriginal = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,98", - "NP 8,110", "NP 8,43", "PP 61,110", "PP 61,98", "PP 99,110", "ROOT 0,112", - "S 0,112", "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; - - String[] dependencies = { - "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](We) G[3,7](need)", - "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", - "[ 8, 9]DET(det,basic) D[8,9](a) G[35,43](sentence)", - "[ 10, 14]ADVMOD(advmod,basic) D[10,14](very) G[15,26](complicated)", - "[ 15, 26]AMOD(amod,basic) D[15,26](complicated) G[35,43](sentence)", - "[ 27, 34]NN(nn,basic) D[27,34](example) G[35,43](sentence)", - "[ 35, 43]DOBJ(dobj,basic) D[35,43](sentence) G[3,7](need)", - "[ 46, 51]NSUBJ(nsubj,basic) D[46,51](which) G[52,60](contains)", - "[ 52, 60]RCMOD(rcmod,basic) D[52,60](contains) G[35,43](sentence)", - "[ 64, 68]AMOD(amod,basic) D[64,68](many) G[69,81](constituents)", - "[ 69, 81]PREP(prep_as,basic) D[69,81](constituents) G[52,60](contains)", - "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", - "[102,110]PREP(prep_as,basic) D[102,110](possible) G[52,60](contains)" }; - - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", "POS_NOUN", "POS_PUNCT", "POS_DET", - "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; - - String[] posOriginal = { "PRP", "VBP", "DT", "RB", "JJ", "NN", "NN", ",", "WDT", "VBZ", - "IN", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; - - String pennTree = "(ROOT (S (NP (PRP We)) (VP (VBP need) (NP (NP (DT a) (ADJP (RB very) " - + "(JJ complicated)) (NN example) (NN sentence)) (, ,) (SBAR (WHNP (WDT which)) " - + "(S (VP (VBZ contains) (PP (PP (IN as) (NP (JJ many) (NNS constituents) " - + "(CC and) (NNS dependencies))) (PP (IN as) (ADJP (JJ possible))))))))) (. .)))"; - - String[] unmappedDep = { "gov" }; - - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, - select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); - } - - @Test - public void testEnglishWsjRnn() - throws Exception - { - JCas jcas = runTest("en", "wsj-rnn", "We need a very complicated example sentence , which " - + "contains as many constituents and dependencies as possible ."); - - String[] constituentMapped = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 61,98", - "NP 8,110", "NP 8,43", "PP 99,110", "QP 61,68", "ROOT 0,112", "S 0,112", - "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; - - String[] constituentOriginal = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 61,98", - "NP 8,110", "NP 8,43", "PP 99,110", "QP 61,68", "ROOT 0,112", "S 0,112", - "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; - - String[] dependencies = { - "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](We) G[3,7](need)", - "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", - "[ 8, 9]DET(det,basic) D[8,9](a) G[35,43](sentence)", - "[ 10, 14]ADVMOD(advmod,basic) D[10,14](very) G[15,26](complicated)", - "[ 15, 26]AMOD(amod,basic) D[15,26](complicated) G[35,43](sentence)", - "[ 27, 34]NN(nn,basic) D[27,34](example) G[35,43](sentence)", - "[ 35, 43]DOBJ(dobj,basic) D[35,43](sentence) G[3,7](need)", - "[ 46, 51]NSUBJ(nsubj,basic) D[46,51](which) G[52,60](contains)", - "[ 52, 60]RCMOD(rcmod,basic) D[52,60](contains) G[35,43](sentence)", - "[ 61, 63]QUANTMOD(quantmod,basic) D[61,63](as) G[64,68](many)", - "[ 64, 68]NUM(num,basic) D[64,68](many) G[69,81](constituents)", - "[ 69, 81]DOBJ(dobj,basic) D[69,81](constituents) G[52,60](contains)", - "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", - "[102,110]PREP(prep_as,basic) D[102,110](possible) G[52,60](contains)" }; - - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", "POS_NOUN", "POS_PUNCT", "POS_DET", - "POS_VERB", "POS_ADV", "POS_ADJ", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; - - String[] posOriginal = { "PRP", "VBP", "DT", "RB", "VBN", "NN", "NN", ",", "WDT", "VBZ", - "RB", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; - - String pennTree = "(ROOT (S (NP (PRP We)) (VP (VBP need) (NP (NP (DT a) (ADJP (RB very) " - + "(VBN complicated)) (NN example) (NN sentence)) (, ,) (SBAR (WHNP (WDT which)) " - + "(S (VP (VBZ contains) (NP (QP (RB as) (JJ many)) (NNS constituents) (CC and) " - + "(NNS dependencies)) (PP (IN as) (ADJP (JJ possible)))))))) (. .)))"; - - String[] unmappedDep = { "gov" }; - - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, - select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); - AssertAnnotations.assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); - } - - /** - * This test uses simple double quotes. - * - * @throws Exception - * if there is an error. - */ - @Test - public void testEnglishFactoredDirectSpeech() - throws Exception - { - JCas jcas = runTest("en", "factored", - "\" It 's cold outside , \" he said , \" and it 's starting to rain . \""); - - String[] posOriginal = new String[] { "``", "PRP", "VBZ", "JJ", "JJ", ",", "''", "PRP", - "VBD", ",", "``", "CC", "PRP", "VBZ", "VBG", "TO", "NN", ".", "''" }; - - String pennTree = "(ROOT (S (`` \") (S (NP (PRP It)) (VP (VBZ 's) (ADJP (JJ cold)) (S " - + "(ADJP (JJ outside))))) (PRN (, ,) ('' \") (S (NP (PRP he)) (VP (VBD said))) (, " - + ",) (`` \")) (CC and) (S (NP (PRP it)) (VP (VBZ 's) (VP (VBG starting) (PP " - + "(TO to) (NP (NN rain)))))) (. .) ('' \")))"; - - AssertAnnotations.assertPOS(null, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - } - - /** - * This test uses UTF-8 quotes as they can be found in the British National Corpus. - * - * @throws Exception - * if there is an error. - */ - @Test - public void testEnglishFactoredDirectSpeech2() - throws Exception - { - // JCas jcas = runTest("en", "factored", - // "‘Prices are used as a barrier so that the sort of " + - // "people we don't want go over the road ,’ he said ."); - JCas jcas = runTest("en", "factored", new String[] { "‘", "It", "'s", "cold", "outside", - ",", "’", "he", "said", ",", "‘", "and", "it", "'s", "starting", "to", "rain", ".", - "’" }); - - String[] posOriginal = new String[] { "``", "PRP", "VBZ", "JJ", "JJ", ",", "''", "PRP", - "VBD", ",", "``", "CC", "PRP", "VBZ", "VBG", "TO", "NN", ".", "''" }; - - String pennTree = "(ROOT (S (`` ‘) (S (NP (PRP It)) (VP (VBZ 's) (ADJP (JJ cold)) (S " - + "(ADJP (JJ outside))))) (PRN (, ,) ('' ’) (S (NP (PRP he)) (VP (VBD said))) " - + "(, ,) (`` ‘)) (CC and) (S (NP (PRP it)) (VP (VBZ 's) (VP (VBG starting) (PP " - + "(TO to) (NP (NN rain)))))) (. .) ('' ’)))"; - - AssertAnnotations.assertPOS(null, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - } - - @Test - public void testSpanishShiftReduceBeam() - throws Exception - { - JCas jcas = runTestWithPosTagger("es", "sr-beam", "Necesitamos una oración de ejemplo " - + "muy complicado , que contiene la mayor cantidad de componentes y dependencias " - + "como sea posible ."); - - String[] constituentMapped = { "ADJP 122,129", "ADJP 68,73", "ADVP 35,38", "CONJP 113,117", - "CONJP 98,99", "NP 100,112", "NP 12,129", "NP 16,129", "NP 27,34", "NP 65,112", - "NP 68,112", "NP 86,112", "NP 86,97", "PP 24,34", "PP 83,112", "ROOT 0,131", - "S 0,131", "S 113,129", "S 35,49", "S 50,129", "VP 0,11", "VP 118,121", "VP 56,64", - "X 12,15", "X 24,26", "X 39,49", "X 52,55", "X 65,67", "X 83,85" }; - - String[] constituentOriginal = { "ROOT 0,131", "S 113,129", "S 35,49", "S 50,129", - "conj 113,117", "conj 98,99", "grup.a 122,129", "grup.a 68,73", "grup.adv 35,38", - "grup.nom 100,112", "grup.nom 16,129", "grup.nom 27,34", "grup.nom 68,112", - "grup.nom 86,112", "grup.nom 86,97", "grup.verb 0,11", "grup.verb 118,121", - "grup.verb 56,64", "participi 39,49", "prep 24,26", "prep 83,85", "relatiu 52,55", - "s.a 122,129", "s.a 68,73", "sadv 35,38", "sentence 0,131", "sn 12,129", - "sn 27,34", "sn 65,112", "sn 86,112", "sp 24,34", "sp 83,112", "spec 12,15", - "spec 65,67" }; - - String[] dependencies = { }; - - String[] posMapped = { "POS_VERB", "POS_DET", "POS_NOUN", "POS_ADP", "POS_NOUN", "POS_ADV", "POS_ADJ", "POS_PUNCT", "POS_PRON", - "POS_VERB", "POS_DET", "POS_ADJ", "POS_NOUN", "POS_ADP", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_CONJ", "POS_VERB", "POS_ADJ", - "POS_PUNCT" }; - - String[] posOriginal = { "vmip000", "di0000", "nc0s000", "sp000", "nc0s000", "rg", - "aq0000", "fc", "pr000000", "vmip000", "da0000", "aq0000", "nc0s000", "sp000", - "nc0p000", "cc", "nc0p000", "cs", "vssp000", "aq0000", "fp" }; - - String pennTree = "(ROOT (sentence (grup.verb (vmip000 Necesitamos)) (sn (spec " - + "(di0000 una)) (grup.nom (nc0s000 oración) (sp (prep (sp000 de)) (sn " - + "(grup.nom (nc0s000 ejemplo)))) (S (sadv (grup.adv (rg muy))) (participi " - + "(aq0000 complicado))) (S (fc ,) (relatiu (pr000000 que)) (grup.verb " - + "(vmip000 contiene)) (sn (spec (da0000 la)) (grup.nom (s.a (grup.a " - + "(aq0000 mayor))) (nc0s000 cantidad) (sp (prep (sp000 de)) (sn (grup.nom " - + "(grup.nom (nc0p000 componentes)) (conj (cc y)) (grup.nom " - + "(nc0p000 dependencias))))))) (S (conj (cs como)) (grup.verb (vssp000 sea)) " - + "(s.a (grup.a (aq0000 posible))))))) (fp .)))"; - - String[] posTags = SPANISH_POS_TAGS; - - String[] constituentTags = { "ROOT", "S", "conj", "f", "gerundi", "grup.a", "grup.adv", - "grup.cc", "grup.cs", "grup.nom", "grup.prep", "grup.pron", "grup.verb", "grup.w", - "grup.z", "inc", "infinitiu", "interjeccio", "morfema.pronominal", "morfema.verbal", - "neg", "participi", "prep", "relatiu", "s.a", "sadv", "sentence", "sn", "sp", - "spec" }; - - String[] unmappedPos = { ".$$.", "359000", "NCMS000", "word" }; - - String[] unmappedConst = { "f" }; - - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, - select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertTagset(POS.class, "ancora", posTags, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ancora", unmappedPos, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ancora", constituentTags, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ancora", unmappedConst, jcas); -// AssertAnnotations.assertTagset(Dependency.class, "stanford341", depTags, jcas); -// AssertAnnotations.assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); - } - - /** - * Tests the parser reading pre-existing POS tags - * - * @throws Exception - * if there is an error. - */ - @Test - public void testExistingPos() - throws Exception - { - AnalysisEngineDescription engine = createEngineDescription( - createEngineDescription(StanfordPosTagger.class), - createEngineDescription(StanfordParser.class, - StanfordParser.PARAM_READ_POS, true, - StanfordParser.PARAM_WRITE_POS, false, - StanfordParser.PARAM_WRITE_PENN_TREE, true)); - - JCas jcas = TestRunner.runTest(engine, "en", "This is a test ."); - - String[] posOriginal = new String[] { "DT", "VBZ", "DT", "NN", "." }; - - String pennTree = "(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN test))) (. .)))"; - - AssertAnnotations.assertPOS(null, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - } - - @Test - public void testFrenchFactored() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("fr", "factored", "Nous avons besoin d' une phrase par exemple très " - + "compliqué , qui contient des constituants que de nombreuses dépendances et que " - + "possible ."); - - String[] constituentMapped = { "ADJP 128,136", "ADVP 32,48", "NP 11,48", "NP 21,48", - "NP 61,64", "NP 74,90", "NP 95,120", "PP 18,48", "ROOT 0,138", "S 0,138", - "SBAR 124,136", "SBAR 61,90", "SBAR 91,136", "VP 0,58", "VP 65,73", "X 121,136", - "X 32,43" }; - - String[] constituentOriginal = { "AP 128,136", "AdP 32,48", "COORD 121,136", "MWADV 32,43", - "NP 11,48", "NP 21,48", "NP 61,64", "NP 74,90", "NP 95,120", "PP 18,48", - "ROOT 0,138", "SENT 0,138", "Srel 61,90", "Ssub 124,136", "Ssub 91,136", "VN 0,58", - "VN 65,73" }; - - String[] dependencies = {/** No dependencies for French */ }; - - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_NOUN", "POS_ADP", "POS_DET", "POS_NOUN", "POS_ADP", "POS_NOUN", "POS_ADV", - "POS_VERB", "POS_PUNCT", "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_CONJ", "POS_DET", "POS_ADJ", "POS_NOUN", - "POS_CONJ", "POS_CONJ", "POS_ADJ", "POS_PUNCT" }; - - String[] posOriginal = { "CLS", "V", "NC", "P", "DET", "NC", "P", "N", "ADV", "VPP", - "PUNC", "PROREL", "V", "DET", "NC", "CS", "DET", "ADJ", "NC", "CC", "CS", "ADJ", - "PUNC" }; - - String pennTree = "(ROOT (SENT (VN (CLS Nous) (V avons) (NP (NC besoin) (PP (P d') (NP " - + "(DET une) (NC phrase) (AdP (MWADV (P par) (N exemple)) (ADV très))))) " - + "(VPP compliqué)) (PUNC ,) (Srel (NP (PROREL qui)) (VN (V contient)) (NP " - + "(DET des) (NC constituants))) (Ssub (CS que) (NP (DET de) (ADJ nombreuses) " - + "(NC dépendances)) (COORD (CC et) (Ssub (CS que) (AP (ADJ possible))))) " - + "(PUNC .)))"; - - String[] posTags = FRENCH_POS_TAGS; - - String[] constituentTags = { "AP", "AdP", "COORD", "MWA", "MWADV", "MWC", "MWCL", "MWD", - "MWET", "MWI", "MWN", "MWP", "MWPRO", "MWV", "NP", "PP", "ROOT", "SENT", "Sint", - "Srel", "Ssub", "VN", "VPinf", "VPpart" }; - - // NO DEP TAGS String[] depTags = {}; - - String[] unmappedPos = { ".$$." }; - - String[] unmappedConst = { "MWA", "MWADV", "MWC", "MWCL", "MWD", "MWET", - "MWI", "MWN", "MWP", "MWPRO", "MWV" }; - - // NO DEP TAGS String[] unmappedDep = {}; - - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, - select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - - AssertAnnotations.assertTagset(POS.class, "corenlp34", posTags, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "corenlp34", unmappedPos, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ftb", constituentTags, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ftb", unmappedConst, jcas); - // NO DEP TAGS AssertAnnotations.assertTagset(Dependency.class, null, depTags, jcas); - // NO DEP TAGS AssertAnnotations.assertTagsetMapping(Dependency.class, null, unmappedDep, jcas); - } - - @Test - public void testFrench2() - throws Exception - { - Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); - - JCas jcas = runTest("fr", null, "La traduction d' un texte du français vers l' anglais ."); - - String[] constituentMapped = { "ADJP 29,37", "NP 0,53", "NP 17,37", "NP 43,53", "PP 14,37", - "PP 26,37", "PP 38,53", "ROOT 0,55", "S 0,55" }; - - String[] constituentOriginal = { "AP 29,37", "NP 0,53", "NP 17,37", "NP 43,53", "PP 14,37", - "PP 26,37", "PP 38,53", "ROOT 0,55", "SENT 0,55" }; - - String[] posMapped = { "POS_DET", "POS_NOUN", "POS_ADP", "POS_DET", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_ADP", "POS_DET", - "POS_NOUN", "POS_PUNCT" }; - - String[] posOriginal = { "DET", "NC", "P", "DET", "NC", "P", "ADJ", "P", "DET", "NC", - "PUNC" }; - - String pennTree = "(ROOT (SENT (NP (DET La) (NC traduction) (PP (P d') (NP (DET un) " - + "(NC texte) (PP (P du) (AP (ADJ français))))) (PP (P vers) (NP (DET l') " - + "(NC anglais)))) (PUNC .)))"; - - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, - select(jcas, Constituent.class)); - } - - @Test - public void testChineseFactored() - throws Exception - { - JCas jcas = runTest("zh", "factored", - "我们 需要 一个 非常 复杂 的 句子 例如 其中 包含 许多 成分 和 尽可能 的 依赖 。"); - - String[] constituentMapped = { "ADJP 12,14", "ADJP 9,14", "ADVP 20,22", "ADVP 37,40", - "ADVP 9,11", "NP 0,2", "NP 17,19", "NP 23,25", "NP 29,34", "NP 32,34", "NP 6,19", - "QP 29,31", "QP 6,8", "ROOT 0,47", "VP 26,34", "VP 26,45", "VP 3,19", "VP 37,45", - "VP 43,45", "X 0,19", "X 0,47", "X 20,45", "X 37,42", "X 9,16" }; - - String[] constituentOriginal = { "ADJP 12,14", "ADJP 9,14", "ADVP 20,22", "ADVP 37,40", - "ADVP 9,11", "DNP 9,16", "DVP 37,42", "IP 0,19", "IP 0,47", "IP 20,45", "NP 0,2", - "NP 17,19", "NP 23,25", "NP 29,34", "NP 32,34", "NP 6,19", "QP 29,31", "QP 6,8", - "ROOT 0,47", "VP 26,34", "VP 26,45", "VP 3,19", "VP 37,45", "VP 43,45" }; - - String[] dependencies = { - "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](我们) G[3,5](需要)", - "[ 3, 5]ROOT(root,basic) D[3,5](需要) G[3,5](需要)", - "[ 6, 8]Dependency(nummod,basic) D[6,8](一个) G[17,19](句子)", - "[ 9, 11]ADVMOD(advmod,basic) D[9,11](非常) G[12,14](复杂)", - "[ 12, 14]Dependency(assmod,basic) D[12,14](复杂) G[17,19](句子)", - "[ 15, 16]Dependency(assm,basic) D[15,16](的) G[12,14](复杂)", - "[ 17, 19]DOBJ(dobj,basic) D[17,19](句子) G[3,5](需要)", - "[ 20, 22]ADVMOD(advmod,basic) D[20,22](例如) G[26,28](包含)", - "[ 23, 25]NSUBJ(nsubj,basic) D[23,25](其中) G[26,28](包含)", - "[ 26, 28]CONJ(conj,basic) D[26,28](包含) G[3,5](需要)", - "[ 29, 31]Dependency(nummod,basic) D[29,31](许多) G[32,34](成分)", - "[ 32, 34]DOBJ(dobj,basic) D[32,34](成分) G[26,28](包含)", - "[ 35, 36]CC(cc,basic) D[35,36](和) G[26,28](包含)", - "[ 37, 40]Dependency(dvpmod,basic) D[37,40](尽可能) G[43,45](依赖)", - "[ 41, 42]Dependency(dvpm,basic) D[41,42](的) G[37,40](尽可能)", - "[ 43, 45]CONJ(conj,basic) D[43,45](依赖) G[26,28](包含)" }; - - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_NUM", "POS_ADJ", "POS_ADJ", "POS_PART", "POS_NOUN", "POS_ADJ", "POS_NOUN", - "POS_VERB", "POS_NUM", "POS_NOUN", "POS_CONJ", "POS_ADJ", "POS_PART", "POS_VERB", "POS_PUNCT" }; - - String[] posOriginal = { "PN", "VV", "CD", "AD", "JJ", "DEG", "NN", "AD", "NN", "VV", "CD", - "NN", "CC", "AD", "DEV", "VV", "PU" }; - - String pennTree = "(ROOT (IP (IP (NP (PN 我们)) (VP (VV 需要) (NP (QP (CD 一个)) (DNP " - + "(ADJP (ADVP (AD 非常)) (ADJP (JJ 复杂))) (DEG 的)) (NP (NN 句子))))) (IP (ADVP " - + "(AD 例如)) (NP (NN 其中)) (VP (VP (VV 包含) (NP (QP (CD 许多)) (NP (NN 成分)))) " - + "(CC 和) (VP (DVP (ADVP (AD 尽可能)) (DEV 的)) (VP (VV 依赖))))) (PU 。)))"; - - String[] posTags = { ".$$.", "AD", "AS", "BA", "CC", "CD", "CS", "DEC", "DEG", "DER", - "DEV", "DT", "ETC", "FW", "IJ", "JJ", "LB", "LC", "M", "MSP", "NN", "NR", "NT", - "OD", "ON", "P", "PN", "PU", "SB", "SP", "URL", "VA", "VC", "VE", "VV", "X" }; - - String[] constituentTags = { "ADJP", "ADVP", "CLP", "CP", "DFL", "DNP", "DP", "DVP", "FLR", - "FRAG", "INC", "INTJ", "IP", "LCP", "LST", "NP", "PP", "PRN", "QP", "ROOT", "UCP", - "VCD", "VCP", "VNV", "VP", "VPT", "VRD", "VSB", "WHPP" }; - - // NO DEP TAGS String[] depTags = new String[] {}; - - String[] unmappedPos = { ".$$.", "URL" }; - - String[] unmappedConst = { "DFL", "FLR", "INC", "WHPP" }; - - // NO DEP TAGS String[] unmappedDep = new String[] {}; - - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, - select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(POS.class, "ctb", posTags, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ctb", unmappedPos, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ctb", constituentTags, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ctb", unmappedConst, jcas); - // NO DEP TAGS AssertAnnotations.assertTagset(Dependency.class, null, depTags, jcas); - // NO DEP TAGS AssertAnnotations.assertTagsetMapping(Dependency.class, null, unmappedDep, jcas); - } - - @Test - public void testChineseXinhuaFactored() - throws Exception - { - JCas jcas = runTest("zh", "xinhua-factored", - "我们 需要 一个 非常 复杂 的 句子 例如 其中 包含 许多 成分 和 尽可能 的 依赖 。"); - - String[] constituentMapped = { "ADVP 20,22", "ADVP 37,40", "ADVP 9,11", "NP 0,2", - "NP 17,19", "NP 23,25", "NP 29,34", "NP 32,34", "NP 43,45", "NP 6,45", "NP 9,19", - "QP 29,31", "QP 6,8", "ROOT 0,47", "VP 12,14", "VP 26,34", "VP 26,40", "VP 3,45", - "VP 37,40", "VP 9,14", "X 0,47", "X 20,40", "X 9,14", "X 9,16", "X 9,40", "X 9,42" }; - - String[] constituentOriginal = { "ADVP 20,22", "ADVP 37,40", "ADVP 9,11", "CP 9,16", - "CP 9,42", "IP 0,47", "IP 20,40", "IP 9,14", "IP 9,40", "NP 0,2", "NP 17,19", - "NP 23,25", "NP 29,34", "NP 32,34", "NP 43,45", "NP 6,45", "NP 9,19", "QP 29,31", - "QP 6,8", "ROOT 0,47", "VP 12,14", "VP 26,34", "VP 26,40", "VP 3,45", "VP 37,40", - "VP 9,14" }; - - String[] dependencies = { - "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](我们) G[3,5](需要)", - "[ 3, 5]ROOT(root,basic) D[3,5](需要) G[3,5](需要)", - "[ 6, 8]Dependency(nummod,basic) D[6,8](一个) G[43,45](依赖)", - "[ 9, 11]ADVMOD(advmod,basic) D[9,11](非常) G[12,14](复杂)", - "[ 12, 14]RCMOD(rcmod,basic) D[12,14](复杂) G[17,19](句子)", - "[ 15, 16]Dependency(cpm,basic) D[15,16](的) G[12,14](复杂)", - "[ 17, 19]NSUBJ(nsubj,basic) D[17,19](句子) G[26,28](包含)", - "[ 20, 22]ADVMOD(advmod,basic) D[20,22](例如) G[26,28](包含)", - "[ 23, 25]NSUBJ(nsubj,basic) D[23,25](其中) G[26,28](包含)", - "[ 26, 28]RCMOD(rcmod,basic) D[26,28](包含) G[43,45](依赖)", - "[ 29, 31]Dependency(nummod,basic) D[29,31](许多) G[32,34](成分)", - "[ 32, 34]DOBJ(dobj,basic) D[32,34](成分) G[26,28](包含)", - "[ 35, 36]CC(cc,basic) D[35,36](和) G[26,28](包含)", - "[ 37, 40]CONJ(conj,basic) D[37,40](尽可能) G[26,28](包含)", - "[ 41, 42]Dependency(cpm,basic) D[41,42](的) G[26,28](包含)", - "[ 43, 45]DOBJ(dobj,basic) D[43,45](依赖) G[3,5](需要)" }; - - String[] posMapped = { "POS_PRON", "POS_VERB", "POS_NUM", "POS_ADJ", "POS_VERB", "POS_PART", "POS_NOUN", "POS_ADJ", "POS_NOUN", - "POS_VERB", "POS_NUM", "POS_NOUN", "POS_CONJ", "POS_ADJ", "POS_PART", "POS_NOUN", "POS_PUNCT" }; - - String[] posOriginal = { "PN", "VV", "CD", "AD", "VA", "DEC", "NN", "AD", "NN", "VV", "CD", - "NN", "CC", "AD", "DEC", "NN", "PU" }; - - String pennTree = "(ROOT (IP (NP (PN 我们)) (VP (VV 需要) (NP (QP (CD 一个)) (CP (IP (NP " - + "(CP (IP (VP (ADVP (AD 非常)) (VP (VA 复杂)))) (DEC 的)) (NP (NN 句子))) (IP " - + "(ADVP (AD 例如)) (NP (NN 其中)) (VP (VP (VV 包含) (NP (QP (CD 许多)) (NP " - + "(NN 成分)))) (CC 和) (VP (ADVP (AD 尽可能)))))) (DEC 的)) (NP (NN 依赖)))) " - + "(PU 。)))"; - - String[] posTags = { ".$$.", "AD", "AS", "BA", "CC", "CD", "CS", "DEC", "DEG", "DER", - "DEV", "DT", "ETC", "FW", "JJ", "LB", "LC", "M", "MSP", "NN", "NR", "NT", "OD", - "P", "PN", "PU", "SB", "SP", "VA", "VC", "VE", "VV" }; - - String[] constituentTags = { "ADJP", "ADVP", "CLP", "CP", "DNP", "DP", "DVP", "FRAG", "IP", - "LCP", "LST", "NP", "PP", "PRN", "QP", "ROOT", "UCP", "VCD", "VCP", "VNV", "VP", - "VPT", "VRD", "VSB" }; - - // NO DEP TAGS String[] depTags = new String[] {}; - - String[] unmappedPos = { ".$$." }; - - String[] unmappedConst = { }; - - // NO DEP TAGS String[] unmappedDep = new String[] {}; - - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, - select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(POS.class, "ctb", posTags, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ctb", unmappedPos, jcas); - AssertAnnotations.assertTagset(Constituent.class, "ctb", constituentTags, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "ctb", unmappedConst, jcas); - // NO DEP TAGS AssertAnnotations.assertTagset(Dependency.class, null, depTags, jcas); - // NO DEP TAGS AssertAnnotations.assertTagsetMapping(Dependency.class, null, unmappedDep, jcas); - } - - @Test - public void testArabicFactored() - throws Exception - { - JCas jcas = runTest("ar", "factored", - "نحتاج مثالا معقدا جدا ل جملة تحتوي على أكبر قدر ممكن من العناصر و الروابط ."); - - String[] constituentMapped = { "NP 24,28", "NP 24,73", "NP 39,73", "NP 44,52", "NP 44,73", - "NP 56,73", "NP 6,21", "PP 22,73", "PP 35,73", "PP 53,73", "ROOT 0,75", "S 0,75", - "S 29,73", "SBAR 29,73", "VP 0,73", "VP 29,73" }; - - String[] constituentOriginal = { "NP 24,28", "NP 24,73", "NP 39,73", "NP 44,52", - "NP 44,73", "NP 56,73", "NP 6,21", "PP 22,73", "PP 35,73", "PP 53,73", "ROOT 0,75", - "S 0,75", "S 29,73", "SBAR 29,73", "VP 0,73", "VP 29,73" }; - - String[] dependencies = {}; - - String pennTree = "(ROOT (S (VP (VBP نحتاج) (NP (NN مثالا) (JJ معقدا) (NN جدا)) (PP (IN ل) (NP " - + "(NP (NN جملة)) (SBAR (S (VP (VBP تحتوي) (PP (IN على) (NP (NN أكبر) (NP (NP (NN قدر) " - + "(JJ ممكن)) (PP (IN من) (NP (DTNN العناصر) (CC و) (DTNN الروابط)))))))))))) (PUNC .)))"; - - String[] posMapped = { "POS_VERB", "POS_NOUN", "POS_ADJ", "POS_NOUN", "POS_ADP", "POS_NOUN", "POS_VERB", "POS_ADP", "POS_NOUN", - "POS_NOUN", "POS_ADJ", "POS_ADP", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_PUNCT" }; - - String[] posOriginal = { "VBP", "NN", "JJ", "NN", "IN", "NN", "VBP", "IN", "NN", "NN", - "JJ", "IN", "DTNN", "CC", "DTNN", "PUNC" }; - - String[] posTags = { ".$$.", "ADJ_NUM", "CC", "CD", "DT", "DTJJ", "DTJJR", "DTNN", "DTNNP", - "DTNNPS", "DTNNS", "IN", "JJ", "JJR", "NN", "NNP", "NNPS", "NNS", "NOUN_QUANT", - "PRP", "PRP$", "PUNC", "RB", "RP", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VN", - "WP", "WRB" }; - - String[] constituentTags = { "ADJP", "ADVP", "CONJP", "FRAG", "INTJ", "LST", "NAC", "NP", - "PP", "PRN", "PRT", "ROOT", "S", "SBAR", "SBARQ", "SQ", "UCP", "VP", "WHADVP", - "WHNP", "WHPP", "X" }; - - String[] unmappedPos = { ".$$.", "ADJ_NUM", "NOUN_QUANT", "PRP$" }; - - String[] unmappedConst = { "LST" }; - - AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - AssertAnnotations.assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); - AssertAnnotations.assertConstituents(constituentMapped, constituentOriginal, - select(jcas, Constituent.class)); - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(POS.class, "atb", posTags, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "atb", unmappedPos, jcas); - AssertAnnotations.assertTagset(Constituent.class, "atb", constituentTags, jcas); - AssertAnnotations.assertTagsetMapping(Constituent.class, "atb", unmappedConst, jcas); - } - - /** - * This tests whether a complete syntax tree can be recreated from the annotations without any - * loss. Consequently, all links to children should be correct. (This makes no assertions about - * the parent-links, because they are not used for the recreation) - * - * @throws Exception - * if there is an error. - */ - @Test - public void testEnglishSyntaxTreeReconstruction() - throws Exception - { - JCas jcas = runTest("en", "factored", "We need a very complicated example sentence , which " - + "contains as many constituents and dependencies as possible ."); - - String pennOriginal = ""; - String pennFromRecreatedTree = ""; - - // As we only have one input sentence, each loop only runs once! - - for (PennTree curPenn : select(jcas, PennTree.class)) { - // get original penn representation of syntax tree - pennOriginal = curPenn.getPennTree(); - } - - for (ROOT curRoot : select(jcas, ROOT.class)) { - // recreate syntax tree - Tree recreation = TreeUtils.createStanfordTree(curRoot); - - // make a tree with simple string-labels - recreation = recreation.deepCopy(recreation.treeFactory(), StringLabel.factory()); - - pennFromRecreatedTree = recreation.pennString(); - } - - assertTrue("The recreated syntax-tree did not match the input syntax-tree.", - pennOriginal.equals(pennFromRecreatedTree)); - } - - @Test - public void testModelSharing() - throws Exception - { - // Save share override value (if any was set) and enable sharing for the StanfordParser - String prop = "dkpro.core.resourceprovider.sharable." + StanfordParser.class.getName(); - String oldValue = System.getProperty(prop); - System.setProperty(prop, "true"); - - final List<LoggingEvent> records = new ArrayList<LoggingEvent>(); - - // Tell the logger to log everything - Logger rootLogger = org.apache.log4j.LogManager.getRootLogger(); - final org.apache.log4j.Level oldLevel = rootLogger.getLevel(); - rootLogger.setLevel(org.apache.log4j.Level.ALL); - Appender appender = (Appender) rootLogger.getAllAppenders().nextElement(); - // Capture output, log only what would have passed the original logging level - appender.addFilter(new org.apache.log4j.spi.Filter() - { - @Override - public int decide(LoggingEvent event) - { - records.add(event); - return event.getLevel().toInt() >= oldLevel.toInt() ? org.apache.log4j.spi.Filter.NEUTRAL - : org.apache.log4j.spi.Filter.DENY; - } - }); - - try { - AnalysisEngineDescription pipeline = createEngineDescription( - createEngineDescription(StanfordParser.class, - StanfordParser.PARAM_WRITE_CONSTITUENT, true, - StanfordParser.PARAM_WRITE_DEPENDENCY, false), - createEngineDescription(StanfordParser.class, - StanfordParser.PARAM_WRITE_CONSTITUENT, false, - StanfordParser.PARAM_WRITE_DEPENDENCY, true)); - - JCas jcas = TestRunner.runTest(pipeline, "en", "This is a test ."); - - boolean found = false; - for (LoggingEvent e : records) { - if (String.valueOf(e.getMessage()).contains("Used resource from cache")) { - found = true; - } - } - - assertTrue("No log message about using the cached resource was found!", found); - - String[] dependencies = { - "[ 0, 4]NSUBJ(nsubj,basic) D[0,4](This) G[10,14](test)", - "[ 5, 7]COP(cop,basic) D[5,7](is) G[10,14](test)", - "[ 8, 9]DET(det,basic) D[8,9](a) G[10,14](test)", - "[ 10, 14]ROOT(root,basic) D[10,14](test) G[10,14](test)" }; - - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - } - finally { - if (oldLevel != null) { - rootLogger.setLevel(oldLevel); - appender.clearFilters(); - } - - if (oldValue != null) { - System.setProperty(prop, oldValue); - } - else { - System.clearProperty(prop); - } - } - } - - private JCas runTestWithPosTagger(String aLanguage, String aVariant, String aText, - Object... aExtraParams) - throws Exception - { - AssumeResource.assumeResource(StanfordPosTagger.class, "tagger", aLanguage, null); - AssumeResource.assumeResource(StanfordParser.class, "parser", aLanguage, aVariant); - - AggregateBuilder aggregate = new AggregateBuilder(); - - aggregate.add(createEngineDescription(StanfordPosTagger.class)); - - Object[] params = new Object[] { - StanfordParser.PARAM_VARIANT, aVariant, - StanfordParser.PARAM_PRINT_TAGSET, true, - StanfordParser.PARAM_WRITE_CONSTITUENT, true, - StanfordParser.PARAM_WRITE_DEPENDENCY, true, - StanfordParser.PARAM_WRITE_PENN_TREE, true, - StanfordParser.PARAM_READ_POS, true, - StanfordParser.PARAM_WRITE_POS, false}; - params = ArrayUtils.addAll(params, aExtraParams); - aggregate.add(createEngineDescription(StanfordParser.class, params)); - - return TestRunner.runTest(aggregate.createAggregateDescription(), aLanguage, aText); - } - - private JCas runTest(String aLanguage, String aVariant, String aText, Object... aExtraParams) - throws Exception - { - AssumeResource.assumeResource(StanfordParser.class, "parser", aLanguage, aVariant); - - AggregateBuilder aggregate = new AggregateBuilder(); - - Object[] params = new Object[] { - StanfordParser.PARAM_VARIANT, aVariant, - StanfordParser.PARAM_PRINT_TAGSET, true, - StanfordParser.PARAM_WRITE_CONSTITUENT, true, - StanfordParser.PARAM_WRITE_DEPENDENCY, true, - StanfordParser.PARAM_WRITE_PENN_TREE, true, - StanfordParser.PARAM_WRITE_POS, true}; - params = ArrayUtils.addAll(params, aExtraParams); - aggregate.add(createEngineDescription(StanfordParser.class, params)); - - return TestRunner.runTest(aggregate.createAggregateDescription(), aLanguage, aText); - } - - private JCas runTest(String aLanguage, String aVariant, String[] aTokens) - throws Exception - { - AssumeResource.assumeResource(StanfordParser.class, "parser", aLanguage, aVariant); - - // setup English - AnalysisEngineDescription parser = createEngineDescription(StanfordParser.class, - StanfordParser.PARAM_VARIANT, aVariant, - StanfordParser.PARAM_PRINT_TAGSET, true, - StanfordParser.PARAM_WRITE_CONSTITUENT, true, - StanfordParser.PARAM_WRITE_DEPENDENCY, true, - StanfordParser.PARAM_WRITE_PENN_TREE, true, - StanfordParser.PARAM_WRITE_POS, true, - StanfordParser.PARAM_WRITE_PENN_TREE, true, - StanfordParser.PARAM_QUOTE_BEGIN, new String[] { "‘" }, - StanfordParser.PARAM_QUOTE_END, new String[] { "’" }); - - AnalysisEngine engine = createEngine(parser); - JCas jcas = engine.newJCas(); - jcas.setDocumentLanguage(aLanguage); - - JCasBuilder builder = new JCasBuilder(jcas); - for (String t : aTokens) { - builder.add(t, Token.class); - builder.add(" "); - } - builder.add(0, Sentence.class); - builder.close(); - - engine.process(jcas); - - return jcas; - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordSegmenterTest.java b/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordSegmenterTest.java deleted file mode 100644 index 14991c1b7c..0000000000 --- a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordSegmenterTest.java +++ /dev/null @@ -1,202 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; - -import static java.util.Arrays.asList; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.junit.Assert.assertEquals; - -import java.io.PrintWriter; -import java.io.StringWriter; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Properties; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.fit.factory.JCasFactory; -import org.apache.uima.jcas.JCas; -import org.junit.Ignore; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.harness.SegmenterHarness; -import edu.stanford.nlp.ie.crf.CRFClassifier; -import edu.stanford.nlp.ling.CoreAnnotations; -import edu.stanford.nlp.ling.CoreLabel; -import edu.stanford.nlp.objectbank.ObjectBank; -import edu.stanford.nlp.pipeline.Annotation; -import edu.stanford.nlp.pipeline.StanfordCoreNLP; -import edu.stanford.nlp.sequences.SeqClassifierFlags; -import edu.stanford.nlp.util.CoreMap; - -public -class StanfordSegmenterTest -{ - @Test - public void run() throws Throwable - { - AnalysisEngineDescription aed = createEngineDescription(StanfordSegmenter.class); - - SegmenterHarness.run(aed, "de.1", "de.2", "de.3", "de.4", "en.9", "ar.1", "zh.1", "zh.2"); - } - - @Test - public void testEnglishSpeech() throws Exception - { - JCas jcas = JCasFactory.createJCas(); - jcas.setDocumentLanguage("en"); - jcas.setDocumentText("'Let's go! I want to see the Don', he said."); - - AnalysisEngine aed = createEngine(StanfordSegmenter.class); - aed.process(jcas); - - String[] tokens = { "'", "Let", "'s", "go", "!", "I", "want", "to", "see", "the", "Don", - "'", ",", "he", "said", "." }; - - AssertAnnotations.assertToken(tokens, select(jcas, Token.class)); - } - - @Test - public void testFrench() throws Exception - { - JCas jcas = JCasFactory.createJCas(); - jcas.setDocumentLanguage("fr"); - jcas.setDocumentText("Tim a dit Jamie pour la 100e fois de quitter la salle ."); - - AnalysisEngine aed = createEngine(StanfordSegmenter.class); - aed.process(jcas); - - String[] tokens = { "Tim", "a", "dit", "Jamie", "pour", "la", "100e", "fois", "de", - "quitter", "la", "salle", "." }; - - AssertAnnotations.assertToken(tokens, select(jcas, Token.class)); - } - - @Test - public void testSpanish() throws Exception - { - JCas jcas = JCasFactory.createJCas(); - jcas.setDocumentLanguage("es"); - jcas.setDocumentText("Tim dijo a Jamie para la 100ª vez que abandone la sala."); - - AnalysisEngine aed = createEngine(StanfordSegmenter.class); - aed.process(jcas); - - String[] tokens = { "Tim", "dijo", "a", "Jamie", "para", "la", "100ª", "vez", "que", - "abandone", "la", "sala", "." }; - - AssertAnnotations.assertToken(tokens, select(jcas, Token.class)); - } - - @Test - public void testUnwrapped() throws Exception - { - String text = "\"Hey you!\", John said."; - - String[] expectedSentences = { "0 10 \"Hey you!\"", "10 22 , John said." }; - String[] expectedTokens = { "0 1 `` \"", "1 4 Hey Hey", "5 8 you you", "8 9 ! !", - "9 10 '' \"", "10 11 , ,", "12 16 John John", "17 21 said said", "21 22 . ." }; - - List<String> sentences = new ArrayList<String>(); - List<String> tokens = new ArrayList<String>(); - - Properties props = new Properties(); - props.setProperty("annotators", "tokenize, ssplit"); - StanfordCoreNLP pipeline = new StanfordCoreNLP(props); - Annotation annotation = pipeline.process(text); - for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { - sentences.add(String.format("%d %d %s", - sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), - sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class), - sentence.get(CoreAnnotations.TextAnnotation.class))); - for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { - tokens.add(String.format("%d %d %s %s", - token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), - token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class), - token.get(CoreAnnotations.TextAnnotation.class), - token.get(CoreAnnotations.OriginalTextAnnotation.class))); - } - } - -// System.out.println(AssertAnnotations.asCopyableString(sentences, true)); -// System.out.println(AssertAnnotations.asCopyableString(tokens, true)); - - assertEquals(asList(expectedSentences), sentences); - assertEquals(asList(expectedTokens), tokens); - } - - @Ignore("This is completely incomplete so far") - @Test - public void testChinese() throws Exception - { - Properties props = new Properties(); - props.setProperty("sighanCorporaDict", "target/download/segmenter/stanford-segmenter-2014-01-04/data"); - props.setProperty("sighanPostProcessing", "true"); - props.setProperty("loadClassifier", "target/download/segmenter/stanford-segmenter-2014-01-04/data/ctb.gz"); - props.setProperty("serDictionary", "target/download/segmenter/stanford-segmenter-2014-01-04/data/dict-chris6.ser.gz"); - - SeqClassifierFlags flags = new SeqClassifierFlags(); - flags.setProperties(props, false); - CRFClassifier<CoreLabel> crf = new CRFClassifier<CoreLabel>(flags); - crf.loadClassifierNoExceptions(flags.loadClassifier, props); - crf.loadTagIndex(); - - String sentence = "我们需要一个非常复杂的句子例如其中包含许多成分和尽可能的依赖。"; - - System.out.println(crf.segmentString(sentence)); - - ObjectBank<List<CoreLabel>> docs = crf.makeObjectBankFromString(sentence, - crf.defaultReaderAndWriter()); - - StringWriter stringWriter = new StringWriter(); - PrintWriter stringPrintWriter = new PrintWriter(stringWriter); - for (List<CoreLabel> doc : docs) { - crf.classify(doc); -// for (CoreLabel w : doc) { -// System.out.printf("%s %s %s %s%n", -// String.valueOf(w.get(CoreAnnotations.PositionAnnotation.class)), -// String.valueOf(w.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)), -// String.valueOf(w.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)), -// String.valueOf(w.get(CoreAnnotations.AnswerAnnotation.class))); -// } - crf.defaultReaderAndWriter().printAnswers(doc, stringPrintWriter); - stringPrintWriter.println(); - } - stringPrintWriter.close(); - String segmented = stringWriter.toString(); - - System.out.println(Arrays.asList(segmented.split("\\s"))); - } - - @Test - public void testZoning() throws Exception - { - SegmenterHarness.testZoning(StanfordSegmenter.class); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordSentimentAnalyzerTest.java b/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordSentimentAnalyzerTest.java deleted file mode 100644 index 8764dd7f7a..0000000000 --- a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordSentimentAnalyzerTest.java +++ /dev/null @@ -1,62 +0,0 @@ -/** - * Copyright 2007-2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. - */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; - -import static org.junit.Assert.assertTrue; - -import org.apache.uima.cas.CAS; -import org.apache.uima.fit.component.CasDumpWriter; -import org.apache.uima.fit.factory.AnalysisEngineFactory; -import org.apache.uima.fit.factory.TypeSystemDescriptionFactory; -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.util.CasCreationUtils; -import org.junit.Ignore; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.sentiment.type.StanfordSentimentAnnotation; - -/** - * Test for {@link de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSentimentAnalyzer} - */ -@Ignore("https://github.com/dkpro/dkpro-core/issues/779") -public class StanfordSentimentAnalyzerTest { - - @Test - public void testSentiment() throws Exception { - CAS cas = CasCreationUtils.createCas(TypeSystemDescriptionFactory.createTypeSystemDescription(), null, null); - cas.setDocumentLanguage("en"); - cas.setDocumentText("I feel very very bad."); - Sentence s = new Sentence(cas.getJCas(), 0, cas.getDocumentText().length()); - s.addToIndexes(); - - SimplePipeline.runPipeline(cas, - AnalysisEngineFactory.createEngineDescription(StanfordSentimentAnalyzer.class), - AnalysisEngineFactory.createEngineDescription(CasDumpWriter.class) - ); - - StanfordSentimentAnnotation sentimentAnnotation = JCasUtil.select(cas.getJCas(), - StanfordSentimentAnnotation.class).iterator().next(); - - // more negative than positive - assertTrue(sentimentAnnotation.getNegative() + sentimentAnnotation.getVeryNegative() - > sentimentAnnotation.getPositive() + sentimentAnnotation.getVeryPositive()); - } -} \ No newline at end of file diff --git a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/ReuseTest.java b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/ReuseTest.java similarity index 93% rename from dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/ReuseTest.java rename to dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/ReuseTest.java index 75b4897817..dfd0eb08cb 100644 --- a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/ReuseTest.java +++ b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/ReuseTest.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,9 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; +package org.dkpro.core.stanfordnlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; @@ -32,15 +32,14 @@ import org.apache.uima.collection.EntityProcessStatus; import org.apache.uima.collection.StatusCallbackListener; import org.apache.uima.fit.cpe.CpeBuilder; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.io.text.TextWriter; +import org.dkpro.core.testing.DkproTestContext; import org.junit.After; import org.junit.Before; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextWriter; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - public class ReuseTest { @Before diff --git a/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordCoreferenceResolverTest.java b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordCoreferenceResolverTest.java new file mode 100644 index 0000000000..58640ddaa0 --- /dev/null +++ b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordCoreferenceResolverTest.java @@ -0,0 +1,242 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.stanfordnlp; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.junit.Assert.assertFalse; + +import java.util.ArrayList; +import java.util.List; +import java.util.logging.ConsoleHandler; +import java.util.logging.Filter; +import java.util.logging.Level; +import java.util.logging.LogManager; +import java.util.logging.LogRecord; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Ignore; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; +import edu.stanford.nlp.dcoref.Constants; + +/** + */ +public class StanfordCoreferenceResolverTest +{ + @Test + public void test() + throws Exception + { + JCas jcas = runTest("en", "John bought a car. He is very happy with it."); + + String[][] ref = { + { "John", "He" }, + { "a car", "it" } }; + + AssertAnnotations.assertCoreference(ref, select(jcas, CoreferenceChain.class)); + } + + // https://github.com/dkpro/dkpro-core/issues/582 + // Jan 22, 2015 5:11:54 PM edu.stanford.nlp.dcoref.Document findSpeaker + // WARNING: Cannot find node in dependency for word rally + // Jan 22, 2015 5:11:54 PM edu.stanford.nlp.dcoref.Document findSpeaker + // WARNING: Cannot find node in dependency for word told + @Test + public void test2() + throws Exception + { + final List<LogRecord> records = new ArrayList<LogRecord>(); + ConsoleHandler handler = (ConsoleHandler) LogManager.getLogManager().getLogger("") + .getHandlers()[0]; + java.util.logging.Level oldLevel = handler.getLevel(); + handler.setLevel(Level.ALL); + handler.setFilter(new Filter() + { + @Override + public boolean isLoggable(LogRecord record) + { + records.add(record); + return false; + } + }); + + try { + JCas jcas = runTest("en", + "\" We cannot forgive this war , \" Miyako Fuji , 20 , one of the rally 's " + + "organisers told Jiji news agency ."); + + String[][] ref = { + { "Jiji" }, + { "We" }, + { "this war" }, + { "Miyako Fuji , 20 , one of the rally 's organisers" }, + { "Miyako Fuji , 20" }, + { "Miyako Fuji", "20" }, + { "one of the rally 's organisers" }, + { "Jiji news agency" } }; + + for (LogRecord r : records) { + assertFalse(r.getMessage().contains("Cannot find node in dependency for word")); + } + + AssertAnnotations.assertCoreference(ref, select(jcas, CoreferenceChain.class)); + } + finally { + if (oldLevel != null) { + handler.setLevel(oldLevel); + handler.setFilter(null); + } + } + } + + @Test + public void testDictionarySieve() + throws Exception + { + JCas jcas = runTest("en", "John joined Google in 2012. He is doing research for the company.", + Constants.SIEVEPASSES + ",CorefDictionaryMatch"); + + String[][] ref = new String[][] { + { "John", "He" }, + { "Google", "the company" }, + { "2012" } }; + + AssertAnnotations.assertCoreference(ref, select(jcas, CoreferenceChain.class)); + } + + @Test + public void testTriggerReparse() + throws Exception + { + JCas jcas = runTest("en", "'Let's go! I want to see the Don', he said."); + + String[][] ref = { + { "'s", "I" }, + { "the Don'", "he" } }; + + String[] pennTree = { + "(ROOT (S (`` ') (VP (VB Let) (S (NP (PRP 's)) (VP (VB go)))) (. !)))", + "(ROOT (S (S (NP (PRP I)) (VP (VBP want) (S (VP (TO to) (VP (VB see) (NP (DT the) " + + "(NNPS Don) (POS '))))))) (, ,) (NP (PRP he)) (VP (VBD said)) (. .)))" + }; + + AssertAnnotations.assertPennTree(pennTree, select(jcas, PennTree.class)); + AssertAnnotations.assertCoreference(ref, select(jcas, CoreferenceChain.class)); + } + + @Test + @Ignore("Disabled due to side effects on parser unit tests. See issue 175") + public void testTriggerReparse1() + throws Exception + { + JCas jcas = runTest("en", + "Other major domestic initiatives in his presidency include the Patient " + + "Protection and Affordable Care Act, often referred to as \"Obamacare\"; the " + + "Dodd–Frank Wall Street Reform and Consumer Protection Act; the Don't Ask, " + + "Don't Tell Repeal Act of 2010; the Budget Control Act of 2011; and the " + + "American Taxpayer Relief Act of 2012."); + + String[][] ref = { + { "Other major domestic initiatives in his presidency" }, + { "his presidency" }, + { "his" }, + { "the Patient Protection and Affordable Care Act, often referred to as " + + "\"Obamacare\"; the Dodd–Frank Wall Street Reform and Consumer " + + "Protection Act; the Don't Ask" }, + { "the Patient Protection and Affordable Care Act" }, + { "the Patient Protection" }, + { "Affordable Care Act" }, + { "\"Obamacare\"; the Dodd–Frank Wall Street Reform and Consumer Protection Act;" }, + { "the Dodd" }, + { "Frank Wall Street Reform and Consumer Protection Act" }, + { "Frank Wall Street Reform" }, + { "Consumer Protection Act" }, + { "Repeal Act of 2010; the Budget Control Act of 2011; and the American " + + "Taxpayer Relief Act of 2012" }, + { "2010" }, + { "the Budget Control Act of 2011" }, + { "the American Taxpayer Relief Act of 2012" }, + { "2011" }, + { "2012" } }; + + String[] pennTree = { + "(ROOT (S (NP (NP (JJ Other) (JJ major) (JJ domestic) (NNS initiatives)) (PP (IN in) " + + "(NP (PRP$ his) (NN presidency)))) (VP (VBP include) (SBAR (S (NP (NP (DT the) " + + "(NNP Patient) (NNP Protection) (CC and) (NNP Affordable) (NNP Care) (NNP Act)) " + + "(, ,) (VP (ADVP (RB often)) (VBN referred) (PP (TO to) (SBAR (IN as) (S (NP " + + "(`` \") (NP (NNP Obamacare)) ('' \") (PRN (: ;) (S (NP (DT the) (NNP Dodd)) (VP " + + "(VBP –) (NP (NP (NNP Frank) (NNP Wall) (NNP Street) (NNP Reform)) (CC and) (NP " + + "(NNP Consumer) (NNP Protection) (NNP Act))))) (: ;))) (DT the) (VP (VBP Do) " + + "(RB n't) (VP (VB Ask))))))) (, ,)) (VP (VBP Do) (RB n't) (VP (VB Tell) (NP (NP " + + "(NP (NN Repeal) (NNP Act)) (PP (IN of) (NP (CD 2010)))) (: ;) (NP (NP (DT the) " + + "(NNP Budget) (NNP Control) (NNP Act)) (PP (IN of) (NP (CD 2011)))) (: ;) " + + "(CC and) (NP (NP (DT the) (NNP American) (NNP Taxpayer) (NNP Relief) (NNP Act)) " + + "(PP (IN of) (NP (CD 2012)))))))))) (. .)))" }; + + AssertAnnotations.assertPennTree(pennTree, select(jcas, PennTree.class)); + AssertAnnotations.assertCoreference(ref, select(jcas, CoreferenceChain.class)); + } + + private JCas runTest(String aLanguage, String aText) + throws Exception + { + return runTest(aLanguage, aText, Constants.SIEVEPASSES); + } + + + private JCas runTest(String aLanguage, String aText, String aSieves) + throws Exception + { + AssumeResource.assumeResource(StanfordCoreferenceResolver.class, "coref", aLanguage, + "default"); + + // Coreference resolution requires the parser and the NER to run before + AnalysisEngine engine = createEngine(createEngineDescription( + createEngineDescription(StanfordSegmenter.class), + createEngineDescription(StanfordParser.class, + StanfordParser.PARAM_WRITE_CONSTITUENT, true, + StanfordParser.PARAM_WRITE_DEPENDENCY, true, + StanfordParser.PARAM_WRITE_PENN_TREE, true, + StanfordParser.PARAM_WRITE_POS, true), + createEngineDescription(StanfordLemmatizer.class), + createEngineDescription(StanfordNamedEntityRecognizer.class), + createEngineDescription(StanfordCoreferenceResolver.class, + StanfordCoreferenceResolver.PARAM_SIEVES, aSieves))); + + // Set up a simple example + JCas jcas = engine.newJCas(); + jcas.setDocumentLanguage(aLanguage); + jcas.setDocumentText(aText); + engine.process(jcas); + + return jcas; + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordDependencyConverterTest.java b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordDependencyConverterTest.java similarity index 90% rename from dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordDependencyConverterTest.java rename to dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordDependencyConverterTest.java index c3016ff462..f36aa9716b 100644 --- a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordDependencyConverterTest.java +++ b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordDependencyConverterTest.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,9 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; +package org.dkpro.core.stanfordnlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; @@ -25,14 +25,14 @@ import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.penntree.PennTreeNode; +import org.dkpro.core.io.penntree.PennTreeToJCasConverter; +import org.dkpro.core.io.penntree.PennTreeUtils; +import org.dkpro.core.testing.AssertAnnotations; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeNode; -import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeToJCasConverter; -import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; public class StanfordDependencyConverterTest { diff --git a/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordLemmatizerTest.java b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordLemmatizerTest.java new file mode 100644 index 0000000000..0c38840d79 --- /dev/null +++ b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordLemmatizerTest.java @@ -0,0 +1,87 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.stanfordnlp; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; + +public class StanfordLemmatizerTest +{ + @Test + public void testUnderscore() throws Exception + { + runTest("en", "foo _ bar", + new String[] { "foo", "_", "bar" }); + } + + @Test + public void testEnglish() throws Exception + { + runTest("en", "This is a test .", + new String[] { "this", "be", "a", "test", "." }); + + runTest("en", "We need a very complicated example sentence , which " + + "contains as many constituents and dependencies as possible .", + new String[] { "we", "need", "a", "very", "complicated", "example", "sentence", ",", + "which", "contain", "as", "many", "constituent", "and", "dependency", "as", + "possible", "." }); + } + + @Test(expected = AnalysisEngineProcessException.class) + public void testNotEnglish() + throws Exception + { + runTest("de", "Das ist ein test .", new String[] {} ); + } + + @Test + public void testUrl() throws Exception + { + runTest("en", + "Details hinzu findet man unter http://www.armytimes.com/news/2009/11/army_M4_112109w/ .", + new String[] { "detail", "hinzu", "findet", "man", "unter", + "http://www.armytimes.com/news/2009/11/army_m4_112109w/", "." }); + } + + private void runTest(String aLanguage, String testDocument, String[] lemmas) + throws Exception + { + AnalysisEngineDescription posTagger = createEngineDescription(StanfordPosTagger.class); + AnalysisEngineDescription lemmatizer = createEngineDescription(StanfordLemmatizer.class); + + JCas aJCas = TestRunner.runTest(createEngineDescription(posTagger, lemmatizer), + aLanguage, testDocument); + + AssertAnnotations.assertLemma(lemmas, select(aJCas, Lemma.class)); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizerTest.java b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizerTest.java new file mode 100644 index 0000000000..f4c7c9b9ac --- /dev/null +++ b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizerTest.java @@ -0,0 +1,352 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.stanfordnlp; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Assume; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; + +/** + */ +public class StanfordNamedEntityRecognizerTest +{ + @Test + public void testDutchFremeNer() throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("nl", "freme-wikiner", "10 jaar Markus werkzaam bij SAP in Duitsland ."); + + String[] ne = { + "[ 8, 14]Person(I-PER) (Markus)", + "[ 28, 31]Organization(I-ORG) (SAP)", + "[ 35, 44]Location(I-LOC) (Duitsland)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void testEnglish() throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("en", null, "IBM where John Miller works is in Germany ."); + + String[] ne = { + "[ 0, 3]Organization(ORGANIZATION) (IBM)", + "[ 10, 21]Person(PERSON) (John Miller)", + "[ 34, 41]Location(LOCATION) (Germany)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void testEnglishAdjacent() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("en", null, "Jake John called late at night ."); + + String[] ne = { + "[ 0, 9]Person(PERSON) (Jake John)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void testEnglishFremeNer() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("en", "freme-wikiner", "IBM where John Miller works is in Germany ."); + + String[] ne = { + "[ 0, 3]Organization(I-ORG) (IBM)", + "[ 10, 21]Person(I-PER) (John Miller)", + "[ 34, 41]Location(I-LOC) (Germany)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void test3classCaselessEnglish() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("en", "all.3class.caseless.distsim.crf", "ibm where john works is in germany ."); + + String[] ne = { + "[ 0, 3]Organization(ORGANIZATION) (ibm)", + "[ 10, 14]Person(PERSON) (john)", + "[ 27, 34]Location(LOCATION) (germany)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void testNoWiki3classCaselessEnglish() throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("en", "nowiki.3class.caseless.distsim.crf", + "ibm where john works is in germany ."); + + String[] ne = { "[ 0, 3]Organization(ORGANIZATION) (ibm)", + "[ 10, 14]Person(PERSON) (john)", "[ 27, 34]Location(LOCATION) (germany)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void test4classEnglish() throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("en", "conll.4class.distsim.crf", "IBM where John works is in Germany ."); + + String[] ne = { + "[ 0, 3]Organization(ORGANIZATION) (IBM)", + "[ 10, 14]Person(PERSON) (John)", + "[ 27, 34]Location(LOCATION) (Germany)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + + @Test + public void test4classCaselessEnglish() throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("en", "conll.4class.caseless.distsim.crf", + "ibm where john works is in germany ."); + + String[] ne = { "[ 0, 3]Organization(ORGANIZATION) (ibm)", + "[ 10, 14]Person(PERSON) (john)", "[ 27, 34]Location(LOCATION) (germany)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void test4classCaselessMixedEnglish() throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("en", "conll.4class.caseless.distsim.crf", + "IBM where john works is in Germany ."); + + String[] ne = { + "[ 0, 3]Organization(ORGANIZATION) (IBM)", + "[ 10, 14]Person(PERSON) (john)", + "[ 27, 34]Location(LOCATION) (Germany)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void test7classEnglish() throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("en", "muc.7class.distsim.crf", "IBM where John works is in Germany ."); + + String[] ne = { + "[ 0, 3]Organization(ORGANIZATION) (IBM)", + "[ 10, 14]Person(PERSON) (John)", + "[ 27, 34]Location(LOCATION) (Germany)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void testEnglishWithNEInLastToken() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("en", null, "IBM where John works is in Germany"); + + String[] ne = { + "[ 0, 3]Organization(ORGANIZATION) (IBM)", + "[ 10, 14]Person(PERSON) (John)", + "[ 27, 34]Location(LOCATION) (Germany)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void testGerman() throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("de", null, "Markus arbeitet seit 10 Jahren bei SAP in Deutschland ."); + + String[] ne = { + "[ 0, 6]Person(PERSON) (Markus)", + "[ 35, 38]Organization(ORGANIZATION) (SAP)", + "[ 42, 53]Location(LOCATION) (Deutschland)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void testGermanNemgp() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("de", "nemgp", "Markus arbeitet seit 10 Jahren bei SAP in Deutschland ."); + + String[] ne = { + "[ 0, 6]Person(PER) (Markus)", + "[ 35, 38]Organization(ORG) (SAP)", + "[ 42, 53]Location(LOC) (Deutschland)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void testHgcGerman() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("de", "hgc_175m_600.crf", "Markus arbeitet seit 10 Jahren bei SAP in Deutschland ."); + + String[] ne = { + "[ 0, 6]Person(I-PER) (Markus)", + "[ 35, 38]Organization(I-ORG) (SAP)", + "[ 42, 53]Location(I-LOC) (Deutschland)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void testFrenchFremeNer() throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("fr", "freme-wikiner", "Il y a 10 ans Markus travaille dans SAP en Allemagne ."); + + String[] ne = { + "[ 14, 20]Person(I-PER) (Markus)", + "[ 36, 39]Organization(I-ORG) (SAP)", + "[ 43, 52]Location(I-LOC) (Allemagne)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void testItalianFremeNer() throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("it", "freme-wikiner", "10 anni fa Markus lavora in SAP in Germania ."); + + String[] ne = { + "[ 11, 17]Person(I-PER) (Markus)", + "[ 28, 31]Organization(I-ORG) (SAP)", + "[ 35, 43]Location(I-LOC) (Germania)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void testRussianFremeNer() throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("ru", "freme-wikiner", "10 лет Маркус работал в SAP в Германии ."); + + String[] ne = { + "[ 7, 13]Person(I-PER) (Маркус)", + "[ 24, 27]Organization(I-ORG) (SAP)", + "[ 30, 38]Location(I-LOC) (Германии)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void testSpanish() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("es", null, "Hace 10 años Markus trabaja en SAP en Alemania ."); + + String[] ne = { + "[ 13, 19]Person(PERS) (Markus)", + "[ 31, 34]Organization(ORG) (SAP)", + "[ 38, 46]Location(LUG) (Alemania)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test + public void testSpanishFremeNer() throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("es", "freme-wikiner", "Hace 10 años Markus trabaja en SAP en Alemania ."); + + String[] ne = { + "[ 13, 19]Person(I-PER) (Markus)", + "[ 31, 34]NamedEntity(I-MISC) (SAP)", + "[ 38, 46]Location(I-LOC) (Alemania)" }; + + AssertAnnotations.assertNamedEntity(ne, select(jcas, NamedEntity.class)); + } + + @Test(expected = AnalysisEngineProcessException.class) + public void testMissingModel() throws Exception + { + runTest("xx", null, "Xec xena Xeo ."); + } + + private JCas runTest(String language, String variant, String testDocument) + throws Exception + { + AssumeResource.assumeResource(StanfordNamedEntityRecognizer.class, "ner", language, + variant); + + AnalysisEngine engine = createEngine(StanfordNamedEntityRecognizer.class, + StanfordNamedEntityRecognizer.PARAM_VARIANT, variant, + StanfordNamedEntityRecognizer.PARAM_PRINT_TAGSET, true); + + return TestRunner.runTest(engine, language, testDocument); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizerTrainerTest.java b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizerTrainerTest.java new file mode 100644 index 0000000000..f7badb4487 --- /dev/null +++ b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordNamedEntityRecognizerTrainerTest.java @@ -0,0 +1,181 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.stanfordnlp; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.pipeline.SimplePipeline.iteratePipeline; +import static org.assertj.core.api.Assertions.assertThatExceptionOfType; +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.factory.ConfigurationParameterFactory; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.api.datasets.Dataset; +import org.dkpro.core.api.datasets.DatasetFactory; +import org.dkpro.core.api.datasets.Split; +import org.dkpro.core.eval.EvalUtil; +import org.dkpro.core.eval.model.Span; +import org.dkpro.core.eval.report.Result; +import org.dkpro.core.io.conll.Conll2002Reader; +import org.dkpro.core.io.conll.Conll2002Reader.ColumnSeparators; +import org.dkpro.core.io.conll.Conll2002Writer; +import org.dkpro.core.testing.DkproTestContext; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; + +public class StanfordNamedEntityRecognizerTrainerTest +{ + private Dataset ds; + + @Test + public void test() + throws Exception + { + File targetFolder = testContext.getTestOutputFolder(); + + System.out.println("Target Folder: " + targetFolder.getAbsolutePath()); + Split split = ds.getDefaultSplit(); + + File model = new File(targetFolder, "ner-model.ser.gz"); + + File properties = new File("ner/train-english.props"); + + File[] trainingFiles = split.getTrainingFiles(); + for (File file : trainingFiles) { + System.out.println("Training file: " + file.getAbsolutePath()); + } + + CollectionReaderDescription trainReader = createReaderDescription(Conll2002Reader.class, + Conll2002Reader.PARAM_PATTERNS, split.getDevelopmentFiles(), + Conll2002Reader.PARAM_LANGUAGE, ds.getLanguage(), + Conll2002Reader.PARAM_COLUMN_SEPARATOR, ColumnSeparators.TAB.getName(), + Conll2002Reader.PARAM_HAS_TOKEN_NUMBER, true, + Conll2002Reader.PARAM_HAS_HEADER, true, + Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true); + + AnalysisEngineDescription trainer = createEngineDescription( + StanfordNamedEntityRecognizerTrainer.class, + StanfordNamedEntityRecognizerTrainer.PARAM_TARGET_LOCATION, model, + StanfordNamedEntityRecognizerTrainer.PARAM_PROPERTIES_LOCATION, properties, + StanfordNamedEntityRecognizerTrainer.PARAM_LABEL_SET, "noprefix", + StanfordNamedEntityRecognizerTrainer.PARAM_RETAIN_CLASS, true); + + SimplePipeline.runPipeline(trainReader, trainer); + + // Apply model and collect labels + System.out.println("Applying model to test data"); + CollectionReaderDescription testReader = createReaderDescription(Conll2002Reader.class, + Conll2002Reader.PARAM_PATTERNS, split.getTestFiles(), + Conll2002Reader.PARAM_LANGUAGE, "de", + Conll2002Reader.PARAM_COLUMN_SEPARATOR, ColumnSeparators.TAB.getName(), + Conll2002Reader.PARAM_HAS_TOKEN_NUMBER, true, + Conll2002Reader.PARAM_HAS_HEADER, true, + Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true, + Conll2002Reader.PARAM_READ_NAMED_ENTITY, false); + + AnalysisEngineDescription ner = createEngineDescription(StanfordNamedEntityRecognizer.class, + StanfordNamedEntityRecognizer.PARAM_PRINT_TAGSET, true, + StanfordNamedEntityRecognizer.PARAM_MODEL_LOCATION, model); + + AnalysisEngineDescription writer = createEngineDescription( + Conll2002Writer.class, + Conll2002Writer.PARAM_SINGULAR_TARGET, true, + Conll2002Writer.PARAM_TARGET_LOCATION, new File(targetFolder, "output.conll")); + + List<Span<String>> actual = EvalUtil.loadSamples(iteratePipeline(testReader, ner, writer), + NamedEntity.class, NamedEntity::getValue); + System.out.printf("Actual samples: %d%n", actual.size()); + + // Read reference data collect labels + ConfigurationParameterFactory.setParameter(testReader, + Conll2002Reader.PARAM_READ_NAMED_ENTITY, true); + List<Span<String>> expected = EvalUtil.loadSamples(testReader, NamedEntity.class, + NamedEntity::getValue); + System.out.printf("Expected samples: %d%n", expected.size()); + + Result results = EvalUtil.dumpResults(targetFolder, expected, actual); + + // Using split.getTrainingFiles() with 10GB heap takes ~80 minutes to train + // F-score 0.692730 + // Precision 0.765778 + // Recall 0.632405 + + // + assertEquals(0.493260, results.getFscore(), 0.01); + assertEquals(0.621921, results.getPrecision(), 0.01); + assertEquals(0.408708, results.getRecall(), 0.01); + } + + @Test + public void test__EmptyDataset__ShouldRaiseExceptionWithHelpfulMessage() + throws Exception + { + Path emptyDir = Files.createTempDirectory("empty_dir"); + File targetFolder = testContext.getTestOutputFolder(); + + File model = new File(targetFolder, "ner-model.ser.gz"); + + File properties = new File("ner/train-english.props"); + + CollectionReaderDescription trainReader = createReaderDescription(Conll2002Reader.class, + Conll2002Reader.PARAM_SOURCE_LOCATION, emptyDir.toAbsolutePath().toString(), + Conll2002Reader.PARAM_PATTERNS, "*.txt", + Conll2002Reader.PARAM_LANGUAGE, "en", + Conll2002Reader.PARAM_COLUMN_SEPARATOR, ColumnSeparators.TAB.getName(), + Conll2002Reader.PARAM_HAS_TOKEN_NUMBER, true, + Conll2002Reader.PARAM_HAS_HEADER, true, + Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true); + + AnalysisEngineDescription trainer = createEngineDescription( + StanfordNamedEntityRecognizerTrainer.class, + StanfordNamedEntityRecognizerTrainer.PARAM_TARGET_LOCATION, model, + StanfordNamedEntityRecognizerTrainer.PARAM_PROPERTIES_LOCATION, properties, + StanfordNamedEntityRecognizerTrainer.PARAM_LABEL_SET, "noprefix", + StanfordNamedEntityRecognizerTrainer.PARAM_RETAIN_CLASS, true); + + assertThatExceptionOfType(AnalysisEngineProcessException.class) + .isThrownBy(() -> SimplePipeline.runPipeline(trainReader, trainer)) + .withCauseInstanceOf(IllegalStateException.class) + .matches(e -> e.getCause().getMessage().equals( + "Trainer did not receive any training data.")); + } + + @Before + public void setup() + throws IOException + { + DatasetFactory loader = new DatasetFactory(DkproTestContext.getCacheFolder()); + ds = loader.load("germeval2014-de"); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordParserTest.java b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordParserTest.java new file mode 100644 index 0000000000..bc9d7765bc --- /dev/null +++ b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordParserTest.java @@ -0,0 +1,1266 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.stanfordnlp; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectSingle; +import static org.dkpro.core.testing.AssertAnnotations.assertConstituents; +import static org.dkpro.core.testing.AssertAnnotations.assertDependencies; +import static org.dkpro.core.testing.AssertAnnotations.assertPOS; +import static org.dkpro.core.testing.AssertAnnotations.assertPennTree; +import static org.dkpro.core.testing.AssertAnnotations.assertSyntacticFunction; +import static org.dkpro.core.testing.AssertAnnotations.assertTagset; +import static org.dkpro.core.testing.AssertAnnotations.assertTagsetMapping; +import static org.junit.Assert.assertTrue; + +import org.apache.commons.lang3.ArrayUtils; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.fit.factory.AggregateBuilder; +import org.apache.uima.fit.factory.JCasBuilder; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.stanfordnlp.util.TreeUtils; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Assume; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import edu.stanford.nlp.ling.StringLabel; +import edu.stanford.nlp.trees.Tree; + +/** + */ +public class StanfordParserTest +{ + private static final String[] GERMAN_POS_TAGS = { "$,", "$.", "$[", ".$$.", "ADJA", "ADJD", + "ADV", "APPO", "APPR", "APPRART", "APZR", "ART", "CARD", "FM", "ITJ", "KOKOM", "KON", + "KOUI", "KOUS", "NE", "NN", "PDAT", "PDS", "PIAT", "PIDAT", "PIS", "PPER", "PPOSAT", + "PPOSS", "PRELAT", "PRELS", "PRF", "PROAV", "PTKA", "PTKANT", "PTKNEG", "PTKVZ", + "PTKZU", "PWAT", "PWAV", "PWS", "TRUNC", "VAFIN", "VAIMP", "VAINF", "VAPP", "VMFIN", + "VMINF", "VMPP", "VVFIN", "VVIMP", "VVINF", "VVIZU", "VVPP", "XY" }; + + private static final String[] GERMAN_CONSTITUENT_TAGS = { "AA", "AP", "AVP", "CAC", "CAP", + "CAVP", "CCP", "CH", "CNP", "CO", "CPP", "CS", "CVP", "CVZ", "DL", "ISU", "MPN", "MTA", + "NM", "NP", "NUR", "PP", "QL", "ROOT", "S", "VP", "VZ" }; + + private static final String[] ENGLISH_POS_TAGS = { "#", "$", "''", ",", "-LRB-", "-RRB-", ".", + ".$$.", ":", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", + "NNP", "NNPS", "NNS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", + "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "``" }; + + private static final String[] ENGLISH_POS_UNMAPPED = { ".$$."}; + + private static final String[] ENGLISH_CONSTITUENT_TAGS = { "ADJP", "ADVP", "CONJP", "FRAG", + "INTJ", "LST", "NAC", "NP", "NX", "PP", "PRN", "PRT", "QP", "ROOT", "RRC", "S", "SBAR", + "SBARQ", "SINV", "SQ", "UCP", "VP", "WHADJP", "WHADVP", "WHNP", "WHPP", "X" }; + + private static final String[] ENGLISH_CONSTITUENT_UNMAPPED = { }; + + private static final String[] ENGLISH_DEPENDENCY_TAGS = { "acomp", "advcl", "advmod", "agent", + "amod", "appos", "arg", "aux", "auxpass", "cc", "ccomp", "comp", "conj", "cop", + "csubj", "csubjpass", "dep", "det", "discourse", "dobj", "expl", "goeswith", "gov", + "iobj", "mark", "mod", "mwe", "neg", "nn", "npadvmod", "nsubj", "nsubjpass", "num", + "number", "obj", "parataxis", "pcomp", "pobj", "poss", "possessive", "preconj", "pred", + "predet", "prep", "prt", "punct", "quantmod", "rcmod", "ref", "rel", "sdep", "subj", + "tmod", "vmod", "xcomp" }; + + private static final String[] SPANISH_POS_TAGS = { ".$$.", "359000", "NCMS000", "ac0000", + "ao0000", "ap0000", "aq0000", "aqs000", "cc", "cs", "d00000", "da0000", "dd0000", + "de0000", "di0000", "dn0000", "do0000", "dp0000", "dt0000", "f0", "faa", "fat", "fc", + "fca", "fct", "fd", "fe", "fg", "fh", "fi", "fia", "fit", "fp", "fpa", "fpt", "fra", + "frc", "fs", "fsa", "ft", "fx", "fz", "i", "nc00000", "nc0a000", "nc0c000", "nc0n000", + "nc0p000", "nc0s000", "np00000", "p0000000", "pd000000", "pe000000", "pi000000", + "pn000000", "po000000", "pp000000", "pr000000", "pt000000", "px000000", "rg", "rn", + "sc000", "se000", "sp000", "va00000", "vag0000", "vaic000", "vaif000", "vaii000", + "vaip000", "vais000", "vam0000", "van0000", "vap0000", "vasi000", "vasp000", "vass000", + "vm00000", "vm0p000", "vmg0000", "vmi0000", "vmi2000", "vmic000", "vmif000", "vmii000", + "vmim000", "vmip000", "vmis000", "vmm0000", "vmmp000", "vmms000", "vmn0000", "vmp0000", + "vms0000", "vmsf000", "vmsi000", "vmsp000", "vq00000", "vs00000", "vsg0000", "vsic000", + "vsif000", "vsii000", "vsip000", "vsis000", "vsm0000", "vsmp000", "vsn0000", "vsp0000", + "vssf000", "vssi000", "vssp000", "vsss000", "w", "word", "z0", "zd", "zm", "zp", "zu" }; + + private static final String[] FRENCH_POS_TAGS = { ".$$.", "A", "ADJ", "ADJWH", "ADV", "ADVWH", + "C", "CC", "CL", "CLO", "CLR", "CLS", "CS", "DET", "DETWH", "ET", "I", "N", "NC", + "NPP", "P", "PREF", "PRO", "PROREL", "PROWH", "PUNC", "V", "VIMP", "VINF", "VPP", + "VPR", "VS" }; + + // TODO Maybe test link to parents (not tested by syntax tree recreation) + + @Test + public void testGermanPcfg() + throws Exception + { + JCas jcas = runTest("de", "pcfg", "Wir brauchen ein sehr kompliziertes Beispiel , welches " + + "möglichst viele Konstituenten und Dependenzen beinhaltet ."); + + String[] constituentMapped = { "ADJP 17,35", "NP 13,111", "NP 55,100", "NP 71,100", + "ROOT 0,113", "S 0,113", "S 47,111" }; + + String[] constituentOriginal = { "AP 17,35", "CNP 71,100", "NP 13,111", "NP 55,100", + "ROOT 0,113", "S 0,113", "S 47,111" }; + + String[] synFunc = {}; + + String[] posOriginal = { "PPER", "VVFIN", "ART", "ADV", "ADJA", "NN", "$,", "PRELS", "ADV", + "PIDAT", "NN", "KON", "NN", "VVFIN", "$." }; + + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", "POS_PUNCT", "POS_PRON", "POS_ADV", + "POS_PRON", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_VERB", "POS_PUNCT" }; + + String[] dependencies = {/** No dependencies for German */ }; + + String pennTree = "(ROOT (S (PPER Wir) (VVFIN brauchen) (NP (ART ein) (AP (ADV sehr) " + + "(ADJA kompliziertes)) (NN Beispiel) ($, ,) (S (PRELS welches) (NP " + + "(ADV möglichst) (PIDAT viele) (CNP (NN Konstituenten) (KON und) " + + "(NN Dependenzen))) (VVFIN beinhaltet))) ($. .)))"; + + String[] unmappedPos = { "$[", ".$$." }; + + String[] unmappedConst = { "NUR" }; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, + select(jcas, Constituent.class)); + assertSyntacticFunction(synFunc, select(jcas, Constituent.class)); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(POS.class, "stts", GERMAN_POS_TAGS, jcas); + assertTagsetMapping(POS.class, "stts", unmappedPos, jcas); + assertTagset(Constituent.class, "negra", GERMAN_CONSTITUENT_TAGS, jcas); + assertTagsetMapping(Constituent.class, "negra", unmappedConst, jcas); + } + + @Test + public void testGermanFactored() + throws Exception + { + JCas jcas = runTest("de", "factored", + "Wir brauchen ein sehr kompliziertes Beispiel , welches " + + "möglichst viele Konstituenten und Dependenzen beinhaltet ."); + + String[] constituentMapped = { "ADJP 17,35", "ADJP 55,70", "NP 13,111", "NP 55,100", + "NP 71,100", "ROOT 0,113", "S 0,113", "S 47,111" }; + + String[] constituentOriginal = { "AP 17,35", "AP 55,70", "CNP 71,100", "NP 13,111", + "NP 55,100", "ROOT 0,113", "S 0,113", "S 47,111" }; + + String[] posOriginal = { "PPER", "VVFIN", "ART", "ADV", "ADJA", "NN", "$,", "PRELS", "ADV", + "PIDAT", "NN", "KON", "NN", "VVFIN", "$." }; + + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", "POS_PUNCT", "POS_PRON", "POS_ADV", + "POS_PRON", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_VERB", "POS_PUNCT" }; + + String[] dependencies = { /** No dependencies for German */ }; + + String pennTree = "(ROOT (S (PPER Wir) (VVFIN brauchen) (NP (ART ein) (AP " + + "(ADV sehr) (ADJA kompliziertes)) (NN Beispiel) ($, ,) (S (PRELS welches) " + + "(NP (AP (ADV möglichst) (PIDAT viele)) (CNP (NN Konstituenten) (KON und) " + + "(NN Dependenzen))) (VVFIN beinhaltet))) ($. .)))"; + + String[] unmappedPos = { "$[", ".$$." }; + + String[] unmappedConst = { "NUR" }; + + assertConstituents(constituentMapped, constituentOriginal, + select(jcas, Constituent.class)); + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(POS.class, "stts", GERMAN_POS_TAGS, jcas); + assertTagsetMapping(POS.class, "stts", unmappedPos, jcas); + assertTagset(Constituent.class, "negra", GERMAN_CONSTITUENT_TAGS, jcas); + assertTagsetMapping(Constituent.class, "negra", unmappedConst, jcas, + true); + } + + @Test + public void testEnglishPcfg() + throws Exception + { + JCas jcas = runTest("en", "pcfg", "We need a very complicated example sentence , which " + + "contains as many constituents and dependencies as possible ."); + + String[] constituentMapped = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,98", + "NP 8,110", "NP 8,43", "PP 61,98", "PP 99,110", "ROOT 0,112", "S 0,112", + "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; + + String[] constituentOriginal = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,98", + "NP 8,110", "NP 8,43", "PP 61,98", "PP 99,110", "ROOT 0,112", "S 0,112", + "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; + + String[] dependencies = { + "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](We) G[3,7](need)", + "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", + "[ 8, 9]DET(det,basic) D[8,9](a) G[35,43](sentence)", + "[ 10, 14]ADVMOD(advmod,basic) D[10,14](very) G[15,26](complicated)", + "[ 15, 26]AMOD(amod,basic) D[15,26](complicated) G[35,43](sentence)", + "[ 27, 34]NN(nn,basic) D[27,34](example) G[35,43](sentence)", + "[ 35, 43]DOBJ(dobj,basic) D[35,43](sentence) G[3,7](need)", + "[ 46, 51]NSUBJ(nsubj,basic) D[46,51](which) G[52,60](contains)", + "[ 52, 60]RCMOD(rcmod,basic) D[52,60](contains) G[35,43](sentence)", + "[ 64, 68]AMOD(amod,basic) D[64,68](many) G[69,81](constituents)", + "[ 69, 81]PREP(prep_as,basic) D[69,81](constituents) G[52,60](contains)", + "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", + "[102,110]PREP(prep_as,basic) D[102,110](possible) G[52,60](contains)" }; + + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", + "POS_NOUN", "POS_PUNCT", "POS_DET", "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", + "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; + + String[] posOriginal = { "PRP", "VBP", "DT", "RB", "VBN", "NN", "NN", ",", "WDT", "VBZ", + "IN", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; + + String pennTree = "(ROOT (S (NP (PRP We)) (VP (VBP need) (NP (NP (DT a) (ADJP (RB very) " + + "(VBN complicated)) (NN example) (NN sentence)) (, ,) (SBAR (WHNP (WDT which)) " + + "(S (VP (VBZ contains) (PP (IN as) (NP (JJ many) (NNS constituents) (CC and) " + + "(NNS dependencies))) (PP (IN as) (ADJP (JJ possible)))))))) (. .)))"; + + String[] unmappedDep = { "gov" }; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, + select(jcas, Constituent.class)); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); + assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); + assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); + assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); + assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); + assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); + } + + @Test + public void testEnglishPcfgCollapsed() + throws Exception + { + JCas jcas = runTest("en", "pcfg", "We need a very complicated example sentence , which " + + "contains as many constituents and dependencies as possible .", + StanfordParser.PARAM_MODE, StanfordParser.DependenciesMode.COLLAPSED_WITH_EXTRA); + + String[] constituentMapped = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,98", + "NP 8,110", "NP 8,43", "PP 61,98", "PP 99,110", "ROOT 0,112", "S 0,112", + "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; + + String[] constituentOriginal = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,98", + "NP 8,110", "NP 8,43", "PP 61,98", "PP 99,110", "ROOT 0,112", "S 0,112", + "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; + + String[] dependencies = { + "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](We) G[3,7](need)", + "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", + "[ 8, 9]DET(det,basic) D[8,9](a) G[35,43](sentence)", + "[ 10, 14]ADVMOD(advmod,basic) D[10,14](very) G[15,26](complicated)", + "[ 15, 26]AMOD(amod,basic) D[15,26](complicated) G[35,43](sentence)", + "[ 27, 34]NN(nn,basic) D[27,34](example) G[35,43](sentence)", + "[ 35, 43]DOBJ(dobj,basic) D[35,43](sentence) G[3,7](need)", + "[ 35, 43]NSUBJ(nsubj,enhanced) D[35,43](sentence) G[52,60](contains)", + "[ 52, 60]RCMOD(rcmod,basic) D[52,60](contains) G[35,43](sentence)", + "[ 64, 68]AMOD(amod,basic) D[64,68](many) G[69,81](constituents)", + "[ 69, 81]PREP(prep_as,basic) D[69,81](constituents) G[52,60](contains)", + "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", + "[102,110]PREP(prep_as,basic) D[102,110](possible) G[52,60](contains)" }; + + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", + "POS_NOUN", "POS_PUNCT", "POS_DET", "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", + "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; + + String[] posOriginal = { "PRP", "VBP", "DT", "RB", "VBN", "NN", "NN", ",", "WDT", "VBZ", + "IN", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; + + String pennTree = "(ROOT (S (NP (PRP We)) (VP (VBP need) (NP (NP (DT a) (ADJP (RB very) " + + "(VBN complicated)) (NN example) (NN sentence)) (, ,) (SBAR (WHNP (WDT which)) " + + "(S (VP (VBZ contains) (PP (IN as) (NP (JJ many) (NNS constituents) (CC and) " + + "(NNS dependencies))) (PP (IN as) (ADJP (JJ possible)))))))) (. .)))"; + + String[] unmappedDep = { "gov" }; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, + select(jcas, Constituent.class)); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); + assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); + assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); + assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); + assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); + assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); + } + + @Test + public void testEnglishFactored() + throws Exception + { + JCas jcas = runTest("en", "factored", "We need a very complicated example sentence , which " + + "contains as many constituents and dependencies as possible ."); + + String[] constituentMapped = { "ADJP 10,26", "ADJP 102,110", "ADJP 61,68", "NP 0,2", + "NP 61,98", "NP 8,110", "NP 8,43", "PP 99,110", "ROOT 0,112", "S 0,112", + "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; + + String[] constituentOriginal = { "ADJP 10,26", "ADJP 102,110", "ADJP 61,68", "NP 0,2", + "NP 61,98", "NP 8,110", "NP 8,43", "PP 99,110", "ROOT 0,112", "S 0,112", + "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; + + String[] dependencies = { + "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](We) G[3,7](need)", + "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", + "[ 8, 9]DET(det,basic) D[8,9](a) G[35,43](sentence)", + "[ 10, 14]ADVMOD(advmod,basic) D[10,14](very) G[15,26](complicated)", + "[ 15, 26]AMOD(amod,basic) D[15,26](complicated) G[35,43](sentence)", + "[ 27, 34]NN(nn,basic) D[27,34](example) G[35,43](sentence)", + "[ 35, 43]DOBJ(dobj,basic) D[35,43](sentence) G[3,7](need)", + "[ 46, 51]NSUBJ(nsubj,basic) D[46,51](which) G[52,60](contains)", + "[ 52, 60]RCMOD(rcmod,basic) D[52,60](contains) G[35,43](sentence)", + "[ 61, 63]ADVMOD(advmod,basic) D[61,63](as) G[64,68](many)", + "[ 64, 68]AMOD(amod,basic) D[64,68](many) G[69,81](constituents)", + "[ 69, 81]DOBJ(dobj,basic) D[69,81](constituents) G[52,60](contains)", + "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", + "[102,110]PREP(prep_as,basic) D[102,110](possible) G[52,60](contains)" }; + + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", + "POS_NOUN", "POS_PUNCT", "POS_DET", "POS_VERB", "POS_ADV", "POS_ADJ", "POS_NOUN", + "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; + + String[] posOriginal = { "PRP", "VBP", "DT", "RB", "VBN", "NN", "NN", ",", "WDT", "VBZ", + "RB", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; + + String pennTree = "(ROOT (S (NP (PRP We)) (VP (VBP need) (NP (NP (DT a) (ADJP (RB very) " + + "(VBN complicated)) (NN example) (NN sentence)) (, ,) (SBAR (WHNP (WDT which)) " + + "(S (VP (VBZ contains) (NP (ADJP (RB as) (JJ many)) (NNS constituents) (CC and) " + + "(NNS dependencies)) (PP (IN as) (ADJP (JJ possible)))))))) (. .)))"; + + String[] unmappedDep = { "gov" }; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertConstituents(constituentMapped, constituentOriginal, + select(jcas, Constituent.class)); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); + assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); + assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); + assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); + assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); + assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); + } + + @Test + public void testEnglishKeepPunctuation() + throws Exception + { + JCas jcas = runTest("en", "rnn", "This is a test .", + StanfordParser.PARAM_KEEP_PUNCTUATION, true); + + String[] dependencies = { + "[ 0, 4]NSUBJ(nsubj,basic) D[0,4](This) G[10,14](test)", + "[ 5, 7]COP(cop,basic) D[5,7](is) G[10,14](test)", + "[ 8, 9]DET(det,basic) D[8,9](a) G[10,14](test)", + "[ 10, 14]ROOT(root,basic) D[10,14](test) G[10,14](test)", + "[ 15, 16]PUNCT(punct,basic) D[15,16](.) G[10,14](test)" }; + + assertDependencies(dependencies, select(jcas, Dependency.class)); + } + + @Test + public void testEnglishRnn() + throws Exception + { + JCas jcas = runTest("en", "rnn", "We need a very complicated example sentence , which " + + "contains as many constituents and dependencies as possible ."); + + String[] constituentMapped = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 61,98", + "NP 8,110", "NP 8,43", "PP 99,110", "QP 61,68", "ROOT 0,112", "S 0,112", + "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; + + String[] constituentOriginal = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 61,98", + "NP 8,110", "NP 8,43", "PP 99,110", "QP 61,68", "ROOT 0,112", "S 0,112", + "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; + + String[] dependencies = { + "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](We) G[3,7](need)", + "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", + "[ 8, 9]DET(det,basic) D[8,9](a) G[35,43](sentence)", + "[ 10, 14]ADVMOD(advmod,basic) D[10,14](very) G[15,26](complicated)", + "[ 15, 26]AMOD(amod,basic) D[15,26](complicated) G[35,43](sentence)", + "[ 27, 34]NN(nn,basic) D[27,34](example) G[35,43](sentence)", + "[ 35, 43]DOBJ(dobj,basic) D[35,43](sentence) G[3,7](need)", + "[ 46, 51]NSUBJ(nsubj,basic) D[46,51](which) G[52,60](contains)", + "[ 52, 60]RCMOD(rcmod,basic) D[52,60](contains) G[35,43](sentence)", + "[ 61, 63]QUANTMOD(quantmod,basic) D[61,63](as) G[64,68](many)", + "[ 64, 68]NUM(num,basic) D[64,68](many) G[69,81](constituents)", + "[ 69, 81]DOBJ(dobj,basic) D[69,81](constituents) G[52,60](contains)", + "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", + "[102,110]PREP(prep_as,basic) D[102,110](possible) G[52,60](contains)" }; + + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", + "POS_NOUN", "POS_PUNCT", "POS_DET", "POS_VERB", "POS_ADV", "POS_ADJ", "POS_NOUN", + "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; + + String[] posOriginal = { "PRP", "VBP", "DT", "RB", "VBN", "NN", "NN", ",", "WDT", "VBZ", + "RB", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; + + String pennTree = "(ROOT (S (NP (PRP We)) (VP (VBP need) (NP (NP (DT a) (ADJP (RB very) " + + "(VBN complicated)) (NN example) (NN sentence)) (, ,) (SBAR (WHNP (WDT which)) " + + "(S (VP (VBZ contains) (NP (QP (RB as) (JJ many)) (NNS constituents) (CC and) " + + "(NNS dependencies)) (PP (IN as) (ADJP (JJ possible)))))))) (. .)))"; + + String[] unmappedDep = { "gov" }; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertConstituents(constituentMapped, constituentOriginal, + select(jcas, Constituent.class)); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); + assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); + assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); + assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); + assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); + assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); + } + + @Test + public void testEnglishShiftReduce() + throws Exception + { + JCas jcas = runTestWithPosTagger("en", "sr", "We need a very complicated example sentence , which " + + "contains as many constituents and dependencies as possible ."); + + String[] constituentMapped = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,110", + "NP 64,98", "NP 8,110", "NP 8,43", "PP 61,110", "PP 99,110", "ROOT 0,112", + "S 0,112", "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; + + String[] constituentOriginal = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,110", + "NP 64,98", "NP 8,110", "NP 8,43", "PP 61,110", "PP 99,110", "ROOT 0,112", + "S 0,112", "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; + + String[] dependencies = { + "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](We) G[3,7](need)", + "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", + "[ 8, 9]DET(det,basic) D[8,9](a) G[35,43](sentence)", + "[ 10, 14]ADVMOD(advmod,basic) D[10,14](very) G[15,26](complicated)", + "[ 15, 26]AMOD(amod,basic) D[15,26](complicated) G[35,43](sentence)", + "[ 27, 34]NN(nn,basic) D[27,34](example) G[35,43](sentence)", + "[ 35, 43]DOBJ(dobj,basic) D[35,43](sentence) G[3,7](need)", + "[ 46, 51]NSUBJ(nsubj,basic) D[46,51](which) G[52,60](contains)", + "[ 52, 60]RCMOD(rcmod,basic) D[52,60](contains) G[35,43](sentence)", + "[ 64, 68]AMOD(amod,basic) D[64,68](many) G[69,81](constituents)", + "[ 69, 81]PREP(prep_as,basic) D[69,81](constituents) G[52,60](contains)", + "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", + "[102,110]PREP(prep_as,basic) D[102,110](possible) G[69,81](constituents)" }; + + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", + "POS_NOUN", "POS_PUNCT", "POS_DET", "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", + "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; + + String[] posOriginal = { "PRP", "VBP", "DT", "RB", "JJ", "NN", "NN", ",", "WDT", "VBZ", + "IN", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; + + String pennTree = "(ROOT (S (NP (PRP We)) (VP (VBP need) (NP (NP (DT a) (ADJP (RB very) " + + "(JJ complicated)) (NN example) (NN sentence)) (, ,) (SBAR (WHNP (WDT which)) " + + "(S (VP (VBZ contains) (PP (IN as) (NP (NP (JJ many) (NNS constituents) " + + "(CC and) (NNS dependencies)) (PP (IN as) (ADJP (JJ possible)))))))))) (. .)))"; + + String[] unmappedDep = { "gov" }; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertConstituents(constituentMapped, constituentOriginal, + select(jcas, Constituent.class)); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); + assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); + assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); + assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); + assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); + assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); + } + + @Test + public void testEnglishShiftReduceBeam() + throws Exception + { + JCas jcas = runTestWithPosTagger("en", "sr-beam", "We need a very complicated example " + + "sentence , which contains as many constituents and dependencies as possible ."); + + String[] constituentMapped = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,98", + "NP 8,110", "NP 8,43", "PP 61,110", "PP 61,98", "PP 99,110", "ROOT 0,112", + "S 0,112", "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; + + String[] constituentOriginal = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 64,98", + "NP 8,110", "NP 8,43", "PP 61,110", "PP 61,98", "PP 99,110", "ROOT 0,112", + "S 0,112", "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; + + String[] dependencies = { + "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](We) G[3,7](need)", + "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", + "[ 8, 9]DET(det,basic) D[8,9](a) G[35,43](sentence)", + "[ 10, 14]ADVMOD(advmod,basic) D[10,14](very) G[15,26](complicated)", + "[ 15, 26]AMOD(amod,basic) D[15,26](complicated) G[35,43](sentence)", + "[ 27, 34]NN(nn,basic) D[27,34](example) G[35,43](sentence)", + "[ 35, 43]DOBJ(dobj,basic) D[35,43](sentence) G[3,7](need)", + "[ 46, 51]NSUBJ(nsubj,basic) D[46,51](which) G[52,60](contains)", + "[ 52, 60]RCMOD(rcmod,basic) D[52,60](contains) G[35,43](sentence)", + "[ 64, 68]AMOD(amod,basic) D[64,68](many) G[69,81](constituents)", + "[ 69, 81]PREP(prep_as,basic) D[69,81](constituents) G[52,60](contains)", + "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", + "[102,110]PREP(prep_as,basic) D[102,110](possible) G[52,60](contains)" }; + + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_ADJ", "POS_NOUN", + "POS_NOUN", "POS_PUNCT", "POS_DET", "POS_VERB", "POS_ADP", "POS_ADJ", "POS_NOUN", + "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; + + String[] posOriginal = { "PRP", "VBP", "DT", "RB", "JJ", "NN", "NN", ",", "WDT", "VBZ", + "IN", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; + + String pennTree = "(ROOT (S (NP (PRP We)) (VP (VBP need) (NP (NP (DT a) (ADJP (RB very) " + + "(JJ complicated)) (NN example) (NN sentence)) (, ,) (SBAR (WHNP (WDT which)) " + + "(S (VP (VBZ contains) (PP (PP (IN as) (NP (JJ many) (NNS constituents) " + + "(CC and) (NNS dependencies))) (PP (IN as) (ADJP (JJ possible))))))))) (. .)))"; + + String[] unmappedDep = { "gov" }; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertConstituents(constituentMapped, constituentOriginal, + select(jcas, Constituent.class)); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); + assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); + assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); + assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); + assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); + assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); + } + + @Test + public void testEnglishWsjRnn() + throws Exception + { + JCas jcas = runTest("en", "wsj-rnn", "We need a very complicated example sentence , which " + + "contains as many constituents and dependencies as possible ."); + + String[] constituentMapped = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 61,98", + "NP 8,110", "NP 8,43", "PP 99,110", "QP 61,68", "ROOT 0,112", "S 0,112", + "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; + + String[] constituentOriginal = { "ADJP 10,26", "ADJP 102,110", "NP 0,2", "NP 61,98", + "NP 8,110", "NP 8,43", "PP 99,110", "QP 61,68", "ROOT 0,112", "S 0,112", + "S 52,110", "SBAR 46,110", "VP 3,110", "VP 52,110", "WHNP 46,51" }; + + String[] dependencies = { + "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](We) G[3,7](need)", + "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", + "[ 8, 9]DET(det,basic) D[8,9](a) G[35,43](sentence)", + "[ 10, 14]ADVMOD(advmod,basic) D[10,14](very) G[15,26](complicated)", + "[ 15, 26]AMOD(amod,basic) D[15,26](complicated) G[35,43](sentence)", + "[ 27, 34]NN(nn,basic) D[27,34](example) G[35,43](sentence)", + "[ 35, 43]DOBJ(dobj,basic) D[35,43](sentence) G[3,7](need)", + "[ 46, 51]NSUBJ(nsubj,basic) D[46,51](which) G[52,60](contains)", + "[ 52, 60]RCMOD(rcmod,basic) D[52,60](contains) G[35,43](sentence)", + "[ 61, 63]QUANTMOD(quantmod,basic) D[61,63](as) G[64,68](many)", + "[ 64, 68]NUM(num,basic) D[64,68](many) G[69,81](constituents)", + "[ 69, 81]DOBJ(dobj,basic) D[69,81](constituents) G[52,60](contains)", + "[ 86, 98]CONJ(conj_and,basic) D[86,98](dependencies) G[69,81](constituents)", + "[102,110]PREP(prep_as,basic) D[102,110](possible) G[52,60](contains)" }; + + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_DET", "POS_ADV", "POS_VERB", "POS_NOUN", + "POS_NOUN", "POS_PUNCT", "POS_DET", "POS_VERB", "POS_ADV", "POS_ADJ", "POS_NOUN", + "POS_CONJ", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_PUNCT" }; + + String[] posOriginal = { "PRP", "VBP", "DT", "RB", "VBN", "NN", "NN", ",", "WDT", "VBZ", + "RB", "JJ", "NNS", "CC", "NNS", "IN", "JJ", "." }; + + String pennTree = "(ROOT (S (NP (PRP We)) (VP (VBP need) (NP (NP (DT a) (ADJP (RB very) " + + "(VBN complicated)) (NN example) (NN sentence)) (, ,) (SBAR (WHNP (WDT which)) " + + "(S (VP (VBZ contains) (NP (QP (RB as) (JJ many)) (NNS constituents) (CC and) " + + "(NNS dependencies)) (PP (IN as) (ADJP (JJ possible)))))))) (. .)))"; + + String[] unmappedDep = { "gov" }; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertConstituents(constituentMapped, constituentOriginal, + select(jcas, Constituent.class)); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertTagset(POS.class, "ptb", ENGLISH_POS_TAGS, jcas); + assertTagsetMapping(POS.class, "ptb", ENGLISH_POS_UNMAPPED, jcas); + assertTagset(Constituent.class, "ptb", ENGLISH_CONSTITUENT_TAGS, jcas); + assertTagsetMapping(Constituent.class, "ptb", ENGLISH_CONSTITUENT_UNMAPPED, jcas); + assertTagset(Dependency.class, "stanford341", ENGLISH_DEPENDENCY_TAGS, jcas); + assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); + } + + /** + * This test uses simple double quotes. + * + * @throws Exception + * if there is an error. + */ + @Test + public void testEnglishFactoredDirectSpeech() + throws Exception + { + JCas jcas = runTest("en", "factored", + "\" It 's cold outside , \" he said , \" and it 's starting to rain . \""); + + String[] posOriginal = new String[] { "``", "PRP", "VBZ", "JJ", "JJ", ",", "''", "PRP", + "VBD", ",", "``", "CC", "PRP", "VBZ", "VBG", "TO", "NN", ".", "''" }; + + String pennTree = "(ROOT (S (`` \") (S (NP (PRP It)) (VP (VBZ 's) (ADJP (JJ cold)) (S " + + "(ADJP (JJ outside))))) (PRN (, ,) ('' \") (S (NP (PRP he)) (VP (VBD said))) (, " + + ",) (`` \")) (CC and) (S (NP (PRP it)) (VP (VBZ 's) (VP (VBG starting) (PP " + + "(TO to) (NP (NN rain)))))) (. .) ('' \")))"; + + assertPOS(null, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + } + + /** + * This test uses UTF-8 quotes as they can be found in the British National Corpus. + * + * @throws Exception + * if there is an error. + */ + @Test + public void testEnglishFactoredDirectSpeech2() + throws Exception + { + // JCas jcas = runTest("en", "factored", + // "‘Prices are used as a barrier so that the sort of " + + // "people we don't want go over the road ,’ he said ."); + JCas jcas = runTest("en", "factored", new String[] { "‘", "It", "'s", "cold", "outside", + ",", "’", "he", "said", ",", "‘", "and", "it", "'s", "starting", "to", "rain", ".", + "’" }); + + String[] posOriginal = new String[] { "``", "PRP", "VBZ", "JJ", "JJ", ",", "''", "PRP", + "VBD", ",", "``", "CC", "PRP", "VBZ", "VBG", "TO", "NN", ".", "''" }; + + String pennTree = "(ROOT (S (`` ‘) (S (NP (PRP It)) (VP (VBZ 's) (ADJP (JJ cold)) (S " + + "(ADJP (JJ outside))))) (PRN (, ,) ('' ’) (S (NP (PRP he)) (VP (VBD said))) " + + "(, ,) (`` ‘)) (CC and) (S (NP (PRP it)) (VP (VBZ 's) (VP (VBG starting) (PP " + + "(TO to) (NP (NN rain)))))) (. .) ('' ’)))"; + + assertPOS(null, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + } + + @Test + public void testSpanishShiftReduceBeam() + throws Exception + { + JCas jcas = runTestWithPosTagger("es", "sr-beam", "Necesitamos una oración de ejemplo " + + "muy complicado , que contiene la mayor cantidad de componentes y dependencias " + + "como sea posible ."); + + String[] constituentMapped = { "ADJP 122,129", "ADJP 68,73", "ADVP 35,38", "CONJP 113,117", + "CONJP 98,99", "NP 100,112", "NP 12,129", "NP 16,129", "NP 27,34", "NP 65,112", + "NP 68,112", "NP 86,112", "NP 86,97", "PP 24,34", "PP 83,112", "ROOT 0,131", + "S 0,131", "S 113,129", "S 35,49", "S 50,129", "VP 0,11", "VP 118,121", "VP 56,64", + "X 12,15", "X 24,26", "X 39,49", "X 52,55", "X 65,67", "X 83,85" }; + + String[] constituentOriginal = { "ROOT 0,131", "S 113,129", "S 35,49", "S 50,129", + "conj 113,117", "conj 98,99", "grup.a 122,129", "grup.a 68,73", "grup.adv 35,38", + "grup.nom 100,112", "grup.nom 16,129", "grup.nom 27,34", "grup.nom 68,112", + "grup.nom 86,112", "grup.nom 86,97", "grup.verb 0,11", "grup.verb 118,121", + "grup.verb 56,64", "participi 39,49", "prep 24,26", "prep 83,85", "relatiu 52,55", + "s.a 122,129", "s.a 68,73", "sadv 35,38", "sentence 0,131", "sn 12,129", + "sn 27,34", "sn 65,112", "sn 86,112", "sp 24,34", "sp 83,112", "spec 12,15", + "spec 65,67" }; + + String[] dependencies = { }; + + String[] posMapped = { "POS_VERB", "POS_DET", "POS_NOUN", "POS_ADP", "POS_NOUN", "POS_ADV", "POS_ADJ", "POS_PUNCT", "POS_PRON", + "POS_VERB", "POS_DET", "POS_ADJ", "POS_NOUN", "POS_ADP", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_CONJ", "POS_VERB", "POS_ADJ", + "POS_PUNCT" }; + + String[] posOriginal = { "vmip000", "di0000", "nc0s000", "sp000", "nc0s000", "rg", + "aq0000", "fc", "pr000000", "vmip000", "da0000", "aq0000", "nc0s000", "sp000", + "nc0p000", "cc", "nc0p000", "cs", "vssp000", "aq0000", "fp" }; + + String pennTree = "(ROOT (sentence (grup.verb (vmip000 Necesitamos)) (sn (spec " + + "(di0000 una)) (grup.nom (nc0s000 oración) (sp (prep (sp000 de)) (sn " + + "(grup.nom (nc0s000 ejemplo)))) (S (sadv (grup.adv (rg muy))) (participi " + + "(aq0000 complicado))) (S (fc ,) (relatiu (pr000000 que)) (grup.verb " + + "(vmip000 contiene)) (sn (spec (da0000 la)) (grup.nom (s.a (grup.a " + + "(aq0000 mayor))) (nc0s000 cantidad) (sp (prep (sp000 de)) (sn (grup.nom " + + "(grup.nom (nc0p000 componentes)) (conj (cc y)) (grup.nom " + + "(nc0p000 dependencias))))))) (S (conj (cs como)) (grup.verb (vssp000 sea)) " + + "(s.a (grup.a (aq0000 posible))))))) (fp .)))"; + + String[] posTags = SPANISH_POS_TAGS; + + String[] constituentTags = { "ROOT", "S", "conj", "f", "gerundi", "grup.a", "grup.adv", + "grup.cc", "grup.cs", "grup.nom", "grup.prep", "grup.pron", "grup.verb", "grup.w", + "grup.z", "inc", "infinitiu", "interjeccio", "morfema.pronominal", "morfema.verbal", + "neg", "participi", "prep", "relatiu", "s.a", "sadv", "sentence", "sn", "sp", + "spec" }; + + String[] unmappedPos = { ".$$.", "359000", "NCMS000", "word" }; + + String[] unmappedConst = { "f" }; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertConstituents(constituentMapped, constituentOriginal, + select(jcas, Constituent.class)); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertTagset(POS.class, "ancora", posTags, jcas); + assertTagsetMapping(POS.class, "ancora", unmappedPos, jcas); + assertTagset(Constituent.class, "ancora", constituentTags, jcas); + assertTagsetMapping(Constituent.class, "ancora", unmappedConst, jcas); +// assertTagset(Dependency.class, "stanford341", depTags, jcas); +// assertTagsetMapping(Dependency.class, "stanford341", unmappedDep, jcas); + } + + /** + * Tests the parser reading pre-existing POS tags + * + * @throws Exception + * if there is an error. + */ + @Test + public void testExistingPos() + throws Exception + { + AnalysisEngineDescription engine = createEngineDescription( + createEngineDescription(StanfordPosTagger.class), + createEngineDescription(StanfordParser.class, + StanfordParser.PARAM_READ_POS, true, + StanfordParser.PARAM_WRITE_POS, false, + StanfordParser.PARAM_WRITE_PENN_TREE, true)); + + JCas jcas = TestRunner.runTest(engine, "en", "This is a test ."); + + String[] posOriginal = new String[] { "DT", "VBZ", "DT", "NN", "." }; + + String pennTree = "(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN test))) (. .)))"; + + assertPOS(null, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + } + + @Test + public void testFrenchFactored() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("fr", "factored", "Nous avons besoin d' une phrase par exemple très " + + "compliqué , qui contient des constituants que de nombreuses dépendances et que " + + "possible ."); + + String[] constituentMapped = { "ADJP 128,136", "ADVP 32,48", "NP 11,48", "NP 21,48", + "NP 61,64", "NP 74,90", "NP 95,120", "PP 18,48", "ROOT 0,138", "S 0,138", + "SBAR 124,136", "SBAR 61,90", "SBAR 91,136", "VP 0,58", "VP 65,73", "X 121,136", + "X 32,43" }; + + String[] constituentOriginal = { "AP 128,136", "AdP 32,48", "COORD 121,136", "MWADV 32,43", + "NP 11,48", "NP 21,48", "NP 61,64", "NP 74,90", "NP 95,120", "PP 18,48", + "ROOT 0,138", "SENT 0,138", "Srel 61,90", "Ssub 124,136", "Ssub 91,136", "VN 0,58", + "VN 65,73" }; + + String[] dependencies = {/** No dependencies for French */ }; + + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_NOUN", "POS_ADP", "POS_DET", "POS_NOUN", "POS_ADP", "POS_NOUN", "POS_ADV", + "POS_VERB", "POS_PUNCT", "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_CONJ", "POS_DET", "POS_ADJ", "POS_NOUN", + "POS_CONJ", "POS_CONJ", "POS_ADJ", "POS_PUNCT" }; + + String[] posOriginal = { "CLS", "V", "NC", "P", "DET", "NC", "P", "N", "ADV", "VPP", + "PUNC", "PROREL", "V", "DET", "NC", "CS", "DET", "ADJ", "NC", "CC", "CS", "ADJ", + "PUNC" }; + + String pennTree = "(ROOT (SENT (VN (CLS Nous) (V avons) (NP (NC besoin) (PP (P d') (NP " + + "(DET une) (NC phrase) (AdP (MWADV (P par) (N exemple)) (ADV très))))) " + + "(VPP compliqué)) (PUNC ,) (Srel (NP (PROREL qui)) (VN (V contient)) (NP " + + "(DET des) (NC constituants))) (Ssub (CS que) (NP (DET de) (ADJ nombreuses) " + + "(NC dépendances)) (COORD (CC et) (Ssub (CS que) (AP (ADJ possible))))) " + + "(PUNC .)))"; + + String[] posTags = FRENCH_POS_TAGS; + + String[] constituentTags = { "AP", "AdP", "COORD", "MWA", "MWADV", "MWC", "MWCL", "MWD", + "MWET", "MWI", "MWN", "MWP", "MWPRO", "MWV", "NP", "PP", "ROOT", "SENT", "Sint", + "Srel", "Ssub", "VN", "VPinf", "VPpart" }; + + // NO DEP TAGS String[] depTags = {}; + + String[] unmappedPos = { ".$$." }; + + String[] unmappedConst = { "MWA", "MWADV", "MWC", "MWCL", "MWD", "MWET", + "MWI", "MWN", "MWP", "MWPRO", "MWV" }; + + // NO DEP TAGS String[] unmappedDep = {}; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, + select(jcas, Constituent.class)); + assertDependencies(dependencies, select(jcas, Dependency.class)); + + assertTagset(POS.class, "corenlp34", posTags, jcas); + assertTagsetMapping(POS.class, "corenlp34", unmappedPos, jcas); + assertTagset(Constituent.class, "ftb", constituentTags, jcas); + assertTagsetMapping(Constituent.class, "ftb", unmappedConst, jcas); + // NO DEP TAGS assertTagset(Dependency.class, null, depTags, jcas); + // NO DEP TAGS assertTagsetMapping(Dependency.class, null, unmappedDep, jcas); + } + + @Test + public void testFrench2() + throws Exception + { + Assume.assumeTrue(Runtime.getRuntime().maxMemory() > 1000000000); + + JCas jcas = runTest("fr", null, "La traduction d' un texte du français vers l' anglais ."); + + String[] constituentMapped = { "ADJP 29,37", "NP 0,53", "NP 17,37", "NP 43,53", "PP 14,37", + "PP 26,37", "PP 38,53", "ROOT 0,55", "S 0,55" }; + + String[] constituentOriginal = { "AP 29,37", "NP 0,53", "NP 17,37", "NP 43,53", "PP 14,37", + "PP 26,37", "PP 38,53", "ROOT 0,55", "SENT 0,55" }; + + String[] posMapped = { "POS_DET", "POS_NOUN", "POS_ADP", "POS_DET", "POS_NOUN", "POS_ADP", "POS_ADJ", "POS_ADP", "POS_DET", + "POS_NOUN", "POS_PUNCT" }; + + String[] posOriginal = { "DET", "NC", "P", "DET", "NC", "P", "ADJ", "P", "DET", "NC", + "PUNC" }; + + String pennTree = "(ROOT (SENT (NP (DET La) (NC traduction) (PP (P d') (NP (DET un) " + + "(NC texte) (PP (P du) (AP (ADJ français))))) (PP (P vers) (NP (DET l') " + + "(NC anglais)))) (PUNC .)))"; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, + select(jcas, Constituent.class)); + } + + @Test + public void testChineseFactored() + throws Exception + { + JCas jcas = runTest("zh", "factored", + "我们 需要 一个 非常 复杂 的 句子 例如 其中 包含 许多 成分 和 尽可能 的 依赖 。"); + + String[] constituentMapped = { "ADJP 12,14", "ADJP 9,14", "ADVP 20,22", "ADVP 37,40", + "ADVP 9,11", "NP 0,2", "NP 17,19", "NP 23,25", "NP 29,34", "NP 32,34", "NP 6,19", + "QP 29,31", "QP 6,8", "ROOT 0,47", "VP 26,34", "VP 26,45", "VP 3,19", "VP 37,45", + "VP 43,45", "X 0,19", "X 0,47", "X 20,45", "X 37,42", "X 9,16" }; + + String[] constituentOriginal = { "ADJP 12,14", "ADJP 9,14", "ADVP 20,22", "ADVP 37,40", + "ADVP 9,11", "DNP 9,16", "DVP 37,42", "IP 0,19", "IP 0,47", "IP 20,45", "NP 0,2", + "NP 17,19", "NP 23,25", "NP 29,34", "NP 32,34", "NP 6,19", "QP 29,31", "QP 6,8", + "ROOT 0,47", "VP 26,34", "VP 26,45", "VP 3,19", "VP 37,45", "VP 43,45" }; + + String[] dependencies = { + "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](我们) G[3,5](需要)", + "[ 3, 5]ROOT(root,basic) D[3,5](需要) G[3,5](需要)", + "[ 6, 8]Dependency(nummod,basic) D[6,8](一个) G[17,19](句子)", + "[ 9, 11]ADVMOD(advmod,basic) D[9,11](非常) G[12,14](复杂)", + "[ 12, 14]Dependency(assmod,basic) D[12,14](复杂) G[17,19](句子)", + "[ 15, 16]Dependency(assm,basic) D[15,16](的) G[12,14](复杂)", + "[ 17, 19]DOBJ(dobj,basic) D[17,19](句子) G[3,5](需要)", + "[ 20, 22]ADVMOD(advmod,basic) D[20,22](例如) G[26,28](包含)", + "[ 23, 25]NSUBJ(nsubj,basic) D[23,25](其中) G[26,28](包含)", + "[ 26, 28]CONJ(conj,basic) D[26,28](包含) G[3,5](需要)", + "[ 29, 31]Dependency(nummod,basic) D[29,31](许多) G[32,34](成分)", + "[ 32, 34]DOBJ(dobj,basic) D[32,34](成分) G[26,28](包含)", + "[ 35, 36]CC(cc,basic) D[35,36](和) G[26,28](包含)", + "[ 37, 40]Dependency(dvpmod,basic) D[37,40](尽可能) G[43,45](依赖)", + "[ 41, 42]Dependency(dvpm,basic) D[41,42](的) G[37,40](尽可能)", + "[ 43, 45]CONJ(conj,basic) D[43,45](依赖) G[26,28](包含)" }; + + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_NUM", "POS_ADJ", "POS_ADJ", "POS_PART", + "POS_NOUN", "POS_ADJ", "POS_NOUN", "POS_VERB", "POS_NUM", "POS_NOUN", "POS_CONJ", + "POS_ADJ", "POS_PART", "POS_VERB", "POS_PUNCT" }; + + String[] posOriginal = { "PN", "VV", "CD", "AD", "JJ", "DEG", "NN", "AD", "NN", "VV", "CD", + "NN", "CC", "AD", "DEV", "VV", "PU" }; + + String pennTree = "(ROOT (IP (IP (NP (PN 我们)) (VP (VV 需要) (NP (QP (CD 一个)) (DNP " + + "(ADJP (ADVP (AD 非常)) (ADJP (JJ 复杂))) (DEG 的)) (NP (NN 句子))))) (IP (ADVP " + + "(AD 例如)) (NP (NN 其中)) (VP (VP (VV 包含) (NP (QP (CD 许多)) (NP (NN 成分)))) " + + "(CC 和) (VP (DVP (ADVP (AD 尽可能)) (DEV 的)) (VP (VV 依赖))))) (PU 。)))"; + + String[] posTags = { ".$$.", "AD", "AS", "BA", "CC", "CD", "CS", "DEC", "DEG", "DER", + "DEV", "DT", "ETC", "FW", "IJ", "JJ", "LB", "LC", "M", "MSP", "NN", "NR", "NT", + "OD", "ON", "P", "PN", "PU", "SB", "SP", "URL", "VA", "VC", "VE", "VV", "X" }; + + String[] constituentTags = { "ADJP", "ADVP", "CLP", "CP", "DFL", "DNP", "DP", "DVP", "FLR", + "FRAG", "INC", "INTJ", "IP", "LCP", "LST", "NP", "PP", "PRN", "QP", "ROOT", "UCP", + "VCD", "VCP", "VNV", "VP", "VPT", "VRD", "VSB", "WHPP" }; + + // NO DEP TAGS String[] depTags = new String[] {}; + + String[] unmappedPos = { ".$$.", "URL" }; + + String[] unmappedConst = { "DFL", "FLR", "INC", "WHPP" }; + + // NO DEP TAGS String[] unmappedDep = new String[] {}; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, + select(jcas, Constituent.class)); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(POS.class, "ctb", posTags, jcas); + assertTagsetMapping(POS.class, "ctb", unmappedPos, jcas); + assertTagset(Constituent.class, "ctb", constituentTags, jcas); + assertTagsetMapping(Constituent.class, "ctb", unmappedConst, jcas); + // NO DEP TAGS assertTagset(Dependency.class, null, depTags, jcas); + // NO DEP TAGS assertTagsetMapping(Dependency.class, null, unmappedDep, jcas); + } + + @Test + public void testChineseXinhuaFactored() + throws Exception + { + JCas jcas = runTest("zh", "xinhua-factored", + "我们 需要 一个 非常 复杂 的 句子 例如 其中 包含 许多 成分 和 尽可能 的 依赖 。"); + + String[] constituentMapped = { "ADVP 20,22", "ADVP 37,40", "ADVP 9,11", "NP 0,2", + "NP 17,19", "NP 23,25", "NP 29,34", "NP 32,34", "NP 43,45", "NP 6,45", "NP 9,19", + "QP 29,31", "QP 6,8", "ROOT 0,47", "VP 12,14", "VP 26,34", "VP 26,40", "VP 3,45", + "VP 37,40", "VP 9,14", "X 0,47", "X 20,40", "X 9,14", "X 9,16", "X 9,40", + "X 9,42" }; + + String[] constituentOriginal = { "ADVP 20,22", "ADVP 37,40", "ADVP 9,11", "CP 9,16", + "CP 9,42", "IP 0,47", "IP 20,40", "IP 9,14", "IP 9,40", "NP 0,2", "NP 17,19", + "NP 23,25", "NP 29,34", "NP 32,34", "NP 43,45", "NP 6,45", "NP 9,19", "QP 29,31", + "QP 6,8", "ROOT 0,47", "VP 12,14", "VP 26,34", "VP 26,40", "VP 3,45", "VP 37,40", + "VP 9,14" }; + + String[] dependencies = { + "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](我们) G[3,5](需要)", + "[ 3, 5]ROOT(root,basic) D[3,5](需要) G[3,5](需要)", + "[ 6, 8]Dependency(nummod,basic) D[6,8](一个) G[43,45](依赖)", + "[ 9, 11]ADVMOD(advmod,basic) D[9,11](非常) G[12,14](复杂)", + "[ 12, 14]RCMOD(rcmod,basic) D[12,14](复杂) G[17,19](句子)", + "[ 15, 16]Dependency(cpm,basic) D[15,16](的) G[12,14](复杂)", + "[ 17, 19]NSUBJ(nsubj,basic) D[17,19](句子) G[26,28](包含)", + "[ 20, 22]ADVMOD(advmod,basic) D[20,22](例如) G[26,28](包含)", + "[ 23, 25]NSUBJ(nsubj,basic) D[23,25](其中) G[26,28](包含)", + "[ 26, 28]RCMOD(rcmod,basic) D[26,28](包含) G[43,45](依赖)", + "[ 29, 31]Dependency(nummod,basic) D[29,31](许多) G[32,34](成分)", + "[ 32, 34]DOBJ(dobj,basic) D[32,34](成分) G[26,28](包含)", + "[ 35, 36]CC(cc,basic) D[35,36](和) G[26,28](包含)", + "[ 37, 40]CONJ(conj,basic) D[37,40](尽可能) G[26,28](包含)", + "[ 41, 42]Dependency(cpm,basic) D[41,42](的) G[26,28](包含)", + "[ 43, 45]DOBJ(dobj,basic) D[43,45](依赖) G[3,5](需要)" }; + + String[] posMapped = { "POS_PRON", "POS_VERB", "POS_NUM", "POS_ADJ", "POS_VERB", "POS_PART", + "POS_NOUN", "POS_ADJ", "POS_NOUN", "POS_VERB", "POS_NUM", "POS_NOUN", "POS_CONJ", + "POS_ADJ", "POS_PART", "POS_NOUN", "POS_PUNCT" }; + + String[] posOriginal = { "PN", "VV", "CD", "AD", "VA", "DEC", "NN", "AD", "NN", "VV", "CD", + "NN", "CC", "AD", "DEC", "NN", "PU" }; + + String pennTree = "(ROOT (IP (NP (PN 我们)) (VP (VV 需要) (NP (QP (CD 一个)) (CP (IP (NP " + + "(CP (IP (VP (ADVP (AD 非常)) (VP (VA 复杂)))) (DEC 的)) (NP (NN 句子))) (IP " + + "(ADVP (AD 例如)) (NP (NN 其中)) (VP (VP (VV 包含) (NP (QP (CD 许多)) (NP " + + "(NN 成分)))) (CC 和) (VP (ADVP (AD 尽可能)))))) (DEC 的)) (NP (NN 依赖)))) " + + "(PU 。)))"; + + String[] posTags = { ".$$.", "AD", "AS", "BA", "CC", "CD", "CS", "DEC", "DEG", "DER", + "DEV", "DT", "ETC", "FW", "JJ", "LB", "LC", "M", "MSP", "NN", "NR", "NT", "OD", + "P", "PN", "PU", "SB", "SP", "VA", "VC", "VE", "VV" }; + + String[] constituentTags = { "ADJP", "ADVP", "CLP", "CP", "DNP", "DP", "DVP", "FRAG", "IP", + "LCP", "LST", "NP", "PP", "PRN", "QP", "ROOT", "UCP", "VCD", "VCP", "VNV", "VP", + "VPT", "VRD", "VSB" }; + + // NO DEP TAGS String[] depTags = new String[] {}; + + String[] unmappedPos = { ".$$." }; + + String[] unmappedConst = { }; + + // NO DEP TAGS String[] unmappedDep = new String[] {}; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, + select(jcas, Constituent.class)); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(POS.class, "ctb", posTags, jcas); + assertTagsetMapping(POS.class, "ctb", unmappedPos, jcas); + assertTagset(Constituent.class, "ctb", constituentTags, jcas); + assertTagsetMapping(Constituent.class, "ctb", unmappedConst, jcas); + // NO DEP TAGS assertTagset(Dependency.class, null, depTags, jcas); + // NO DEP TAGS assertTagsetMapping(Dependency.class, null, unmappedDep, jcas); + } + + @Test + public void testArabicFactored() + throws Exception + { + JCas jcas = runTest("ar", "factored", + "نحتاج مثالا معقدا جدا ل جملة تحتوي على أكبر قدر ممكن من العناصر و الروابط ."); + + String[] constituentMapped = { "NP 24,28", "NP 24,73", "NP 39,73", "NP 44,52", "NP 44,73", + "NP 56,73", "NP 6,21", "PP 22,73", "PP 35,73", "PP 53,73", "ROOT 0,75", "S 0,75", + "S 29,73", "SBAR 29,73", "VP 0,73", "VP 29,73" }; + + String[] constituentOriginal = { "NP 24,28", "NP 24,73", "NP 39,73", "NP 44,52", + "NP 44,73", "NP 56,73", "NP 6,21", "PP 22,73", "PP 35,73", "PP 53,73", "ROOT 0,75", + "S 0,75", "S 29,73", "SBAR 29,73", "VP 0,73", "VP 29,73" }; + + String[] dependencies = {}; + + String pennTree = "(ROOT (S (VP (VBP نحتاج) (NP (NN مثالا) (JJ معقدا) (NN جدا)) (PP (IN ل) (NP " + + "(NP (NN جملة)) (SBAR (S (VP (VBP تحتوي) (PP (IN على) (NP (NN أكبر) (NP (NP (NN قدر) " + + "(JJ ممكن)) (PP (IN من) (NP (DTNN العناصر) (CC و) (DTNN الروابط)))))))))))) (PUNC .)))"; + + String[] posMapped = { "POS_VERB", "POS_NOUN", "POS_ADJ", "POS_NOUN", "POS_ADP", "POS_NOUN", "POS_VERB", "POS_ADP", "POS_NOUN", + "POS_NOUN", "POS_ADJ", "POS_ADP", "POS_NOUN", "POS_CONJ", "POS_NOUN", "POS_PUNCT" }; + + String[] posOriginal = { "VBP", "NN", "JJ", "NN", "IN", "NN", "VBP", "IN", "NN", "NN", + "JJ", "IN", "DTNN", "CC", "DTNN", "PUNC" }; + + String[] posTags = { ".$$.", "ADJ_NUM", "CC", "CD", "DT", "DTJJ", "DTJJR", "DTNN", "DTNNP", + "DTNNPS", "DTNNS", "IN", "JJ", "JJR", "NN", "NNP", "NNPS", "NNS", "NOUN_QUANT", + "PRP", "PRP$", "PUNC", "RB", "RP", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VN", + "WP", "WRB" }; + + String[] constituentTags = { "ADJP", "ADVP", "CONJP", "FRAG", "INTJ", "LST", "NAC", "NP", + "PP", "PRN", "PRT", "ROOT", "S", "SBAR", "SBARQ", "SQ", "UCP", "VP", "WHADVP", + "WHNP", "WHPP", "X" }; + + String[] unmappedPos = { ".$$.", "ADJ_NUM", "NOUN_QUANT", "PRP$" }; + + String[] unmappedConst = { "LST" }; + + assertPOS(posMapped, posOriginal, select(jcas, POS.class)); + assertPennTree(pennTree, selectSingle(jcas, PennTree.class)); + assertConstituents(constituentMapped, constituentOriginal, + select(jcas, Constituent.class)); + assertDependencies(dependencies, select(jcas, Dependency.class)); + assertTagset(POS.class, "atb", posTags, jcas); + assertTagsetMapping(POS.class, "atb", unmappedPos, jcas); + assertTagset(Constituent.class, "atb", constituentTags, jcas); + assertTagsetMapping(Constituent.class, "atb", unmappedConst, jcas); + } + + /** + * This tests whether a complete syntax tree can be recreated from the annotations without any + * loss. Consequently, all links to children should be correct. (This makes no assertions about + * the parent-links, because they are not used for the recreation) + * + * @throws Exception + * if there is an error. + */ + @Test + public void testEnglishSyntaxTreeReconstruction() + throws Exception + { + JCas jcas = runTest("en", "factored", "We need a very complicated example sentence , which " + + "contains as many constituents and dependencies as possible ."); + + String pennOriginal = ""; + String pennFromRecreatedTree = ""; + + // As we only have one input sentence, each loop only runs once! + + for (PennTree curPenn : select(jcas, PennTree.class)) { + // get original penn representation of syntax tree + pennOriginal = curPenn.getPennTree(); + } + + for (ROOT curRoot : select(jcas, ROOT.class)) { + // recreate syntax tree + Tree recreation = TreeUtils.createStanfordTree(curRoot); + + // make a tree with simple string-labels + recreation = recreation.deepCopy(recreation.treeFactory(), StringLabel.factory()); + + pennFromRecreatedTree = recreation.pennString(); + } + + assertTrue("The recreated syntax-tree did not match the input syntax-tree.", + pennOriginal.equals(pennFromRecreatedTree)); + } + +// @Test +// public void testModelSharing() +// throws Exception +// { +// // Save share override value (if any was set) and enable sharing for the StanfordParser +// String prop = "dkpro.core.resourceprovider.sharable." + StanfordParser.class.getName(); +// String oldValue = System.getProperty(prop); +// System.setProperty(prop, "true"); +// +// final List<LoggingEvent> records = new ArrayList<LoggingEvent>(); +// +// // Tell the logger to log everything +// Logger rootLogger = org.apache.log4j.LogManager.getRootLogger(); +// final org.apache.log4j.Level oldLevel = rootLogger.getLevel(); +// rootLogger.setLevel(org.apache.log4j.Level.ALL); +// Appender appender = (Appender) rootLogger.getAllAppenders().nextElement(); +// // Capture output, log only what would have passed the original logging level +// appender.addFilter(new org.apache.log4j.spi.Filter() +// { +// @Override +// public int decide(LoggingEvent event) +// { +// records.add(event); +// return event.getLevel().toInt() >= oldLevel.toInt() +// ? org.apache.log4j.spi.Filter.NEUTRAL +// : org.apache.log4j.spi.Filter.DENY; +// } +// }); +// +// try { +// AnalysisEngineDescription pipeline = createEngineDescription( +// createEngineDescription(StanfordParser.class, +// StanfordParser.PARAM_WRITE_CONSTITUENT, true, +// StanfordParser.PARAM_WRITE_DEPENDENCY, false), +// createEngineDescription(StanfordParser.class, +// StanfordParser.PARAM_WRITE_CONSTITUENT, false, +// StanfordParser.PARAM_WRITE_DEPENDENCY, true)); +// +// JCas jcas = TestRunner.runTest(pipeline, "en", "This is a test ."); +// +// boolean found = false; +// for (LoggingEvent e : records) { +// if (String.valueOf(e.getMessage()).contains("Used resource from cache")) { +// found = true; +// } +// } +// +// assertTrue("No log message about using the cached resource was found!", found); +// +// String[] dependencies = { +// "[ 0, 4]NSUBJ(nsubj,basic) D[0,4](This) G[10,14](test)", +// "[ 5, 7]COP(cop,basic) D[5,7](is) G[10,14](test)", +// "[ 8, 9]DET(det,basic) D[8,9](a) G[10,14](test)", +// "[ 10, 14]ROOT(root,basic) D[10,14](test) G[10,14](test)" }; +// +// assertDependencies(dependencies, select(jcas, Dependency.class)); +// } +// finally { +// if (oldLevel != null) { +// rootLogger.setLevel(oldLevel); +// appender.clearFilters(); +// } +// +// if (oldValue != null) { +// System.setProperty(prop, oldValue); +// } +// else { +// System.clearProperty(prop); +// } +// } +// } + + private JCas runTestWithPosTagger(String aLanguage, String aVariant, String aText, + Object... aExtraParams) + throws Exception + { + AssumeResource.assumeResource(StanfordPosTagger.class, "tagger", aLanguage, null); + AssumeResource.assumeResource(StanfordParser.class, "parser", aLanguage, aVariant); + + AggregateBuilder aggregate = new AggregateBuilder(); + + aggregate.add(createEngineDescription(StanfordPosTagger.class)); + + Object[] params = new Object[] { + StanfordParser.PARAM_VARIANT, aVariant, + StanfordParser.PARAM_PRINT_TAGSET, true, + StanfordParser.PARAM_WRITE_CONSTITUENT, true, + StanfordParser.PARAM_WRITE_DEPENDENCY, true, + StanfordParser.PARAM_WRITE_PENN_TREE, true, + StanfordParser.PARAM_READ_POS, true, + StanfordParser.PARAM_WRITE_POS, false}; + params = ArrayUtils.addAll(params, aExtraParams); + aggregate.add(createEngineDescription(StanfordParser.class, params)); + + return TestRunner.runTest(aggregate.createAggregateDescription(), aLanguage, aText); + } + + private JCas runTest(String aLanguage, String aVariant, String aText, Object... aExtraParams) + throws Exception + { + AssumeResource.assumeResource(StanfordParser.class, "parser", aLanguage, aVariant); + + AggregateBuilder aggregate = new AggregateBuilder(); + + Object[] params = new Object[] { + StanfordParser.PARAM_VARIANT, aVariant, + StanfordParser.PARAM_PRINT_TAGSET, true, + StanfordParser.PARAM_WRITE_CONSTITUENT, true, + StanfordParser.PARAM_WRITE_DEPENDENCY, true, + StanfordParser.PARAM_WRITE_PENN_TREE, true, + StanfordParser.PARAM_WRITE_POS, true}; + params = ArrayUtils.addAll(params, aExtraParams); + aggregate.add(createEngineDescription(StanfordParser.class, params)); + + return TestRunner.runTest(aggregate.createAggregateDescription(), aLanguage, aText); + } + + private JCas runTest(String aLanguage, String aVariant, String[] aTokens) + throws Exception + { + AssumeResource.assumeResource(StanfordParser.class, "parser", aLanguage, aVariant); + + // setup English + AnalysisEngineDescription parser = createEngineDescription(StanfordParser.class, + StanfordParser.PARAM_VARIANT, aVariant, + StanfordParser.PARAM_PRINT_TAGSET, true, + StanfordParser.PARAM_WRITE_CONSTITUENT, true, + StanfordParser.PARAM_WRITE_DEPENDENCY, true, + StanfordParser.PARAM_WRITE_PENN_TREE, true, + StanfordParser.PARAM_WRITE_POS, true, + StanfordParser.PARAM_WRITE_PENN_TREE, true, + StanfordParser.PARAM_QUOTE_BEGIN, new String[] { "‘" }, + StanfordParser.PARAM_QUOTE_END, new String[] { "’" }); + + AnalysisEngine engine = createEngine(parser); + JCas jcas = engine.newJCas(); + jcas.setDocumentLanguage(aLanguage); + + JCasBuilder builder = new JCasBuilder(jcas); + for (String t : aTokens) { + builder.add(t, Token.class); + builder.add(" "); + } + builder.add(0, Sentence.class); + builder.close(); + + engine.process(jcas); + + return jcas; + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordPosTaggerTest.java b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordPosTaggerTest.java similarity index 75% rename from dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordPosTaggerTest.java rename to dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordPosTaggerTest.java index 4e1a102aae..e35d9439ec 100644 --- a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordPosTaggerTest.java +++ b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordPosTaggerTest.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,9 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; +package org.dkpro.core.stanfordnlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; @@ -26,37 +26,37 @@ import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.resources.ResourceObjectProviderBase; +import org.dkpro.core.languagetool.LanguageToolSegmenter; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.languagetool.LanguageToolSegmenter; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class StanfordPosTaggerTest { - @Test - public void testEnglish() - throws Exception - { + @Test + public void testEnglish() + throws Exception + { runTest("en", "This is a test . \n", - new String[] { "DT", "VBZ", "DT", "NN", "." }, - new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + new String[] { "DT", "VBZ", "DT", "NN", "." }, + new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); runTest("en", "A neural net . \n", - new String[] { "DT", "JJ", "NN", "." }, - new String[] { "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); + new String[] { "DT", "JJ", "NN", "." }, + new String[] { "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); runTest("en", "John is purchasing oranges . \n", - new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, - new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); - } + new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, + new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); + } @Test public void testEnglishExtra() @@ -72,37 +72,33 @@ public void testEnglishExtra() runTest("en", "twitter-fast", "John is purchasing oranges . \n", new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, - new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); + new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); runTest("en", "caseless-left3words-distsim", "john is purchasing oranges . \n", new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, - new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); + new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); runTest("en", "wsj-0-18-caseless-left3words-distsim", "john is purchasing oranges . \n", new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, - new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); + new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); } - @Test - public void testGerman() - throws Exception + @Test + public void testGerman() + throws Exception { runTest("de", "Das ist ein Test .", - new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); runTest("de", "ud", "Das ist ein Test .", new String[] { "PRON", "VERB", "DET", "NOUN", "PUNCT" }, new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); runTest("de", "hgc", "Das ist ein Test .", - new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - - runTest("de", "dewac", "Das ist ein Test .", - new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); runTest("de", "fast-caseless", "das ist ein test .", new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, @@ -151,10 +147,10 @@ public void testFrench2() { JCas jcas = runTest("fr", null, "La traduction d'un texte du français vers l'anglais."); - String[] posMapped = { "POS_DET", "POS_NOUN", "POS_ADP", "POS_DET", "POS_NOUN", "POS_ADP", "POS_NOUN", "POS_ADP", "POS_DET", - "POS_NOUN", "POS_PUNCT" }; + String[] posMapped = { "POS_DET", "POS_NOUN", "POS_ADP", "POS_DET", "POS_NOUN", "POS_ADP", + "POS_DET", "POS_NOUN", "POS_ADP", "POS_DET", "POS_NOUN", "POS_PUNCT" }; - String[] posOriginal = { "DET", "NC", "P", "DET", "NC", "P", "NC", "P", "DET", "NC", + String[] posOriginal = { "DET", "NC", "P", "DET", "NC", "P", "DET", "NC", "P", "DET", "NC", "PUNC" }; AssertAnnotations.assertPOS(posMapped, posOriginal, select(jcas, POS.class)); @@ -178,33 +174,35 @@ public void testPortuguese() public void testChinese() throws Exception { - // The rudder often in the wake of the wind round the back of the area. + // The rudder often in the wake of the wind round the back of the area. runTest("zh", "尾 舵 常 处于 风轮 后面 的 尾流 区里 。", - new String[] { "NN", "NN", "AD", "VV", "NN", "NN", "DEG", "NN", "NN", "PU" }, + new String[] { "NN", "NN", "AD", "VV", "NN", "NN", "DEG", "NN", "NN", "PU" }, new String[] { "POS_NOUN", "POS_NOUN", "POS_ADJ", "POS_VERB", "POS_NOUN", "POS_NOUN", "POS_PART", "POS_NOUN", "POS_NOUN", "POS_PUNCT" }); // The service sector has become an important engine of Guangdong's economic transformation // and upgrading. runTest("zh", "服务业 成为 广东 经济 转型 升级 的 重要 引擎 。", - new String[] { "NN", "VV", "NR", "NN", "VV", "VV", "DEC", "JJ", "NN", "PU" }, + new String[] { "NN", "VV", "NR", "NN", "VV", "VV", "DEC", "JJ", "NN", "PU" }, new String[] { "POS_NOUN", "POS_VERB", "POS_PROPN", "POS_NOUN", "POS_VERB", "POS_VERB", "POS_PART", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); // How far is China from the world brand? runTest("zh", "中国 离 世界 技术 品牌 有 多远 ?", - new String[] { "NR", "P", "NN", "NN", "NN", "VE", "VV", "PU" } , - new String[] { "POS_PROPN", "POS_ADP", "POS_NOUN", "POS_NOUN", "POS_NOUN", "POS_VERB", "POS_VERB", "POS_PUNCT" } ); + new String[] { "NR", "P", "NN", "NN", "NN", "VE", "VV", "PU" } , + new String[] { "POS_PROPN", "POS_ADP", "POS_NOUN", "POS_NOUN", "POS_NOUN", + "POS_VERB", "POS_VERB", "POS_PUNCT" }); } @Test public void testArabic() throws Exception { - // Covering the following sub-Saharan countries with vast areas very + // Covering the following sub-Saharan countries with vast areas very runTest("ar", "تغطي الصحراء الكبرى الدول التالية بمساحات شاسعة جدا", - new String[] { "VBP", "DTNN", "DTJJR", "DTNN", "DTJJ", "NNS", "JJ", "NN" }, - new String[] { "POS_VERB", "POS_NOUN", "POS_ADJ", "POS_NOUN", "POS_ADJ", "POS_NOUN", "POS_ADJ", "POS_NOUN" }); + new String[] { "VBP", "DTNN", "DTJJR", "DTNN", "DTJJ", "NNS", "JJ", "NN" }, + new String[] { "POS_VERB", "POS_NOUN", "POS_ADJ", "POS_NOUN", "POS_ADJ", "POS_NOUN", + "POS_ADJ", "POS_NOUN" }); } @Test @@ -221,7 +219,8 @@ public void testEscaping() throws Exception { runTest("en", "This is a ( small ) test . \n", new String[] { "DT", "VBZ", "DT", "-LRB-", "JJ", "-RRB-", "NN", "." }, - new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_PUNCT", "POS_ADJ", "POS_PUNCT", "POS_NOUN", "POS_PUNCT" }); + new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_PUNCT", "POS_ADJ", + "POS_PUNCT", "POS_NOUN", "POS_PUNCT" }); } /** @@ -262,11 +261,12 @@ private JCas runTest(String aLanguage, String aVariant, String aText, Object... return jcas; } - private void runTest(String language, String testDocument, String[] tags, String[] tagClasses) - throws Exception - { - runTest(language, null, testDocument, tags, tagClasses); - } + + private void runTest(String language, String testDocument, String[] tags, String[] tagClasses) + throws Exception + { + runTest(language, null, testDocument, tags, tagClasses); + } private void runTest(String language, String variant, String testDocument, String[] tags, String[] tagClasses) @@ -275,8 +275,8 @@ private void runTest(String language, String variant, String testDocument, Strin AssumeResource.assumeResource(StanfordPosTagger.class, "tagger", language, variant); AnalysisEngine engine = createEngine(StanfordPosTagger.class, - StanfordPosTagger.PARAM_VARIANT, variant, StanfordPosTagger.PARAM_PRINT_TAGSET, - true); + StanfordPosTagger.PARAM_VARIANT, variant, + StanfordPosTagger.PARAM_PRINT_TAGSET, true); JCas aJCas = TestRunner.runTest(engine, language, testDocument); AssertAnnotations.assertPOS(tagClasses, tags, select(aJCas, POS.class)); diff --git a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordPosTaggerTrainerTest.java b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordPosTaggerTrainerTest.java similarity index 86% rename from dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordPosTaggerTrainerTest.java rename to dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordPosTaggerTrainerTest.java index 6c4a1ab272..eb70585ed0 100644 --- a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordPosTaggerTrainerTest.java +++ b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordPosTaggerTrainerTest.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,9 +14,9 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; +package org.dkpro.core.stanfordnlp; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; @@ -30,18 +30,18 @@ import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.factory.ConfigurationParameterFactory; import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.api.datasets.Dataset; +import org.dkpro.core.api.datasets.DatasetFactory; +import org.dkpro.core.api.datasets.Split; +import org.dkpro.core.eval.EvalUtil; +import org.dkpro.core.eval.model.Span; +import org.dkpro.core.eval.report.Result; +import org.dkpro.core.io.conll.Conll2006Reader; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Dataset; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetFactory; -import de.tudarmstadt.ukp.dkpro.core.api.datasets.Split; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.eval.EvalUtil; -import de.tudarmstadt.ukp.dkpro.core.eval.model.Span; -import de.tudarmstadt.ukp.dkpro.core.eval.report.Result; -import de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2006Reader; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; public class StanfordPosTaggerTrainerTest { @@ -92,9 +92,7 @@ public void test() StanfordPosTagger.PARAM_MODEL_LOCATION, new File(targetFolder, "model.bin")); List<Span<String>> actual = EvalUtil.loadSamples(iteratePipeline(testReader, ner), - POS.class, pos -> { - return pos.getPosValue(); - }); + POS.class, pos -> pos.getPosValue()); System.out.printf("Actual samples: %d%n", actual.size()); // Read reference data collect labels diff --git a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordPtbTransformerTest.java b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordPtbTransformerTest.java similarity index 80% rename from dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordPtbTransformerTest.java rename to dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordPtbTransformerTest.java index 3ae1ba5092..3f734b8082 100644 --- a/dkpro-core-stanfordnlp-gpl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordPtbTransformerTest.java +++ b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordPtbTransformerTest.java @@ -1,5 +1,5 @@ -/** - * Copyright 2007-2017 +/* + * Copyright 2007-2019 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * @@ -14,12 +14,13 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see http://www.gnu.org/licenses/. + * along with this program. If not, see http://www.gnu.org/licenses/. */ -package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; +package org.dkpro.core.stanfordnlp; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTransformedText; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.dkpro.core.testing.AssertAnnotations.assertTransformedText; + import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.junit.Test; @@ -32,7 +33,8 @@ public void test() String expected = "``Hey you!'', John said."; String input = "\"Hey you!\", John said."; - AnalysisEngineDescription normalizer = createEngineDescription(StanfordPtbTransformer.class); + AnalysisEngineDescription normalizer = createEngineDescription( + StanfordPtbTransformer.class); assertTransformedText(expected, input, "en", normalizer); } diff --git a/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordSegmenterTest.java b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordSegmenterTest.java new file mode 100644 index 0000000000..1538ad0311 --- /dev/null +++ b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordSegmenterTest.java @@ -0,0 +1,201 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.stanfordnlp; + +import static java.util.Arrays.asList; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.junit.Assert.assertEquals; + +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.harness.SegmenterHarness; +import org.junit.Ignore; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import edu.stanford.nlp.ie.crf.CRFClassifier; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.objectbank.ObjectBank; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.sequences.SeqClassifierFlags; +import edu.stanford.nlp.util.CoreMap; + +public class StanfordSegmenterTest +{ + @Test + public void run() throws Throwable + { + AnalysisEngineDescription aed = createEngineDescription(StanfordSegmenter.class); + + SegmenterHarness.run(aed, "de.1", "de.2", "de.3", "de.4", "en.9", "ar.1", "zh.1", "zh.2"); + } + + @Test + public void testEnglishSpeech() throws Exception + { + JCas jcas = JCasFactory.createJCas(); + jcas.setDocumentLanguage("en"); + jcas.setDocumentText("'Let's go! I want to see the Don', he said."); + + AnalysisEngine aed = createEngine(StanfordSegmenter.class); + aed.process(jcas); + + String[] tokens = { "'", "Let", "'s", "go", "!", "I", "want", "to", "see", "the", "Don", + "'", ",", "he", "said", "." }; + + AssertAnnotations.assertToken(tokens, select(jcas, Token.class)); + } + + @Test + public void testFrench() throws Exception + { + JCas jcas = JCasFactory.createJCas(); + jcas.setDocumentLanguage("fr"); + jcas.setDocumentText("Tim a dit Jamie pour la 100e fois de quitter la salle ."); + + AnalysisEngine aed = createEngine(StanfordSegmenter.class); + aed.process(jcas); + + String[] tokens = { "Tim", "a", "dit", "Jamie", "pour", "la", "100e", "fois", "de", + "quitter", "la", "salle", "." }; + + AssertAnnotations.assertToken(tokens, select(jcas, Token.class)); + } + + @Test + public void testSpanish() throws Exception + { + JCas jcas = JCasFactory.createJCas(); + jcas.setDocumentLanguage("es"); + jcas.setDocumentText("Tim dijo a Jamie para la 100ª vez que abandone la sala."); + + AnalysisEngine aed = createEngine(StanfordSegmenter.class); + aed.process(jcas); + + String[] tokens = { "Tim", "dijo", "a", "Jamie", "para", "la", "100ª", "vez", "que", + "abandone", "la", "sala", "." }; + + AssertAnnotations.assertToken(tokens, select(jcas, Token.class)); + } + + @Test + public void testUnwrapped() throws Exception + { + String text = "\"Hey you!\", John said."; + + String[] expectedSentences = { "0 10 \"Hey you!\"", "10 22 , John said." }; + String[] expectedTokens = { "0 1 `` \"", "1 4 Hey Hey", "5 8 you you", "8 9 ! !", + "9 10 '' \"", "10 11 , ,", "12 16 John John", "17 21 said said", "21 22 . ." }; + + List<String> sentences = new ArrayList<String>(); + List<String> tokens = new ArrayList<String>(); + + Properties props = new Properties(); + props.setProperty("annotators", "tokenize, ssplit"); + StanfordCoreNLP pipeline = new StanfordCoreNLP(props); + Annotation annotation = pipeline.process(text); + for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { + sentences.add(String.format("%d %d %s", + sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), + sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class), + sentence.get(CoreAnnotations.TextAnnotation.class))); + for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { + tokens.add(String.format("%d %d %s %s", + token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), + token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class), + token.get(CoreAnnotations.TextAnnotation.class), + token.get(CoreAnnotations.OriginalTextAnnotation.class))); + } + } + +// System.out.println(AssertAnnotations.asCopyableString(sentences, true)); +// System.out.println(AssertAnnotations.asCopyableString(tokens, true)); + + assertEquals(asList(expectedSentences), sentences); + assertEquals(asList(expectedTokens), tokens); + } + + @Ignore("This is completely incomplete so far") + @Test + public void testChinese() throws Exception + { + Properties props = new Properties(); + props.setProperty("sighanCorporaDict", "target/download/segmenter/stanford-segmenter-2014-01-04/data"); + props.setProperty("sighanPostProcessing", "true"); + props.setProperty("loadClassifier", "target/download/segmenter/stanford-segmenter-2014-01-04/data/ctb.gz"); + props.setProperty("serDictionary", "target/download/segmenter/stanford-segmenter-2014-01-04/data/dict-chris6.ser.gz"); + + SeqClassifierFlags flags = new SeqClassifierFlags(); + flags.setProperties(props, false); + CRFClassifier<CoreLabel> crf = new CRFClassifier<CoreLabel>(flags); + crf.loadClassifierNoExceptions(flags.loadClassifier, props); + crf.loadTagIndex(); + + String sentence = "我们需要一个非常复杂的句子例如其中包含许多成分和尽可能的依赖。"; + + System.out.println(crf.segmentString(sentence)); + + ObjectBank<List<CoreLabel>> docs = crf.makeObjectBankFromString(sentence, + crf.defaultReaderAndWriter()); + + StringWriter stringWriter = new StringWriter(); + PrintWriter stringPrintWriter = new PrintWriter(stringWriter); + for (List<CoreLabel> doc : docs) { + crf.classify(doc); +// for (CoreLabel w : doc) { +// System.out.printf("%s %s %s %s%n", +// String.valueOf(w.get(CoreAnnotations.PositionAnnotation.class)), +// String.valueOf(w.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)), +// String.valueOf(w.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)), +// String.valueOf(w.get(CoreAnnotations.AnswerAnnotation.class))); +// } + crf.defaultReaderAndWriter().printAnswers(doc, stringPrintWriter); + stringPrintWriter.println(); + } + stringPrintWriter.close(); + String segmented = stringWriter.toString(); + + System.out.println(Arrays.asList(segmented.split("\\s"))); + } + + @Test + public void testZoning() throws Exception + { + SegmenterHarness.testZoning(StanfordSegmenter.class); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordSentimentAnalyzerTest.java b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordSentimentAnalyzerTest.java new file mode 100644 index 0000000000..5c717de05d --- /dev/null +++ b/dkpro-core-stanfordnlp-gpl/src/test/java/org/dkpro/core/stanfordnlp/StanfordSentimentAnalyzerTest.java @@ -0,0 +1,64 @@ +/* + * Copyright 2007-2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ +package org.dkpro.core.stanfordnlp; + +import static org.junit.Assert.assertTrue; + +import org.apache.uima.cas.CAS; +import org.apache.uima.fit.component.CasDumpWriter; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.TypeSystemDescriptionFactory; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.util.CasCreationUtils; +import org.junit.Ignore; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.sentiment.type.StanfordSentimentAnnotation; + +/** + * Test for {@link org.dkpro.core.stanfordnlp.StanfordSentimentAnalyzer} + */ +@Ignore("https://github.com/dkpro/dkpro-core/issues/779") +public class StanfordSentimentAnalyzerTest { + + @Test + public void testSentiment() throws Exception + { + CAS cas = CasCreationUtils + .createCas(TypeSystemDescriptionFactory.createTypeSystemDescription(), null, null); + cas.setDocumentLanguage("en"); + cas.setDocumentText("I feel very very bad."); + Sentence s = new Sentence(cas.getJCas(), 0, cas.getDocumentText().length()); + s.addToIndexes(); + + SimplePipeline.runPipeline(cas, + AnalysisEngineFactory.createEngineDescription(StanfordSentimentAnalyzer.class), + AnalysisEngineFactory.createEngineDescription(CasDumpWriter.class) + ); + + StanfordSentimentAnnotation sentimentAnnotation = JCasUtil.select(cas.getJCas(), + StanfordSentimentAnnotation.class).iterator().next(); + + // more negative than positive + assertTrue(sentimentAnnotation.getNegative() + sentimentAnnotation.getVeryNegative() + > sentimentAnnotation.getPositive() + sentimentAnnotation.getVeryPositive()); + } +} diff --git a/dkpro-core-stanfordnlp-gpl/src/test/resources/log4j.properties b/dkpro-core-stanfordnlp-gpl/src/test/resources/log4j.properties deleted file mode 100644 index 9ef9876f5c..0000000000 --- a/dkpro-core-stanfordnlp-gpl/src/test/resources/log4j.properties +++ /dev/null @@ -1,7 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG diff --git a/dkpro-core-stanfordnlp-gpl/src/test/resources/log4j2.xml b/dkpro-core-stanfordnlp-gpl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..19bf03b585 --- /dev/null +++ b/dkpro-core-stanfordnlp-gpl/src/test/resources/log4j2.xml @@ -0,0 +1,15 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-stopwordremover-asl/pom.xml b/dkpro-core-stopwordremover-asl/pom.xml deleted file mode 100644 index 132ad4fe13..0000000000 --- a/dkpro-core-stopwordremover-asl/pom.xml +++ /dev/null @@ -1,69 +0,0 @@ -<!-- - Copyright 2017 - Ubiquitous Knowledge Processing (UKP) Lab - Technische Universität Darmstadt - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - <parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <version>1.10.0-SNAPSHOT</version> - <relativePath>../dkpro-core-asl</relativePath> - </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.stopwordremover-asl</artifactId> - <packaging>jar</packaging> - <name>DKPro Core ASL - Stop Word Remover (ASL)</name> - <dependencies> - <dependency> - <groupId>org.apache.uima</groupId> - <artifactId>uimaj-core</artifactId> - </dependency> - <dependency> - <groupId>org.apache.uima</groupId> - <artifactId>uimafit-core</artifactId> - </dependency> - <dependency> - <groupId>commons-io</groupId> - <artifactId>commons-io</artifactId> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.featurepath-asl</artifactId> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> - </dependency> - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> - <scope>test</scope> - </dependency> - </dependencies> -</project> diff --git a/dkpro-core-stopwordremover-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stopwordremover/StopWordRemover.java b/dkpro-core-stopwordremover-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stopwordremover/StopWordRemover.java deleted file mode 100644 index 8e5a6a3894..0000000000 --- a/dkpro-core-stopwordremover-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stopwordremover/StopWordRemover.java +++ /dev/null @@ -1,321 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.stopwordremover; - -import static de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils.resolveLocation; -import static org.apache.commons.io.IOUtils.closeQuietly; -import static org.apache.uima.fit.util.CasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.getView; -import static org.apache.uima.util.Level.FINE; - -import java.io.IOException; -import java.io.InputStream; -import java.net.URL; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Comparator; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Set; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.tcas.Annotation; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.Logger; - -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathInfo; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.StopWord; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * Remove all of the specified types from the CAS if their covered text is in the stop word - * dictionary. Also remove any other of the specified types that is covered by a matching instance. - */ -@ResourceMetaData(name="Stop Word Remover") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.StopWord" }) -public class StopWordRemover - extends JCasAnnotator_ImplBase -{ - // VIEW NAMES - private static final String TOPIC_VIEW = "topic"; - private static final String DOC_VIEW = "doc"; - - /** - * A list of URLs from which to load the stop word lists. If an URL is prefixed with a language - * code in square brackets, the stop word list is only used for documents in that language. - * Using no prefix or the prefix "[*]" causes the list to be used for every document. - * Example: "[de]classpath:/stopwords/en_articles.txt" - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true) - private Set<String> swFileNames; - - /** - * The character encoding used by the model. - */ - public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; - @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, defaultValue = "UTF-8") - private String modelEncoding; - - /** - * Feature paths for annotations that should be matched/removed. The default is - * - * <pre> - * StopWord.class.getName() - * Token.class.getName() - * Lemma.class.getName()+"/value" - * </pre> - */ - public static final String PARAM_PATHS = "Paths"; - @ConfigurationParameter(name = PARAM_PATHS, mandatory = false) - private Set<String> paths; - - /** - * Anything annotated with this type will be removed even if it does not match any word in the - * lists. - */ - public static final String PARAM_STOP_WORD_TYPE = "StopWordType"; - @ConfigurationParameter(name = PARAM_STOP_WORD_TYPE, mandatory = false) - private String stopWordType; - - private Map<String, StopWordSet> stopWordSets; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - - // Set default paths. This cannot be done in the annotation because we cannot call - // methods there. - if (paths == null || paths.size() == 0) { - paths = new HashSet<String>(); - paths.add(StopWord.class.getName()); - paths.add(Token.class.getName()); - paths.add(Lemma.class.getName()+"/value"); - } - - // Set default stop word type. This cannot be done in the annotation because we cannot call - // methods there. - if (stopWordType == null) { - stopWordType = StopWord.class.getName(); - } - - try { - stopWordSets = new HashMap<String, StopWordSet>(); - for (String swFileName : swFileNames) { - String fileLocale = "*"; - // Check if a locale is defined for the file - if (swFileName.startsWith("[")) { - fileLocale = swFileName.substring(1, swFileName.indexOf(']')); - swFileName = swFileName.substring(swFileName.indexOf(']')+1); - } - - // Fetch the set for the specified locale - StopWordSet set = stopWordSets.get(fileLocale); - if (set == null) { - set = new StopWordSet(); - stopWordSets.put(fileLocale, set); - } - - // Load the set - URL source = resolveLocation(swFileName, this, context); - InputStream is = null; - try { - is = source.openStream(); - set.load(is, modelEncoding); - } - finally { - closeQuietly(is); - } - - getLogger().info( - "Loaded stopwords for locale [" + fileLocale + "] from [" + source + "]"); - } - } - catch (IOException e1) { - throw new ResourceInitializationException(e1); - } - } - - @Override - public void process(JCas jcas) - throws AnalysisEngineProcessException - { - JCas doc = getView(jcas, DOC_VIEW, null); - JCas topic = getView(jcas, TOPIC_VIEW, null); - - try { - if (doc != null) { - check(doc); - } - - if (topic != null) { - check(topic); - } - - if (topic == null && doc == null) { - check(jcas); - } - } - catch (FeaturePathException e) { - throw new AnalysisEngineProcessException(e); - } - } - - private void check(JCas aJCas) - throws FeaturePathException - { - Logger log = getContext().getLogger(); - - Locale casLocale = new Locale(aJCas.getDocumentLanguage()); - StopWordSet anyLocaleSet = stopWordSets.get("*"); - StopWordSet casLocaleSet = stopWordSets.get(aJCas.getDocumentLanguage()); - - // Now really to the removal part - FeaturePathInfo fp = new FeaturePathInfo(); - for (String path : paths) { - // Create a sorted list of annotations that we can quickly search on - AnnotationFS[] candidates = getCandidates(aJCas); - - // Initialize list of annotations to remove - List<AnnotationFS> toRemove = new ArrayList<AnnotationFS>(); - - // Separate Typename and featurepath - String[] segments = path.split("/", 2); - - String typeName = segments[0]; - boolean isStopWordType = stopWordType.equals(typeName); - Type t = aJCas.getTypeSystem().getType(typeName); - if (t == null) { - throw new IllegalStateException("Type [" + typeName + "] not found in type system"); - } - - // initialize the FeaturePathInfo with the corresponding part - if (segments.length > 1) { - fp.initialize(segments[1]); - } - else { - fp.initialize(""); - } - - int safeStart = 0; - Iterator<Annotation> i = aJCas.getAnnotationIndex(t).iterator(); - while (i.hasNext()) { - Annotation anno = i.next(); - - // Move the start of the containment scanning range ahead if possible - while ((safeStart + 1) < candidates.length - && candidates[safeStart + 1].getEnd() < anno.getBegin()) { - safeStart++; - } - - String candidate = fp.getValue(anno).toLowerCase(casLocale); - if (isStopWordType || ((anyLocaleSet != null) && anyLocaleSet.contains(candidate)) - || ((casLocaleSet != null) && casLocaleSet.contains(candidate))) { - // Remove the annotation that matched the stop word - toRemove.add(anno); - if (log.isLoggable(FINE)) { - log.log(FINE, "Removing [" - + typeName.substring(typeName.lastIndexOf('.') + 1) - + "] annotated as stop word [" + anno.getCoveredText() + "]@" - + anno.getBegin() + ".." + anno.getEnd()); - } - - // Scan all potential annotations that may be covered the current - // annotation and remove them as well - int n = safeStart; - while (n < candidates.length && candidates[n].getBegin() < anno.getEnd()) { - if ((anno.getBegin() <= candidates[n].getBegin()) - && (candidates[n].getEnd() <= anno.getEnd())) { - if (log.isLoggable(FINE)) { - log.log(FINE, "Removing as well [" - + candidates[n].getClass().getSimpleName() - + "] annotated as stop word [" - + candidates[n].getCoveredText() + "]@" - + candidates[n].getBegin() + ".." + candidates[n].getEnd()); - } - toRemove.add(candidates[n]); - } - n++; - } - } - } - - // Remove from the CAS - for (AnnotationFS anno : toRemove) { - aJCas.removeFsFromIndexes(anno); - } - } - } - - private AnnotationFS[] getCandidates(JCas aJCas) - { - // Make a list of all the annotations that can be matched by the given paths. If any one - // of the paths match, we want to remove instances of all others being covered by the - // match as well. - List<AnnotationFS> candidateList = new ArrayList<AnnotationFS>(); - for (String path : paths) { - String[] segments = path.split("/", 2); - String typeName = segments[0]; - Type t = aJCas.getTypeSystem().getType(typeName); - if (t == null) { - throw new IllegalStateException("Type [" + typeName + "] not found in type system"); - } - - for (AnnotationFS fs : select(aJCas.getCas(), t)) { - candidateList.add(fs); - } - } - AnnotationFS[] candidates = candidateList.toArray(new AnnotationFS[candidateList.size()]); - Arrays.sort(candidates, new BeginEndComparator()); - return candidates; - - } - - static class BeginEndComparator implements Comparator<AnnotationFS> - { - @Override - public int compare(AnnotationFS aO1, AnnotationFS aO2) - { - if (aO1.getBegin() == aO2.getBegin()) { - return aO1.getEnd() - aO2.getEnd(); - } - else { - return aO1.getBegin() - aO2.getBegin(); - } - } - } -} diff --git a/dkpro-core-stopwordremover-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stopwordremover/package-info.java b/dkpro-core-stopwordremover-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stopwordremover/package-info.java deleted file mode 100644 index e92956b889..0000000000 --- a/dkpro-core-stopwordremover-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stopwordremover/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Remove stop words to exclude them from further processing. - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.stopwordremover; diff --git a/dkpro-core-testing-asl/pom.xml b/dkpro-core-testing-asl/pom.xml index 24e7b84f03..8ceb524397 100644 --- a/dkpro-core-testing-asl/pom.xml +++ b/dkpro-core-testing-asl/pom.xml @@ -18,14 +18,15 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <version>1.10.0-SNAPSHOT</version> + <artifactId>dkpro-core-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <artifactId>dkpro-core-testing-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - Unit Testing Support</name> + <url>https://dkpro.github.io/dkpro-core/</url> <dependencies> <dependency> <groupId>org.apache.uima</groupId> @@ -48,44 +49,44 @@ <artifactId>spring-core</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.anomaly-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-anomaly-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.coref-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-coref-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.metadata-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-metadata-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.syntax-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-syntax-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.semantics-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-semantics-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.ner-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-ner-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> </dependency> <dependency> <groupId>org.reflections</groupId> @@ -97,12 +98,45 @@ <artifactId>junit</artifactId> </dependency> <dependency> - <groupId>log4j</groupId> - <artifactId>log4j</artifactId> + <groupId>org.assertj</groupId> + <artifactId>assertj-core</artifactId> </dependency> <dependency> <groupId>org.slf4j</groupId> - <artifactId>slf4j-log4j12</artifactId> + <artifactId>slf4j-api</artifactId> + </dependency> + + + <!-- The logging subsystem used by DKPro Core during testing --> + <dependency> + <!-- Use Log4J http as logging backend --> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-core</artifactId> + <scope>runtime</scope> + </dependency> + <dependency> + <!-- Use Log4J http as logging backend --> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-slf4j-impl</artifactId> + <scope>runtime</scope> + </dependency> + <dependency> + <!-- Route Log4J v1 over SLF4J --> + <groupId>org.slf4j</groupId> + <artifactId>log4j-over-slf4j</artifactId> + <scope>runtime</scope> + </dependency> + <dependency> + <!-- Route Java Commons Logging over SLF4J --> + <groupId>org.slf4j</groupId> + <artifactId>jcl-over-slf4j</artifactId> + <scope>runtime</scope> + </dependency> + <dependency> + <!-- Avoid Commons Logging --> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + <scope>provided</scope> </dependency> </dependencies> <build> @@ -115,18 +149,29 @@ <usedDependencies> <!-- - We want to avoid the need to each tested module to explicitly - - depend on log4j, so we depend on it here. - --> - <usedDependency>log4j:log4j</usedDependency> - <!-- - - Also a default testing dependency to avoid warnings on all libraries that use - - SLF4J + - depend on log4j/slf4j, so we depend on it here. --> - <usedDependency>org.slf4j:slf4j-log4j12</usedDependency> + <usedDependency>org.apache.logging.log4j:log4j-slf4j-impl</usedDependency> + <usedDependency>org.slf4j:log4j-over-slf4j</usedDependency> + <usedDependency>org.slf4j:jcl-over-slf4j</usedDependency> + <usedDependency>org.apache.logging.log4j:log4j-core</usedDependency> + <usedDependency>commons-logging:commons-logging</usedDependency> </usedDependencies> </configuration> </plugin> </plugins> </pluginManagement> + <plugins> + <plugin> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-maven-plugin</artifactId> + <configuration> + <!-- None of the components in this module should be exported to OpenMinTeD --> + <uimaDescriptorExcludes> + <exclude>**/*.xml</exclude> + </uimaDescriptorExcludes> + </configuration> + </plugin> + </plugins> </build> </project> \ No newline at end of file diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/AssumeResource.java b/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/AssumeResource.java deleted file mode 100644 index afffffd440..0000000000 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/AssumeResource.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.testing; - -import static org.junit.Assume.assumeTrue; - -import java.io.IOException; -import java.util.Properties; - -import org.springframework.core.io.support.PropertiesLoaderUtils; - -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; - -public class AssumeResource -{ - public static void assumeResource(Class<?> aClass, String aTool, String aLanguage, - String aVariant) - throws IOException - { - String pack = aClass.getPackage().getName().replace('.', '/'); - assumeResource(aClass, pack, aTool, aLanguage, aVariant); - } - - public static void assumeResource(Class<?> aClass, String aPackage, String aTool, - String aLanguage, String aVariant) - throws IOException - { - String variant = aVariant; - - // Handle default variants - if (variant == null) { - String pack = aClass.getPackage().getName().replace('.', '/'); - String defModelsLoc = pack + "/lib/" + aTool + "-default-variants.map"; - Properties defaultVariants = PropertiesLoaderUtils.loadAllProperties(defModelsLoc); - variant = defaultVariants.getProperty(aLanguage); - if (variant == null) { - variant = defaultVariants.getProperty("*"); - } - } - - // Check if the model exists by checking for it's DKPro Core metadata file - boolean exists; - try { - String propLoc = "classpath:/" + aPackage + "/lib/" + aTool + "-" + aLanguage + "-" - + variant + ".properties"; - ResourceUtils.resolveLocation(propLoc); - exists = true; - } - catch (IOException e) { - exists = false; - } - - if (!exists) { - // The English default model should always be included in the default test dependencies, so - // issue a special warning here - if (aVariant == null && "en".equals(aLanguage)) { - System.out.println("[" + aClass.getSimpleName() + "] default model not available: [" - + aLanguage + "] [" + variant + "]!"); - } - else { - System.out.println("[" + aClass.getSimpleName() + "] model not available: [" - + aLanguage + "] [" + variant + "] - skipping"); - } - } - - assumeTrue("[" + aClass.getSimpleName() + "] model not available: [" + aLanguage + "] [" - + aVariant + "]", exists); - } -} diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/DocumentMetaDataStripper.java b/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/DocumentMetaDataStripper.java deleted file mode 100644 index 79e8e882de..0000000000 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/DocumentMetaDataStripper.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.testing; - -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; - -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; - -/** - * Removes fields from the document meta data which may be different depending on the machine a - * test is run on. - */ -@ResourceMetaData(name="DocumentMetaData Stripper") -@TypeCapability( - inputs={"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}, - outputs={"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) - -public class DocumentMetaDataStripper - extends JCasAnnotator_ImplBase -{ - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - try { - DocumentMetaData meta = DocumentMetaData.get(aJCas); - meta.setDocumentBaseUri(null); - meta.setDocumentUri(null); - meta.setCollectionId(null); - } - catch (IllegalArgumentException e) { - // No metadata in the CAS. - } - } -} diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/TestOptions.java b/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/TestOptions.java deleted file mode 100644 index 0441d89411..0000000000 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/TestOptions.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.testing; - -import java.io.File; -import java.util.HashSet; -import java.util.Set; -import java.util.function.BiConsumer; - -import de.tudarmstadt.ukp.dkpro.core.testing.validation.checks.Check; - -public class TestOptions -{ - Set<Class<? extends Check>> skippedChecks = new HashSet<>(); - BiConsumer<File, File> resultAssertor; - - public TestOptions skipCheck(Class<? extends Check> aCheck) - { - skippedChecks.add(aCheck); - return this; - } - - public TestOptions resultAssertor(BiConsumer<File, File> aResultComparator) - { - resultAssertor = aResultComparator; - return this; - } -} \ No newline at end of file diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/dumper/DependencyDumper.java b/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/dumper/DependencyDumper.java deleted file mode 100644 index 4fb84f715e..0000000000 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/dumper/DependencyDumper.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.testing.dumper; - -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasConsumer_ImplBase; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; - -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; - -/** - * Dump dependencies to screen. - */ -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency"}) -public class DependencyDumper - extends JCasConsumer_ImplBase -{ - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - for (Dependency dep : select(aJCas, Dependency.class)) { - System.out.format("%-10s [%s] [%s]%n", dep.getDependencyType(), dep.getGovernor() - .getCoveredText(), dep.getDependent().getCoveredText()); - } - } -} diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/harness/SegmenterHarness.java b/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/harness/SegmenterHarness.java deleted file mode 100644 index 37ae09dff2..0000000000 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/harness/SegmenterHarness.java +++ /dev/null @@ -1,386 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.testing.harness; - -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertSentence; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertToken; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.util.JCasUtil.select; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import org.apache.commons.lang3.ArrayUtils; -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.fit.factory.JCasFactory; -import org.apache.uima.jcas.JCas; -import org.junit.Assert; -import org.junit.internal.AssumptionViolatedException; - -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; - -public final class SegmenterHarness -{ - public static final TestData[] DATA = new TestData[] { - new TestData("de.1", "de", "Herr Frank M. Meier hat einen Hund.", - new String[] { "Herr", "Frank", "M.", "Meier", "hat", "einen", - "Hund", "."}, - new String[] { "Herr Frank M. Meier hat einen Hund." }), - new TestData("de.2", "de", "Ich bin ein blöder Hund.", - new String[] { "Ich", "bin", "ein", "blöder", "Hund", "." }, - new String[] { "Ich bin ein blöder Hund." }), - new TestData("de.3", "de", "Mein Name ist Hans.", - new String[] { "Mein", "Name", "ist", "Hans", "." }, - new String[] { "Mein Name ist Hans." }), - // DKPRO-CORE-ASL-98: BreakIteratorSegmenter turns hypens to separate tokens - new TestData("de.4", "de", "ihre Negativbei- spiele immer", - new String[] { "ihre", "Negativbei-", "spiele", "immer" }, - new String[] { "ihre Negativbei- spiele immer" }), - - new TestData("en.1", "en", "Sadler, A.L. Cha-No-Yu: The Japanese Tea Ceremony.", - new String[] { "Sadler", ",", "A.L.", "Cha-No-Yu", ":", "The", - "Japanese", "Tea", "Ceremony", "."}, - new String[] { "Sadler, A.L. Cha-No-Yu: The Japanese Tea Ceremony." } ), - new TestData("en.2", "en", "I love the UIMA toolkit. 1989 is the year in which the Berlin wall fell.", - new String[] { "I", "love", "the", "UIMA", "toolkit", ".", - "1989", "is", "the", "year", "in", "which", "the", "Berlin", - "wall", "fell", "." }, - new String[] { "I love the UIMA toolkit.", - "1989 is the year in which the Berlin wall fell." }), - new TestData("en.3", "en", "I'm not a girl.", - new String[] { "I", "'m", "not", "a", "girl", "." }, - new String[] { "I'm not a girl." }), - new TestData("en.4", "en", "I am a stupid dog.", - new String[] { "I", "am", "a", "stupid", "dog", "." }, - new String[] { "I am a stupid dog." }), - new TestData("en.5", "en", "Georg \"Bullseye\" Logal is a though guy.", - new String[] { "Georg", "\"", "Bullseye", "\"", "Logal", - "is", "a", "though", "guy", "." }, - new String[] { "Georg \"Bullseye\" Logal is a though guy." }), - new TestData("en.6", "en", "This doesn't compute.", - new String[] { "This", "does", "n't", "compute", "." }, - new String[] { "This doesn't compute." }), - new TestData("en.7", "en", "based on\n 'Carnival of Souls', written by [...] and directed by [...].", - new String[] { "based", "on", "'", "Carnival", "of", "Souls", - "'", ",", "written", "by", "[", "...", "]", "and", "directed", - "by", "[", "...", "]", "." }, - new String[] { "based on\n 'Carnival of Souls', written by [...] and directed by [...]." }), - new TestData("en.8", "en", ", , ,", - new String[] { ",", ",", "," }, - new String[] { ", , ," }), - new TestData("en.9", "en", "How to tokenize smileys? This is a good example. >^,,^< :0 3:[", - new String[] { "How", "to", "tokenize", "smileys", "?", "This", "is", "a", "good", "example.", ">^,,^<", ":0", "3:[" }, - new String[] { "How to tokenize smileys?", "This is a good example.", ">^,,^< :0 3:[" }), - - // Sombody who can read arabic, please check this - // Covering the following sub-Saharan countries with vast areas very - new TestData("ar.1", "ar", "تغطي الصحراء الكبرى الدول التالية بمساحات شاسعة جدا", - new String[] { "تغطي", "الصحراء", "الكبرى", "الدول", "التالية", - "مساحات", "شاسعة", "جدا" }, - new String[] { "تغطي الصحراء الكبرى الدول التالية بمساحات شاسعة جدا" }), - - // While the stanford parser should come with a proper tokenizer - // for Chinese (because it can parse chinese text), this does not - // seem to be the right one or I am using it wrong. The associated - // test cases do not work. Maybe debugging the command below - // would help to find out how to use it. - // They use command to parse it: java -mx1g -cp "stanford-parser.jar" - // edu.stanford.nlp.parser.lexparser.LexicalizedParser -tLPP - // edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -sentences - // newline -escaper - // edu.stanford.nlp.trees.international.pennchinese.ChineseEscaper - // -outputFormat "penn,typedDependencies" -outputFormatOptions - // "removeTopBracket" xinhuaFactoredSegmenting.ser.gz sampleInput.txt. - new TestData("zh.1", "zh", "服务业成为广东经济转型升级的重要引擎。", - new String[] {"服务业", "成为", "广东", "经济", "转型", "升级", "的", - "重要", "引擎", "。"}, - new String[] {"服务业成为广东经济转型升级的重要引擎。"}), - new TestData("zh.2", "zh", "中国离世界技术品牌有多远?", - new String[] {"中国", "离", "世界", "技术", "品牌", "有", "多远", - "?" }, - new String[] { "中国离世界技术品牌有多远?" }) - }; - - private SegmenterHarness() - { - // No instances - } - - @FunctionalInterface - public static interface AssumeResourcePredicate { - void assume(String aLanguage, String aVariant) - throws AssumptionViolatedException, IOException; - } - - public static void run(AnalysisEngineDescription aAed, String... aIgnoreIds) - throws Throwable - { - run(aAed, null, aIgnoreIds); - } - - public static void run(AnalysisEngineDescription aAed, AssumeResourcePredicate aCheck, - String... aIgnoreIds) - throws Throwable - { - // No automatic downloading from repository during testing. This makes sure we fail if - // models are not properly added as test dependencies. - if (offline) { - System.setProperty(ResourceObjectProviderBase.PROP_REPO_OFFLINE, "true"); - } - offline = true; - - AnalysisEngine ae = createEngine(aAed); - JCas jCas = ae.newJCas(); - - List<String> results = new ArrayList<String>(); - - try { - for (TestData td : DATA) { - System.out.printf("== %s ==%n", td.id); - jCas.reset(); - - if (aCheck != null) { - try { - aCheck.assume(td.language, null); - } - catch (AssumptionViolatedException e) { - results.add(String.format("%s skipped", td.id)); - continue; - } - } - - jCas.setDocumentLanguage(td.language); - jCas.setDocumentText(td.text); - - boolean failed = false; - - try { - ae.process(jCas); - - AssertAnnotations.assertSentence(td.sentences, select(jCas, Sentence.class)); - AssertAnnotations.assertToken(td.tokens, select(jCas, Token.class)); - - results.add(String.format("%s OK", td.id)); - } - catch (Throwable e) { - failed = true; - if (!ArrayUtils.contains(aIgnoreIds, td.id)) { - results.add(String.format("%s FAIL", td.id)); - throw e; - } - else { - results.add(String.format("%s FAIL - Known, ignored", td.id)); - } - } - - if (!failed && ArrayUtils.contains(aIgnoreIds, td.id)) { - results.add(String.format("%s FAIL", td.id)); - Assert.fail(td.id + " passed but was expected to fail"); - } - } - } - finally { - System.out.println("=== RESULTS ==="); - for (String r : results) { - System.out.println(r); - } - } - } - - public static void testZoning(Class<? extends SegmenterBase> aSegmenter) - throws Exception - { - testZoning(aSegmenter, "en"); - } - - public static void testZoning(Class<? extends SegmenterBase> aSegmenter, String aLanguage) - throws Exception - { - testLaxZoning(aSegmenter, aLanguage); - testStrictZoning(aSegmenter, aLanguage); - testOufOfBoundsZones(aSegmenter, aLanguage); - } - - public static void testLaxZoning(Class<? extends SegmenterBase> aSegmenter, String aLanguage) - throws Exception - { - // No automatic downloading from repository during testing. This makes sure we fail if - // models are not properly added as test dependencies. - if (offline) { - System.setProperty(ResourceObjectProviderBase.PROP_REPO_OFFLINE, "true"); - } - offline = true; - - String[] sentences = { "A a a a .", "A a a a -", "B b b b .", "B b b b -", "C c c c .", - "C c c c -" }; - - String[] tokens = { "A", "a", "a", "a", ".", "A", "a", "a", "a", "-", "B", "b", "b", "b", - ".", "B", "b", "b", "b", "-", "C", "c", "c", "c", ".", "C", "c", "c", "c", "-" }; - - JCas jcas = JCasFactory.createJCas(); - jcas.setDocumentLanguage(aLanguage); - // 1 1 2 2 3 3 4 4 5 5 6 - // 0 5 0 5 0 5 0 5 0 5 0 5 0 - // ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- - jcas.setDocumentText("A a a a . A a a a - B b b b . B b b b - C c c c . C c c c -"); - // |------------------| |------------------| - new Paragraph(jcas, 0, 19).addToIndexes(); - new Paragraph(jcas, 40, 59).addToIndexes(); - - AnalysisEngine ae = createEngine(aSegmenter, - SegmenterBase.PARAM_STRICT_ZONING, false, - SegmenterBase.PARAM_ZONE_TYPES, Paragraph.class); - ae.process(jcas); - - assertToken(tokens, select(jcas, Token.class)); - assertSentence(sentences, select(jcas, Sentence.class)); - } - - public static void testOufOfBoundsZones(Class<? extends SegmenterBase> aSegmenter, - String aLanguage) - throws Exception - { - // No automatic downloading from repository during testing. This makes sure we fail if - // models are not properly added as test dependencies. - if (offline) { - System.setProperty(ResourceObjectProviderBase.PROP_REPO_OFFLINE, "true"); - } - offline = true; - - // 1 1 2 2 3 3 4 4 5 5 6 - // 0 5 0 5 0 5 0 5 0 5 0 5 0 - // ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- - String text = "A a a a . A a a a - B b b b . B b b b - C c c c . C c c c -"; - // |------------------| |------------------| - - // non-strict zoning - { - String[] sentences = { "A a a a .", "A a a a -", "B b b b .", "B b b b -", "C c c c .", - "C c c c -" }; - - String[] tokens = { "A", "a", "a", "a", ".", "A", "a", "a", "a", "-", "B", "b", "b", - "b", ".", "B", "b", "b", "b", "-", "C", "c", "c", "c", ".", "C", "c", "c", "c", - "-" }; - - JCas jcas = JCasFactory.createJCas(); - jcas.setDocumentLanguage(aLanguage); - jcas.setDocumentText(text); - new Paragraph(jcas, 0, 19).addToIndexes(); - new Paragraph(jcas, 40, 65).addToIndexes(); - - AnalysisEngine ae = createEngine(aSegmenter, - SegmenterBase.PARAM_STRICT_ZONING, false, - SegmenterBase.PARAM_ZONE_TYPES, Paragraph.class); - ae.process(jcas); - - assertToken(tokens, select(jcas, Token.class)); - assertSentence(sentences, select(jcas, Sentence.class)); - } - - // strict zoning - { - String[] sentences = { "A a a a .", "A a a a -", "C c c c .", "C c c c -" }; - - String[] tokens = { "A", "a", "a", "a", ".", "A", "a", "a", "a", "-", "C", "c", "c", - "c", ".", "C", "c", "c", "c", "-" }; - - JCas jcas = JCasFactory.createJCas(); - jcas.setDocumentLanguage(aLanguage); - jcas.setDocumentText(text); - new Paragraph(jcas, 0, 19).addToIndexes(); - new Paragraph(jcas, 40, 65).addToIndexes(); - - AnalysisEngine ae = createEngine(aSegmenter, - SegmenterBase.PARAM_STRICT_ZONING, true, - SegmenterBase.PARAM_ZONE_TYPES, Paragraph.class); - ae.process(jcas); - - assertToken(tokens, select(jcas, Token.class)); - assertSentence(sentences, select(jcas, Sentence.class)); - } - } - - public static void testStrictZoning(Class<? extends SegmenterBase> aSegmenter, String aLanguage) - throws Exception - { - // No automatic downloading from repository during testing. This makes sure we fail if - // models are not properly added as test dependencies. - if (offline) { - System.setProperty(ResourceObjectProviderBase.PROP_REPO_OFFLINE, "true"); - } - offline = true; - - String[] sentences = { "A a a a .", "A a a a -", "C c c c .", "C c c c -" }; - - String[] tokens = { - "A", "a", "a", "a", ".", - "A", "a", "a", "a", "-", - "C", "c", "c", "c", ".", - "C", "c", "c", "c", "-" }; - - JCas jcas = JCasFactory.createJCas(); - jcas.setDocumentLanguage(aLanguage); - // 1 1 2 2 3 3 4 4 5 5 6 - // 0 5 0 5 0 5 0 5 0 5 0 5 0 - // ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- - jcas.setDocumentText("A a a a . A a a a - B b b b . B b b b - C c c c . C c c c -"); - // |------------------| |------------------| - new Paragraph(jcas, 0, 19).addToIndexes(); - new Paragraph(jcas, 40, 59).addToIndexes(); - - AnalysisEngine ae = createEngine(aSegmenter, - SegmenterBase.PARAM_STRICT_ZONING, true, - SegmenterBase.PARAM_ZONE_TYPES, Paragraph.class); - ae.process(jcas); - - assertToken(tokens, select(jcas, Token.class)); - assertSentence(sentences, select(jcas, Sentence.class)); - } - - static class TestData - { - final String id; - final String language; - final String text; - final String[] sentences; - final String[] tokens; - - public TestData(String aId, String aLanguage, String aText, String[] aTokens, String[] aSentences) - { - id = aId; - language = aLanguage; - text = aText; - sentences = aSentences; - tokens = aTokens; - } - } - - private static boolean offline = true; - - public static void autoloadModelsOnNextTestRun() - { - offline = false; - } -} diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/AssertAnnotations.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/AssertAnnotations.java similarity index 94% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/AssertAnnotations.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/AssertAnnotations.java index e19fab9bab..6a3aae691c 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/AssertAnnotations.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/AssertAnnotations.java @@ -15,21 +15,23 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing; +package org.dkpro.core.testing; -import static de.tudarmstadt.ukp.dkpro.core.testing.validation.Message.Level.ERROR; import static java.util.Arrays.asList; import static org.apache.commons.lang3.StringUtils.normalizeSpace; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.toText; +import static org.assertj.core.api.Assertions.assertThat; +import static org.dkpro.core.testing.validation.Message.Level.ERROR; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Comparator; @@ -38,8 +40,6 @@ import java.util.Map; import java.util.stream.Collectors; -import junit.framework.Assert; - import org.apache.commons.lang3.ObjectUtils; import org.apache.commons.lang3.StringUtils; import org.apache.uima.UIMAException; @@ -58,11 +58,17 @@ import org.apache.uima.util.CasCopier; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; +import org.dkpro.core.api.lexmorph.morph.MorphologicalFeaturesParser; +import org.dkpro.core.api.lexmorph.morph.internal.AnalysisMapping; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.MappingUtils; +import org.dkpro.core.testing.validation.CasValidator; +import org.dkpro.core.testing.validation.Message; +import org.dkpro.core.testing.validation.checks.Check; import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly; import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.morph.MorphologicalFeaturesParser; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.morph.internal.AnalysisMapping; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.Morpheme; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; @@ -70,9 +76,6 @@ import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagDescription; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingUtils; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; @@ -86,9 +89,7 @@ import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.CasValidator; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.Message; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.checks.Check; +import junit.framework.Assert; public class AssertAnnotations { @@ -470,7 +471,8 @@ public static void assertPennTree(String aExpected, PennTree aActual) assertEquals(expected, actual); } - public static void assertPennTree(String aExpected[], Collection<PennTree> aActual) { + public static void assertPennTree(String[] aExpected, Collection<PennTree> aActual) + { List<PennTree> actual = new ArrayList<PennTree>(aActual); assertEquals(aExpected.length, aActual.size()); for (int i = 0; i < aExpected.length; i++) { @@ -541,8 +543,8 @@ public static void assertSemPred(String[] aExpected, Collection<SemPred> aActual args.sort(byRole); for (SemArgLink a : args) { - sb.append('(').append(a.getRole()).append(':').append(a.getTarget().getCoveredText()) - .append(')'); + sb.append('(').append(a.getRole()).append(':') + .append(a.getTarget().getCoveredText()).append(')'); } sb.append(']'); actual.add(sb.toString()); @@ -593,15 +595,17 @@ public static void assertCoreference(String[][] aExpected, Collection<Coreferenc asCopyableString(toText(i.links()))); } - if (aExpected.length == aActual.size()) { - for (int i = 0; i < actual.size(); i++) { - assertEquals(asCopyableString(asList(aExpected[i]), true), - asCopyableString(toText(actual.get(i).links()), true)); - } - } - else { - fail("Expected [" + aExpected.length + "] chains but found " + aActual.size() + "]"); - } + String[] expectedStrings = Arrays.stream(aExpected) + .map(it -> asCopyableString(asList(it), true)) + .toArray(String[]::new); + + String[] actualStrings = aActual.stream() + .map(it -> asCopyableString(toText(it.links()), true)) + .toArray(String[]::new); + + assertThat(actualStrings) + .describedAs("Chains match in any order") + .containsExactlyInAnyOrder(expectedStrings); } public static void assertTagset(Class<?> aLayer, String aName, String[] aExpected, JCas aJCas) @@ -642,8 +646,10 @@ public static void assertTagset(Class<?> aComponent, Class<?> aLayer, String aNa System.out.printf("%-20s : %s%n", "Layer", tsd.getLayer()); System.out.printf("%-20s : %s%n", "Tagset", tsd.getName()); System.out.printf("%-20s : %s%n", "Component", tsd.getComponentName()); - System.out.printf("%-20s : %s%n", "Model location", tsd.getModelLocation()); - System.out.printf("%-20s : %s%n", "Model language", tsd.getModelLanguage()); + System.out.printf( + "%-20s : %s%n", "Model location", tsd.getModelLocation()); + System.out.printf( + "%-20s : %s%n", "Model language", tsd.getModelLanguage()); System.out.printf("%-20s : %s%n", "Model variant", tsd.getModelVariant()); System.out.printf("%-20s : %s%n", "Model version", tsd.getModelVersion()); System.out.printf("%-20s : %b%n", "Input", tsd.getInput()); @@ -688,20 +694,16 @@ public static void assertTagsetMapping(Class<?> aComponent, Class<?> aLayer, Str { String pattern; if (aLayer == POS.class) { - pattern = "classpath:/de/tudarmstadt/ukp/dkpro/" - + "core/api/lexmorph/tagset/${language}-${tagset}-pos.map"; + pattern = "classpath:/org/dkpro/core/api/lexmorph/tagset/${language}-${tagset}-pos.map"; } else if (aLayer == Dependency.class) { - pattern = "classpath:/de/tudarmstadt/ukp/dkpro/" - + "core/api/syntax/tagset/${language}-${tagset}-dependency.map"; + pattern = "classpath:/org/dkpro/core/api/syntax/tagset/${language}-${tagset}-dependency.map"; } else if (aLayer == Constituent.class) { - pattern = "classpath:/de/tudarmstadt/ukp/dkpro/" - + "core/api/syntax/tagset/${language}-${tagset}-constituency.map"; + pattern = "classpath:/org/dkpro/core/api/syntax/tagset/${language}-${tagset}-constituency.map"; } else if (aLayer == Chunk.class) { - pattern = "classpath:/de/tudarmstadt/ukp/dkpro/" - + "core/api/syntax/tagset/${language}-${tagset}-chunk.map"; + pattern = "classpath:/org/dkpro/core/api/syntax/tagset/${language}-${tagset}-chunk.map"; } else { throw new IllegalArgumentException("Unsupported layer: " + aLayer.getName()); @@ -758,8 +760,10 @@ else if (aLayer == Chunk.class) { System.out.printf("%-20s : %s%n", "Layer", tsd.getLayer()); System.out.printf("%-20s : %s%n", "Tagset", tsd.getName()); System.out.printf("%-20s : %s%n", "Component", tsd.getComponentName()); - System.out.printf("%-20s : %s%n", "Model location", tsd.getModelLocation()); - System.out.printf("%-20s : %s%n", "Model language", tsd.getModelLanguage()); + System.out.printf("%-20s : %s%n", + "Model location", tsd.getModelLocation()); + System.out.printf("%-20s : %s%n", + "Model language", tsd.getModelLanguage()); System.out.printf("%-20s : %s%n", "Model variant", tsd.getModelVariant()); System.out.printf("%-20s : %s%n", "Model version", tsd.getModelVersion()); System.out.printf("%-20s : %b%n", "Input", tsd.getInput()); @@ -796,8 +800,7 @@ public static void assertTagsetParser(Class<?> aLayer, String aName, String[] aD String pattern; if (aLayer == MorphologicalFeatures.class) { - pattern = "classpath:/de/tudarmstadt/ukp/dkpro/" - + "core/api/lexmorph/tagset/${language}-${tagset}-morph.map"; + pattern = "classpath:/org/dkpro/core/api/lexmorph/tagset/${language}-${tagset}-morph.map"; } else { throw new IllegalArgumentException("Unsupported layer: " + aLayer.getName()); diff --git a/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/AssumeResource.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/AssumeResource.java new file mode 100644 index 0000000000..4c28c1cb51 --- /dev/null +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/AssumeResource.java @@ -0,0 +1,101 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.testing; + +import static org.junit.Assume.assumeTrue; + +import java.io.IOException; +import java.util.Properties; + +import org.dkpro.core.api.resources.ResourceUtils; +import org.springframework.core.io.support.PropertiesLoaderUtils; + +public class AssumeResource +{ + public static void assumeResource(Class<?> aClass, String aTool, String aLanguage, + String aVariant) + throws IOException + { + String pack = aClass.getPackage().getName().replace('.', '/'); + assumeResource(aClass, pack, aTool, aLanguage, aVariant); + } + + public static void assumeResource(Class<?> aClass, String aPackage, String aTool, + String aLanguage, String aVariant) + throws IOException + { + boolean exists = resourceAvailable(aClass, aPackage, aTool, aLanguage, aVariant); + + if (!exists && aPackage.startsWith("org/dkpro/core")) { + // Try the legacy packages + String pack = aPackage.replace("org/dkpro/core", "de/tudarmstadt/ukp/dkpro/core"); + exists = resourceAvailable(aClass, pack, aTool, aLanguage, aVariant); + } + + if (!exists) { + // The English default model should always be included in the default test dependencies, + // so issue a special warning here + if (aVariant == null && "en".equals(aLanguage)) { + System.out.println("[" + aClass.getSimpleName() + "] default model not available: [" + + aLanguage + "] [" + aVariant + "]!"); + } + else { + System.out.println("[" + aClass.getSimpleName() + "] model not available: [" + + aLanguage + "] [" + aVariant + "] - skipping"); + } + } + + assumeTrue("[" + aClass.getSimpleName() + "] model not available: [" + aLanguage + "] [" + + aVariant + "]", exists); + } + + private static boolean resourceAvailable(Class<?> aClass, String aPackage, String aTool, + String aLanguage, String aVariant) + throws IOException + { + String variant = aVariant; + + // Handle default variants - variants map files are always expected to be found relative + // to the class which needs them + if (variant == null) { + String pack = aClass.getPackage().getName().replace('.', '/'); + String defModelsLoc = pack + "/lib/" + aTool + "-default-variants.map"; + Properties defaultVariants = PropertiesLoaderUtils.loadAllProperties(defModelsLoc); + variant = defaultVariants.getProperty(aLanguage); + if (variant == null) { + variant = defaultVariants.getProperty("*"); + } + } + + // Check if the model exists by checking for it's DKPro Core metadata file + // Due do changes in the DKPro Core package and groupId names, the models may be in a + // different package than the component which uses them + boolean exists; + try { + String propLoc = "classpath:/" + aPackage + "/lib/" + aTool + "-" + aLanguage + "-" + + variant + ".properties"; + ResourceUtils.resolveLocation(propLoc); + exists = true; + } + catch (IOException e) { + exists = false; + } + + return exists; + } +} diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/DkproTestContext.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/DkproTestContext.java similarity index 83% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/DkproTestContext.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/DkproTestContext.java index 346c0dd505..4800c8abfd 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/DkproTestContext.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/DkproTestContext.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing; +package org.dkpro.core.testing; import static org.apache.commons.lang3.StringUtils.isNotEmpty; import static org.apache.commons.lang3.StringUtils.substringAfterLast; @@ -23,12 +23,15 @@ import java.io.File; import org.apache.commons.io.FileUtils; +import org.apache.uima.cas.impl.CASImpl; +import org.apache.uima.cas.impl.FeatureStructureImplC; import org.junit.rules.TestWatcher; import org.junit.runner.Description; public class DkproTestContext extends TestWatcher { - private static final ThreadLocal<DkproTestContext> context = new ThreadLocal<DkproTestContext>() { + private static final ThreadLocal<DkproTestContext> context = new ThreadLocal<DkproTestContext>() + { @Override protected DkproTestContext initialValue() { @@ -48,12 +51,16 @@ protected void starting(Description aDescription) methodName = aDescription.getMethodName(); System.out.println("\n=== " + methodName + " ====================="); - // Route logging through log4j - System.setProperty("org.apache.uima.logger.class", "org.apache.uima.util.impl.Log4jLogger_impl"); + // V2 FS toString needed for CasDumpWriter. Also see comment in the root-level pom.xml + // file where this property is globally set for all surefire runs + System.setProperty(FeatureStructureImplC.V2_PRETTY_PRINT, "true"); + + // Route logging through SLF4J + System.setProperty("org.apache.uima.logger.class", "org.apache.uima.util.impl.Slf4jLogger_impl"); // Enable extra check for illegal updates to indexed features (effective with UIMA 2.7.0 // and higher) - System.setProperty("uima.exception_when_fs_update_corrupts_index", "true"); + System.setProperty(CASImpl.THROW_EXCEPTION_FS_UPDATES_CORRUPTS, "true"); context.set(this); } diff --git a/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/DocumentMetaDataStripper.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/DocumentMetaDataStripper.java new file mode 100644 index 0000000000..c682fe825b --- /dev/null +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/DocumentMetaDataStripper.java @@ -0,0 +1,53 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.testing; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; + +/** + * Removes fields from the document meta data which may be different depending on the machine a + * test is run on. + */ +@ResourceMetaData(name = "DocumentMetaData Stripper") +@TypeCapability( + inputs = {"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}, + outputs = {"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) + +public class DocumentMetaDataStripper + extends JCasAnnotator_ImplBase +{ + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException + { + try { + DocumentMetaData meta = DocumentMetaData.get(aJCas); + meta.setDocumentBaseUri(null); + meta.setDocumentUri(null); + meta.setCollectionId(null); + } + catch (IllegalArgumentException e) { + // No metadata in the CAS. + } + } +} diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/EOLUtils.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/EOLUtils.java similarity index 95% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/EOLUtils.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/EOLUtils.java index 567cf188e6..ccd5a7f7ad 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/EOLUtils.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/EOLUtils.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing; +package org.dkpro.core.testing; public class EOLUtils { diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/IOTestRunner.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/IOTestRunner.java similarity index 81% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/IOTestRunner.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/IOTestRunner.java index e2cb4cfa7c..2cf8b3ef0e 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/IOTestRunner.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/IOTestRunner.java @@ -15,14 +15,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing; +package org.dkpro.core.testing; +import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; import static org.apache.uima.fit.factory.ConfigurationParameterFactory.canParameterBeSet; import static org.apache.uima.fit.factory.ConfigurationParameterFactory.getParameterSettings; import static org.apache.uima.fit.factory.ConfigurationParameterFactory.setParameter; import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.contentOf; import static org.junit.Assert.assertEquals; import java.io.File; @@ -41,22 +44,30 @@ import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.testing.dumper.CasDumpWriter; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.CasValidator; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.Message; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.testing.dumper.CasDumpWriter; +import org.dkpro.core.testing.validation.CasValidator; +import org.dkpro.core.testing.validation.Message; public class IOTestRunner { - private static final String RESOURCE_COLLECTION_READER_BASE = "de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase"; - private static final String JCAS_FILE_WRITER_IMPL_BASE = "de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase"; + private static final String RESOURCE_COLLECTION_READER_BASE = "org.dkpro.core.api.io.ResourceCollectionReaderBase"; + private static final String JCAS_FILE_WRITER_IMPL_BASE = "org.dkpro.core.api.io.JCasFileWriter_ImplBase"; + /** + * @deprecated Use {@link ReaderAssert} instead. + */ + @Deprecated public static void testRoundTrip(Class<? extends CollectionReader> aReader, Class<? extends AnalysisComponent> aWriter, String aFile) throws Exception { - testOneWay(createReaderDescription(aReader), createEngineDescription(aWriter), aFile, aFile); + ReaderAssert.assertThat(aReader) + .readingFrom("src/test/resources/" + aFile) + .usingWriter(aWriter) + .outputAsString(FilenameUtils.getName(aFile)) + .satisfies(output -> assertThat(output.trim()).isEqualToNormalizingNewlines( + contentOf(new File("src/test/resources/" + aFile), UTF_8).trim())); } public static void testRoundTrip(Class<? extends CollectionReader> aReader, @@ -78,6 +89,12 @@ public static void testRoundTrip(CollectionReaderDescription aReader, AnalysisEngineDescription aWriter, String aFile) throws Exception { +// ReaderAssert.assertThat(aReader) +// .readingFrom("src/test/resources/" + aFile) +// .usingWriter(aWriter) +// .asString() +// .isEqualToNormalizingNewlines( +// contentOf(new File("src/test/resources/" + aFile), UTF_8)); testOneWay(aReader, aWriter, aFile, aFile); } @@ -171,7 +188,8 @@ public static void testOneWay(CollectionReaderDescription aReader, } public static void testOneWay(CollectionReaderDescription aReader, - AnalysisEngineDescription aWriter, String aExpectedFile, String aFile, TestOptions aOptions) + AnalysisEngineDescription aWriter, String aExpectedFile, String aFile, + TestOptions aOptions) throws Exception { Class<?> dkproReaderBase = Class.forName(RESOURCE_COLLECTION_READER_BASE); @@ -181,7 +199,8 @@ public static void testOneWay(CollectionReaderDescription aReader, } Class<?> dkproWriterBase = Class.forName(JCAS_FILE_WRITER_IMPL_BASE); - if (!dkproWriterBase.isAssignableFrom(Class.forName(aWriter.getAnnotatorImplementationName()))) { + if (!dkproWriterBase + .isAssignableFrom(Class.forName(aWriter.getAnnotatorImplementationName()))) { throw new IllegalArgumentException("writer must be a subclass of [" + JCAS_FILE_WRITER_IMPL_BASE + "]"); } @@ -234,15 +253,24 @@ public static void testOneWay2(CollectionReaderDescription aReader, setParameter(aWriter, ComponentParameters.PARAM_TARGET_LOCATION, output); } - AnalysisEngineDescription metadataStripper = createEngineDescription( - DocumentMetaDataStripper.class); + List<AnalysisEngineDescription> processors = new ArrayList<>(); + + // By default, we strip the document metadata if no options are specified + if (aOptions == null || !aOptions.keepDocumentMetadata) { + processors.add(createEngineDescription(DocumentMetaDataStripper.class)); + } - AnalysisEngineDescription validator = createEngineDescription( - Validator.class); + processors.add(createEngineDescription(Validator.class)); + + if (aOptions != null && aOptions.processor != null) { + processors.add(aOptions.processor); + } + + processors.add(aWriter); Validator.options = aOptions != null ? aOptions : new TestOptions(); - runPipeline(aReader, validator, metadataStripper, aWriter); + runPipeline(aReader, processors.toArray(new AnalysisEngineDescription[] {})); AssertAnnotations.assertValid(Validator.messages); @@ -282,5 +310,13 @@ public void process(JCas aJCas) options.skippedChecks.forEach(check -> validator.removeCheck(check)); messages = validator.analyze(aJCas); } + + @Override + public void collectionProcessComplete() throws AnalysisEngineProcessException + { + super.collectionProcessComplete(); + + AssertAnnotations.assertValid(Validator.messages); + } } } diff --git a/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/ReaderAssert.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/ReaderAssert.java new file mode 100644 index 0000000000..ea42ee3dbe --- /dev/null +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/ReaderAssert.java @@ -0,0 +1,299 @@ +/* + * Copyright 2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.testing; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.factory.ConfigurationParameterFactory.canParameterBeSet; +import static org.apache.uima.fit.factory.ConfigurationParameterFactory.getParameterSettings; +import static org.apache.uima.fit.factory.ConfigurationParameterFactory.setParameter; +import static org.dkpro.core.api.parameter.ComponentParameters.PARAM_SOURCE_LOCATION; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import org.apache.uima.analysis_component.AnalysisComponent; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.SerialFormat; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.JCasIterable; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.metadata.TypeSystemDescription; +import org.apache.uima.util.CasCreationUtils; +import org.apache.uima.util.CasIOUtils; +import org.assertj.core.api.AbstractAssert; +import org.assertj.core.api.ListAssert; +import org.assertj.core.internal.Failures; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.testing.IOTestRunner.Validator; +import org.dkpro.core.testing.validation.checks.Check; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; + +public class ReaderAssert + extends AbstractAssert<ReaderAssert, CollectionReaderDescription> +{ + private Logger LOG = LoggerFactory.getLogger(getClass()); + + private Object requestedSourceLocation; + + private AnalysisEngineDescription[] engines; + + private boolean stripDocumentMetadata = true; + private boolean validate = true; + private TestOptions validationOptions = new TestOptions(); + + public ReaderAssert(CollectionReaderDescription aReader) + { + super(aReader, ReaderAssert.class); + + isNotNull(); + } + + public static ReaderAssert assertThat(Class<? extends CollectionReader> aReaderClass, + Object... aConfigurationData) + throws ResourceInitializationException + { + return assertThat(createReaderDescription(aReaderClass, aConfigurationData)); + } + + public static ReaderAssert assertThat(CollectionReaderDescription aReader) + { + return new ReaderAssert(aReader); + } + + /** + * Configure the reader to read from the given file. + * + * @param aLocation + * a file location. + * @return the assert for chaining. + * @see #readingFrom(String) + */ + public ReaderAssert readingFrom(File aLocation) + { + return _readingFrom(aLocation); + } + + /** + * Configure the reader to read from the given location. The source location can either be + * configured using this method or by setting {@link ComponentParameters#PARAM_SOURCE_LOCATION} + * in the reader description. + * + * @param aLocation + * a location. + * @return the assert for chaining. + */ + public ReaderAssert readingFrom(String aLocation) + { + return _readingFrom(aLocation); + } + + protected ReaderAssert _readingFrom(Object aLocation) + { + isNotNull(); + + if (requestedSourceLocation != null) { + failWithMessage("Source location has already been set to [%s]", + requestedSourceLocation); + } + + requestedSourceLocation = aLocation; + + if (!canParameterBeSet(actual, PARAM_SOURCE_LOCATION)) { + failWithMessage("Parameter [%s] cannot be set on reader [%s]", + PARAM_SOURCE_LOCATION, actual.getImplementationName()); + } + + // Is the source location defined in the reader parameters? + Map<String, Object> readerParameters = getParameterSettings(actual); + if (readerParameters.containsKey(PARAM_SOURCE_LOCATION)) { + throw Failures.instance().failure(String.format( + "Source location [%s] already defined in the reader parameters.", + readerParameters.get(PARAM_SOURCE_LOCATION))); + } + + setParameter(actual, PARAM_SOURCE_LOCATION, requestedSourceLocation); + + return this; + } + + public ReaderAssert usingEngines(AnalysisEngineDescription... aEngines) + { + isNotNull(); + + engines = aEngines; + + return this; + } + + public WriterAssert usingWriter(Class<? extends AnalysisComponent> aComponentClass, + Object... aConfigurationData) + throws ResourceInitializationException + { + return usingWriter(createEngineDescription(aComponentClass, aConfigurationData)); + } + + public WriterAssert usingWriter(AnalysisEngineDescription aWriter) + { + isNotNull(); + + try { + return WriterAssert.assertThat(aWriter).consuming(toJCasIterable()); + } + catch (ResourceInitializationException e) { + AssertionError error = Failures.instance() + .failure(String.format("Error constucting reading pipeline.")); + error.initCause(e); + throw error; + } + } + + /** + * Normally fields such as {@link DocumentMetaData#getDocumentUri()} which include the full + * document path and which are not consistent between different test environments are cleared. + * If this is not desired, invoke this method. + */ + public void keepDocumentMetadata() + { + stripDocumentMetadata = false; + } + + /** + * Normally, the output of the reader is sanity-checked. If this is not desired, invoke this + * method. + */ + public void skipValidation() + { + validate = false; + } + + /** + * Skip the given checks during reader output validation. + * + * @param aCheck + * the checks to skip. + */ + public void skipChecks(Class<? extends Check> aCheck) + { + validationOptions.skipCheck(aCheck); + } + + /** + * Infers the actual source location. + * + * @return the source location. + */ + protected Object sourceLocation() + { + Map<String, Object> readerParameters = getParameterSettings(actual); + + // Was the source location set explicitly? + if (requestedSourceLocation == null) { + // Is the target location known from the reader parameters? + if (readerParameters.containsKey(PARAM_SOURCE_LOCATION)) { + return readerParameters.get(PARAM_SOURCE_LOCATION); + } + + // Can we get one from the DKPro Core test context? + if (DkproTestContext.get() == null) { + String contextOutputFolderName = "target/test-output/" + + DkproTestContext.get().getTestOutputFolderName(); + readingFrom(contextOutputFolderName); + return contextOutputFolderName; + } + + // No success? + throw Failures.instance() + .failure(String.format("Unable to determine source location. Use a @Rule " + + DkproTestContext.class.getSimpleName() + + " or set the location using `readingWith()")); + } + else { + return requestedSourceLocation; + } + } + + protected List<AnalysisEngineDescription> processors() throws ResourceInitializationException + { + List<AnalysisEngineDescription> processors = new ArrayList<>(); + + // By default we sanity-check the output of the reader. + if (validate) { + processors.add(createEngineDescription(Validator.class)); + Validator.options = validationOptions; + } + + // By default, we strip the document metadata if no options are specified + if (stripDocumentMetadata) { + processors.add(createEngineDescription(DocumentMetaDataStripper.class)); + } + + return processors; + } + + public ListAssert<JCas> asJCasList() + { + List<JCas> casses = new ArrayList<>(); + + try { + for (JCas jcas : toJCasIterable()) { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + CasIOUtils.save(jcas.getCas(), bos, SerialFormat.SERIALIZED_TSI); + CAS copy = CasCreationUtils.createCas((TypeSystemDescription) null, null, null); + CasIOUtils.load(new ByteArrayInputStream(bos.toByteArray()), copy); + casses.add(copy.getJCas()); + } + } + catch (Exception e) { + AssertionError error = Failures.instance() + .failure(String.format("Pipeline execution failed: %s", e.getMessage())); + error.initCause(e); + throw error; + } + + return new ListAssert<>(casses); + } + + public JCasIterable toJCasIterable() throws ResourceInitializationException + { + // Obtains the actual source location, also ensuring that it was actually defined. + Object actualSourceLocation = sourceLocation(); + + LOG.debug("Reading from source location: {}", actualSourceLocation); + + List<AnalysisEngineDescription> allProcessors = new ArrayList<>(); + allProcessors.addAll(processors()); + if (engines != null) { + allProcessors.addAll(Arrays.asList(engines)); + } + + return new JCasIterable(actual, + allProcessors.stream().toArray(AnalysisEngineDescription[]::new)); + } +} diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/TagsetDescriptionStripper.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/TagsetDescriptionStripper.java similarity index 94% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/TagsetDescriptionStripper.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/TagsetDescriptionStripper.java index 1c2d902d92..5d05c1f9bf 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/TagsetDescriptionStripper.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/TagsetDescriptionStripper.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing; +package org.dkpro.core.testing; import static org.apache.uima.fit.util.JCasUtil.select; @@ -28,6 +28,9 @@ import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription; +/** + * Remove tagset description metadata. + */ public class TagsetDescriptionStripper extends JCasAnnotator_ImplBase { diff --git a/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/TestOptions.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/TestOptions.java new file mode 100644 index 0000000000..cc49d2e841 --- /dev/null +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/TestOptions.java @@ -0,0 +1,58 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.testing; + +import java.io.File; +import java.util.HashSet; +import java.util.Set; +import java.util.function.BiConsumer; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.dkpro.core.testing.validation.checks.Check; + +public class TestOptions +{ + Set<Class<? extends Check>> skippedChecks = new HashSet<>(); + BiConsumer<File, File> resultAssertor; + boolean keepDocumentMetadata = false; + AnalysisEngineDescription processor; + + public TestOptions skipCheck(Class<? extends Check> aCheck) + { + skippedChecks.add(aCheck); + return this; + } + + public TestOptions resultAssertor(BiConsumer<File, File> aResultComparator) + { + resultAssertor = aResultComparator; + return this; + } + + public TestOptions keepDocumentMetadata() + { + keepDocumentMetadata = true; + return this; + } + + public TestOptions processor(AnalysisEngineDescription aProcessor) + { + processor = aProcessor; + return this; + } +} diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/TestRunner.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/TestRunner.java similarity index 79% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/TestRunner.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/TestRunner.java index 762ed677f1..fcdc65bc91 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/TestRunner.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/TestRunner.java @@ -15,18 +15,25 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing; +package org.dkpro.core.testing; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import java.io.File; +import java.io.FileOutputStream; +import java.io.OutputStream; + +import org.apache.commons.io.FileUtils; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.impl.XmiCasSerializer; import org.apache.uima.fit.testing.factory.TokenBuilder; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.resources.ResourceObjectProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; @@ -48,7 +55,8 @@ public class TestRunner * if an exception occurs. * @see TokenBuilder */ - public static JCas runTest(AnalysisEngineDescription aEngine, String aLanguage, String aDocument) + public static JCas runTest(AnalysisEngineDescription aEngine, String aLanguage, + String aDocument) throws UIMAException { return runTest(createEngine(aEngine), aLanguage, aDocument); @@ -80,8 +88,9 @@ public static JCas runTest(String aDocumentId, AnalysisEngineDescription aEngine } /** - * Run an analysis engine using a document. The document is automatically split into tokens based on - * spaces and into sentences based on newline. Make sure the punctuation is surrounded by spaces! + * Run an analysis engine using a document. The document is automatically split into tokens + * based on spaces and into sentences based on newline. Make sure the punctuation is surrounded + * by spaces! * * @param aEngine * an analysis engine description. @@ -101,8 +110,9 @@ public static JCas runTest(AnalysisEngine aEngine, String aLanguage, String aDoc } /** - * Run an analysis engine using a document. The document is automatically split into tokens based on - * spaces and into sentences based on newline. Make sure the punctuation is surrounded by spaces! + * Run an analysis engine using a document. The document is automatically split into tokens + * based on spaces and into sentences based on newline. Make sure the punctuation is surrounded + * by spaces! * * @param aDocumentId * a document ID. @@ -143,6 +153,22 @@ public static JCas runTest(String aDocumentId, AnalysisEngine aEngine, String aL aEngine.process(jcas); + DkproTestContext context = DkproTestContext.get(); + if (context != null) { + File folder = new File("target/test-output/" + context.getTestOutputFolderName()); + if (!folder.exists()) { + FileUtils.deleteQuietly(folder); + } + folder.mkdirs(); + + try (OutputStream docOS = new FileOutputStream(new File(folder, "output.xmi"))) { + XmiCasSerializer.serialize(jcas.getCas(), null, docOS, true, null); + } + catch (Exception e) { + throw new AnalysisEngineProcessException(e); + } + } + AssertAnnotations.assertValid(jcas); return jcas; diff --git a/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/WriterAssert.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/WriterAssert.java new file mode 100644 index 0000000000..cfd6eb80cd --- /dev/null +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/WriterAssert.java @@ -0,0 +1,456 @@ +/* + * Copyright 2019 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.testing; + +import static org.apache.commons.lang3.StringUtils.replaceOnce; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.ConfigurationParameterFactory.canParameterBeSet; +import static org.apache.uima.fit.factory.ConfigurationParameterFactory.getParameterSettings; +import static org.apache.uima.fit.factory.ConfigurationParameterFactory.setParameter; +import static org.dkpro.core.api.parameter.ComponentParameters.PARAM_TARGET_LOCATION; + +import java.io.File; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.commons.io.FileUtils; +import org.apache.uima.analysis_component.AnalysisComponent; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.fit.pipeline.JCasIterable; +import org.apache.uima.fit.util.LifeCycleUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.assertj.core.api.AbstractAssert; +import org.assertj.core.api.FileAssert; +import org.assertj.core.api.ListAssert; +import org.assertj.core.api.StringAssert; +import org.assertj.core.internal.Failures; +import org.assertj.core.util.Files; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class WriterAssert + extends AbstractAssert<WriterAssert, AnalysisEngineDescription> +{ + private Logger LOG = LoggerFactory.getLogger(getClass()); + + public static final String VAR_TARGET = "${TARGET}"; + + // See JCasFileWriter_ImplBase + private static final String PARAM_SINGULAR_TARGET = "singularTarget"; + private static final String PARAM_STRIP_EXTENSION = "stripExtension"; + + private JCasIterable jcasIterable; + + private Object requestedTargetLocation; + private boolean singularTargetAnnounced = false; + private boolean stripExtension = true; + + public WriterAssert(AnalysisEngineDescription aWriter) + { + super(aWriter, WriterAssert.class); + + isNotNull(); + + if (!actual.isPrimitive()) { + failWithMessage("Writer cannot be an aggregate. Use `usingEngine` if you need to add " + + "additional analysis engines or secondary writers."); + } + } + + public static WriterAssert assertThat(Class<? extends AnalysisComponent> aWriterClass, + Object... aConfigurationData) + throws ResourceInitializationException + { + return assertThat(createEngineDescription(aWriterClass, aConfigurationData)); + } + + public static WriterAssert assertThat(AnalysisEngineDescription aWriter) + { + return new WriterAssert(aWriter); + } + + public WriterAssert consuming(JCasIterable aJCasIterable) + { + jcasIterable = aJCasIterable; + + return this; + } + + /** + * By default, the original extension is stripped from the original file name and the writer's + * extension is then added. By calling this method, the original extension is retained and + * in addition the writer's extension is added. + * + * @return the assert for chaining. + */ + public WriterAssert keepOriginalExtension() + { + stripExtension = false; + + return this; + } + + /** + * Configure the writer to write to the given file. + * + * @param aLocation + * a location. + * @return the assert for chaining. + * @see #writingTo(String) + */ + public WriterAssert writingTo(File aLocation) + { + return _writingTo(aLocation); + } + + /** + * Configure the writer to write to the given location. The target location can either be + * configured using this method or by setting {@link ComponentParameters#PARAM_TARGET_LOCATION} + * in the reader description. + * + * @param aLocation + * a location. + * @return the assert for chaining. + */ + public WriterAssert writingTo(String aLocation) + { + return _writingTo(aLocation); + } + + /** + * Configure the writer to write all output into a single file at the given location. The + * location is the final file name, not a folder name. The singular target flag can either be + * configured using this method or by setting {@code PARAM_SINGULAR_TARGET} to {@code true} + * in the writer description. This method can also be used to indicate that a component + * implicitly writes a singular target, even if it does not support + * {@code PARAM_SINGULAR_TARGET}. This affects e.g. how {@link #asFiles()} interprets the + * target location. + * + * @param aLocation + * a location. + * @return the assert for chaining. + * @see #writingTo(String) + */ + public WriterAssert writingToSingular(String aLocation) + { + singularTargetAnnounced = true; + + // If the parameter can be set on the component, set it - otherwise assume that the + // component implicitly creates a singular target from the target location + if (canParameterBeSet(actual, PARAM_SINGULAR_TARGET)) { + Map<String, Object> writerParameters = getParameterSettings(actual); + if (Boolean.TRUE.equals(writerParameters.get(PARAM_SINGULAR_TARGET))) { + failWithMessage("PARAM_SINGULAR_TARGET already set in the writer parameters."); + + } + setParameter(actual, PARAM_SINGULAR_TARGET, true); + } + + return _writingTo(aLocation); + } + + public WriterAssert _writingTo(Object aLocation) + { + isNotNull(); + + if (requestedTargetLocation != null) { + failWithMessage("Target location has already been set to [%s]", + requestedTargetLocation); + } + + requestedTargetLocation = aLocation; + + if (!canParameterBeSet(actual, PARAM_TARGET_LOCATION)) { + failWithMessage("Parameter [%s] cannot be set on writer [%s]", + PARAM_TARGET_LOCATION, actual.getImplementationName()); + } + + // Is the target location defined in the writer parameters? + Map<String, Object> writerParameters = getParameterSettings(actual); + if (writerParameters.containsKey(PARAM_TARGET_LOCATION)) { + throw Failures.instance().failure(String.format( + "Target location [%s] already defined in the writer parameters.", + writerParameters.get(PARAM_TARGET_LOCATION))); + } + + requestedTargetLocation = resolvePlaceholders(requestedTargetLocation); + + setParameter(actual, PARAM_TARGET_LOCATION, requestedTargetLocation); + + return this; + } + + protected static <T> T resolvePlaceholders(T aLocation) + { + if (aLocation instanceof String) { + String location = (String) aLocation; + + if (location.startsWith(VAR_TARGET)) { + if (DkproTestContext.get() == null) { + throw Failures.instance() + .failure(String.format("Cannot substitute `%s` - no %s found.", + VAR_TARGET, DkproTestContext.class.getSimpleName())); + } + + File contextOutputFolder = new File("target/test-output/" + + DkproTestContext.get().getTestOutputFolderName()); + if (contextOutputFolder.exists()) { + FileUtils.deleteQuietly(contextOutputFolder); + } + + return (T) replaceOnce(location, VAR_TARGET, contextOutputFolder.getPath()); + } + } + + return aLocation; + } + + /** + * Infers the actual target location. + * + * @return the target location. + */ + protected Object targetLocation() + { + Map<String, Object> writerParameters = getParameterSettings(actual); + + // Was the target location set explicitly? + if (requestedTargetLocation == null) { + // Is the target location known from the writer parameters? + if (writerParameters.containsKey(PARAM_TARGET_LOCATION)) { + return writerParameters.get(PARAM_TARGET_LOCATION); + } + + // Can we get one from the DKPro Core test context? + if (DkproTestContext.get() != null) { + writingTo(VAR_TARGET); + return getParameterSettings(actual).get(PARAM_TARGET_LOCATION); + } + + // No success? + throw Failures.instance() + .failure(String.format("Unable to determine target location. Use a @Rule " + + DkproTestContext.class.getSimpleName() + + " or set the location using `writingTo()")); + } + else { + return requestedTargetLocation; + } + } + + protected List<File> listTargetLocationFiles() + { + Object location = targetLocation(); + + if (location instanceof String) { + location = new File((String) location); + } + + if (location instanceof File) { + File fileLocation = (File) location; + + if (!fileLocation.exists()) { + throw Failures.instance().failure( + String.format("Target location [%s] does not exist.", fileLocation)); + } + + if (isSingularTarget()) { + return Arrays.asList(fileLocation); + } + + return Arrays.asList(fileLocation.listFiles()); + } + + throw Failures.instance().failure(String + .format("Target location [%s] cannot be interpreted as a directory.", location)); + } + + protected boolean isSingularTarget() + { + Map<String, Object> writerParameters = getParameterSettings(actual); + + if (Boolean.TRUE.equals(writerParameters.get(PARAM_SINGULAR_TARGET))) { + return true; + } + + return singularTargetAnnounced; + } + + protected void configureWriter() + { + // By default, we strip the original extension when writing to avoid extension accumulation + if (stripExtension && canParameterBeSet(actual, PARAM_STRIP_EXTENSION)) { + setParameter(actual, PARAM_STRIP_EXTENSION, true); + } + + // If the target location is specified in the writer descriptor only, replace any variable + // in it if possible + if (canParameterBeSet(actual, PARAM_TARGET_LOCATION)) { + Map<String, Object> writerParameters = getParameterSettings(actual); + if (writerParameters.containsKey(PARAM_TARGET_LOCATION)) { + Object location = writerParameters.get(PARAM_TARGET_LOCATION); + setParameter(actual, PARAM_TARGET_LOCATION, resolvePlaceholders(location)); + } + } + } + + /** + * Gets the output written to the target location as a string. This method fails if more than + * one output file was created or if no output was created. + * <p> + * This method triggers the execution of the text pipeline. + * + * @return the output written to the target location as a string. + */ + public StringAssert outputAsString() + { + return outputAsString(null); + } + + /** + * Gets the output written to the target location as a string. + * <p> + * This method triggers the execution of the text pipeline. + * + * @param aPathSuffix + * a path/filename suffix which uniquely identifies the requested output file. + * + * @return the output written to the target location as a string. + */ + public StringAssert outputAsString(String aPathSuffix) + { + run(); + + List<File> files = listTargetLocationFiles(); + + if (files.isEmpty()) { + failWithMessage("Not output found at target location [%s].", requestedTargetLocation); + } + + if (aPathSuffix != null) { + files = files.stream() + .filter(file -> file.getPath().endsWith(aPathSuffix)) + .collect(Collectors.toList()); + } + + if (files.isEmpty()) { + if (aPathSuffix != null) { + failWithMessage("Not output file ending in [%s] found at target location [%s].", + aPathSuffix, requestedTargetLocation); + } + else { + failWithMessage("Not output file found at target location [%s].", + requestedTargetLocation); + } + } + + if (files.size() > 1) { + if (aPathSuffix != null) { + failWithMessage( + "Expected single output file ending in [%s] at target location [%s] but " + + "found multiple: %s.", + aPathSuffix, requestedTargetLocation, files); + } + else { + failWithMessage( + "Expected single output file at target location [%s] but found multiple: %s.", + requestedTargetLocation, files); + } + } + + return new StringAssert(Files.contentOf(files.get(0), StandardCharsets.UTF_8)); + } + + /** + * Gets the output written to the target location as a file. This method fails if more than + * one output file was created or if no output was created. + * <p> + * This method triggers the execution of the text pipeline. + * + * @return the output written to the target location as a file. + */ + public FileAssert asFile() + { + run(); + + List<File> files = listTargetLocationFiles(); + + if (files.isEmpty()) { + failWithMessage("Not output found at target location [%s].", requestedTargetLocation); + } + + if (files.size() > 1) { + failWithMessage( + "Expected single output file at target location [%s] but found multiple: %s.", + requestedTargetLocation, files); + } + + return new FileAssert(files.get(0)); + } + /** + * Gets the files written to the target location. + * <p> + * This method triggers the execution of the text pipeline. + * + * @return the files written to the target location. + */ + public ListAssert<File> asFiles() + { + run(); + + return new ListAssert<>(listTargetLocationFiles()); + } + + protected void run() + { + configureWriter(); + + // Obtains the actual target location, also ensuring that it was actually defined. + Object actualTargetLocation = targetLocation(); + + LOG.debug("Writing to target location : {}", actualTargetLocation); + LOG.debug("- is singular target : {}", isSingularTarget()); + + AnalysisEngine writer = null; + try { + writer = createEngine(actual); + + for (JCas jcas : jcasIterable) { + writer.process(jcas); + } + + LifeCycleUtil.collectionProcessComplete(writer); + } + catch (Exception e) { + AssertionError error = Failures.instance().failure(String.format( + "Pipeline execution failed: %s", e.getMessage())); + error.initCause(e); + throw error; + } + finally { + LifeCycleUtil.destroy(writer); + } + } +} diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/dumper/CasDumpWriter.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/dumper/CasDumpWriter.java similarity index 96% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/dumper/CasDumpWriter.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/dumper/CasDumpWriter.java index d4deb72ae2..baf33c62c7 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/dumper/CasDumpWriter.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/dumper/CasDumpWriter.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.dumper; +package org.dkpro.core.testing.dumper; import java.io.File; import java.io.FileOutputStream; @@ -48,15 +48,13 @@ import org.apache.uima.fit.component.CasConsumer_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; import org.springframework.util.DigestUtils; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; - /** * Dumps CAS content to a text file. This is useful when setting up test cases which contain a * reference output to which an actually produced CAS is compared. The format produced by this * component is more easily comparable than a XCAS or XMI format. - * */ public class CasDumpWriter extends CasConsumer_ImplBase @@ -108,9 +106,9 @@ public class CasDumpWriter */ public static final String PARAM_FEATURE_PATTERNS = "featurePatterns"; @ConfigurationParameter(name = PARAM_FEATURE_PATTERNS, mandatory = true, defaultValue = { - INCLUDE_PREFIX+PATTERN_ANY, EXCLUDE_PREFIX+PATTERN_DOCUMENT_URI, - EXCLUDE_PREFIX+PATTERN_COLLECTION_ID, EXCLUDE_PREFIX+PATTERN_DOCUMENT_BASE_URI, - EXCLUDE_PREFIX+PATTERN_NULL_VALUE }) + INCLUDE_PREFIX + PATTERN_ANY, EXCLUDE_PREFIX + PATTERN_DOCUMENT_URI, + EXCLUDE_PREFIX + PATTERN_COLLECTION_ID, EXCLUDE_PREFIX + PATTERN_DOCUMENT_BASE_URI, + EXCLUDE_PREFIX + PATTERN_NULL_VALUE }) private String[] featurePatterns; private InExPattern[] cookedFeaturePatterns; @@ -247,7 +245,8 @@ public int compare(AnnotationFS aO1, AnnotationFS aO2) } // Last resort: try the address. - if (aO1 instanceof FeatureStructureImpl && aO2 instanceof FeatureStructureImpl) { + if (aO1 instanceof FeatureStructureImpl + && aO2 instanceof FeatureStructureImpl) { return ((FeatureStructureImpl) aO1).getAddress() - ((FeatureStructureImpl) aO2).getAddress(); } @@ -348,7 +347,8 @@ private static InExPattern[] compilePatterns(String[] aPatterns) InExPattern[] patterns = new InExPattern[aPatterns.length]; for (int i = 0; i < aPatterns.length; i++) { if (aPatterns[i].startsWith(INCLUDE_PREFIX)) { - patterns[i] = new InExPattern(aPatterns[i].substring(INCLUDE_PREFIX.length()), true); + patterns[i] = new InExPattern(aPatterns[i].substring(INCLUDE_PREFIX.length()), + true); } else if (aPatterns[i].startsWith(EXCLUDE_PREFIX)) { patterns[i] = new InExPattern(aPatterns[i].substring(EXCLUDE_PREFIX.length()), diff --git a/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/dumper/DependencyDumper.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/dumper/DependencyDumper.java new file mode 100644 index 0000000000..4124594eae --- /dev/null +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/dumper/DependencyDumper.java @@ -0,0 +1,46 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.testing.dumper; + +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasConsumer_ImplBase; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; + +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; + +/** + * Dump dependencies to screen. + */ +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency"}) +public class DependencyDumper + extends JCasConsumer_ImplBase +{ + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException + { + for (Dependency dep : select(aJCas, Dependency.class)) { + System.out.format("%-10s [%s] [%s]%n", dep.getDependencyType(), + dep.getGovernor().getCoveredText(), dep.getDependent().getCoveredText()); + } + } +} diff --git a/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/harness/SegmenterHarness.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/harness/SegmenterHarness.java new file mode 100644 index 0000000000..97da6617f9 --- /dev/null +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/harness/SegmenterHarness.java @@ -0,0 +1,391 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.testing.harness; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.AssertAnnotations.assertSentence; +import static org.dkpro.core.testing.AssertAnnotations.assertToken; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.lang3.ArrayUtils; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.resources.ResourceObjectProviderBase; +import org.dkpro.core.api.segmentation.SegmenterBase; +import org.dkpro.core.testing.AssertAnnotations; +import org.junit.Assert; +import org.junit.internal.AssumptionViolatedException; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public final class SegmenterHarness +{ + public static final TestData[] DATA = new TestData[] { + new TestData("de.1", "de", "Herr Frank M. Meier hat einen Hund.", + new String[] { "Herr", "Frank", "M.", "Meier", "hat", "einen", + "Hund", "."}, + new String[] { "Herr Frank M. Meier hat einen Hund." }), + new TestData("de.2", "de", "Ich bin ein blöder Hund.", + new String[] { "Ich", "bin", "ein", "blöder", "Hund", "." }, + new String[] { "Ich bin ein blöder Hund." }), + new TestData("de.3", "de", "Mein Name ist Hans.", + new String[] { "Mein", "Name", "ist", "Hans", "." }, + new String[] { "Mein Name ist Hans." }), + // DKPRO-CORE-ASL-98: BreakIteratorSegmenter turns hypens to separate tokens + new TestData("de.4", "de", "ihre Negativbei- spiele immer", + new String[] { "ihre", "Negativbei-", "spiele", "immer" }, + new String[] { "ihre Negativbei- spiele immer" }), + + new TestData("en.1", "en", "Sadler, A.L. Cha-No-Yu: The Japanese Tea Ceremony.", + new String[] { "Sadler", ",", "A.L.", "Cha-No-Yu", ":", "The", + "Japanese", "Tea", "Ceremony", "."}, + new String[] { "Sadler, A.L. Cha-No-Yu: The Japanese Tea Ceremony." } ), + new TestData("en.2", "en", "I love the UIMA toolkit. 1989 is the year in which the Berlin wall fell.", + new String[] { "I", "love", "the", "UIMA", "toolkit", ".", + "1989", "is", "the", "year", "in", "which", "the", "Berlin", + "wall", "fell", "." }, + new String[] { "I love the UIMA toolkit.", + "1989 is the year in which the Berlin wall fell." }), + new TestData("en.3", "en", "I'm not a girl.", + new String[] { "I", "'m", "not", "a", "girl", "." }, + new String[] { "I'm not a girl." }), + new TestData("en.4", "en", "I am a stupid dog.", + new String[] { "I", "am", "a", "stupid", "dog", "." }, + new String[] { "I am a stupid dog." }), + new TestData("en.5", "en", "Georg \"Bullseye\" Logal is a though guy.", + new String[] { "Georg", "\"", "Bullseye", "\"", "Logal", + "is", "a", "though", "guy", "." }, + new String[] { "Georg \"Bullseye\" Logal is a though guy." }), + new TestData("en.6", "en", "This doesn't compute.", + new String[] { "This", "does", "n't", "compute", "." }, + new String[] { "This doesn't compute." }), + new TestData("en.7", "en", "based on\n 'Carnival of Souls', written by [...] and directed by [...].", + new String[] { "based", "on", "'", "Carnival", "of", "Souls", + "'", ",", "written", "by", "[", "...", "]", "and", "directed", + "by", "[", "...", "]", "." }, + new String[] { + "based on\n 'Carnival of Souls', written by [...] and directed by [...]." }), + new TestData("en.8", "en", ", , ,", + new String[] { ",", ",", "," }, + new String[] { ", , ," }), + new TestData("en.9", "en", "How to tokenize smileys? This is a good example. >^,,^< :0 3:[", + new String[] { + "How", "to", "tokenize", "smileys", "?", "This", "is", "a", "good", + "example.", ">^,,^<", ":0", "3:[" }, + new String[] { + "How to tokenize smileys?", "This is a good example.", ">^,,^< :0 3:[" }), + + // Sombody who can read arabic, please check this + // Covering the following sub-Saharan countries with vast areas very + new TestData("ar.1", "ar", "تغطي الصحراء الكبرى الدول التالية بمساحات شاسعة جدا", + new String[] { "تغطي", "الصحراء", "الكبرى", "الدول", "التالية", + "مساحات", "شاسعة", "جدا" }, + new String[] { "تغطي الصحراء الكبرى الدول التالية بمساحات شاسعة جدا" }), + + // While the stanford parser should come with a proper tokenizer + // for Chinese (because it can parse chinese text), this does not + // seem to be the right one or I am using it wrong. The associated + // test cases do not work. Maybe debugging the command below + // would help to find out how to use it. + // They use command to parse it: java -mx1g -cp "stanford-parser.jar" + // edu.stanford.nlp.parser.lexparser.LexicalizedParser -tLPP + // edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -sentences + // newline -escaper + // edu.stanford.nlp.trees.international.pennchinese.ChineseEscaper + // -outputFormat "penn,typedDependencies" -outputFormatOptions + // "removeTopBracket" xinhuaFactoredSegmenting.ser.gz sampleInput.txt. + new TestData("zh.1", "zh", "服务业成为广东经济转型升级的重要引擎。", + new String[] {"服务业", "成为", "广东", "经济", "转型", "升级", "的", + "重要", "引擎", "。"}, + new String[] {"服务业成为广东经济转型升级的重要引擎。"}), + new TestData("zh.2", "zh", "中国离世界技术品牌有多远?", + new String[] {"中国", "离", "世界", "技术", "品牌", "有", "多远", + "?" }, + new String[] { "中国离世界技术品牌有多远?" }) + }; + + private SegmenterHarness() + { + // No instances + } + + @FunctionalInterface + public static interface AssumeResourcePredicate { + void assume(String aLanguage, String aVariant) + throws AssumptionViolatedException, IOException; + } + + public static void run(AnalysisEngineDescription aAed, String... aIgnoreIds) + throws Throwable + { + run(aAed, null, aIgnoreIds); + } + + public static void run(AnalysisEngineDescription aAed, AssumeResourcePredicate aCheck, + String... aIgnoreIds) + throws Throwable + { + // No automatic downloading from repository during testing. This makes sure we fail if + // models are not properly added as test dependencies. + if (offline) { + System.setProperty(ResourceObjectProviderBase.PROP_REPO_OFFLINE, "true"); + } + offline = true; + + AnalysisEngine ae = createEngine(aAed); + JCas jCas = ae.newJCas(); + + List<String> results = new ArrayList<String>(); + + try { + for (TestData td : DATA) { + System.out.printf("== %s ==%n", td.id); + jCas.reset(); + + if (aCheck != null) { + try { + aCheck.assume(td.language, null); + } + catch (AssumptionViolatedException e) { + results.add(String.format("%s skipped", td.id)); + continue; + } + } + + jCas.setDocumentLanguage(td.language); + jCas.setDocumentText(td.text); + + boolean failed = false; + + try { + ae.process(jCas); + + AssertAnnotations.assertSentence(td.sentences, select(jCas, Sentence.class)); + AssertAnnotations.assertToken(td.tokens, select(jCas, Token.class)); + + results.add(String.format("%s OK", td.id)); + } + catch (Throwable e) { + failed = true; + if (!ArrayUtils.contains(aIgnoreIds, td.id)) { + results.add(String.format("%s FAIL", td.id)); + throw e; + } + else { + results.add(String.format("%s FAIL - Known, ignored", td.id)); + } + } + + if (!failed && ArrayUtils.contains(aIgnoreIds, td.id)) { + results.add(String.format("%s FAIL", td.id)); + Assert.fail(td.id + " passed but was expected to fail"); + } + } + } + finally { + System.out.println("=== RESULTS ==="); + for (String r : results) { + System.out.println(r); + } + } + } + + public static void testZoning(Class<? extends SegmenterBase> aSegmenter) + throws Exception + { + testZoning(aSegmenter, "en"); + } + + public static void testZoning(Class<? extends SegmenterBase> aSegmenter, String aLanguage) + throws Exception + { + testLaxZoning(aSegmenter, aLanguage); + testStrictZoning(aSegmenter, aLanguage); + testOufOfBoundsZones(aSegmenter, aLanguage); + } + + public static void testLaxZoning(Class<? extends SegmenterBase> aSegmenter, String aLanguage) + throws Exception + { + // No automatic downloading from repository during testing. This makes sure we fail if + // models are not properly added as test dependencies. + if (offline) { + System.setProperty(ResourceObjectProviderBase.PROP_REPO_OFFLINE, "true"); + } + offline = true; + + String[] sentences = { "A a a a .", "A a a a -", "B b b b .", "B b b b -", "C c c c .", + "C c c c -" }; + + String[] tokens = { "A", "a", "a", "a", ".", "A", "a", "a", "a", "-", "B", "b", "b", "b", + ".", "B", "b", "b", "b", "-", "C", "c", "c", "c", ".", "C", "c", "c", "c", "-" }; + + JCas jcas = JCasFactory.createJCas(); + jcas.setDocumentLanguage(aLanguage); + // 1 1 2 2 3 3 4 4 5 5 6 + // 0 5 0 5 0 5 0 5 0 5 0 5 0 + // ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- + jcas.setDocumentText("A a a a . A a a a - B b b b . B b b b - C c c c . C c c c -"); + // |------------------| |------------------| + new Paragraph(jcas, 0, 19).addToIndexes(); + new Paragraph(jcas, 40, 59).addToIndexes(); + + AnalysisEngine ae = createEngine(aSegmenter, + SegmenterBase.PARAM_STRICT_ZONING, false, + SegmenterBase.PARAM_ZONE_TYPES, Paragraph.class); + ae.process(jcas); + + assertToken(tokens, select(jcas, Token.class)); + assertSentence(sentences, select(jcas, Sentence.class)); + } + + public static void testOufOfBoundsZones(Class<? extends SegmenterBase> aSegmenter, + String aLanguage) + throws Exception + { + // No automatic downloading from repository during testing. This makes sure we fail if + // models are not properly added as test dependencies. + if (offline) { + System.setProperty(ResourceObjectProviderBase.PROP_REPO_OFFLINE, "true"); + } + offline = true; + + // 1 1 2 2 3 3 4 4 5 5 6 + // 0 5 0 5 0 5 0 5 0 5 0 5 0 + // ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- + String text = "A a a a . A a a a - B b b b . B b b b - C c c c . C c c c -"; + // |------------------| |------------------| + + // non-strict zoning + { + String[] sentences = { "A a a a .", "A a a a -", "B b b b .", "B b b b -", "C c c c .", + "C c c c -" }; + + String[] tokens = { "A", "a", "a", "a", ".", "A", "a", "a", "a", "-", "B", "b", "b", + "b", ".", "B", "b", "b", "b", "-", "C", "c", "c", "c", ".", "C", "c", "c", "c", + "-" }; + + JCas jcas = JCasFactory.createJCas(); + jcas.setDocumentLanguage(aLanguage); + jcas.setDocumentText(text); + new Paragraph(jcas, 0, 19).addToIndexes(); + new Paragraph(jcas, 40, 65).addToIndexes(); + + AnalysisEngine ae = createEngine(aSegmenter, + SegmenterBase.PARAM_STRICT_ZONING, false, + SegmenterBase.PARAM_ZONE_TYPES, Paragraph.class); + ae.process(jcas); + + assertToken(tokens, select(jcas, Token.class)); + assertSentence(sentences, select(jcas, Sentence.class)); + } + + // strict zoning + { + String[] sentences = { "A a a a .", "A a a a -", "C c c c .", "C c c c -" }; + + String[] tokens = { "A", "a", "a", "a", ".", "A", "a", "a", "a", "-", "C", "c", "c", + "c", ".", "C", "c", "c", "c", "-" }; + + JCas jcas = JCasFactory.createJCas(); + jcas.setDocumentLanguage(aLanguage); + jcas.setDocumentText(text); + new Paragraph(jcas, 0, 19).addToIndexes(); + new Paragraph(jcas, 40, 65).addToIndexes(); + + AnalysisEngine ae = createEngine(aSegmenter, + SegmenterBase.PARAM_STRICT_ZONING, true, + SegmenterBase.PARAM_ZONE_TYPES, Paragraph.class); + ae.process(jcas); + + assertToken(tokens, select(jcas, Token.class)); + assertSentence(sentences, select(jcas, Sentence.class)); + } + } + + public static void testStrictZoning(Class<? extends SegmenterBase> aSegmenter, String aLanguage) + throws Exception + { + // No automatic downloading from repository during testing. This makes sure we fail if + // models are not properly added as test dependencies. + if (offline) { + System.setProperty(ResourceObjectProviderBase.PROP_REPO_OFFLINE, "true"); + } + offline = true; + + String[] sentences = { "A a a a .", "A a a a -", "C c c c .", "C c c c -" }; + + String[] tokens = { + "A", "a", "a", "a", ".", + "A", "a", "a", "a", "-", + "C", "c", "c", "c", ".", + "C", "c", "c", "c", "-" }; + + JCas jcas = JCasFactory.createJCas(); + jcas.setDocumentLanguage(aLanguage); + // 1 1 2 2 3 3 4 4 5 5 6 + // 0 5 0 5 0 5 0 5 0 5 0 5 0 + // ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- + jcas.setDocumentText("A a a a . A a a a - B b b b . B b b b - C c c c . C c c c -"); + // |------------------| |------------------| + new Paragraph(jcas, 0, 19).addToIndexes(); + new Paragraph(jcas, 40, 59).addToIndexes(); + + AnalysisEngine ae = createEngine(aSegmenter, + SegmenterBase.PARAM_STRICT_ZONING, true, + SegmenterBase.PARAM_ZONE_TYPES, Paragraph.class); + ae.process(jcas); + + assertToken(tokens, select(jcas, Token.class)); + assertSentence(sentences, select(jcas, Sentence.class)); + } + + static class TestData + { + final String id; + final String language; + final String text; + final String[] sentences; + final String[] tokens; + + public TestData(String aId, String aLanguage, String aText, String[] aTokens, + String[] aSentences) + { + id = aId; + language = aLanguage; + text = aText; + sentences = aSentences; + tokens = aTokens; + } + } + + private static boolean offline = true; + + public static void autoloadModelsOnNextTestRun() + { + offline = false; + } +} diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/CasAnalysisUtils.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/CasAnalysisUtils.java similarity index 89% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/CasAnalysisUtils.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/CasAnalysisUtils.java index f33fed57e6..631ce3b2c4 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/CasAnalysisUtils.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/CasAnalysisUtils.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.validation; +package org.dkpro.core.testing.validation; import java.util.Map; import java.util.Set; @@ -65,7 +65,8 @@ public static void collect(Set<FeatureStructure> aFSes, FeatureStructure aFS) aFSes.add(aFS); for (Feature f : aFS.getType().getFeatures()) { - if (!f.getRange().isPrimitive() && !CAS.FEATURE_BASE_NAME_SOFA.equals(f.getShortName())) { + if (!f.getRange().isPrimitive() + && !CAS.FEATURE_BASE_NAME_SOFA.equals(f.getShortName())) { collect(aFSes, aFS.getFeatureValue(f)); } } @@ -101,9 +102,10 @@ public static void collect(Map<FeatureStructure, FeatureStructure> aFSes, } for (Feature f : aFS.getType().getFeatures()) { - if (!f.getRange().isPrimitive() && !CAS.FEATURE_BASE_NAME_SOFA.equals(f.getShortName())) { - collect(aFSes, aIndexed, aFS.getFeatureValue(f), aIndexed.contains(aFS) ? aFS - : aLastIndexed); + if (!f.getRange().isPrimitive() + && !CAS.FEATURE_BASE_NAME_SOFA.equals(f.getShortName())) { + collect(aFSes, aIndexed, aFS.getFeatureValue(f), + aIndexed.contains(aFS) ? aFS : aLastIndexed); } } } @@ -133,8 +135,8 @@ public static Map<FeatureStructure, FeatureStructure> getNonIndexedFSesWithOwner LowLevelCAS llcas = aCas.getLowLevelCAS(); Set<FeatureStructure> allIndexedFS = collectIndexed(aCas); - Map<FeatureStructure, FeatureStructure> allReachableFS = new TreeMap<>( - (fs1, fs2) -> llcas.ll_getFSRef(fs1) - llcas.ll_getFSRef(fs2)); + Map<FeatureStructure, FeatureStructure> allReachableFS = new TreeMap<>((fs1, fs2) -> + llcas.ll_getFSRef(fs1) - llcas.ll_getFSRef(fs2)); FSIterator<FeatureStructure> i = aCas.getIndexRepository().getAllIndexedFS( aCas.getTypeSystem().getTopType()); @@ -142,8 +144,8 @@ public static Map<FeatureStructure, FeatureStructure> getNonIndexedFSesWithOwner i.forEachRemaining(fs -> collect(allReachableFS, allIndexedFS, fs, fs)); // Remove all that are not annotations - allReachableFS.entrySet().removeIf( - e -> !ts.subsumes(aCas.getAnnotationType(), e.getKey().getType())); + allReachableFS.entrySet() + .removeIf(e -> !ts.subsumes(aCas.getAnnotationType(), e.getKey().getType())); // Remove all that are indexed allReachableFS.entrySet().removeIf(e -> e.getKey() == e.getValue()); diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/CasValidator.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/CasValidator.java similarity index 96% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/CasValidator.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/CasValidator.java index f4199ae0dc..4cbc3143ef 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/CasValidator.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/CasValidator.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.validation; +package org.dkpro.core.testing.validation; import static java.util.Arrays.asList; @@ -29,10 +29,9 @@ import org.apache.commons.lang3.exception.ExceptionUtils; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.validation.checks.Check; import org.reflections.Reflections; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.checks.Check; - public class CasValidator { private Set<Class<? extends Check>> checks = new LinkedHashSet<>(); diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/CasValidatorComponent.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/CasValidatorComponent.java similarity index 96% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/CasValidatorComponent.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/CasValidatorComponent.java index 5a4a9a8b2b..215464447f 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/CasValidatorComponent.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/CasValidatorComponent.java @@ -15,15 +15,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.validation; +package org.dkpro.core.testing.validation; + +import java.util.List; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasConsumer_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; -import java.util.List; - +/** + * Validate the CAS according to DKPro Core conventions. + */ public class CasValidatorComponent extends JCasConsumer_ImplBase { diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/Message.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/Message.java similarity index 95% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/Message.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/Message.java index ff657de626..cc82eb8e33 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/Message.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/Message.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.validation; +package org.dkpro.core.testing.validation; public class Message { @@ -46,4 +46,4 @@ public String toString() return String.format("[%s] %s", source != null ? source.getSimpleName() : "<unknown>", message); } -} \ No newline at end of file +} diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/AllAnnotationsIndexedCheck.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/AllAnnotationsIndexedCheck.java similarity index 82% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/AllAnnotationsIndexedCheck.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/AllAnnotationsIndexedCheck.java index 6d1ed830df..c8c88b4a59 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/AllAnnotationsIndexedCheck.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/AllAnnotationsIndexedCheck.java @@ -15,10 +15,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.validation.checks; +package org.dkpro.core.testing.validation.checks; -import static de.tudarmstadt.ukp.dkpro.core.testing.validation.CasAnalysisUtils.getNonIndexedFSesWithOwner; -import static de.tudarmstadt.ukp.dkpro.core.testing.validation.Message.Level.ERROR; +import static org.dkpro.core.testing.validation.CasAnalysisUtils.getNonIndexedFSesWithOwner; +import static org.dkpro.core.testing.validation.Message.Level.ERROR; import java.util.List; import java.util.Map; @@ -26,8 +26,7 @@ import org.apache.uima.cas.FeatureStructure; import org.apache.uima.jcas.JCas; - -import de.tudarmstadt.ukp.dkpro.core.testing.validation.Message; +import org.dkpro.core.testing.validation.Message; public class AllAnnotationsIndexedCheck implements Check diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/BasicDependenciesFormATreeCheck.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/BasicDependenciesFormATreeCheck.java similarity index 93% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/BasicDependenciesFormATreeCheck.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/BasicDependenciesFormATreeCheck.java index ebcb20a1a9..49d1a15079 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/BasicDependenciesFormATreeCheck.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/BasicDependenciesFormATreeCheck.java @@ -15,11 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.validation.checks; +package org.dkpro.core.testing.validation.checks; -import static de.tudarmstadt.ukp.dkpro.core.testing.validation.Message.Level.ERROR; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.dkpro.core.testing.validation.Message.Level.ERROR; import java.util.Collection; import java.util.HashMap; @@ -29,13 +29,13 @@ import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.validation.Message; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.Message; public class BasicDependenciesFormATreeCheck implements Check @@ -112,8 +112,8 @@ else if (!attachedDependencies.isEmpty()) { private List<Dependency> selectCoveredBasic(AnnotationFS aAnnotation) { - return selectCovered(Dependency.class, aAnnotation).stream().filter( - dep -> DependencyFlavor.BASIC.equals(dep.getFlavor()) || dep.getFlavor() == null) + return selectCovered(Dependency.class, aAnnotation).stream().filter(dep -> + DependencyFlavor.BASIC.equals(dep.getFlavor()) || dep.getFlavor() == null) .collect(Collectors.toList()); } } diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/Check.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/Check.java similarity index 85% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/Check.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/Check.java index 8ff97ae400..e0dd2e81c0 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/Check.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/Check.java @@ -15,13 +15,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.validation.checks; +package org.dkpro.core.testing.validation.checks; import java.util.List; import org.apache.uima.jcas.JCas; - -import de.tudarmstadt.ukp.dkpro.core.testing.validation.Message; +import org.dkpro.core.testing.validation.Message; public interface Check { diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/DependencyRootSelfLoopCheck.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/DependencyRootSelfLoopCheck.java similarity index 90% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/DependencyRootSelfLoopCheck.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/DependencyRootSelfLoopCheck.java index 9040d5f2fb..04e6fdb4eb 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/DependencyRootSelfLoopCheck.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/DependencyRootSelfLoopCheck.java @@ -15,21 +15,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.validation.checks; +package org.dkpro.core.testing.validation.checks; -import static de.tudarmstadt.ukp.dkpro.core.testing.validation.Message.Level.ERROR; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.dkpro.core.testing.validation.Message.Level.ERROR; import java.util.Collection; import java.util.List; import java.util.stream.Collectors; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.validation.Message; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.Message; public class DependencyRootSelfLoopCheck implements Check diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/DependencyRootTypeCheck.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/DependencyRootTypeCheck.java similarity index 91% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/DependencyRootTypeCheck.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/DependencyRootTypeCheck.java index 9005ec4b7d..ee6357ee9e 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/DependencyRootTypeCheck.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/DependencyRootTypeCheck.java @@ -15,22 +15,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.validation.checks; +package org.dkpro.core.testing.validation.checks; -import static de.tudarmstadt.ukp.dkpro.core.testing.validation.Message.Level.ERROR; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.dkpro.core.testing.validation.Message.Level.ERROR; import java.util.Collection; import java.util.List; import java.util.stream.Collectors; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.validation.Message; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.Message; public class DependencyRootTypeCheck implements Check diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/LemmaAttachedToTokenCheck.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/LemmaAttachedToTokenCheck.java similarity index 83% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/LemmaAttachedToTokenCheck.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/LemmaAttachedToTokenCheck.java index ad9a7af999..34a089715a 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/LemmaAttachedToTokenCheck.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/LemmaAttachedToTokenCheck.java @@ -15,16 +15,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.validation.checks; +package org.dkpro.core.testing.validation.checks; -import static de.tudarmstadt.ukp.dkpro.core.testing.validation.Message.Level.ERROR; +import static org.dkpro.core.testing.validation.Message.Level.ERROR; import java.util.List; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.validation.Message; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.Message; public class LemmaAttachedToTokenCheck extends TokenAttributeAttachedToTokenCheck_ImplBase diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/MorphologicalFeaturesAttachedToTokenCheck.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/MorphologicalFeaturesAttachedToTokenCheck.java similarity index 84% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/MorphologicalFeaturesAttachedToTokenCheck.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/MorphologicalFeaturesAttachedToTokenCheck.java index 5dd56ce22c..592144fe04 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/MorphologicalFeaturesAttachedToTokenCheck.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/MorphologicalFeaturesAttachedToTokenCheck.java @@ -15,16 +15,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.validation.checks; +package org.dkpro.core.testing.validation.checks; -import static de.tudarmstadt.ukp.dkpro.core.testing.validation.Message.Level.ERROR; +import static org.dkpro.core.testing.validation.Message.Level.ERROR; import java.util.List; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.validation.Message; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.Message; public class MorphologicalFeaturesAttachedToTokenCheck extends TokenAttributeAttachedToTokenCheck_ImplBase diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/NoZeroSizeTokensAndSentencesCheck.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/NoZeroSizeTokensAndSentencesCheck.java similarity index 82% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/NoZeroSizeTokensAndSentencesCheck.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/NoZeroSizeTokensAndSentencesCheck.java index c0dc482647..6d7ff18574 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/NoZeroSizeTokensAndSentencesCheck.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/NoZeroSizeTokensAndSentencesCheck.java @@ -15,20 +15,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.validation.checks; +package org.dkpro.core.testing.validation.checks; -import static de.tudarmstadt.ukp.dkpro.core.testing.validation.Message.Level.ERROR; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.validation.Message.Level.ERROR; import java.util.List; -import org.apache.log4j.Logger; -import org.apache.uima.cas.CASException; import org.apache.uima.jcas.JCas; -import static org.apache.uima.fit.util.JCasUtil.*; +import org.dkpro.core.testing.validation.Message; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.Message; public class NoZeroSizeTokensAndSentencesCheck implements Check diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/ParentSetCheck.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/ParentSetCheck.java similarity index 90% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/ParentSetCheck.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/ParentSetCheck.java index 41193626f5..e55a99bdbb 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/ParentSetCheck.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/ParentSetCheck.java @@ -15,10 +15,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.validation.checks; +package org.dkpro.core.testing.validation.checks; -import static de.tudarmstadt.ukp.dkpro.core.testing.validation.Message.Level.ERROR; import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.validation.Message.Level.ERROR; import java.util.Collection; import java.util.List; @@ -26,9 +26,9 @@ import org.apache.uima.fit.util.FSUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; +import org.dkpro.core.testing.validation.Message; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.Message; public class ParentSetCheck implements Check { diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/PosAttachedToTokenCheck.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/PosAttachedToTokenCheck.java similarity index 83% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/PosAttachedToTokenCheck.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/PosAttachedToTokenCheck.java index eb7bfc76d9..8e60a5bfdf 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/PosAttachedToTokenCheck.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/PosAttachedToTokenCheck.java @@ -15,16 +15,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.validation.checks; +package org.dkpro.core.testing.validation.checks; -import static de.tudarmstadt.ukp.dkpro.core.testing.validation.Message.Level.ERROR; +import static org.dkpro.core.testing.validation.Message.Level.ERROR; import java.util.List; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.validation.Message; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.Message; public class PosAttachedToTokenCheck extends TokenAttributeAttachedToTokenCheck_ImplBase diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/StemAttachedToTokenCheck.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/StemAttachedToTokenCheck.java similarity index 83% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/StemAttachedToTokenCheck.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/StemAttachedToTokenCheck.java index fc006447cd..0fe634a83d 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/StemAttachedToTokenCheck.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/StemAttachedToTokenCheck.java @@ -15,16 +15,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.validation.checks; +package org.dkpro.core.testing.validation.checks; -import static de.tudarmstadt.ukp.dkpro.core.testing.validation.Message.Level.ERROR; +import static org.dkpro.core.testing.validation.Message.Level.ERROR; import java.util.List; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.validation.Message; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.Message; public class StemAttachedToTokenCheck extends TokenAttributeAttachedToTokenCheck_ImplBase diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/TokenAttributeAttachedToTokenCheck_ImplBase.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/TokenAttributeAttachedToTokenCheck_ImplBase.java similarity index 90% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/TokenAttributeAttachedToTokenCheck_ImplBase.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/TokenAttributeAttachedToTokenCheck_ImplBase.java index 98aba3ec85..b5b4af05ff 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/TokenAttributeAttachedToTokenCheck_ImplBase.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/checks/TokenAttributeAttachedToTokenCheck_ImplBase.java @@ -15,12 +15,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.validation.checks; +package org.dkpro.core.testing.validation.checks; -import static de.tudarmstadt.ukp.dkpro.core.testing.validation.Message.Level.ERROR; -import static de.tudarmstadt.ukp.dkpro.core.testing.validation.Message.Level.INFO; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectAt; +import static org.dkpro.core.testing.validation.Message.Level.ERROR; +import static org.dkpro.core.testing.validation.Message.Level.INFO; import java.util.ArrayList; import java.util.List; @@ -31,9 +31,9 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; +import org.dkpro.core.testing.validation.Message; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.Message; public abstract class TokenAttributeAttachedToTokenCheck_ImplBase implements Check diff --git a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/extras/AllTokensHavePos.java b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/extras/AllTokensHavePos.java similarity index 87% rename from dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/extras/AllTokensHavePos.java rename to dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/extras/AllTokensHavePos.java index c54f19ccbe..03ea788085 100644 --- a/dkpro-core-testing-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/extras/AllTokensHavePos.java +++ b/dkpro-core-testing-asl/src/main/java/org/dkpro/core/testing/validation/extras/AllTokensHavePos.java @@ -15,19 +15,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.validation.extras; +package org.dkpro.core.testing.validation.extras; -import static de.tudarmstadt.ukp.dkpro.core.testing.validation.Message.Level.ERROR; import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.validation.Message.Level.ERROR; import java.util.List; import java.util.stream.Collectors; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.validation.Message; +import org.dkpro.core.testing.validation.checks.Check; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.Message; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.checks.Check; public class AllTokensHavePos implements Check { diff --git a/dkpro-core-testing-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/testing/AssertAnnotationsTest.java b/dkpro-core-testing-asl/src/test/java/org/dkpro/core/testing/AssertAnnotationsTest.java similarity index 88% rename from dkpro-core-testing-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/testing/AssertAnnotationsTest.java rename to dkpro-core-testing-asl/src/test/java/org/dkpro/core/testing/AssertAnnotationsTest.java index a62b81af74..f7600ca5bd 100644 --- a/dkpro-core-testing-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/testing/AssertAnnotationsTest.java +++ b/dkpro-core-testing-asl/src/test/java/org/dkpro/core/testing/AssertAnnotationsTest.java @@ -15,9 +15,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing; +package org.dkpro.core.testing; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.*; +import static org.dkpro.core.testing.AssertAnnotations.asCopyableString; import static org.junit.Assert.assertFalse; import java.util.Arrays; @@ -33,7 +33,7 @@ public void testAsCopyableStringLineBreak() List<String> expected = Arrays.asList(new String[] { "" }); List<String> actual = Arrays.asList(new String[] { null }); assertFalse(expected.equals(actual)); - assertFalse(asCopyableString(expected, true).equals(asCopyableString(actual, true))); + assertFalse(asCopyableString(expected, true).equals(asCopyableString(actual, true))); } @Test @@ -42,6 +42,6 @@ public void testAsCopyableStringNoLineBreak() List<String> expected = Arrays.asList(new String[] { "" }); List<String> actual = Arrays.asList(new String[] { null }); assertFalse(expected.equals(actual)); - assertFalse(asCopyableString(expected, false).equals(asCopyableString(actual, false))); + assertFalse(asCopyableString(expected, false).equals(asCopyableString(actual, false))); } } diff --git a/dkpro-core-testing-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/NoZeroSizeTokenAndSentenceCheckTest.java b/dkpro-core-testing-asl/src/test/java/org/dkpro/core/testing/validation/checks/NoZeroSizeTokenAndSentenceCheckTest.java similarity index 89% rename from dkpro-core-testing-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/NoZeroSizeTokenAndSentenceCheckTest.java rename to dkpro-core-testing-asl/src/test/java/org/dkpro/core/testing/validation/checks/NoZeroSizeTokenAndSentenceCheckTest.java index 2a3838be7b..801c7bb570 100644 --- a/dkpro-core-testing-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/NoZeroSizeTokenAndSentenceCheckTest.java +++ b/dkpro-core-testing-asl/src/test/java/org/dkpro/core/testing/validation/checks/NoZeroSizeTokenAndSentenceCheckTest.java @@ -15,22 +15,23 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.validation.checks; +package org.dkpro.core.testing.validation.checks; -import static de.tudarmstadt.ukp.dkpro.core.testing.validation.Message.Level.ERROR; -import static org.junit.Assert.assertTrue; +import static org.dkpro.core.testing.validation.Message.Level.ERROR; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; import java.util.List; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.validation.CasValidator; +import org.dkpro.core.testing.validation.Message; +import org.dkpro.core.testing.validation.checks.NoZeroSizeTokensAndSentencesCheck; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.CasValidator; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.Message; public class NoZeroSizeTokenAndSentenceCheckTest { diff --git a/dkpro-core-testing-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/PosAttachedToTokenCheckTest.java b/dkpro-core-testing-asl/src/test/java/org/dkpro/core/testing/validation/checks/PosAttachedToTokenCheckTest.java similarity index 82% rename from dkpro-core-testing-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/PosAttachedToTokenCheckTest.java rename to dkpro-core-testing-asl/src/test/java/org/dkpro/core/testing/validation/checks/PosAttachedToTokenCheckTest.java index b621fb02b4..621058923e 100644 --- a/dkpro-core-testing-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/testing/validation/checks/PosAttachedToTokenCheckTest.java +++ b/dkpro-core-testing-asl/src/test/java/org/dkpro/core/testing/validation/checks/PosAttachedToTokenCheckTest.java @@ -15,20 +15,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.testing.validation.checks; +package org.dkpro.core.testing.validation.checks; -import static de.tudarmstadt.ukp.dkpro.core.testing.validation.Message.Level.ERROR; +import static org.dkpro.core.testing.validation.Message.Level.ERROR; import static org.junit.Assert.assertTrue; import java.util.List; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.validation.CasValidator; +import org.dkpro.core.testing.validation.Message; +import org.dkpro.core.testing.validation.checks.PosAttachedToTokenCheck; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.CasValidator; -import de.tudarmstadt.ukp.dkpro.core.testing.validation.Message; public class PosAttachedToTokenCheckTest { diff --git a/dkpro-core-textcat-asl/pom.xml b/dkpro-core-textcat-asl/pom.xml index 2634dfef51..d5087957d7 100644 --- a/dkpro-core-textcat-asl/pom.xml +++ b/dkpro-core-textcat-asl/pom.xml @@ -1,53 +1,58 @@ <!-- - Copyright 2010 - Ubiquitous Knowledge Processing (UKP) Lab - Technische Universität Darmstadt + Copyright 2010 + Ubiquitous Knowledge Processing (UKP) Lab + Technische Universität Darmstadt - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. --> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - <parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <version>1.10.0-SNAPSHOT</version> - <relativePath>../dkpro-core-asl</relativePath> - </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.textcat-asl</artifactId> - <packaging>jar</packaging> - <name>DKPro Core ASL - TextCat (v ${textcat.version}) (LGPL)</name> - <description>http://textcat.sourceforge.net/</description> - <dependencies> - <dependency> - <groupId>org.apache.uima</groupId> - <artifactId>uimaj-core</artifactId> - </dependency> - <dependency> - <groupId>org.apache.uima</groupId> - <artifactId>uimafit-core</artifactId> - </dependency> - <dependency> - <groupId>org.knallgrau.utils</groupId> - <artifactId>textcat</artifactId> - <version>${textcat.version}</version> - </dependency> - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <scope>test</scope> - </dependency> - </dependencies> - <properties> - <textcat.version>1.0.1</textcat.version> - </properties> + <modelVersion>4.0.0</modelVersion> + <parent> + <artifactId>dkpro-core-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <version>2.3.0-SNAPSHOT</version> + <relativePath>../dkpro-core-asl</relativePath> + </parent> + <artifactId>dkpro-core-textcat-asl</artifactId> + <packaging>jar</packaging> + <name>DKPro Core ASL - TextCat (v ${textcat.version}) (LGPL)</name> + <url>https://dkpro.github.io/dkpro-core/</url> + <description>http://textcat.sourceforge.net/</description> + <properties> + <textcat.version>1.0.1</textcat.version> + </properties> + <dependencies> + <dependency> + <groupId>org.apache.uima</groupId> + <artifactId>uimaj-core</artifactId> + </dependency> + <dependency> + <groupId>org.apache.uima</groupId> + <artifactId>uimafit-core</artifactId> + </dependency> + <dependency> + <groupId>org.knallgrau.utils</groupId> + <artifactId>textcat</artifactId> + <version>${textcat.version}</version> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> + </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + </dependencies> </project> diff --git a/dkpro-core-textcat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textcat/LanguageIdentifier.java b/dkpro-core-textcat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textcat/LanguageIdentifier.java deleted file mode 100644 index a4b398ff55..0000000000 --- a/dkpro-core-textcat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textcat/LanguageIdentifier.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.textcat; - -import java.util.HashMap; -import java.util.Map; - -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.jcas.JCas; -import org.knallgrau.utils.textcat.TextCategorizer; - -/** - * <p>Detection based on character n-grams. Uses the <a href="http://textcat.sourceforge.net">Java - * Text Categorizing Library</a> based on a technique by Cavnar and Trenkle.</p> - * - * <p><b>References</b></p> - * <ul> - * <li>Cavnar, W. B. and J. M. Trenkle (1994). N-Gram-Based Text Categorization. - * In Proceedings of Third Annual Symposium on Document Analysis and Information Retrieval, - * Las Vegas, NV, UNLV Publications/Reprographics, pp. 161-175, 11-13 April 1994.</li></ul> - */ -@ResourceMetaData(name="TextCat Language Identifier (Character N-Gram-based)") -public class LanguageIdentifier - extends JCasAnnotator_ImplBase -{ - private static final Map<String, String> langName2ISO = new HashMap<String, String>(); - static { - langName2ISO.put("german", "de"); - langName2ISO.put("english", "en"); - langName2ISO.put("french", "fr"); - langName2ISO.put("spanish", "es"); - langName2ISO.put("italian", "it"); - langName2ISO.put("swedish", "sv"); - langName2ISO.put("polish", "pl"); - langName2ISO.put("dutch", "nl"); - langName2ISO.put("norwegian", "no"); - langName2ISO.put("finnish", "fi"); - langName2ISO.put("albanian", "sq"); - langName2ISO.put("slovakian", "sk"); - langName2ISO.put("slovenian", "sl"); - langName2ISO.put("danish", "da"); - langName2ISO.put("hungarian", "hu"); - } - - private final TextCategorizer categorizer = new TextCategorizer(); - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - String docText = aJCas.getDocumentText(); - if (docText != null) { - String result = categorizer.categorize(docText); - aJCas.setDocumentLanguage(langName2ISO.get(result)); - } - } -} diff --git a/dkpro-core-textcat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textcat/package-info.java b/dkpro-core-textcat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textcat/package-info.java deleted file mode 100644 index 55a13d5a64..0000000000 --- a/dkpro-core-textcat-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textcat/package-info.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Language guessed basedon the <a href="http://textcat.sourceforge.net/">Java Text Categorizing - * Library </a> (JTCL). - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.textcat; diff --git a/dkpro-core-textcat-asl/src/main/java/org/dkpro/core/textcat/LanguageIdentifier.java b/dkpro-core-textcat-asl/src/main/java/org/dkpro/core/textcat/LanguageIdentifier.java new file mode 100644 index 0000000000..7d29061a17 --- /dev/null +++ b/dkpro-core-textcat-asl/src/main/java/org/dkpro/core/textcat/LanguageIdentifier.java @@ -0,0 +1,80 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.textcat; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.knallgrau.utils.textcat.TextCategorizer; + +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * <p>Detection based on character n-grams. Uses the <a href="http://textcat.sourceforge.net">Java + * Text Categorizing Library</a> based on a technique by Cavnar and Trenkle.</p> + * + * <p><b>References</b></p> + * <ul> + * <li>Cavnar, W. B. and J. M. Trenkle (1994). N-Gram-Based Text Categorization. + * In Proceedings of Third Annual Symposium on Document Analysis and Information Retrieval, + * Las Vegas, NV, UNLV Publications/Reprographics, pp. 161-175, 11-13 April 1994.</li></ul> + */ +@Component(OperationType.LANGUAGE_IDENTIFIER) +@ResourceMetaData(name = "TextCat Language Identifier (Character N-Gram-based)") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +public class LanguageIdentifier + extends JCasAnnotator_ImplBase +{ + private static final Map<String, String> langName2ISO = new HashMap<String, String>(); + static { + langName2ISO.put("german", "de"); + langName2ISO.put("english", "en"); + langName2ISO.put("french", "fr"); + langName2ISO.put("spanish", "es"); + langName2ISO.put("italian", "it"); + langName2ISO.put("swedish", "sv"); + langName2ISO.put("polish", "pl"); + langName2ISO.put("dutch", "nl"); + langName2ISO.put("norwegian", "no"); + langName2ISO.put("finnish", "fi"); + langName2ISO.put("albanian", "sq"); + langName2ISO.put("slovakian", "sk"); + langName2ISO.put("slovenian", "sl"); + langName2ISO.put("danish", "da"); + langName2ISO.put("hungarian", "hu"); + } + + private final TextCategorizer categorizer = new TextCategorizer(); + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + String docText = aJCas.getDocumentText(); + if (docText != null) { + String result = categorizer.categorize(docText); + aJCas.setDocumentLanguage(langName2ISO.get(result)); + } + } +} diff --git a/dkpro-core-textcat-asl/src/main/java/org/dkpro/core/textcat/package-info.java b/dkpro-core-textcat-asl/src/main/java/org/dkpro/core/textcat/package-info.java new file mode 100644 index 0000000000..69ab01a6fd --- /dev/null +++ b/dkpro-core-textcat-asl/src/main/java/org/dkpro/core/textcat/package-info.java @@ -0,0 +1,25 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Language guessed basedon the <a href="http://textcat.sourceforge.net/">Java Text Categorizing + * Library </a> (JTCL). + * + * @since 1.1.0 + */ +package org.dkpro.core.textcat; diff --git a/dkpro-core-textcat-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textcat/LanguageIdentifierTest.java b/dkpro-core-textcat-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textcat/LanguageIdentifierTest.java deleted file mode 100644 index 325b7608a6..0000000000 --- a/dkpro-core-textcat-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textcat/LanguageIdentifierTest.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.textcat; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.factory.TypeSystemDescriptionFactory.createTypeSystemDescription; -import static org.junit.Assert.assertEquals; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.jcas.JCas; -import org.junit.Test; - -public -class LanguageIdentifierTest -{ - @Test - public - void testEnglish() - throws Exception - { - AnalysisEngine ae = createEngine(LanguageIdentifier.class, createTypeSystemDescription()); - JCas aJCas = ae.newJCas(); - aJCas.setDocumentText("This is an english file."); - ae.process(aJCas); - assertEquals("en", aJCas.getDocumentLanguage()); - } - - @Test - public - void testGerman() - throws Exception - { - AnalysisEngine ae = createEngine(LanguageIdentifier.class, createTypeSystemDescription()); - JCas aJCas = ae.newJCas(); - aJCas.setDocumentText("Das ist ein deutsches Dokument."); - ae.process(aJCas); - assertEquals("de", aJCas.getDocumentLanguage()); - } -} diff --git a/dkpro-core-textcat-asl/src/test/java/org/dkpro/core/textcat/LanguageIdentifierTest.java b/dkpro-core-textcat-asl/src/test/java/org/dkpro/core/textcat/LanguageIdentifierTest.java new file mode 100644 index 0000000000..e4ec0daf4a --- /dev/null +++ b/dkpro-core-textcat-asl/src/test/java/org/dkpro/core/textcat/LanguageIdentifierTest.java @@ -0,0 +1,49 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.textcat; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.TypeSystemDescriptionFactory.createTypeSystemDescription; +import static org.junit.Assert.assertEquals; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.jcas.JCas; +import org.junit.Test; + +public class LanguageIdentifierTest +{ + @Test + public void testEnglish() throws Exception + { + AnalysisEngine ae = createEngine(LanguageIdentifier.class, createTypeSystemDescription()); + JCas aJCas = ae.newJCas(); + aJCas.setDocumentText("This is an english file."); + ae.process(aJCas); + assertEquals("en", aJCas.getDocumentLanguage()); + } + + @Test + public void testGerman() throws Exception + { + AnalysisEngine ae = createEngine(LanguageIdentifier.class, createTypeSystemDescription()); + JCas aJCas = ae.newJCas(); + aJCas.setDocumentText("Das ist ein deutsches Dokument."); + ae.process(aJCas); + assertEquals("de", aJCas.getDocumentLanguage()); + } +} diff --git a/dkpro-core-textnormalizer-asl/pom.xml b/dkpro-core-textnormalizer-asl/pom.xml index a5dd568f6e..cf5db188f6 100644 --- a/dkpro-core-textnormalizer-asl/pom.xml +++ b/dkpro-core-textnormalizer-asl/pom.xml @@ -18,14 +18,15 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.textnormalizer-asl</artifactId> + <artifactId>dkpro-core-textnormalizer-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - Text normalizer</name> + <url>https://dkpro.github.io/dkpro-core/</url> <description>Provides normalizer for text. E.g. takes a text and checks for umlauts written as "ae", "oe", or "ue" and normalizes them if they really are umlauts depending on a frequency model.</description> <dependencies> <dependency> @@ -45,56 +46,60 @@ <artifactId>commons-lang3</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.castransformation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-castransformation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.frequency-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-frequency-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.anomaly-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-anomaly-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.featurepath-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-featurepath-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.transform-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-transform-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.frequency-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-frequency-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.metadata-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-metadata-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.io.web1t-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-io-web1t-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.tokit-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-tokit-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.jazzy-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-jazzy-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> </dependency> <dependency> <groupId>junit</groupId> @@ -102,18 +107,18 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.io.text-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-io-text-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-opennlp-asl</artifactId> <scope>test</scope> </dependency> <dependency> @@ -140,9 +145,9 @@ <dependencyManagement> <dependencies> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-opennlp-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <type>pom</type> <scope>import</scope> </dependency> @@ -166,5 +171,31 @@ </plugin> </plugins> </pluginManagement> + <plugins> + <plugin> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-maven-plugin</artifactId> + <configuration> + <uimaDescriptorExcludes> + <!-- + The following components require model files which cannot be provided on the + OpenMinTeD platform. + --> + <exclude>**/ReplacementFileNormalizer.xml</exclude> + <exclude>**/DictionaryBasedTokenTransformer.xml</exclude> + <exclude>**/FileBasedTokenTransformer.xml</exclude> + <exclude>**/HyphenationRemover.xml</exclude> + <!-- + The following components must be configured via external resources which is not + possible on the OpenMinTeD platform. + --> + <exclude>**/CapitalizationNormalizer.xml</exclude> + <exclude>**/ExpressiveLengtheningNormalizer.xml</exclude> + <exclude>**/SharpSNormalizer.xml</exclude> + <exclude>**/UmlautNormalizer.xml</exclude> + </uimaDescriptorExcludes> + </configuration> + </plugin> + </plugins> </build> </project> \ No newline at end of file diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/factory/NormalizerFactory.java b/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/factory/NormalizerFactory.java deleted file mode 100644 index 7d426b5103..0000000000 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/factory/NormalizerFactory.java +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.factory; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.cas.CAS; -import org.apache.uima.fit.factory.AggregateBuilder; -import org.apache.uima.resource.ExternalResourceDescription; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.castransformation.ApplyChangesAnnotator; -import de.tudarmstadt.ukp.dkpro.core.jazzy.JazzyChecker; -import de.tudarmstadt.ukp.dkpro.core.textnormalizer.ReplacementFileNormalizer; -import de.tudarmstadt.ukp.dkpro.core.textnormalizer.ReplacementFileNormalizer.SrcSurroundings; -import de.tudarmstadt.ukp.dkpro.core.textnormalizer.ReplacementFileNormalizer.TargetSurroundings; -import de.tudarmstadt.ukp.dkpro.core.textnormalizer.SpellingNormalizer; -import de.tudarmstadt.ukp.dkpro.core.textnormalizer.frequency.CapitalizationNormalizer; -import de.tudarmstadt.ukp.dkpro.core.textnormalizer.frequency.ExpressiveLengtheningNormalizer; -import de.tudarmstadt.ukp.dkpro.core.textnormalizer.frequency.SharpSNormalizer; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; - -@Deprecated -public class NormalizerFactory -{ - private int view_counter = 0; - - public AnalysisEngineDescription getSpellcorrection(String aModelLocation) - throws ResourceInitializationException - { - AggregateBuilder ab = new AggregateBuilder(); - ab.add(createEngineDescription(BreakIteratorSegmenter.class), - CAS.NAME_DEFAULT_SOFA, getSourceView()); - ab.add(createEngineDescription(JazzyChecker.class, - JazzyChecker.PARAM_MODEL_LOCATION, aModelLocation), - CAS.NAME_DEFAULT_SOFA, getSourceView()); - ab.add(createEngineDescription(SpellingNormalizer.class), - CAS.NAME_DEFAULT_SOFA, getSourceView()); - ab.add(createEngineDescription(ApplyChangesAnnotator.class), - ApplyChangesAnnotator.VIEW_SOURCE, getSourceView(), - ApplyChangesAnnotator.VIEW_TARGET, getTargetView()); - AnalysisEngineDescription aed = ab.createAggregateDescription(); - aed.setAnnotatorImplementationName("Spell"); - - return aed; - } - - public AnalysisEngineDescription getUmlautSharpSNormalization( - ExternalResourceDescription aFrequencyProvider, int aMinFrequency) - throws ResourceInitializationException - { - AggregateBuilder ab = new AggregateBuilder(); - ab.add(createEngineDescription(BreakIteratorSegmenter.class), - CAS.NAME_DEFAULT_SOFA, getSourceView()); - ab.add(createEngineDescription( SharpSNormalizer.class, - SharpSNormalizer.FREQUENCY_PROVIDER, aFrequencyProvider, - SharpSNormalizer.PARAM_MIN_FREQUENCY_THRESHOLD, aMinFrequency), - CAS.NAME_DEFAULT_SOFA, getSourceView()); - ab.add(createEngineDescription(ApplyChangesAnnotator.class), - ApplyChangesAnnotator.VIEW_SOURCE, getSourceView(), - ApplyChangesAnnotator.VIEW_TARGET, getTargetView()); - AnalysisEngineDescription aed = ab.createAggregateDescription(); - aed.setAnnotatorImplementationName("Umlaute"); - - return aed; - } - - public AnalysisEngineDescription getReplacementNormalization(String aModelLocation, - SrcSurroundings aSrc, TargetSurroundings aTarget) - throws ResourceInitializationException - { - AggregateBuilder ab = new AggregateBuilder(); - ab.add(createEngineDescription( - ReplacementFileNormalizer.class, - ReplacementFileNormalizer.PARAM_MODEL_LOCATION, aModelLocation, - ReplacementFileNormalizer.PARAM_SRC_SURROUNDINGS, aSrc, - ReplacementFileNormalizer.PARAM_TARGET_SURROUNDINGS, aTarget), - CAS.NAME_DEFAULT_SOFA, getSourceView()); - ab.add(createEngineDescription(ApplyChangesAnnotator.class), - ApplyChangesAnnotator.VIEW_SOURCE, getSourceView(), - ApplyChangesAnnotator.VIEW_TARGET, getTargetView()); - AnalysisEngineDescription aed = ab.createAggregateDescription(); - // aed.setAnnotatorImplementationName(new File(filepath).getName().split("\\")[0]); - - return aed; - } - - public AnalysisEngineDescription getExpressiveLengtheningNormalization( - ExternalResourceDescription aFrequencyProvider) - throws ResourceInitializationException - { - AggregateBuilder ab = new AggregateBuilder(); - ab.add(createEngineDescription(BreakIteratorSegmenter.class), - CAS.NAME_DEFAULT_SOFA, getSourceView()); - ab.add(createEngineDescription(ExpressiveLengtheningNormalizer.class, - ExpressiveLengtheningNormalizer.FREQUENCY_PROVIDER, aFrequencyProvider), - CAS.NAME_DEFAULT_SOFA, getSourceView()); - ab.add(createEngineDescription(ApplyChangesAnnotator.class), - ApplyChangesAnnotator.VIEW_SOURCE, getSourceView(), - ApplyChangesAnnotator.VIEW_TARGET, getTargetView()); - AnalysisEngineDescription aed = ab.createAggregateDescription(); - aed.setAnnotatorImplementationName("Lengthening"); - - return aed; - } - - public AnalysisEngineDescription getCapitalizationNormalization( - ExternalResourceDescription aFrequencyProvider) - throws ResourceInitializationException - { - AggregateBuilder ab = new AggregateBuilder(); - ab.add(createEngineDescription(BreakIteratorSegmenter.class), - CAS.NAME_DEFAULT_SOFA, getSourceView()); - ab.add(createEngineDescription(CapitalizationNormalizer.class, - CapitalizationNormalizer.FREQUENCY_PROVIDER, aFrequencyProvider), - CAS.NAME_DEFAULT_SOFA, getSourceView()); - ab.add(createEngineDescription(ApplyChangesAnnotator.class), - ApplyChangesAnnotator.VIEW_SOURCE, getSourceView(), - ApplyChangesAnnotator.VIEW_TARGET, getTargetView()); - AnalysisEngineDescription aed = ab.createAggregateDescription(); - aed.setAnnotatorImplementationName("Capitalization"); - - return aed; - } - - protected String getSourceView() - { - return (view_counter > 0) ? "view" + view_counter : CAS.NAME_DEFAULT_SOFA; - } - - protected String getTargetView() - { - return "view" + ++view_counter; - } - - public String getOutputView() - { - return "view" + view_counter; - } -} diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/Normalizer_ImplBase.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/Normalizer_ImplBase.java similarity index 91% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/Normalizer_ImplBase.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/Normalizer_ImplBase.java index 6a4c7af440..13f9b10cc9 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/Normalizer_ImplBase.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/Normalizer_ImplBase.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer; +package org.dkpro.core.textnormalizer; import java.util.ArrayList; import java.util.Collections; @@ -26,20 +26,20 @@ import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.transform.alignment.AlignedString; +import org.dkpro.core.textnormalizer.internal.AnnotationComparator; +import org.dkpro.core.textnormalizer.util.NormalizationUtils; -import de.tudarmstadt.ukp.dkpro.core.api.transform.alignment.AlignedString; import de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation; -import de.tudarmstadt.ukp.dkpro.core.textnormalizer.internal.AnnotationComparator; -import de.tudarmstadt.ukp.dkpro.core.textnormalizer.util.NormalizationUtils; /** * Base class for normalizers * */ @TypeCapability( - inputs={ + inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, - outputs={ + outputs = { "de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation"}) @Deprecated public abstract class Normalizer_ImplBase diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/ReplacementFileNormalizer.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/ReplacementFileNormalizer.java similarity index 90% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/ReplacementFileNormalizer.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/ReplacementFileNormalizer.java index 2be15bcfdd..2feed5027d 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/ReplacementFileNormalizer.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/ReplacementFileNormalizer.java @@ -15,9 +15,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer; +package org.dkpro.core.textnormalizer; -import static de.tudarmstadt.ukp.dkpro.core.castransformation.ApplyChangesAnnotator.OP_REPLACE; +import static org.dkpro.core.castransformation.ApplyChangesAnnotator.OP_REPLACE; import java.io.File; import java.io.IOException; @@ -37,16 +37,21 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.transform.alignment.AlignedString; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.transform.alignment.AlignedString; import de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Takes a text and replaces desired expressions. This class should not work on tokens as some * expressions might span several tokens. */ -@ResourceMetaData(name="Replacement File Normalizer") +@Component(OperationType.NORMALIZER) +@ResourceMetaData(name = "Replacement File Normalizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, outputs = { "de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation" }) @@ -68,10 +73,16 @@ public class ReplacementFileNormalizer @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, defaultValue = "UTF-8") protected String modelEncoding; + /** + * Pattern describing valid left/right context of the source expression. + */ public static final String PARAM_SRC_SURROUNDINGS = "srcExpressionSurroundings"; @ConfigurationParameter(name = PARAM_SRC_SURROUNDINGS, mandatory = true, defaultValue = "IRRELEVANT") private SrcSurroundings srcExpressionSurroundings; + /** + * Left/right context of the replacement. + */ public static final String PARAM_TARGET_SURROUNDINGS = "targetExpressionSurroundings"; @ConfigurationParameter(name = PARAM_TARGET_SURROUNDINGS, mandatory = true, defaultValue = "NOTHING") private TargetSurroundings targetExpressionSurroundings; diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/SpellingNormalizer.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/SpellingNormalizer.java similarity index 84% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/SpellingNormalizer.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/SpellingNormalizer.java index 76a0b38666..57a1f9696c 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/SpellingNormalizer.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/SpellingNormalizer.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer; +package org.dkpro.core.textnormalizer; import static org.apache.uima.fit.util.JCasUtil.select; @@ -23,14 +23,19 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SpellingAnomaly; -import de.tudarmstadt.ukp.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Converts annotations of the type SpellingAnomaly into a SofaChangeAnnoatation. */ -@ResourceMetaData(name="Spelling Normalizer") +@Component(OperationType.NORMALIZER) +@ResourceMetaData(name = "Spelling Normalizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SpellingAnomaly" }) diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/annotations/AnnotationByTextFilter.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/annotations/AnnotationByTextFilter.java similarity index 79% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/annotations/AnnotationByTextFilter.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/annotations/AnnotationByTextFilter.java index 15f7592cb3..11855d3284 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/annotations/AnnotationByTextFilter.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/annotations/AnnotationByTextFilter.java @@ -1,120 +1,132 @@ -/* - * Copyright 2014 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.annotations; - -import java.io.File; -import java.io.IOException; -import java.util.HashSet; -import java.util.Map.Entry; -import java.util.Set; - -import org.apache.commons.io.FileUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathFactory; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; - -/** - * Reads a list of words from a text file (one token per line) and retains only tokens or other - * annotations that match any of these words. - */ -@ResourceMetaData(name="Annotation-By-Text Filter") -public class AnnotationByTextFilter - extends JCasAnnotator_ImplBase -{ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true) - private File modelLocation; - private Set<String> words; - - /** - * If true, annotation texts are filtered case-independently. Default: true, i.e. words that - * occur in the list with different casing are not filtered out. - */ - public static final String PARAM_IGNORE_CASE = "ignoreCase"; - @ConfigurationParameter(name = PARAM_IGNORE_CASE, mandatory = true, defaultValue = "true") - private boolean ignoreCase; - - public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; - @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING) - private String modelEncoding; - - /** - * Annotation type to filter. Default: - * {@link de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token}. - */ - public static final String PARAM_TYPE_NAME = "typeName"; - @ConfigurationParameter(name = PARAM_TYPE_NAME, mandatory = true, defaultValue = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token") - private String typeName; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - - try { - readWords(); - } - catch (IOException e) { - throw new ResourceInitializationException(e); - } - }; - - private void readWords() - throws IOException - { - words = new HashSet<>(); - for (String line : FileUtils.readLines(modelLocation, modelEncoding)) { - words.add(ignoreCase ? line.trim().toLowerCase() : line.trim()); - } - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - Set<AnnotationFS> toRemove = new HashSet<>(); - try { - for (Entry<AnnotationFS, String> entry : FeaturePathFactory.select(aJCas.getCas(), - typeName)) { - String text = ignoreCase ? entry.getValue().toLowerCase() : entry.getValue(); - if (!words.contains(text)) { - toRemove.add(entry.getKey()); - } - } - } - catch (FeaturePathException e) { - throw new AnalysisEngineProcessException(e); - } - - for (AnnotationFS annotation : toRemove) { - aJCas.removeFsFromIndexes(annotation); - } - } - -} +/* + * Copyright 2014 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.dkpro.core.textnormalizer.annotations; + +import java.io.File; +import java.io.IOException; +import java.util.HashSet; +import java.util.Map.Entry; +import java.util.Set; + +import org.apache.commons.io.FileUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.featurepath.FeaturePathException; +import org.dkpro.core.api.featurepath.FeaturePathFactory; +import org.dkpro.core.api.parameter.ComponentParameters; + +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Reads a list of words from a text file (one token per line) and retains only tokens or other + * annotations that match any of these words. + */ +@Component(OperationType.NORMALIZER) +@ResourceMetaData(name = "Annotation-By-Text Filter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +public class AnnotationByTextFilter + extends JCasAnnotator_ImplBase +{ + /** + * Location from which the model is read. This is either a local path or a classpath location. + * In the latter case, the model artifact (if any) is searched as well. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true) + private File modelLocation; + private Set<String> words; + + /** + * If true, annotation texts are filtered case-independently (i.e. words that + * occur in the list with different casing are not filtered out). + */ + public static final String PARAM_IGNORE_CASE = "ignoreCase"; + @ConfigurationParameter(name = PARAM_IGNORE_CASE, mandatory = true, defaultValue = "true") + private boolean ignoreCase; + + /** + * The character encoding used by the model. + */ + public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; + @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, + defaultValue = ComponentParameters.DEFAULT_ENCODING) + private String modelEncoding; + + /** + * Annotation type to filter. + */ + public static final String PARAM_TYPE_NAME = "typeName"; + @ConfigurationParameter(name = PARAM_TYPE_NAME, mandatory = true, defaultValue = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token") + private String typeName; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + try { + readWords(); + } + catch (IOException e) { + throw new ResourceInitializationException(e); + } + }; + + private void readWords() + throws IOException + { + words = new HashSet<>(); + for (String line : FileUtils.readLines(modelLocation, modelEncoding)) { + words.add(ignoreCase ? line.trim().toLowerCase() : line.trim()); + } + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + Set<AnnotationFS> toRemove = new HashSet<>(); + try { + for (Entry<AnnotationFS, String> entry : FeaturePathFactory.select(aJCas.getCas(), + typeName)) { + String text = ignoreCase ? entry.getValue().toLowerCase() : entry.getValue(); + if (!words.contains(text)) { + toRemove.add(entry.getKey()); + } + } + } + catch (FeaturePathException e) { + throw new AnalysisEngineProcessException(e); + } + + for (AnnotationFS annotation : toRemove) { + aJCas.removeFsFromIndexes(annotation); + } + } + +} diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/annotations/RegexTokenFilter.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/annotations/RegexTokenFilter.java similarity index 83% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/annotations/RegexTokenFilter.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/annotations/RegexTokenFilter.java index de9cb59285..283f798b80 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/annotations/RegexTokenFilter.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/annotations/RegexTokenFilter.java @@ -1,84 +1,93 @@ -/* - * Copyright 2014 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.annotations; - -import static org.apache.uima.fit.util.JCasUtil.select; - -import java.util.LinkedList; -import java.util.List; -import java.util.regex.Pattern; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * Remove every token that does or does not match a given regular expression. - */ -@ResourceMetaData(name="Regex Token Filter") -public class RegexTokenFilter - extends JCasAnnotator_ImplBase -{ - /** - * Every token that does or does not match this regular expression will be removed. - */ - public static final String PARAM_REGEX = "regex"; - @ConfigurationParameter(name = PARAM_REGEX, mandatory = true) - private String regex; - - /** - * If this parameter is set to true (default), retain only tokens that match the regex given in - * {@link #PARAM_REGEX}. If set to false, all tokens that match the given regex are removed. - */ - public static final String PARAM_MUST_MATCH = "mustMatch"; - @ConfigurationParameter(name = PARAM_MUST_MATCH, mandatory = true, defaultValue = "true") - private boolean mustMatch; - - private Pattern filterRegex; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - filterRegex = Pattern.compile(regex); - }; - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - List<Token> toRemove = new LinkedList<>(); - for (Token token : select(aJCas, Token.class)) { - if (mustMatch && !filterRegex.matcher(token.getCoveredText()).matches() - || !mustMatch && filterRegex.matcher(token.getCoveredText()).matches()) { - toRemove.add(token); - } - } - for (Token token : toRemove) { - token.removeFromIndexes(); - } - } -} +/* + * Copyright 2014 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.dkpro.core.textnormalizer.annotations; + +import static org.apache.uima.fit.util.JCasUtil.select; + +import java.util.LinkedList; +import java.util.List; +import java.util.regex.Pattern; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Remove every token that does or does not match a given regular expression. + */ +@Component(OperationType.NORMALIZER) +@ResourceMetaData(name = "Regex Token Filter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) +public class RegexTokenFilter + extends JCasAnnotator_ImplBase +{ + /** + * Every token that does or does not match this regular expression will be removed. + */ + public static final String PARAM_REGEX = "regex"; + @ConfigurationParameter(name = PARAM_REGEX, mandatory = true) + private String regex; + + /** + * If this parameter is set to true (default), retain only tokens that match the regex given in + * {@link #PARAM_REGEX}. If set to false, all tokens that match the given regex are removed. + */ + public static final String PARAM_MUST_MATCH = "mustMatch"; + @ConfigurationParameter(name = PARAM_MUST_MATCH, mandatory = true, defaultValue = "true") + private boolean mustMatch; + + private Pattern filterRegex; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + filterRegex = Pattern.compile(regex); + }; + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + List<Token> toRemove = new LinkedList<>(); + for (Token token : select(aJCas, Token.class)) { + if (mustMatch && !filterRegex.matcher(token.getCoveredText()).matches() + || !mustMatch && filterRegex.matcher(token.getCoveredText()).matches()) { + toRemove.add(token); + } + } + for (Token token : toRemove) { + token.removeFromIndexes(); + } + } +} diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/annotations/TrailingCharacterRemover.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/annotations/TrailingCharacterRemover.java similarity index 83% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/annotations/TrailingCharacterRemover.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/annotations/TrailingCharacterRemover.java index f10060215d..20291f4e37 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/annotations/TrailingCharacterRemover.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/annotations/TrailingCharacterRemover.java @@ -1,100 +1,108 @@ -/* - * Copyright 2014 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.annotations; - -import static org.apache.uima.fit.util.JCasUtil.select; - -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * Removing trailing character (sequences) from tokens, e.g. punctuation. - */ -@ResourceMetaData(name="Trailing Character Remover") -public class TrailingCharacterRemover - extends JCasAnnotator_ImplBase -{ - /** - * A regex to be trimmed from the end of tokens. - * <p> - * Default: {@code "[\\Q,-“^»*’()&/\"'©§'—«·=\\E0-9A-Z]+"} (remove punctuations, special - * characters and capital letters). - */ - public static final String PARAM_PATTERN = "pattern"; - @ConfigurationParameter(name = PARAM_PATTERN, mandatory = true, defaultValue = "[\\Q,-“^»*’()&/\"'©§'—«·=\\E0-9A-Z]+") - private String pattern; - private Pattern suffixPattern; - - /** - * All tokens that are shorter than the minimum token length after removing trailing chars are - * completely removed. By default (1), empty tokens are removed. Set to 0 or a negative value if - * no tokens should be removed. - * <p> - * Shorter tokens that do not have trailing chars removed are always retained, regardless of - * their length. - */ - public static final String PARAM_MIN_TOKEN_LENGTH = "minTokenLength"; - @ConfigurationParameter(name = PARAM_MIN_TOKEN_LENGTH, mandatory = true, defaultValue = "1") - private int minTokenLength; - - @Override - public void initialize(UimaContext context) - throws ResourceInitializationException - { - super.initialize(context); - suffixPattern = Pattern.compile(String.format(".*?(%s)$", pattern)); - }; - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - List<Token> toRemove = new ArrayList<>(); - List<Token> tokens = new ArrayList<>(select(aJCas, Token.class)); - for (Token token : tokens) { - Matcher suffixMatcher = suffixPattern.matcher(token.getCoveredText()); - if (suffixMatcher.matches()) { - token.removeFromIndexes(); - token.setEnd(token.getEnd() - (suffixMatcher.end(1) - suffixMatcher.start(1))); - token.addToIndexes(); - - /* remove tokens that have become too short */ - if (minTokenLength > 0 && token.getEnd() - token.getBegin() < minTokenLength) { - toRemove.add(token); - } - } - } - - for (Token token : toRemove) { - token.removeFromIndexes(aJCas); - } - } -} +/* + * Copyright 2014 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.dkpro.core.textnormalizer.annotations; + +import static org.apache.uima.fit.util.JCasUtil.select; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Removing trailing character (sequences) from tokens, e.g. punctuation. + */ +@Component(OperationType.NORMALIZER) +@ResourceMetaData(name = "Trailing Character Remover") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" } ) +public class TrailingCharacterRemover + extends JCasAnnotator_ImplBase +{ + /** + * A regex to be trimmed from the end of tokens. + */ + public static final String PARAM_PATTERN = "pattern"; + @ConfigurationParameter(name = PARAM_PATTERN, mandatory = true, defaultValue = "[\\Q,-“^»*’()&/\"'©§'—«·=\\E0-9A-Z]+") + private String pattern; + private Pattern suffixPattern; + + /** + * All tokens that are shorter than the minimum token length after removing trailing chars are + * completely removed. By default (1), empty tokens are removed. Set to 0 or a negative value if + * no tokens should be removed. + * <p> + * Shorter tokens that do not have trailing chars removed are always retained, regardless of + * their length. + */ + public static final String PARAM_MIN_TOKEN_LENGTH = "minTokenLength"; + @ConfigurationParameter(name = PARAM_MIN_TOKEN_LENGTH, mandatory = true, defaultValue = "1") + private int minTokenLength; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + suffixPattern = Pattern.compile(String.format(".*?(%s)$", pattern)); + }; + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + List<Token> toRemove = new ArrayList<>(); + List<Token> tokens = new ArrayList<>(select(aJCas, Token.class)); + for (Token token : tokens) { + Matcher suffixMatcher = suffixPattern.matcher(token.getCoveredText()); + if (suffixMatcher.matches()) { + token.removeFromIndexes(); + token.setEnd(token.getEnd() - (suffixMatcher.end(1) - suffixMatcher.start(1))); + token.addToIndexes(); + + /* remove tokens that have become too short */ + if (minTokenLength > 0 && token.getEnd() - token.getBegin() < minTokenLength) { + toRemove.add(token); + } + } + } + + for (Token token : toRemove) { + token.removeFromIndexes(aJCas); + } + } +} diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/casfilter/CasFilter_ImplBase.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/casfilter/CasFilter_ImplBase.java similarity index 91% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/casfilter/CasFilter_ImplBase.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/casfilter/CasFilter_ImplBase.java index 961864a19a..186b220f9a 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/casfilter/CasFilter_ImplBase.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/casfilter/CasFilter_ImplBase.java @@ -1,133 +1,132 @@ -/* - * Copyright 2014 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.casfilter; - -import java.util.List; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.AbstractCas; -import org.apache.uima.fit.component.JCasMultiplier_ImplBase; -import org.apache.uima.fit.factory.AggregateBuilder; -import org.apache.uima.fit.factory.FlowControllerFactory; -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.apache.uima.flow.FlowController; -import org.apache.uima.flow.impl.FixedFlowController; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -/** - * This class calls the {@code pass()} methods to determine whether a JCas should be filtered out or - * passed on in a pipeline. Therefore, the {@code pass()} method has to be implemented, returning - * true if a document should be passed on in the pipeline and false if it should be filtered out. - * <p> - * The filter (sub-)classes should be applied within a an {@link AggregateBuilder} applying a - * {@link FlowController} as in the - * {@link CasFilter_ImplBase#createAggregateBuilderDescription(AnalysisEngineDescription...)} - * method. - * <p> - * Note that methods such as - * {@link SimplePipeline#runPipeline(org.apache.uima.cas.CAS, org.apache.uima.analysis_engine.AnalysisEngine...)} - * and - * {@link SimplePipeline#iteratePipeline(org.apache.uima.collection.CollectionReaderDescription, AnalysisEngineDescription...)} - * do not allow direct access to the JCas' produced by a JCasMultiplier. - * - * - */ -public abstract class CasFilter_ImplBase - extends JCasMultiplier_ImplBase -{ - private JCas current = null; - - @Override - public boolean hasNext() - throws AnalysisEngineProcessException - { - return current != null; - } - - @Override - public AbstractCas next() - throws AnalysisEngineProcessException - { - JCas result = current; - current = null; - return result; - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - current = pass(aJCas) ? aJCas : null; - } - - /** - * This method determines whether a document / JCas is removed or retained. If this method - * returns true, the document is retained, if it returns false, it is removed. - * - * @param aJCas - * the currently processed JCas - * @return true if the document is to be retained, false if it is to be removed - */ - protected abstract boolean pass(JCas aJCas); - - /** - * Creates a new AnalysisEngineDescription from an Aggregrator that contains all input - * AnalysisEngineDescriptions in given order. This is intended for the use of a filter like - * {@link CasFilter_ImplBase}; all subsequent analysis engines will only see the documents that - * have passed the filter. - * - * @param aEngines - * {@link AnalysisEngineDescription}s that should be aggregated. - * @return a single {@link AnalysisEngineDescription} aggregating all the input engines. - * @throws ResourceInitializationException - * if any input analysis engine cannot be initialized - */ - public static AnalysisEngineDescription createAggregateBuilderDescription( - AnalysisEngineDescription... aEngines) - throws ResourceInitializationException - { - AggregateBuilder aggregateBuilder = new AggregateBuilder(); - aggregateBuilder.setFlowControllerDescription(FlowControllerFactory - .createFlowControllerDescription(FixedFlowController.class, - FixedFlowController.PARAM_ACTION_AFTER_CAS_MULTIPLIER, "drop")); - - for (AnalysisEngineDescription aEngine : aEngines) { - aggregateBuilder.add(aEngine); - } - return aggregateBuilder.createAggregateDescription(); - } - - /** - * @see CasFilter_ImplBase#createAggregateBuilderDescription(AnalysisEngineDescription...) - * @param aEngines - * a list of {@link AnalysisEngineDescription}s - * @return a single {@link AnalysisEngineDescription} aggregating all the input engines. - * @throws ResourceInitializationException - * if any input analysis engine cannot be initialized - */ - public static AnalysisEngineDescription createAggregateBuilderDescription( - List<AnalysisEngineDescription> aEngines) - throws ResourceInitializationException - { - return createAggregateBuilderDescription(aEngines - .toArray(new AnalysisEngineDescription[aEngines.size()])); - } -} +/* + * Copyright 2014 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.dkpro.core.textnormalizer.casfilter; + +import java.util.List; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.AbstractCas; +import org.apache.uima.fit.component.JCasMultiplier_ImplBase; +import org.apache.uima.fit.factory.AggregateBuilder; +import org.apache.uima.fit.factory.FlowControllerFactory; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.apache.uima.flow.FlowController; +import org.apache.uima.flow.impl.FixedFlowController; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; + +/** + * This class calls the {@code pass()} methods to determine whether a JCas should be filtered out or + * passed on in a pipeline. Therefore, the {@code pass()} method has to be implemented, returning + * true if a document should be passed on in the pipeline and false if it should be filtered out. + * <p> + * The filter (sub-)classes should be applied within a an {@link AggregateBuilder} applying a + * {@link FlowController} as in the + * {@link CasFilter_ImplBase#createAggregateBuilderDescription(AnalysisEngineDescription...)} + * method. + * <p> + * Note that methods such as + * {@link SimplePipeline#runPipeline(org.apache.uima.cas.CAS, + * org.apache.uima.analysis_engine.AnalysisEngine...)} and {@link + * SimplePipeline#iteratePipeline(org.apache.uima.collection.CollectionReaderDescription, + * AnalysisEngineDescription...)} + * do not allow direct access to the JCas' produced by a JCasMultiplier. + */ +public abstract class CasFilter_ImplBase + extends JCasMultiplier_ImplBase +{ + private JCas current = null; + + @Override + public boolean hasNext() + throws AnalysisEngineProcessException + { + return current != null; + } + + @Override + public AbstractCas next() + throws AnalysisEngineProcessException + { + JCas result = current; + current = null; + return result; + } + + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + current = pass(aJCas) ? aJCas : null; + } + + /** + * This method determines whether a document / JCas is removed or retained. If this method + * returns true, the document is retained, if it returns false, it is removed. + * + * @param aJCas + * the currently processed JCas + * @return true if the document is to be retained, false if it is to be removed + */ + protected abstract boolean pass(JCas aJCas); + + /** + * Creates a new AnalysisEngineDescription from an Aggregrator that contains all input + * AnalysisEngineDescriptions in given order. This is intended for the use of a filter like + * {@link CasFilter_ImplBase}; all subsequent analysis engines will only see the documents that + * have passed the filter. + * + * @param aEngines + * {@link AnalysisEngineDescription}s that should be aggregated. + * @return a single {@link AnalysisEngineDescription} aggregating all the input engines. + * @throws ResourceInitializationException + * if any input analysis engine cannot be initialized + */ + public static AnalysisEngineDescription createAggregateBuilderDescription( + AnalysisEngineDescription... aEngines) + throws ResourceInitializationException + { + AggregateBuilder aggregateBuilder = new AggregateBuilder(); + aggregateBuilder.setFlowControllerDescription(FlowControllerFactory + .createFlowControllerDescription(FixedFlowController.class, + FixedFlowController.PARAM_ACTION_AFTER_CAS_MULTIPLIER, "drop")); + + for (AnalysisEngineDescription aEngine : aEngines) { + aggregateBuilder.add(aEngine); + } + return aggregateBuilder.createAggregateDescription(); + } + + /** + * @see CasFilter_ImplBase#createAggregateBuilderDescription(AnalysisEngineDescription...) + * @param aEngines + * a list of {@link AnalysisEngineDescription}s + * @return a single {@link AnalysisEngineDescription} aggregating all the input engines. + * @throws ResourceInitializationException + * if any input analysis engine cannot be initialized + */ + public static AnalysisEngineDescription createAggregateBuilderDescription( + List<AnalysisEngineDescription> aEngines) + throws ResourceInitializationException + { + return createAggregateBuilderDescription(aEngines + .toArray(new AnalysisEngineDescription[aEngines.size()])); + } +} diff --git a/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/factory/NormalizerFactory.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/factory/NormalizerFactory.java new file mode 100644 index 0000000000..bf3c4f2341 --- /dev/null +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/factory/NormalizerFactory.java @@ -0,0 +1,155 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.textnormalizer.factory; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.cas.CAS; +import org.apache.uima.fit.factory.AggregateBuilder; +import org.apache.uima.resource.ExternalResourceDescription; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.castransformation.ApplyChangesAnnotator; +import org.dkpro.core.jazzy.JazzyChecker; +import org.dkpro.core.textnormalizer.ReplacementFileNormalizer; +import org.dkpro.core.textnormalizer.ReplacementFileNormalizer.SrcSurroundings; +import org.dkpro.core.textnormalizer.ReplacementFileNormalizer.TargetSurroundings; +import org.dkpro.core.textnormalizer.SpellingNormalizer; +import org.dkpro.core.textnormalizer.frequency.CapitalizationNormalizer; +import org.dkpro.core.textnormalizer.frequency.ExpressiveLengtheningNormalizer; +import org.dkpro.core.textnormalizer.frequency.SharpSNormalizer; +import org.dkpro.core.tokit.BreakIteratorSegmenter; + +@Deprecated +public class NormalizerFactory +{ + private int view_counter = 0; + + public AnalysisEngineDescription getSpellcorrection(String aModelLocation) + throws ResourceInitializationException + { + AggregateBuilder ab = new AggregateBuilder(); + ab.add(createEngineDescription(BreakIteratorSegmenter.class), + CAS.NAME_DEFAULT_SOFA, getSourceView()); + ab.add(createEngineDescription(JazzyChecker.class, + JazzyChecker.PARAM_MODEL_LOCATION, aModelLocation), + CAS.NAME_DEFAULT_SOFA, getSourceView()); + ab.add(createEngineDescription(SpellingNormalizer.class), + CAS.NAME_DEFAULT_SOFA, getSourceView()); + ab.add(createEngineDescription(ApplyChangesAnnotator.class), + ApplyChangesAnnotator.VIEW_SOURCE, getSourceView(), + ApplyChangesAnnotator.VIEW_TARGET, getTargetView()); + AnalysisEngineDescription aed = ab.createAggregateDescription(); + aed.setAnnotatorImplementationName("Spell"); + + return aed; + } + + public AnalysisEngineDescription getUmlautSharpSNormalization( + ExternalResourceDescription aFrequencyProvider, int aMinFrequency) + throws ResourceInitializationException + { + AggregateBuilder ab = new AggregateBuilder(); + ab.add(createEngineDescription(BreakIteratorSegmenter.class), + CAS.NAME_DEFAULT_SOFA, getSourceView()); + ab.add(createEngineDescription( SharpSNormalizer.class, + SharpSNormalizer.RES_FREQUENCY_PROVIDER, aFrequencyProvider, + SharpSNormalizer.PARAM_MIN_FREQUENCY_THRESHOLD, aMinFrequency), + CAS.NAME_DEFAULT_SOFA, getSourceView()); + ab.add(createEngineDescription(ApplyChangesAnnotator.class), + ApplyChangesAnnotator.VIEW_SOURCE, getSourceView(), + ApplyChangesAnnotator.VIEW_TARGET, getTargetView()); + AnalysisEngineDescription aed = ab.createAggregateDescription(); + aed.setAnnotatorImplementationName("Umlaute"); + + return aed; + } + + public AnalysisEngineDescription getReplacementNormalization(String aModelLocation, + SrcSurroundings aSrc, TargetSurroundings aTarget) + throws ResourceInitializationException + { + AggregateBuilder ab = new AggregateBuilder(); + ab.add(createEngineDescription( + ReplacementFileNormalizer.class, + ReplacementFileNormalizer.PARAM_MODEL_LOCATION, aModelLocation, + ReplacementFileNormalizer.PARAM_SRC_SURROUNDINGS, aSrc, + ReplacementFileNormalizer.PARAM_TARGET_SURROUNDINGS, aTarget), + CAS.NAME_DEFAULT_SOFA, getSourceView()); + ab.add(createEngineDescription(ApplyChangesAnnotator.class), + ApplyChangesAnnotator.VIEW_SOURCE, getSourceView(), + ApplyChangesAnnotator.VIEW_TARGET, getTargetView()); + AnalysisEngineDescription aed = ab.createAggregateDescription(); + // aed.setAnnotatorImplementationName(new File(filepath).getName().split("\\")[0]); + + return aed; + } + + public AnalysisEngineDescription getExpressiveLengtheningNormalization( + ExternalResourceDescription aFrequencyProvider) + throws ResourceInitializationException + { + AggregateBuilder ab = new AggregateBuilder(); + ab.add(createEngineDescription(BreakIteratorSegmenter.class), + CAS.NAME_DEFAULT_SOFA, getSourceView()); + ab.add(createEngineDescription(ExpressiveLengtheningNormalizer.class, + ExpressiveLengtheningNormalizer.RES_FREQUENCY_PROVIDER, aFrequencyProvider), + CAS.NAME_DEFAULT_SOFA, getSourceView()); + ab.add(createEngineDescription(ApplyChangesAnnotator.class), + ApplyChangesAnnotator.VIEW_SOURCE, getSourceView(), + ApplyChangesAnnotator.VIEW_TARGET, getTargetView()); + AnalysisEngineDescription aed = ab.createAggregateDescription(); + aed.setAnnotatorImplementationName("Lengthening"); + + return aed; + } + + public AnalysisEngineDescription getCapitalizationNormalization( + ExternalResourceDescription aFrequencyProvider) + throws ResourceInitializationException + { + AggregateBuilder ab = new AggregateBuilder(); + ab.add(createEngineDescription(BreakIteratorSegmenter.class), + CAS.NAME_DEFAULT_SOFA, getSourceView()); + ab.add(createEngineDescription(CapitalizationNormalizer.class, + CapitalizationNormalizer.RES_FREQUENCY_PROVIDER, aFrequencyProvider), + CAS.NAME_DEFAULT_SOFA, getSourceView()); + ab.add(createEngineDescription(ApplyChangesAnnotator.class), + ApplyChangesAnnotator.VIEW_SOURCE, getSourceView(), + ApplyChangesAnnotator.VIEW_TARGET, getTargetView()); + AnalysisEngineDescription aed = ab.createAggregateDescription(); + aed.setAnnotatorImplementationName("Capitalization"); + + return aed; + } + + protected String getSourceView() + { + return (view_counter > 0) ? "view" + view_counter : CAS.NAME_DEFAULT_SOFA; + } + + protected String getTargetView() + { + return "view" + ++view_counter; + } + + public String getOutputView() + { + return "view" + view_counter; + } +} diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/CapitalizationNormalizer.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/frequency/CapitalizationNormalizer.java similarity index 82% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/CapitalizationNormalizer.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/frequency/CapitalizationNormalizer.java index e7198dca50..d5f841da79 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/CapitalizationNormalizer.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/frequency/CapitalizationNormalizer.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.frequency; +package org.dkpro.core.textnormalizer.frequency; import static org.apache.uima.fit.util.JCasUtil.select; @@ -29,22 +29,30 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.frequency.provider.FrequencyCountProvider; +import org.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.provider.FrequencyCountProvider; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Takes a text and replaces wrong capitalization */ -@ResourceMetaData(name="Capitalization Normalizer") +@Component(OperationType.NORMALIZER) +@ResourceMetaData(name = "Capitalization Normalizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) public class CapitalizationNormalizer extends JCasTransformerChangeBased_ImplBase { - public static final String FREQUENCY_PROVIDER = "FrequencyProvider"; - @ExternalResource(key = FREQUENCY_PROVIDER, mandatory = true) + /** + * Resource providing the frequency counts. + */ + public static final String RES_FREQUENCY_PROVIDER = "FrequencyProvider"; + @ExternalResource(key = RES_FREQUENCY_PROVIDER, mandatory = true) protected FrequencyCountProvider frequencyProvider; @Override diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/ExpressiveLengtheningNormalizer.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/frequency/ExpressiveLengtheningNormalizer.java similarity index 88% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/ExpressiveLengtheningNormalizer.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/frequency/ExpressiveLengtheningNormalizer.java index ce461b087f..8c588f57b7 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/ExpressiveLengtheningNormalizer.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/frequency/ExpressiveLengtheningNormalizer.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.frequency; +package org.dkpro.core.textnormalizer.frequency; import static org.apache.uima.fit.util.JCasUtil.select; @@ -31,22 +31,30 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.frequency.provider.FrequencyCountProvider; +import org.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.provider.FrequencyCountProvider; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Takes a text and shortens extra long words */ -@ResourceMetaData(name="Expressive Lengthening Normalizer") +@Component(OperationType.NORMALIZER) +@ResourceMetaData(name = "Expressive Lengthening Normalizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) public class ExpressiveLengtheningNormalizer extends JCasTransformerChangeBased_ImplBase { - public static final String FREQUENCY_PROVIDER = "FrequencyProvider"; - @ExternalResource(key = FREQUENCY_PROVIDER, mandatory = true) + /** + * Resource providing the frequency counts. + */ + public static final String RES_FREQUENCY_PROVIDER = "FrequencyProvider"; + @ExternalResource(key = RES_FREQUENCY_PROVIDER, mandatory = true) protected FrequencyCountProvider frequencyProvider; @Override diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/ReplacementFrequencyNormalizer_ImplBase.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/frequency/ReplacementFrequencyNormalizer_ImplBase.java similarity index 90% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/ReplacementFrequencyNormalizer_ImplBase.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/frequency/ReplacementFrequencyNormalizer_ImplBase.java index dfe4f37893..f964fbd8b7 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/ReplacementFrequencyNormalizer_ImplBase.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/frequency/ReplacementFrequencyNormalizer_ImplBase.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.frequency; +package org.dkpro.core.textnormalizer.frequency; import static org.apache.uima.fit.util.JCasUtil.select; @@ -28,10 +28,10 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.frequency.provider.FrequencyCountProvider; +import org.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.provider.FrequencyCountProvider; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; /** * This base class is for all normalizers that need a frequency provider and replace based on a @@ -42,10 +42,16 @@ public abstract class ReplacementFrequencyNormalizer_ImplBase extends JCasTransformerChangeBased_ImplBase { - public static final String FREQUENCY_PROVIDER = "frequencyProvider"; - @ExternalResource(key = FREQUENCY_PROVIDER, mandatory = true) + /** + * Resource providing the frequency counts. + */ + public static final String RES_FREQUENCY_PROVIDER = "frequencyProvider"; + @ExternalResource(key = RES_FREQUENCY_PROVIDER, mandatory = true) protected FrequencyCountProvider frequencyProvider; + /** + * Minimum frequency count. + */ public static final String PARAM_MIN_FREQUENCY_THRESHOLD = "minFrequencyThreshold"; @ConfigurationParameter(name = PARAM_MIN_FREQUENCY_THRESHOLD, mandatory = true, defaultValue = "100") private int minFrequencyThreshold; @@ -111,4 +117,4 @@ public void process(JCas aInput, JCas aOutput) replace(token.getBegin(), token.getEnd(), tokenString); } } -} \ No newline at end of file +} diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/SharpSNormalizer.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/frequency/SharpSNormalizer.java similarity index 77% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/SharpSNormalizer.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/frequency/SharpSNormalizer.java index f5d3f32352..62062bf66e 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/SharpSNormalizer.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/frequency/SharpSNormalizer.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.frequency; +package org.dkpro.core.textnormalizer.frequency; import java.util.HashMap; import java.util.Map; @@ -23,10 +23,16 @@ import org.apache.uima.fit.descriptor.LanguageCapability; import org.apache.uima.fit.descriptor.ResourceMetaData; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + /** * Takes a text and replaces sharp s */ +@Component(OperationType.NORMALIZER) @ResourceMetaData(name = "Sharp S (ß) Normalizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @LanguageCapability("de") public class SharpSNormalizer extends ReplacementFrequencyNormalizer_ImplBase @@ -41,4 +47,4 @@ public Map<String, String> getReplacementMap() return map; } -} \ No newline at end of file +} diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/UmlautNormalizer.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/frequency/UmlautNormalizer.java similarity index 81% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/UmlautNormalizer.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/frequency/UmlautNormalizer.java index 5e5274e610..83cba9e029 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/UmlautNormalizer.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/frequency/UmlautNormalizer.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.frequency; +package org.dkpro.core.textnormalizer.frequency; import java.util.HashMap; import java.util.Map; @@ -24,11 +24,17 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + /** * Takes a text and checks for umlauts written as "ae", "oe", or "ue" and normalizes them if they * really are umlauts depending on a frequency model. */ +@Component(OperationType.NORMALIZER) @ResourceMetaData(name = "Umlaut Normalizer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @LanguageCapability("de") @TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) public class UmlautNormalizer @@ -46,4 +52,4 @@ public Map<String, String> getReplacementMap() map.put("Ue", "Ü"); return map; } -} \ No newline at end of file +} diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/internal/AnnotationComparator.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/internal/AnnotationComparator.java similarity index 93% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/internal/AnnotationComparator.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/internal/AnnotationComparator.java index 62f180c1ed..010a7f7002 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/internal/AnnotationComparator.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/internal/AnnotationComparator.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.internal; +package org.dkpro.core.textnormalizer.internal; import java.util.Comparator; diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/DictionaryBasedTokenTransformer.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/transformation/DictionaryBasedTokenTransformer.java similarity index 83% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/DictionaryBasedTokenTransformer.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/transformation/DictionaryBasedTokenTransformer.java index 5178fa81eb..f25de12896 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/DictionaryBasedTokenTransformer.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/transformation/DictionaryBasedTokenTransformer.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.transformation; +package org.dkpro.core.textnormalizer.transformation; import static org.apache.commons.io.IOUtils.readLines; import static org.apache.uima.fit.util.JCasUtil.select; @@ -34,37 +34,49 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Reads a tab-separated file containing mappings from one token to another. All tokens that match * an entry in the first column are changed to the corresponding token in the second column. */ -@ResourceMetaData(name="Dictionary-based Token Transformer") +@Component(OperationType.NORMALIZER) +@ResourceMetaData(name = "Dictionary-based Token Transformer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") public class DictionaryBasedTokenTransformer extends JCasTransformerChangeBased_ImplBase { + /** + * Location from which the model is read. This is either a local path or a classpath location. + * In the latter case, the model artifact (if any) is searched as well. + */ public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true) private String modelLocation; + /** + * The character encoding used by the model. + */ public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, defaultValue = "UTF-8") private String modelEncoding; /** - * Lines starting with this character (or String) are ignored. Default: '#' + * Lines starting with this character (or String) are ignored. */ public static final String PARAM_COMMENT_MARKER = "commentMarker"; @ConfigurationParameter(name = PARAM_COMMENT_MARKER, mandatory = true, defaultValue = "#") private String commentMarker; /** - * Separator for mappings file. Default: "\t" (TAB). + * Separator for mappings file. */ public static final String PARAM_SEPARATOR = "separator"; @ConfigurationParameter(name = PARAM_SEPARATOR, mandatory = true, defaultValue = "\t") diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/FileBasedTokenTransformer.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/transformation/FileBasedTokenTransformer.java similarity index 77% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/FileBasedTokenTransformer.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/transformation/FileBasedTokenTransformer.java index 5c481927a3..e23d510fd8 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/FileBasedTokenTransformer.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/transformation/FileBasedTokenTransformer.java @@ -16,17 +16,9 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.transformation; +package org.dkpro.core.textnormalizer.transformation; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; +import static org.apache.uima.fit.util.JCasUtil.select; import java.io.File; import java.io.IOException; @@ -34,25 +26,49 @@ import java.util.Collection; import java.util.stream.Collectors; -import static org.apache.uima.fit.util.JCasUtil.select; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Replaces all tokens that are listed in the file in {@link #PARAM_MODEL_LOCATION} by the string * specified in {@link #PARAM_REPLACEMENT}. */ -@ResourceMetaData(name="File-based Token Transformer") +@Component(OperationType.NORMALIZER) +@ResourceMetaData(name = "File-based Token Transformer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") public class FileBasedTokenTransformer extends JCasTransformerChangeBased_ImplBase { + /** + * Location from which the model is read. This is either a local path or a classpath location. + * In the latter case, the model artifact (if any) is searched as well. + */ public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true) private String modelLocation; private Collection<String> tokensToReplace; + /** + * The value by which the matching tokens should be replaced. + */ public static final String PARAM_REPLACEMENT = "replacement"; @ConfigurationParameter(name = PARAM_REPLACEMENT, mandatory = true) private String replacement; + /** + * Match tokens against the dictionary without considering case. + */ public static final String PARAM_IGNORE_CASE = "ignoreCase"; @ConfigurationParameter(name = PARAM_IGNORE_CASE, mandatory = true, defaultValue = "false") private boolean ignoreCase; diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/HyphenationRemover.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/transformation/HyphenationRemover.java similarity index 80% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/HyphenationRemover.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/transformation/HyphenationRemover.java index 938f274fb4..f0987a6520 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/HyphenationRemover.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/transformation/HyphenationRemover.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.transformation; +package org.dkpro.core.textnormalizer.transformation; import java.io.IOException; import java.io.InputStream; @@ -33,15 +33,20 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Simple dictionary-based hyphenation remover. */ -@ResourceMetaData(name="Hyphenation Remover") +@Component(OperationType.NORMALIZER) +@ResourceMetaData(name = "Hyphenation Remover") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") public class HyphenationRemover extends JCasTransformerChangeBased_ImplBase { @@ -53,10 +58,17 @@ public class HyphenationRemover private static final Pattern HYPHEN_PATTERN = Pattern.compile( "\\b(\\p{L}+)-[\\p{Space}]+(\\p{L}+)\\b"); + /** + * Location from which the model is read. This is either a local path or a classpath location. + * In the latter case, the model artifact (if any) is searched as well. + */ public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true) private String modelLocation; + /** + * The character encoding used by the model. + */ public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, defaultValue = "UTF-8") private String modelEncoding; diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/RegexBasedTokenTransformer.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/transformation/RegexBasedTokenTransformer.java similarity index 79% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/RegexBasedTokenTransformer.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/transformation/RegexBasedTokenTransformer.java index a83d84ecde..0d3fe484ad 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/RegexBasedTokenTransformer.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/transformation/RegexBasedTokenTransformer.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.transformation; +package org.dkpro.core.textnormalizer.transformation; import static org.apache.uima.fit.util.JCasUtil.select; @@ -26,11 +26,15 @@ import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * A {@link JCasTransformerChangeBased_ImplBase} implementation that replaces tokens based on a @@ -39,7 +43,12 @@ * The parameters {@link #PARAM_REGEX} defines the regular expression to be searcher, * {@link #PARAM_REPLACEMENT} defines the string with which matching patterns are replaces. */ -@ResourceMetaData(name="Regex-based Token Transformer") +@Component(OperationType.NORMALIZER) +@ResourceMetaData(name = "Regex-based Token Transformer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) public class RegexBasedTokenTransformer extends JCasTransformerChangeBased_ImplBase { diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/TokenCaseTransformer.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/transformation/TokenCaseTransformer.java similarity index 87% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/TokenCaseTransformer.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/transformation/TokenCaseTransformer.java index 772a7ace49..035287cf58 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/TokenCaseTransformer.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/transformation/TokenCaseTransformer.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.transformation; +package org.dkpro.core.textnormalizer.transformation; import static org.apache.uima.fit.util.JCasUtil.select; @@ -26,16 +26,21 @@ import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Change tokens to follow a specific casing: all upper case, all lower case, or 'normal case': * lowercase everything but the first character of a token and the characters immediately following * a hyphen. */ -@ResourceMetaData(name="Token Case Transformer") +@Component(OperationType.NORMALIZER) +@ResourceMetaData(name = "Token Case Transformer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") public class TokenCaseTransformer extends JCasTransformerChangeBased_ImplBase { @@ -51,7 +56,8 @@ public static enum Case * <ul> * <li>UPPERCASE: uppercase everything.</li> * <li>LOWERCASE: lowercase everything.</li> - * <li>NORMALCASE: retain first letter in word and after hyphens, lowercase everything else.</li> + * <li>NORMALCASE: retain first letter in word and after hyphens, lowercase everything + * else.</li> * </ul> */ public static final String PARAM_CASE = "tokenCase"; diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/util/CreateNormalizerModel.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/util/CreateNormalizerModel.java similarity index 75% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/util/CreateNormalizerModel.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/util/CreateNormalizerModel.java index 5965ef9b5d..10b6e15b1d 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/util/CreateNormalizerModel.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/util/CreateNormalizerModel.java @@ -1,63 +1,63 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.util; - -import java.io.File; -import java.util.Iterator; - -import org.apache.commons.lang3.StringUtils; - -import de.tudarmstadt.ukp.dkpro.core.api.frequency.provider.FrequencyCountProvider; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.ConditionalFrequencyDistribution; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution; -import de.tudarmstadt.ukp.dkpro.core.api.resources.DkproContext; -import de.tudarmstadt.ukp.dkpro.core.frequency.Web1TFileAccessProvider; -import de.tudarmstadt.ukp.dkpro.core.io.web1t.util.Web1TConverter; - -public class CreateNormalizerModel -{ - - public static void main(String[] args) - throws Exception - { - FrequencyDistribution<String> freqDist = new FrequencyDistribution<String>(); - - File context = DkproContext.getContext().getWorkspace("web1t"); - FrequencyCountProvider provider = new Web1TFileAccessProvider("de", - new File(context, "de"), 1, 1); - - Iterator<String> ngramIterator = provider.getNgramIterator(1); - while (ngramIterator.hasNext()) { - String unigram = ngramIterator.next(); - - if (StringUtils.containsAny(unigram, new char[] { 'ä', 'ö', 'ü', }) - || unigram.contains("ae") || unigram.contains("ae") || unigram.contains("ae")) { - freqDist.addSample(unigram, provider.getFrequency(unigram)); - } - } - - ConditionalFrequencyDistribution<Integer, String> cfd = new ConditionalFrequencyDistribution<Integer, String>(); - cfd.setFrequencyDistribution(1, freqDist); - - Web1TConverter converter = new Web1TConverter("target/model"); - converter.setMaxNgramLength(1); - converter.setSplitThreshold(10); - converter.add(cfd); - converter.createIndex(); - } -} +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.textnormalizer.util; + +import java.io.File; +import java.util.Iterator; + +import org.apache.commons.lang3.StringUtils; +import org.dkpro.core.api.frequency.provider.FrequencyCountProvider; +import org.dkpro.core.api.frequency.util.ConditionalFrequencyDistribution; +import org.dkpro.core.api.frequency.util.FrequencyDistribution; +import org.dkpro.core.api.resources.DkproContext; +import org.dkpro.core.frequency.Web1TFileAccessProvider; +import org.dkpro.core.io.web1t.util.Web1TConverter; + +public class CreateNormalizerModel +{ + + public static void main(String[] args) + throws Exception + { + FrequencyDistribution<String> freqDist = new FrequencyDistribution<String>(); + + File context = DkproContext.getContext().getWorkspace("web1t"); + FrequencyCountProvider provider = new Web1TFileAccessProvider("de", + new File(context, "de"), 1, 1); + + Iterator<String> ngramIterator = provider.getNgramIterator(1); + while (ngramIterator.hasNext()) { + String unigram = ngramIterator.next(); + + if (StringUtils.containsAny(unigram, new char[] { 'ä', 'ö', 'ü', }) + || unigram.contains("ae") || unigram.contains("ae") || unigram.contains("ae")) { + freqDist.addSample(unigram, provider.getFrequency(unigram)); + } + } + + ConditionalFrequencyDistribution<Integer, String> cfd = + new ConditionalFrequencyDistribution<Integer, String>(); + cfd.setFrequencyDistribution(1, freqDist); + + Web1TConverter converter = new Web1TConverter("target/model"); + converter.setMaxNgramLength(1); + converter.setSplitThreshold(10); + converter.add(cfd); + converter.createIndex(); + } +} diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/util/JCasHolder.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/util/JCasHolder.java similarity index 96% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/util/JCasHolder.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/util/JCasHolder.java index 07d7101bbb..03b9b367df 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/util/JCasHolder.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/util/JCasHolder.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.util; +package org.dkpro.core.textnormalizer.util; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; @@ -52,4 +52,4 @@ public static JCas get() { return value; } -} \ No newline at end of file +} diff --git a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/util/NormalizationUtils.java b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/util/NormalizationUtils.java similarity index 83% rename from dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/util/NormalizationUtils.java rename to dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/util/NormalizationUtils.java index a4ed2b936e..00e9c8f320 100644 --- a/dkpro-core-textnormalizer-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/util/NormalizationUtils.java +++ b/dkpro-core-textnormalizer-asl/src/main/java/org/dkpro/core/textnormalizer/util/NormalizationUtils.java @@ -15,16 +15,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.util; +package org.dkpro.core.textnormalizer.util; -import static de.tudarmstadt.ukp.dkpro.core.castransformation.ApplyChangesAnnotator.OP_DELETE; -import static de.tudarmstadt.ukp.dkpro.core.castransformation.ApplyChangesAnnotator.OP_INSERT; -import static de.tudarmstadt.ukp.dkpro.core.castransformation.ApplyChangesAnnotator.OP_REPLACE; +import static org.dkpro.core.castransformation.ApplyChangesAnnotator.OP_DELETE; +import static org.dkpro.core.castransformation.ApplyChangesAnnotator.OP_INSERT; +import static org.dkpro.core.castransformation.ApplyChangesAnnotator.OP_REPLACE; import java.util.Collections; import java.util.List; -import de.tudarmstadt.ukp.dkpro.core.api.transform.alignment.AlignedString; +import org.dkpro.core.api.transform.alignment.AlignedString; + import de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation; public class NormalizationUtils { @@ -54,4 +55,4 @@ public static void applyChanges(AlignedString as, List<SofaChangeAnnotation> cha } } } -} \ No newline at end of file +} diff --git a/dkpro-core-textnormalizer-asl/src/scripts/build.xml b/dkpro-core-textnormalizer-asl/src/scripts/build.xml index 08e1506475..a1969b85e2 100644 --- a/dkpro-core-textnormalizer-asl/src/scripts/build.xml +++ b/dkpro-core-textnormalizer-asl/src/scripts/build.xml @@ -1,60 +1,60 @@ -<!-- - Copyright 2012 - Ubiquitous Knowledge Processing (UKP) Lab - Technische Universität Darmstadt - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project basedir="../.." default="separate-jars"> - <property name="outputPackage" value="de/tudarmstadt/ukp/dkpro/core/umlautnormalizer/lib"/> - - <import> - <url url="https://raw.githubusercontent.com/dkpro/resource-packager/0.7.0/ant-macros.xml"/> - </import> - - <target name="local-maven"> - <property name="install-artifact-enable" value="true"/> - <antcall target="separate-jars"/> - </target> - - <target name="remote-maven"> - <property name="install-artifact-mode" value="remote"/> - <antcall target="separate-jars"/> - </target> - - <target name="separate-jars" depends="en"/> - - <target name="en"> - <mkdir dir="target/download"/> - - <get - src="http://dkpro-core-asl.googlecode.com/files/umlaut_web1t_model_20121116.zip" - dest="target/download/umlaut_web1t_model.zip" - skipexisting="true"/> - <unzip - dest="target/download/umlaut_web1t_model" - src="target/download/umlaut_web1t_model.zip"> - </unzip> - <install-stub-and-upstream-folder - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.umlautnormalizer" - folder="target/download/umlaut_web1t_model" - metaDataVersion="1" - upstreamVersion="20121116" - language="de" - tool="normalizer" - variant="default"> - </install-stub-and-upstream-folder> - <delete dir="target/model-staging"/> - </target> +<!-- + Copyright 2012 + Ubiquitous Knowledge Processing (UKP) Lab + Technische Universität Darmstadt + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project basedir="../.." default="separate-jars"> + <property name="outputPackage" value="de/tudarmstadt/ukp/dkpro/core/umlautnormalizer/lib"/> + + <import> + <url url="https://raw.githubusercontent.com/dkpro/resource-packager/0.7.0/ant-macros.xml"/> + </import> + + <target name="local-maven"> + <property name="install-artifact-enable" value="true"/> + <antcall target="separate-jars"/> + </target> + + <target name="remote-maven"> + <property name="install-artifact-mode" value="remote"/> + <antcall target="separate-jars"/> + </target> + + <target name="separate-jars" depends="en"/> + + <target name="en"> + <mkdir dir="target/download"/> + + <get + src="http://dkpro-core-asl.googlecode.com/files/umlaut_web1t_model_20121116.zip" + dest="target/download/umlaut_web1t_model.zip" + skipexisting="true"/> + <unzip + dest="target/download/umlaut_web1t_model" + src="target/download/umlaut_web1t_model.zip"> + </unzip> + <install-stub-and-upstream-folder + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.umlautnormalizer" + folder="target/download/umlaut_web1t_model" + metaDataVersion="1" + upstreamVersion="20121116" + language="de" + tool="normalizer" + variant="default"> + </install-stub-and-upstream-folder> + <delete dir="target/model-staging"/> + </target> </project> \ No newline at end of file diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/ReplacementFileNormalizerTest.java b/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/ReplacementFileNormalizerTest.java deleted file mode 100644 index 0baceaae1d..0000000000 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/ReplacementFileNormalizerTest.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.junit.Assert.assertEquals; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.cas.CAS; -import org.apache.uima.fit.factory.AggregateBuilder; -import org.apache.uima.jcas.JCas; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.castransformation.ApplyChangesAnnotator; -import de.tudarmstadt.ukp.dkpro.core.textnormalizer.ReplacementFileNormalizer.SrcSurroundings; -import de.tudarmstadt.ukp.dkpro.core.textnormalizer.ReplacementFileNormalizer.TargetSurroundings; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; - -public class ReplacementFileNormalizerTest -{ - @Test - public void testReplacementNormalizer() throws Exception - { - testEmoticonReplacement(":-):-("," lächeln traurig "); - // testInternetslangReplacement("AKA hdl AKA.", "Also known as Hab' dich lieb Also known as."); - } - - public void testEmoticonReplacement(String input, String output) throws Exception - { - AnalysisEngineDescription replace = createEngineDescription( - ReplacementFileNormalizer.class, - ReplacementFileNormalizer.PARAM_MODEL_LOCATION, "src/main/resources/replaceLists/emoticons_de.txt", - ReplacementFileNormalizer.PARAM_TARGET_SURROUNDINGS, TargetSurroundings.WHITESPACE); - - AggregateBuilder ab = new AggregateBuilder(); - ab.add(replace); - ab.add(createEngineDescription(ApplyChangesAnnotator.class), - ApplyChangesAnnotator.VIEW_SOURCE, CAS.NAME_DEFAULT_SOFA, - ApplyChangesAnnotator.VIEW_TARGET, "view1"); - - AnalysisEngine engine = ab.createAggregate(); - JCas jcas = engine.newJCas(); - jcas.setDocumentText(input); - DocumentMetaData.create(jcas); - engine.process(jcas); - - JCas view0 = jcas.getView(CAS.NAME_DEFAULT_SOFA); - JCas view1 = jcas.getView("view1"); - - System.out.println(view0.getDocumentText()); - System.out.println(view1.getDocumentText()); - - assertEquals(output, view1.getDocumentText()); - } - - public void testInternetslangReplacement(String input, String output) throws Exception - { - AnalysisEngineDescription replace = createEngineDescription( - ReplacementFileNormalizer.class, - ReplacementFileNormalizer.PARAM_MODEL_LOCATION, "src/main/resources/replaceLists/internetslang.txt", - ReplacementFileNormalizer.PARAM_SRC_SURROUNDINGS, SrcSurroundings.ONLY_ALPHANIMERIC); - - AggregateBuilder ab = new AggregateBuilder(); - ab.add(createEngineDescription(BreakIteratorSegmenter.class), - CAS.NAME_DEFAULT_SOFA, CAS.NAME_DEFAULT_SOFA); - ab.add(replace); - ab.add(createEngineDescription( ApplyChangesAnnotator.class), - ApplyChangesAnnotator.VIEW_SOURCE, CAS.NAME_DEFAULT_SOFA, - ApplyChangesAnnotator.VIEW_TARGET, "view1"); - - AnalysisEngine engine = ab.createAggregate(); - JCas jcas = engine.newJCas(); - jcas.setDocumentText(input); - DocumentMetaData.create(jcas); - engine.process(jcas); - - JCas view0 = jcas.getView(CAS.NAME_DEFAULT_SOFA); - JCas view1 = jcas.getView("view1"); - - System.out.println(view0.getDocumentText()); - System.out.println(view1.getDocumentText()); - - assertEquals(output, view1.getDocumentText()); - } -} diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/factory/NormalizerFactoryTest.java b/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/factory/NormalizerFactoryTest.java deleted file mode 100644 index 505bbd2f71..0000000000 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/factory/NormalizerFactoryTest.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.factory; - -import static org.apache.uima.fit.factory.ExternalResourceFactory.createExternalResourceDescription; -import static org.junit.Assert.assertEquals; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.fit.factory.AggregateBuilder; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ExternalResourceDescription; -import org.junit.Before; -import org.junit.Ignore; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.frequency.resources.Web1TFrequencyCountResource; -import de.tudarmstadt.ukp.dkpro.core.textnormalizer.ReplacementFileNormalizer.SrcSurroundings; -import de.tudarmstadt.ukp.dkpro.core.textnormalizer.ReplacementFileNormalizer.TargetSurroundings; - -public class NormalizerFactoryTest -{ - - private ExternalResourceDescription frequencyProvider; - - @Before - public void init(){ - frequencyProvider = createExternalResourceDescription( - Web1TFrequencyCountResource.class, - Web1TFrequencyCountResource.PARAM_LANGUAGE, "de", - Web1TFrequencyCountResource.PARAM_MIN_NGRAM_LEVEL, "1", - Web1TFrequencyCountResource.PARAM_MAX_NGRAM_LEVEL, "1", - Web1TFrequencyCountResource.PARAM_INDEX_PATH, "src/test/resources/jweb1t"); - } - - @Ignore("This test will not work while the module is in transition - after we need to fix it") - @Test - public void testNormalizerFactory() throws Exception - { - test( - "GMBH +++ Gewerkschaftb +++ HDL +++ :-)", - "GmbH +++ Gewerkschaft +++ Hab' dich lieb +++ lächeln " - ); - } - - - public void test(String input, String output) throws Exception - { - NormalizerFactory nf = new NormalizerFactory(); - AnalysisEngineDescription normalizeSharpSUmlaute = nf.getUmlautSharpSNormalization(frequencyProvider,0); - AnalysisEngineDescription normalizeRepitions = nf.getExpressiveLengtheningNormalization(frequencyProvider); - AnalysisEngineDescription normalizeCapitalization = nf.getCapitalizationNormalization(frequencyProvider); - AnalysisEngineDescription normalizeInternetslang = nf.getReplacementNormalization("src/main/resources/replaceLists/internetslang.txt", SrcSurroundings.ONLY_ALPHANIMERIC, TargetSurroundings.NOTHING); - AnalysisEngineDescription normalizeSpelling = nf.getSpellcorrection("src/test/resources/dictionary/ngerman"); - AnalysisEngineDescription normalizeEmoticons = nf.getReplacementNormalization("src/main/resources/replaceLists/emoticons_de.txt", SrcSurroundings.IRRELEVANT, TargetSurroundings.WHITESPACE); - - AggregateBuilder ab = new AggregateBuilder(); - ab.add(normalizeSharpSUmlaute); - ab.add(normalizeRepitions); - ab.add(normalizeCapitalization); - ab.add(normalizeInternetslang); - ab.add(normalizeSpelling); - ab.add(normalizeEmoticons); - - AnalysisEngine engine = ab.createAggregate(); - JCas jcas = engine.newJCas(); - jcas.setDocumentText(input); - DocumentMetaData.create(jcas); - engine.process(jcas); - - JCas view0 = jcas.getView("_InitialView"); - JCas view1 = jcas.getView("view1"); - JCas view2 = jcas.getView("view2"); - JCas view3 = jcas.getView("view3"); - JCas view4 = jcas.getView("view4"); - JCas view5 = jcas.getView("view5"); - JCas view6 = jcas.getView("view6"); - - System.out.println("Original :" + view0.getDocumentText()); - System.out.println("Umlaute :" + view1.getDocumentText()); - System.out.println("Repitions :" + view2.getDocumentText()); - System.out.println("Capitalization :" + view3.getDocumentText()); - System.out.println("Internetslang :" + view4.getDocumentText()); - System.out.println("Spelling :" + view5.getDocumentText()); - System.out.println("Emoticons :" + view6.getDocumentText()); - System.out.println("Perfect :" + output); - - assertEquals(output, view6.getDocumentText()); - - - } - -} diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/ExpressiveLengtheningNormalizerTest.java b/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/ExpressiveLengtheningNormalizerTest.java deleted file mode 100644 index ce174da82b..0000000000 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/ExpressiveLengtheningNormalizerTest.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.frequency; - -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTransformedText; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.ExternalResourceFactory.createExternalResourceDescription; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.frequency.resources.Web1TFrequencyCountResource; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; - -public class ExpressiveLengtheningNormalizerTest -{ - @Test - public void test() - throws Exception - { - runTest("süüüüß", "süß"); - runTest("Kresssse", "Kresse"); - runTest("Kresseee", "Kresse"); - } - - public void runTest(String inputText, String normalizedText) throws Exception - { - AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - - AnalysisEngineDescription normalizer = createEngineDescription( - ExpressiveLengtheningNormalizer.class, - ExpressiveLengtheningNormalizer.FREQUENCY_PROVIDER, createExternalResourceDescription( - Web1TFrequencyCountResource.class, - Web1TFrequencyCountResource.PARAM_LANGUAGE, "de", - Web1TFrequencyCountResource.PARAM_MIN_NGRAM_LEVEL, "1", - Web1TFrequencyCountResource.PARAM_MAX_NGRAM_LEVEL, "1", - Web1TFrequencyCountResource.PARAM_INDEX_PATH, "src/test/resources/jweb1t")); - - assertTransformedText(normalizedText, inputText, "de", segmenter, normalizer); - } -} diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/JWeb1TFrequencyProviderTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/JWeb1TFrequencyProviderTest.java similarity index 84% rename from dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/JWeb1TFrequencyProviderTest.java rename to dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/JWeb1TFrequencyProviderTest.java index 7a58936d6b..5454389d34 100644 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/JWeb1TFrequencyProviderTest.java +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/JWeb1TFrequencyProviderTest.java @@ -15,18 +15,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer; +package org.dkpro.core.textnormalizer; import static org.junit.Assert.assertEquals; import java.io.File; import java.io.IOException; +import org.dkpro.core.api.frequency.provider.FrequencyCountProvider; +import org.dkpro.core.frequency.Web1TFileAccessProvider; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.frequency.provider.FrequencyCountProvider; -import de.tudarmstadt.ukp.dkpro.core.frequency.Web1TFileAccessProvider; - public class JWeb1TFrequencyProviderTest { diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/MultiNormalizationTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/MultiNormalizationTest.java similarity index 81% rename from dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/MultiNormalizationTest.java rename to dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/MultiNormalizationTest.java index 9c19fb3597..010aeafaa8 100644 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/MultiNormalizationTest.java +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/MultiNormalizationTest.java @@ -15,18 +15,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer; +package org.dkpro.core.textnormalizer; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTransformedText; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.dkpro.core.testing.AssertAnnotations.assertTransformedText; import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.dkpro.core.jazzy.JazzyChecker; +import org.dkpro.core.textnormalizer.transformation.HyphenationRemover; +import org.dkpro.core.tokit.BreakIteratorSegmenter; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.jazzy.JazzyChecker; -import de.tudarmstadt.ukp.dkpro.core.textnormalizer.transformation.HyphenationRemover; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; - public class MultiNormalizationTest { @Test @@ -40,12 +39,14 @@ public void runTest(String inputText, String normalizedText) throws Exception { - AnalysisEngineDescription hyphens = createEngineDescription(HyphenationRemover.class, + AnalysisEngineDescription hyphens = createEngineDescription( + HyphenationRemover.class, HyphenationRemover.PARAM_MODEL_LOCATION, "src/test/resources/dictionary/ngerman"); AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - AnalysisEngineDescription spellchecker = createEngineDescription(JazzyChecker.class, + AnalysisEngineDescription spellchecker = createEngineDescription( + JazzyChecker.class, JazzyChecker.PARAM_MODEL_LOCATION, "src/test/resources/dictionary/ngerman"); AnalysisEngineDescription spelling = createEngineDescription(SpellingNormalizer.class); diff --git a/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/ReplacementFileNormalizerTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/ReplacementFileNormalizerTest.java new file mode 100644 index 0000000000..f28e1eaf40 --- /dev/null +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/ReplacementFileNormalizerTest.java @@ -0,0 +1,102 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.textnormalizer; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.junit.Assert.assertEquals; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.cas.CAS; +import org.apache.uima.fit.factory.AggregateBuilder; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.castransformation.ApplyChangesAnnotator; +import org.dkpro.core.textnormalizer.ReplacementFileNormalizer.SrcSurroundings; +import org.dkpro.core.textnormalizer.ReplacementFileNormalizer.TargetSurroundings; +import org.dkpro.core.tokit.BreakIteratorSegmenter; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; + +public class ReplacementFileNormalizerTest +{ + @Test + public void testReplacementNormalizer() throws Exception + { + testEmoticonReplacement(":-):-(", " lächeln traurig "); + // testInternetslangReplacement("AKA hdl AKA.", "Also known as Hab' dich lieb Also known as."); + } + + public void testEmoticonReplacement(String input, String output) throws Exception + { + AnalysisEngineDescription replace = createEngineDescription( + ReplacementFileNormalizer.class, + ReplacementFileNormalizer.PARAM_MODEL_LOCATION, "src/main/resources/replaceLists/emoticons_de.txt", + ReplacementFileNormalizer.PARAM_TARGET_SURROUNDINGS, TargetSurroundings.WHITESPACE); + + AggregateBuilder ab = new AggregateBuilder(); + ab.add(replace); + ab.add(createEngineDescription(ApplyChangesAnnotator.class), + ApplyChangesAnnotator.VIEW_SOURCE, CAS.NAME_DEFAULT_SOFA, + ApplyChangesAnnotator.VIEW_TARGET, "view1"); + + AnalysisEngine engine = ab.createAggregate(); + JCas jcas = engine.newJCas(); + jcas.setDocumentText(input); + DocumentMetaData.create(jcas); + engine.process(jcas); + + JCas view0 = jcas.getView(CAS.NAME_DEFAULT_SOFA); + JCas view1 = jcas.getView("view1"); + + System.out.println(view0.getDocumentText()); + System.out.println(view1.getDocumentText()); + + assertEquals(output, view1.getDocumentText()); + } + + public void testInternetslangReplacement(String input, String output) throws Exception + { + AnalysisEngineDescription replace = createEngineDescription( + ReplacementFileNormalizer.class, + ReplacementFileNormalizer.PARAM_MODEL_LOCATION, "src/main/resources/replaceLists/internetslang.txt", + ReplacementFileNormalizer.PARAM_SRC_SURROUNDINGS, SrcSurroundings.ONLY_ALPHANIMERIC); + + AggregateBuilder ab = new AggregateBuilder(); + ab.add(createEngineDescription(BreakIteratorSegmenter.class), + CAS.NAME_DEFAULT_SOFA, CAS.NAME_DEFAULT_SOFA); + ab.add(replace); + ab.add(createEngineDescription( ApplyChangesAnnotator.class), + ApplyChangesAnnotator.VIEW_SOURCE, CAS.NAME_DEFAULT_SOFA, + ApplyChangesAnnotator.VIEW_TARGET, "view1"); + + AnalysisEngine engine = ab.createAggregate(); + JCas jcas = engine.newJCas(); + jcas.setDocumentText(input); + DocumentMetaData.create(jcas); + engine.process(jcas); + + JCas view0 = jcas.getView(CAS.NAME_DEFAULT_SOFA); + JCas view1 = jcas.getView("view1"); + + System.out.println(view0.getDocumentText()); + System.out.println(view1.getDocumentText()); + + assertEquals(output, view1.getDocumentText()); + } +} diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/SpellingNormalizerTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/SpellingNormalizerTest.java similarity index 85% rename from dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/SpellingNormalizerTest.java rename to dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/SpellingNormalizerTest.java index 42a457ba9f..d5d766b550 100644 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/SpellingNormalizerTest.java +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/SpellingNormalizerTest.java @@ -15,17 +15,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer; +package org.dkpro.core.textnormalizer; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTransformedText; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.dkpro.core.testing.AssertAnnotations.assertTransformedText; import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.dkpro.core.jazzy.JazzyChecker; +import org.dkpro.core.tokit.BreakIteratorSegmenter; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.jazzy.JazzyChecker; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; - public class SpellingNormalizerTest { @Test @@ -44,7 +43,8 @@ public void test(String inputText, String normalizedText) { AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - AnalysisEngineDescription spellchecker = createEngineDescription(JazzyChecker.class, + AnalysisEngineDescription spellchecker = createEngineDescription( + JazzyChecker.class, JazzyChecker.PARAM_MODEL_LOCATION, "src/test/resources/dictionary/ngerman"); AnalysisEngineDescription normalizer = createEngineDescription(SpellingNormalizer.class); diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/annotations/AnnotationByTextFilterTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/annotations/AnnotationByTextFilterTest.java similarity index 88% rename from dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/annotations/AnnotationByTextFilterTest.java rename to dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/annotations/AnnotationByTextFilterTest.java index 24f653b276..ad26801752 100644 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/annotations/AnnotationByTextFilterTest.java +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/annotations/AnnotationByTextFilterTest.java @@ -1,62 +1,62 @@ -/* - * Copyright 2014 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.annotations; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.io.text.StringReader; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; - -public class AnnotationByTextFilterTest -{ - private static final String LEXICON_FILE = "src/test/resources/sentiws_100.txt"; - - @Test - public void testSentiTokens() - throws ResourceInitializationException - { - String inputText = "Ich begegne dem Abbau mit abfälligen Gedanken ."; - String[] expectedTokens = new String[] { "Abbau", "abfälligen" }; - - CollectionReaderDescription stringReader = createReaderDescription(StringReader.class, - StringReader.PARAM_DOCUMENT_TEXT, inputText, - StringReader.PARAM_LANGUAGE, "de"); - AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - - AnalysisEngineDescription wordsFilter = createEngineDescription( - AnnotationByTextFilter.class, - AnnotationByTextFilter.PARAM_MODEL_LOCATION, LEXICON_FILE); - - for (JCas jcas : SimplePipeline.iteratePipeline(stringReader, segmenter, wordsFilter)) { - AssertAnnotations.assertToken(expectedTokens, select(jcas, Token.class)); - } - } - -} +/* + * Copyright 2014 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.dkpro.core.textnormalizer.annotations; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.io.text.StringReader; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.tokit.BreakIteratorSegmenter; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class AnnotationByTextFilterTest +{ + private static final String LEXICON_FILE = "src/test/resources/sentiws_100.txt"; + + @Test + public void testSentiTokens() + throws ResourceInitializationException + { + String inputText = "Ich begegne dem Abbau mit abfälligen Gedanken ."; + String[] expectedTokens = new String[] { "Abbau", "abfälligen" }; + + CollectionReaderDescription stringReader = createReaderDescription(StringReader.class, + StringReader.PARAM_DOCUMENT_TEXT, inputText, + StringReader.PARAM_LANGUAGE, "de"); + AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); + + AnalysisEngineDescription wordsFilter = createEngineDescription( + AnnotationByTextFilter.class, + AnnotationByTextFilter.PARAM_MODEL_LOCATION, LEXICON_FILE); + + for (JCas jcas : SimplePipeline.iteratePipeline(stringReader, segmenter, wordsFilter)) { + AssertAnnotations.assertToken(expectedTokens, select(jcas, Token.class)); + } + } + +} diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/annotations/RegexTokenFilterTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/annotations/RegexTokenFilterTest.java similarity index 91% rename from dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/annotations/RegexTokenFilterTest.java rename to dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/annotations/RegexTokenFilterTest.java index c27f82d69c..669636bd8c 100644 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/annotations/RegexTokenFilterTest.java +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/annotations/RegexTokenFilterTest.java @@ -1,95 +1,95 @@ -/* - * Copyright 2014 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.annotations; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.junit.Assert.assertEquals; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.apache.uima.jcas.JCas; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.io.text.StringReader; -import de.tudarmstadt.ukp.dkpro.core.textnormalizer.util.JCasHolder; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; - -public class RegexTokenFilterTest -{ - - private static final String FILTER_REGEX = "^[A-ZÖÜÄ].{2,}"; - - @Test - public void testNoMatch() - throws Exception - { - String inputText = "Ich lebe in Braunschweig."; - String filteredText = "Ich Braunschweig"; - boolean mustMatch = true; - - CollectionReaderDescription reader = createReaderDescription(StringReader.class, - StringReader.PARAM_DOCUMENT_TEXT, inputText, StringReader.PARAM_LANGUAGE, "de"); - - AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - AnalysisEngineDescription filter = createEngineDescription(RegexTokenFilter.class, - RegexTokenFilter.PARAM_REGEX, FILTER_REGEX, - RegexTokenFilter.PARAM_MUST_MATCH, mustMatch); - - AnalysisEngineDescription holder = createEngineDescription(JCasHolder.class); - - StringBuilder outputText = new StringBuilder(); - for (JCas jcas : SimplePipeline.iteratePipeline(reader, segmenter, filter, holder)) { - for (Token token : select(jcas, Token.class)) { - outputText.append(token.getCoveredText() + " "); - } - assertEquals(filteredText, outputText.toString().trim()); - } - } - - @Test - public void testMatch() - throws Exception - { - String inputText = "Ich lebe in Braunschweig."; - String filteredText = "lebe in ."; - boolean mustMatch = false; - - CollectionReaderDescription reader = createReaderDescription(StringReader.class, - StringReader.PARAM_DOCUMENT_TEXT, inputText, StringReader.PARAM_LANGUAGE, "de"); - - AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - AnalysisEngineDescription filter = createEngineDescription(RegexTokenFilter.class, - RegexTokenFilter.PARAM_REGEX, FILTER_REGEX, - RegexTokenFilter.PARAM_MUST_MATCH, mustMatch); - - AnalysisEngineDescription holder = createEngineDescription(JCasHolder.class); - - StringBuilder outputText = new StringBuilder(); - for (JCas jcas : SimplePipeline.iteratePipeline(reader, segmenter, filter, holder)) { - for (Token token : select(jcas, Token.class)) { - outputText.append(token.getCoveredText() + " "); - } - assertEquals(filteredText, outputText.toString().trim()); - } - } -} +/* + * Copyright 2014 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.dkpro.core.textnormalizer.annotations; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.junit.Assert.assertEquals; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.text.StringReader; +import org.dkpro.core.textnormalizer.util.JCasHolder; +import org.dkpro.core.tokit.BreakIteratorSegmenter; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class RegexTokenFilterTest +{ + + private static final String FILTER_REGEX = "^[A-ZÖÜÄ].{2,}"; + + @Test + public void testNoMatch() + throws Exception + { + String inputText = "Ich lebe in Braunschweig."; + String filteredText = "Ich Braunschweig"; + boolean mustMatch = true; + + CollectionReaderDescription reader = createReaderDescription(StringReader.class, + StringReader.PARAM_DOCUMENT_TEXT, inputText, StringReader.PARAM_LANGUAGE, "de"); + + AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); + AnalysisEngineDescription filter = createEngineDescription(RegexTokenFilter.class, + RegexTokenFilter.PARAM_REGEX, FILTER_REGEX, + RegexTokenFilter.PARAM_MUST_MATCH, mustMatch); + + AnalysisEngineDescription holder = createEngineDescription(JCasHolder.class); + + StringBuilder outputText = new StringBuilder(); + for (JCas jcas : SimplePipeline.iteratePipeline(reader, segmenter, filter, holder)) { + for (Token token : select(jcas, Token.class)) { + outputText.append(token.getCoveredText() + " "); + } + assertEquals(filteredText, outputText.toString().trim()); + } + } + + @Test + public void testMatch() + throws Exception + { + String inputText = "Ich lebe in Braunschweig."; + String filteredText = "lebe in ."; + boolean mustMatch = false; + + CollectionReaderDescription reader = createReaderDescription(StringReader.class, + StringReader.PARAM_DOCUMENT_TEXT, inputText, StringReader.PARAM_LANGUAGE, "de"); + + AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); + AnalysisEngineDescription filter = createEngineDescription(RegexTokenFilter.class, + RegexTokenFilter.PARAM_REGEX, FILTER_REGEX, + RegexTokenFilter.PARAM_MUST_MATCH, mustMatch); + + AnalysisEngineDescription holder = createEngineDescription(JCasHolder.class); + + StringBuilder outputText = new StringBuilder(); + for (JCas jcas : SimplePipeline.iteratePipeline(reader, segmenter, filter, holder)) { + for (Token token : select(jcas, Token.class)) { + outputText.append(token.getCoveredText() + " "); + } + assertEquals(filteredText, outputText.toString().trim()); + } + } +} diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/annotations/TrailingCharacterRemoverTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/annotations/TrailingCharacterRemoverTest.java similarity index 88% rename from dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/annotations/TrailingCharacterRemoverTest.java rename to dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/annotations/TrailingCharacterRemoverTest.java index 5bcdb1eb0e..a2dd60abba 100644 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/annotations/TrailingCharacterRemoverTest.java +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/annotations/TrailingCharacterRemoverTest.java @@ -1,68 +1,68 @@ -/* - * Copyright 2014 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.annotations; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.jcas.JCas; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -public class TrailingCharacterRemoverTest -{ - @Test - public void basicTest() - throws UIMAException - { - String inputText = "Ein Text)( mitA Fehlern( . und9-9 a komischen Tokens."; - String[] tokensExpected = { "Ein", "Text", "mit", "Fehlern", ".", "und", "a", "komischen", - "Tokens." }; - - AnalysisEngineDescription engine = createEngineDescription(TrailingCharacterRemover.class); - JCas jcas = TestRunner.runTest(engine, "de", inputText); - - AssertAnnotations.assertToken(tokensExpected, select(jcas, Token.class)); - } - - @Test - public void minTokenLengthTest() - throws UIMAException - { - int minimumTokenLength = 3; - String inputText = "Ein T-- mit komischen) To. a"; - String[] tokensExpected = { "Ein", "mit", "komischen", "To.", "a" }; - - AnalysisEngineDescription engine = createEngineDescription(TrailingCharacterRemover.class, - TrailingCharacterRemover.PARAM_MIN_TOKEN_LENGTH, minimumTokenLength); - JCas jcas = TestRunner.runTest(engine, "de", inputText); - - AssertAnnotations.assertToken(tokensExpected, select(jcas, Token.class)); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} +/* + * Copyright 2014 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.dkpro.core.textnormalizer.annotations; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class TrailingCharacterRemoverTest +{ + @Test + public void basicTest() + throws UIMAException + { + String inputText = "Ein Text)( mitA Fehlern( . und9-9 a komischen Tokens."; + String[] tokensExpected = { "Ein", "Text", "mit", "Fehlern", ".", "und", "a", "komischen", + "Tokens." }; + + AnalysisEngineDescription engine = createEngineDescription(TrailingCharacterRemover.class); + JCas jcas = TestRunner.runTest(engine, "de", inputText); + + AssertAnnotations.assertToken(tokensExpected, select(jcas, Token.class)); + } + + @Test + public void minTokenLengthTest() + throws UIMAException + { + int minimumTokenLength = 3; + String inputText = "Ein T-- mit komischen) To. a"; + String[] tokensExpected = { "Ein", "mit", "komischen", "To.", "a" }; + + AnalysisEngineDescription engine = createEngineDescription(TrailingCharacterRemover.class, + TrailingCharacterRemover.PARAM_MIN_TOKEN_LENGTH, minimumTokenLength); + JCas jcas = TestRunner.runTest(engine, "de", inputText); + + AssertAnnotations.assertToken(tokensExpected, select(jcas, Token.class)); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/casfilter/CasFilter_ImplBaseTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/casfilter/CasFilter_ImplBaseTest.java similarity index 95% rename from dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/casfilter/CasFilter_ImplBaseTest.java rename to dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/casfilter/CasFilter_ImplBaseTest.java index 4d50b31c9b..d785bf7dab 100644 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/casfilter/CasFilter_ImplBaseTest.java +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/casfilter/CasFilter_ImplBaseTest.java @@ -1,250 +1,250 @@ -/* - * Copyright 2014 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.casfilter; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.Set; - -import org.apache.commons.io.FileUtils; -import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.pipeline.SimplePipeline; -import org.apache.uima.jcas.JCas; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.io.text.StringReader; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.dumper.CasDumpWriter; - -public class CasFilter_ImplBaseTest -{ - @Test - public void testAnnotationFilterPass() - throws UIMAException, IOException - { - File tmpFile = new File(testContext.getTestOutputFolder(), "output.dump"); - - String input = "test"; - String expectedFirstLine = "======== CAS 0 begin =================================="; - - CollectionReaderDescription reader = createReaderDescription( - StringReader.class, - StringReader.PARAM_DOCUMENT_TEXT, input, - StringReader.PARAM_LANGUAGE, "en"); - AnalysisEngineDescription filter = createEngineDescription(AnnotationBasedFilter.class); - AnalysisEngineDescription annotator = createEngineDescription(TestAnnotator.class); - AnalysisEngineDescription writer = createEngineDescription( - CasDumpWriter.class, - CasDumpWriter.PARAM_TARGET_LOCATION, tmpFile); - AnalysisEngineDescription aggregator = CasFilter_ImplBase - .createAggregateBuilderDescription(filter, writer); - - SimplePipeline.runPipeline(reader, annotator, aggregator); - - List<String> output = FileUtils.readLines(tmpFile); - assertEquals(expectedFirstLine, output.get(0)); - assertEquals(input, output.get(13)); - assertEquals("Sentence", output.get(15)); - } - - @Test - public void testAnnotationFilterRemove() - throws UIMAException, IOException - { - File tmpFile = new File(testContext.getTestOutputFolder(), "output.dump"); - - String input = ""; - - CollectionReaderDescription reader = createReaderDescription(StringReader.class, - StringReader.PARAM_DOCUMENT_TEXT, input, - StringReader.PARAM_LANGUAGE, "en"); - AnalysisEngineDescription filter = createEngineDescription(AnnotationBasedFilter.class); - AnalysisEngineDescription annotator = createEngineDescription(TestAnnotator.class); - AnalysisEngineDescription writer = createEngineDescription(CasDumpWriter.class, - CasDumpWriter.PARAM_TARGET_LOCATION, tmpFile); - AnalysisEngineDescription aggregator = CasFilter_ImplBase - .createAggregateBuilderDescription(filter, writer); - - SimplePipeline.runPipeline(reader, annotator, aggregator); - assertTrue(FileUtils.readFileToString(tmpFile).isEmpty()); - } - - @Test - public void testEmptyDocumentFilterRemove() - throws UIMAException, IOException - { - File tmpFile = new File(testContext.getTestOutputFolder(), "output.dump"); - - String input = ""; - - CollectionReaderDescription reader = createReaderDescription(StringReader.class, - StringReader.PARAM_DOCUMENT_TEXT, input, - StringReader.PARAM_LANGUAGE, "en"); - AnalysisEngineDescription filter = createEngineDescription(EmptyDocumentFilter.class); - AnalysisEngineDescription writer = createEngineDescription(CasDumpWriter.class, - CasDumpWriter.PARAM_TARGET_LOCATION, tmpFile); - AnalysisEngineDescription aggregator = CasFilter_ImplBase - .createAggregateBuilderDescription(filter, writer); - - SimplePipeline.runPipeline(reader, aggregator); - assertTrue(FileUtils.readFileToString(tmpFile).isEmpty()); - } - - @Test - public void testEmptyDocumentFilterPass() - throws UIMAException, IOException - { - File tmpFile = new File(testContext.getTestOutputFolder(), "output.dump"); - - String input = "test"; - - CollectionReaderDescription reader = createReaderDescription(StringReader.class, - StringReader.PARAM_DOCUMENT_TEXT, input, - StringReader.PARAM_LANGUAGE, "en"); - AnalysisEngineDescription filter = createEngineDescription(EmptyDocumentFilter.class); - AnalysisEngineDescription writer = createEngineDescription(CasDumpWriter.class, - CasDumpWriter.PARAM_TARGET_LOCATION, tmpFile); - AnalysisEngineDescription aggregator = CasFilter_ImplBase - .createAggregateBuilderDescription(filter, writer); - - SimplePipeline.runPipeline(reader, aggregator); - assertFalse(FileUtils.readFileToString(tmpFile).isEmpty()); - } - - @Test - public void testLanguageFilterPass() - throws UIMAException, IOException - { - File tmpFile = new File(testContext.getTestOutputFolder(), "output.dump"); - - String input = "test"; - - CollectionReaderDescription reader = createReaderDescription(StringReader.class, - StringReader.PARAM_DOCUMENT_TEXT, input, - StringReader.PARAM_LANGUAGE, "en"); - AnalysisEngineDescription filter = createEngineDescription(LanguageFilter.class, - LanguageFilter.PARAM_REQUIRED_LANGUAGES, new String[] { "de", "en" }); - AnalysisEngineDescription writer = createEngineDescription(CasDumpWriter.class, - CasDumpWriter.PARAM_TARGET_LOCATION, tmpFile); - AnalysisEngineDescription aggregator = CasFilter_ImplBase - .createAggregateBuilderDescription(filter, writer); - - SimplePipeline.runPipeline(reader, aggregator); - assertFalse(FileUtils.readFileToString(tmpFile).isEmpty()); - } - - @Test - public void testLanguageFilterRemove() - throws UIMAException, IOException - { - File tmpFile = new File(testContext.getTestOutputFolder(), "output.dump"); - - String input = "test"; - - CollectionReaderDescription reader = createReaderDescription(StringReader.class, - StringReader.PARAM_DOCUMENT_TEXT, input, - StringReader.PARAM_LANGUAGE, "ch"); - AnalysisEngineDescription filter = createEngineDescription(LanguageFilter.class, - LanguageFilter.PARAM_REQUIRED_LANGUAGES, new String[] { "de", "en" }); - AnalysisEngineDescription writer = createEngineDescription(CasDumpWriter.class, - CasDumpWriter.PARAM_TARGET_LOCATION, tmpFile); - AnalysisEngineDescription aggregator = CasFilter_ImplBase - .createAggregateBuilderDescription(filter, writer); - - SimplePipeline.runPipeline(reader, aggregator); - assertTrue(FileUtils.readFileToString(tmpFile).isEmpty()); - } - - public static class TestAnnotator - extends JCasAnnotator_ImplBase - { - /** - * Create one sentence over the full text unless the text is empty. - * - * @param aJCas - * @throws AnalysisEngineProcessException - */ - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - String text = aJCas.getDocumentText(); - if (text.length() > 0) { - Sentence sentence = new Sentence(aJCas); - sentence.setBegin(0); - sentence.setEnd(text.length()); - sentence.addToIndexes(aJCas); - } - } - } - - public static class AnnotationBasedFilter - extends CasFilter_ImplBase - { - /** - * filter out documents that do not contain any sentence annotation. - */ - @Override - protected boolean pass(JCas aJCas) - { - return select(aJCas, Sentence.class).size() > 0; - } - } - - public static class EmptyDocumentFilter - extends CasFilter_ImplBase - { - @Override - protected boolean pass(JCas aJCas) - { - return aJCas.getDocumentText().length() > 0; - } - } - - public static class LanguageFilter - extends CasFilter_ImplBase - { - public static final String PARAM_REQUIRED_LANGUAGES = "requiredLanguages"; - @ConfigurationParameter(name = PARAM_REQUIRED_LANGUAGES, mandatory = true) - Set<String> requiredLanguages; - - @Override - protected boolean pass(JCas aJCas) - { - return requiredLanguages.contains(aJCas.getDocumentLanguage()); - } - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} +/* + * Copyright 2014 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.textnormalizer.casfilter; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.Set; + +import org.apache.commons.io.FileUtils; +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.io.text.StringReader; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.dumper.CasDumpWriter; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; + +public class CasFilter_ImplBaseTest +{ + @Test + public void testAnnotationFilterPass() + throws UIMAException, IOException + { + File tmpFile = new File(testContext.getTestOutputFolder(), "output.dump"); + + String input = "test"; + String expectedFirstLine = "======== CAS 0 begin =================================="; + + CollectionReaderDescription reader = createReaderDescription( + StringReader.class, + StringReader.PARAM_DOCUMENT_TEXT, input, + StringReader.PARAM_LANGUAGE, "en"); + AnalysisEngineDescription filter = createEngineDescription(AnnotationBasedFilter.class); + AnalysisEngineDescription annotator = createEngineDescription(TestAnnotator.class); + AnalysisEngineDescription writer = createEngineDescription( + CasDumpWriter.class, + CasDumpWriter.PARAM_TARGET_LOCATION, tmpFile); + AnalysisEngineDescription aggregator = CasFilter_ImplBase + .createAggregateBuilderDescription(filter, writer); + + SimplePipeline.runPipeline(reader, annotator, aggregator); + + List<String> output = FileUtils.readLines(tmpFile); + assertEquals(expectedFirstLine, output.get(0)); + assertEquals(input, output.get(13)); + assertEquals("Sentence", output.get(15)); + } + + @Test + public void testAnnotationFilterRemove() + throws UIMAException, IOException + { + File tmpFile = new File(testContext.getTestOutputFolder(), "output.dump"); + + String input = ""; + + CollectionReaderDescription reader = createReaderDescription(StringReader.class, + StringReader.PARAM_DOCUMENT_TEXT, input, + StringReader.PARAM_LANGUAGE, "en"); + AnalysisEngineDescription filter = createEngineDescription(AnnotationBasedFilter.class); + AnalysisEngineDescription annotator = createEngineDescription(TestAnnotator.class); + AnalysisEngineDescription writer = createEngineDescription(CasDumpWriter.class, + CasDumpWriter.PARAM_TARGET_LOCATION, tmpFile); + AnalysisEngineDescription aggregator = CasFilter_ImplBase + .createAggregateBuilderDescription(filter, writer); + + SimplePipeline.runPipeline(reader, annotator, aggregator); + assertTrue(FileUtils.readFileToString(tmpFile).isEmpty()); + } + + @Test + public void testEmptyDocumentFilterRemove() + throws UIMAException, IOException + { + File tmpFile = new File(testContext.getTestOutputFolder(), "output.dump"); + + String input = ""; + + CollectionReaderDescription reader = createReaderDescription(StringReader.class, + StringReader.PARAM_DOCUMENT_TEXT, input, + StringReader.PARAM_LANGUAGE, "en"); + AnalysisEngineDescription filter = createEngineDescription(EmptyDocumentFilter.class); + AnalysisEngineDescription writer = createEngineDescription(CasDumpWriter.class, + CasDumpWriter.PARAM_TARGET_LOCATION, tmpFile); + AnalysisEngineDescription aggregator = CasFilter_ImplBase + .createAggregateBuilderDescription(filter, writer); + + SimplePipeline.runPipeline(reader, aggregator); + assertTrue(FileUtils.readFileToString(tmpFile).isEmpty()); + } + + @Test + public void testEmptyDocumentFilterPass() + throws UIMAException, IOException + { + File tmpFile = new File(testContext.getTestOutputFolder(), "output.dump"); + + String input = "test"; + + CollectionReaderDescription reader = createReaderDescription(StringReader.class, + StringReader.PARAM_DOCUMENT_TEXT, input, + StringReader.PARAM_LANGUAGE, "en"); + AnalysisEngineDescription filter = createEngineDescription(EmptyDocumentFilter.class); + AnalysisEngineDescription writer = createEngineDescription(CasDumpWriter.class, + CasDumpWriter.PARAM_TARGET_LOCATION, tmpFile); + AnalysisEngineDescription aggregator = CasFilter_ImplBase + .createAggregateBuilderDescription(filter, writer); + + SimplePipeline.runPipeline(reader, aggregator); + assertFalse(FileUtils.readFileToString(tmpFile).isEmpty()); + } + + @Test + public void testLanguageFilterPass() + throws UIMAException, IOException + { + File tmpFile = new File(testContext.getTestOutputFolder(), "output.dump"); + + String input = "test"; + + CollectionReaderDescription reader = createReaderDescription(StringReader.class, + StringReader.PARAM_DOCUMENT_TEXT, input, + StringReader.PARAM_LANGUAGE, "en"); + AnalysisEngineDescription filter = createEngineDescription(LanguageFilter.class, + LanguageFilter.PARAM_REQUIRED_LANGUAGES, new String[] { "de", "en" }); + AnalysisEngineDescription writer = createEngineDescription(CasDumpWriter.class, + CasDumpWriter.PARAM_TARGET_LOCATION, tmpFile); + AnalysisEngineDescription aggregator = CasFilter_ImplBase + .createAggregateBuilderDescription(filter, writer); + + SimplePipeline.runPipeline(reader, aggregator); + assertFalse(FileUtils.readFileToString(tmpFile).isEmpty()); + } + + @Test + public void testLanguageFilterRemove() + throws UIMAException, IOException + { + File tmpFile = new File(testContext.getTestOutputFolder(), "output.dump"); + + String input = "test"; + + CollectionReaderDescription reader = createReaderDescription(StringReader.class, + StringReader.PARAM_DOCUMENT_TEXT, input, + StringReader.PARAM_LANGUAGE, "ch"); + AnalysisEngineDescription filter = createEngineDescription(LanguageFilter.class, + LanguageFilter.PARAM_REQUIRED_LANGUAGES, new String[] { "de", "en" }); + AnalysisEngineDescription writer = createEngineDescription(CasDumpWriter.class, + CasDumpWriter.PARAM_TARGET_LOCATION, tmpFile); + AnalysisEngineDescription aggregator = CasFilter_ImplBase + .createAggregateBuilderDescription(filter, writer); + + SimplePipeline.runPipeline(reader, aggregator); + assertTrue(FileUtils.readFileToString(tmpFile).isEmpty()); + } + + public static class TestAnnotator + extends JCasAnnotator_ImplBase + { + /** + * Create one sentence over the full text unless the text is empty. + * + * @param aJCas + * @throws AnalysisEngineProcessException + */ + @Override + public void process(JCas aJCas) + throws AnalysisEngineProcessException + { + String text = aJCas.getDocumentText(); + if (text.length() > 0) { + Sentence sentence = new Sentence(aJCas); + sentence.setBegin(0); + sentence.setEnd(text.length()); + sentence.addToIndexes(aJCas); + } + } + } + + public static class AnnotationBasedFilter + extends CasFilter_ImplBase + { + /** + * filter out documents that do not contain any sentence annotation. + */ + @Override + protected boolean pass(JCas aJCas) + { + return select(aJCas, Sentence.class).size() > 0; + } + } + + public static class EmptyDocumentFilter + extends CasFilter_ImplBase + { + @Override + protected boolean pass(JCas aJCas) + { + return aJCas.getDocumentText().length() > 0; + } + } + + public static class LanguageFilter + extends CasFilter_ImplBase + { + public static final String PARAM_REQUIRED_LANGUAGES = "requiredLanguages"; + @ConfigurationParameter(name = PARAM_REQUIRED_LANGUAGES, mandatory = true) + Set<String> requiredLanguages; + + @Override + protected boolean pass(JCas aJCas) + { + return requiredLanguages.contains(aJCas.getDocumentLanguage()); + } + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/factory/NormalizerFactoryTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/factory/NormalizerFactoryTest.java new file mode 100644 index 0000000000..9d5787583d --- /dev/null +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/factory/NormalizerFactoryTest.java @@ -0,0 +1,111 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.textnormalizer.factory; + +import static org.apache.uima.fit.factory.ExternalResourceFactory.createResourceDescription; +import static org.junit.Assert.assertEquals; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.fit.factory.AggregateBuilder; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ExternalResourceDescription; +import org.dkpro.core.frequency.resources.Web1TFrequencyCountResource; +import org.dkpro.core.textnormalizer.ReplacementFileNormalizer.SrcSurroundings; +import org.dkpro.core.textnormalizer.ReplacementFileNormalizer.TargetSurroundings; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; + +public class NormalizerFactoryTest +{ + private ExternalResourceDescription frequencyProvider; + + @Before + public void init() { + frequencyProvider = createResourceDescription( + Web1TFrequencyCountResource.class, + Web1TFrequencyCountResource.PARAM_LANGUAGE, "de", + Web1TFrequencyCountResource.PARAM_MIN_NGRAM_LEVEL, "1", + Web1TFrequencyCountResource.PARAM_MAX_NGRAM_LEVEL, "1", + Web1TFrequencyCountResource.PARAM_INDEX_PATH, "src/test/resources/jweb1t"); + } + + @Ignore("This test will not work while the module is in transition - after we need to fix it") + @Test + public void testNormalizerFactory() throws Exception + { + test("GMBH +++ Gewerkschaftb +++ HDL +++ :-)", + "GmbH +++ Gewerkschaft +++ Hab' dich lieb +++ lächeln "); + } + + + public void test(String input, String output) throws Exception + { + NormalizerFactory nf = new NormalizerFactory(); + AnalysisEngineDescription normalizeSharpSUmlaute = nf + .getUmlautSharpSNormalization(frequencyProvider, 0); + AnalysisEngineDescription normalizeRepitions = nf + .getExpressiveLengtheningNormalization(frequencyProvider); + AnalysisEngineDescription normalizeCapitalization = nf + .getCapitalizationNormalization(frequencyProvider); + AnalysisEngineDescription normalizeInternetslang = nf.getReplacementNormalization( + "src/main/resources/replaceLists/internetslang.txt", + SrcSurroundings.ONLY_ALPHANIMERIC, TargetSurroundings.NOTHING); + AnalysisEngineDescription normalizeSpelling = nf + .getSpellcorrection("src/test/resources/dictionary/ngerman"); + AnalysisEngineDescription normalizeEmoticons = nf.getReplacementNormalization( + "src/main/resources/replaceLists/emoticons_de.txt", SrcSurroundings.IRRELEVANT, + TargetSurroundings.WHITESPACE); + + AggregateBuilder ab = new AggregateBuilder(); + ab.add(normalizeSharpSUmlaute); + ab.add(normalizeRepitions); + ab.add(normalizeCapitalization); + ab.add(normalizeInternetslang); + ab.add(normalizeSpelling); + ab.add(normalizeEmoticons); + + AnalysisEngine engine = ab.createAggregate(); + JCas jcas = engine.newJCas(); + jcas.setDocumentText(input); + DocumentMetaData.create(jcas); + engine.process(jcas); + + JCas view0 = jcas.getView("_InitialView"); + JCas view1 = jcas.getView("view1"); + JCas view2 = jcas.getView("view2"); + JCas view3 = jcas.getView("view3"); + JCas view4 = jcas.getView("view4"); + JCas view5 = jcas.getView("view5"); + JCas view6 = jcas.getView("view6"); + + System.out.println("Original :" + view0.getDocumentText()); + System.out.println("Umlaute :" + view1.getDocumentText()); + System.out.println("Repitions :" + view2.getDocumentText()); + System.out.println("Capitalization :" + view3.getDocumentText()); + System.out.println("Internetslang :" + view4.getDocumentText()); + System.out.println("Spelling :" + view5.getDocumentText()); + System.out.println("Emoticons :" + view6.getDocumentText()); + System.out.println("Perfect :" + output); + + assertEquals(output, view6.getDocumentText()); + } +} diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/CapitalizationNormalizerTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/frequency/CapitalizationNormalizerTest.java similarity index 80% rename from dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/CapitalizationNormalizerTest.java rename to dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/frequency/CapitalizationNormalizerTest.java index b2e04f4ae0..01b069772e 100644 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/CapitalizationNormalizerTest.java +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/frequency/CapitalizationNormalizerTest.java @@ -15,18 +15,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.frequency; +package org.dkpro.core.textnormalizer.frequency; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTransformedText; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.ExternalResourceFactory.createExternalResourceDescription; +import static org.apache.uima.fit.factory.ExternalResourceFactory.createResourceDescription; +import static org.dkpro.core.testing.AssertAnnotations.assertTransformedText; import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.dkpro.core.frequency.resources.Web1TFrequencyCountResource; +import org.dkpro.core.tokit.BreakIteratorSegmenter; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.frequency.resources.Web1TFrequencyCountResource; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; - public class CapitalizationNormalizerTest { @Test @@ -40,7 +39,7 @@ public void test() AnalysisEngineDescription normalizer = createEngineDescription( CapitalizationNormalizer.class, - CapitalizationNormalizer.FREQUENCY_PROVIDER, createExternalResourceDescription( + CapitalizationNormalizer.RES_FREQUENCY_PROVIDER, createResourceDescription( Web1TFrequencyCountResource.class, Web1TFrequencyCountResource.PARAM_LANGUAGE, "de", Web1TFrequencyCountResource.PARAM_MIN_NGRAM_LEVEL, "1", diff --git a/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/frequency/ExpressiveLengtheningNormalizerTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/frequency/ExpressiveLengtheningNormalizerTest.java new file mode 100644 index 0000000000..c473b3ebda --- /dev/null +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/frequency/ExpressiveLengtheningNormalizerTest.java @@ -0,0 +1,56 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.textnormalizer.frequency; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.ExternalResourceFactory.createResourceDescription; +import static org.dkpro.core.testing.AssertAnnotations.assertTransformedText; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.dkpro.core.frequency.resources.Web1TFrequencyCountResource; +import org.dkpro.core.tokit.BreakIteratorSegmenter; +import org.junit.Test; + +public class ExpressiveLengtheningNormalizerTest +{ + @Test + public void test() + throws Exception + { + runTest("süüüüß", "süß"); + runTest("Kresssse", "Kresse"); + runTest("Kresseee", "Kresse"); + } + + public void runTest(String inputText, String normalizedText) throws Exception + { + AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); + + AnalysisEngineDescription normalizer = createEngineDescription( + ExpressiveLengtheningNormalizer.class, + ExpressiveLengtheningNormalizer.RES_FREQUENCY_PROVIDER, + createResourceDescription( + Web1TFrequencyCountResource.class, + Web1TFrequencyCountResource.PARAM_LANGUAGE, "de", + Web1TFrequencyCountResource.PARAM_MIN_NGRAM_LEVEL, "1", + Web1TFrequencyCountResource.PARAM_MAX_NGRAM_LEVEL, "1", + Web1TFrequencyCountResource.PARAM_INDEX_PATH, "src/test/resources/jweb1t")); + + assertTransformedText(normalizedText, inputText, "de", segmenter, normalizer); + } +} diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/SharpSNormalizerTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/frequency/SharpSNormalizerTest.java similarity index 81% rename from dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/SharpSNormalizerTest.java rename to dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/frequency/SharpSNormalizerTest.java index be7c6b60e4..ec61d9d52c 100644 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/SharpSNormalizerTest.java +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/frequency/SharpSNormalizerTest.java @@ -15,18 +15,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.frequency; +package org.dkpro.core.textnormalizer.frequency; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTransformedText; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.ExternalResourceFactory.createExternalResourceDescription; +import static org.apache.uima.fit.factory.ExternalResourceFactory.createResourceDescription; +import static org.dkpro.core.testing.AssertAnnotations.assertTransformedText; import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.dkpro.core.frequency.resources.Web1TFrequencyCountResource; +import org.dkpro.core.tokit.BreakIteratorSegmenter; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.frequency.resources.Web1TFrequencyCountResource; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; - public class SharpSNormalizerTest { @Test @@ -44,7 +43,7 @@ public void test(String inputText, String normalizedText) AnalysisEngineDescription normalizer = createEngineDescription( SharpSNormalizer.class, SharpSNormalizer.PARAM_MIN_FREQUENCY_THRESHOLD,0, - SharpSNormalizer.FREQUENCY_PROVIDER, createExternalResourceDescription( + SharpSNormalizer.RES_FREQUENCY_PROVIDER, createResourceDescription( Web1TFrequencyCountResource.class, Web1TFrequencyCountResource.PARAM_LANGUAGE, "de", Web1TFrequencyCountResource.PARAM_MIN_NGRAM_LEVEL, "1", diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/UmlautNormalizerTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/frequency/UmlautNormalizerTest.java similarity index 79% rename from dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/UmlautNormalizerTest.java rename to dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/frequency/UmlautNormalizerTest.java index 44748c06b5..81af73d08b 100644 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/frequency/UmlautNormalizerTest.java +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/frequency/UmlautNormalizerTest.java @@ -1,55 +1,54 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.frequency; - -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTransformedText; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.ExternalResourceFactory.createExternalResourceDescription; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.frequency.resources.Web1TFrequencyCountResource; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; - -public class UmlautNormalizerTest -{ - @Test - public void test() throws Exception - { - test("Der Apfel ist sueß und lecker", "Der Apfel ist süß und lecker"); - } - - public void test(String inputText, String normalizedText) - throws Exception - { - AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); - - AnalysisEngineDescription normalizer = createEngineDescription( - UmlautNormalizer.class, - UmlautNormalizer.PARAM_MIN_FREQUENCY_THRESHOLD,0, - UmlautNormalizer.FREQUENCY_PROVIDER, createExternalResourceDescription( - Web1TFrequencyCountResource.class, - Web1TFrequencyCountResource.PARAM_LANGUAGE, "de", - Web1TFrequencyCountResource.PARAM_MIN_NGRAM_LEVEL, "1", - Web1TFrequencyCountResource.PARAM_MAX_NGRAM_LEVEL, "1", - Web1TFrequencyCountResource.PARAM_INDEX_PATH, "src/test/resources/jweb1t")); - - assertTransformedText(normalizedText, inputText, "de", segmenter, normalizer); - } -} +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.textnormalizer.frequency; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.factory.ExternalResourceFactory.createResourceDescription; +import static org.dkpro.core.testing.AssertAnnotations.assertTransformedText; + +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.dkpro.core.frequency.resources.Web1TFrequencyCountResource; +import org.dkpro.core.tokit.BreakIteratorSegmenter; +import org.junit.Test; + +public class UmlautNormalizerTest +{ + @Test + public void test() throws Exception + { + test("Der Apfel ist sueß und lecker", "Der Apfel ist süß und lecker"); + } + + public void test(String inputText, String normalizedText) + throws Exception + { + AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class); + + AnalysisEngineDescription normalizer = createEngineDescription( + UmlautNormalizer.class, + UmlautNormalizer.PARAM_MIN_FREQUENCY_THRESHOLD,0, + UmlautNormalizer.RES_FREQUENCY_PROVIDER, createResourceDescription( + Web1TFrequencyCountResource.class, + Web1TFrequencyCountResource.PARAM_LANGUAGE, "de", + Web1TFrequencyCountResource.PARAM_MIN_NGRAM_LEVEL, "1", + Web1TFrequencyCountResource.PARAM_MAX_NGRAM_LEVEL, "1", + Web1TFrequencyCountResource.PARAM_INDEX_PATH, "src/test/resources/jweb1t")); + + assertTransformedText(normalizedText, inputText, "de", segmenter, normalizer); + } +} diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/DictionaryBasedTokenTransformerTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/DictionaryBasedTokenTransformerTest.java similarity index 89% rename from dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/DictionaryBasedTokenTransformerTest.java rename to dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/DictionaryBasedTokenTransformerTest.java index ebb03c25c5..c749be4abc 100644 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/DictionaryBasedTokenTransformerTest.java +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/DictionaryBasedTokenTransformerTest.java @@ -16,16 +16,15 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.transformation; +package org.dkpro.core.textnormalizer.transformation; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTransformedText; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.dkpro.core.testing.AssertAnnotations.assertTransformedText; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.tokit.BreakIteratorSegmenter; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; - public class DictionaryBasedTokenTransformerTest { private static final String MAPPINGS_FILE = "src/test/resources/mappings.txt"; diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/FileBasedTokenTransformerTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/FileBasedTokenTransformerTest.java similarity index 90% rename from dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/FileBasedTokenTransformerTest.java rename to dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/FileBasedTokenTransformerTest.java index 23dfedb74f..cc9e55bfb3 100644 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/FileBasedTokenTransformerTest.java +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/FileBasedTokenTransformerTest.java @@ -16,15 +16,14 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.transformation; +package org.dkpro.core.textnormalizer.transformation; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTransformedText; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.dkpro.core.testing.AssertAnnotations.assertTransformedText; +import org.dkpro.core.tokit.BreakIteratorSegmenter; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; - /** * * @see FileBasedTokenTransformer diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/HyphenationRemoverTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/HyphenationRemoverTest.java similarity index 86% rename from dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/HyphenationRemoverTest.java rename to dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/HyphenationRemoverTest.java index 765a5ec9eb..dc3ff6c14f 100644 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/HyphenationRemoverTest.java +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/HyphenationRemoverTest.java @@ -16,28 +16,27 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.transformation; +package org.dkpro.core.textnormalizer.transformation; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTransformedText; +import static org.apache.commons.io.FileUtils.readFileToString; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; +import static org.dkpro.core.testing.AssertAnnotations.assertTransformedText; import static org.junit.Assert.assertEquals; import java.io.File; -import static org.apache.commons.io.FileUtils.*; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReader; import org.apache.uima.fit.pipeline.SimplePipeline; +import org.dkpro.core.io.text.TextReader; +import org.dkpro.core.io.text.TokenizedTextWriter; +import org.dkpro.core.opennlp.OpenNlpSegmenter; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.EOLUtils; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; -import de.tudarmstadt.ukp.dkpro.core.io.text.TokenizedTextWriter; -import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpSegmenter; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.EOLUtils; - public class HyphenationRemoverTest { private static final String RESOURCE_GERMAN_DICTIONARY = "src/test/resources/dictionary/ngerman"; @@ -65,7 +64,7 @@ public void testHyphenationRemoverInPipelineReaderWriter() final String variant = "maxent"; String sourcePath = "src/test/resources/texts/test3.txt"; - final String expected = "Ich habe einen super-tollen Bären .\n"+ + final String expected = "Ich habe einen super-tollen Bären .\n" + "Für eine Registrierung einer Organisation und eine EMail Adresse .\n"; /* process input file */ @@ -96,4 +95,4 @@ TokenizedTextWriter.PARAM_TARGET_LOCATION, new File(outputPath, "test3.txt"), @Rule public DkproTestContext testContext = new DkproTestContext(); -} \ No newline at end of file +} diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/JCasTransformerChangeBased_ImplBaseTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/JCasTransformerChangeBased_ImplBaseTest.java similarity index 85% rename from dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/JCasTransformerChangeBased_ImplBaseTest.java rename to dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/JCasTransformerChangeBased_ImplBaseTest.java index d051962fc3..90d4985214 100644 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/JCasTransformerChangeBased_ImplBaseTest.java +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/JCasTransformerChangeBased_ImplBaseTest.java @@ -16,17 +16,16 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.transformation; +package org.dkpro.core.textnormalizer.transformation; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTransformedText; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.dkpro.core.testing.AssertAnnotations.assertTransformedText; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; - public class JCasTransformerChangeBased_ImplBaseTest { @Test(expected = IllegalStateException.class) diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/JCasTransformer_ImplBaseTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/JCasTransformer_ImplBaseTest.java similarity index 95% rename from dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/JCasTransformer_ImplBaseTest.java rename to dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/JCasTransformer_ImplBaseTest.java index c97dcf8ca6..f8d1b1f894 100644 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/JCasTransformer_ImplBaseTest.java +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/JCasTransformer_ImplBaseTest.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.transformation; +package org.dkpro.core.textnormalizer.transformation; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; @@ -32,14 +32,14 @@ import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.transform.JCasTransformer_ImplBase; +import org.dkpro.core.io.text.StringReader; +import org.dkpro.core.textnormalizer.util.JCasHolder; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.transform.JCasTransformer_ImplBase; -import de.tudarmstadt.ukp.dkpro.core.io.text.StringReader; -import de.tudarmstadt.ukp.dkpro.core.textnormalizer.util.JCasHolder; public class JCasTransformer_ImplBaseTest { diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/RegexBasedTokenTransformerTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/RegexBasedTokenTransformerTest.java similarity index 90% rename from dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/RegexBasedTokenTransformerTest.java rename to dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/RegexBasedTokenTransformerTest.java index 53df912544..28d11a8ed0 100644 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/RegexBasedTokenTransformerTest.java +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/RegexBasedTokenTransformerTest.java @@ -16,15 +16,14 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.transformation; +package org.dkpro.core.textnormalizer.transformation; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTransformedText; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.dkpro.core.testing.AssertAnnotations.assertTransformedText; +import org.dkpro.core.tokit.BreakIteratorSegmenter; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; - /** * * @see RegexBasedTokenTransformer diff --git a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/TokenCaseTransformerTest.java b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/TokenCaseTransformerTest.java similarity index 93% rename from dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/TokenCaseTransformerTest.java rename to dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/TokenCaseTransformerTest.java index 5ab396e551..593b60ee6d 100644 --- a/dkpro-core-textnormalizer-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/textnormalizer/transformation/TokenCaseTransformerTest.java +++ b/dkpro-core-textnormalizer-asl/src/test/java/org/dkpro/core/textnormalizer/transformation/TokenCaseTransformerTest.java @@ -16,22 +16,22 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.textnormalizer.transformation; +package org.dkpro.core.textnormalizer.transformation; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTransformedText; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.AssertAnnotations.assertTransformedText; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.TestRunner; +import org.dkpro.core.tokit.BreakIteratorSegmenter; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; public class TokenCaseTransformerTest { diff --git a/dkpro-core-textnormalizer-asl/src/test/resources/log4j.properties b/dkpro-core-textnormalizer-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-textnormalizer-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-textnormalizer-asl/src/test/resources/log4j2.xml b/dkpro-core-textnormalizer-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-textnormalizer-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Logger name="org.dkpro.core.api.resources.ResourceObjectProviderBase" level="INFO"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-textnormalizer-asl/src/test/resources/mappings.txt b/dkpro-core-textnormalizer-asl/src/test/resources/mappings.txt index 2ca9b88e20..7ce4a7dbd5 100644 --- a/dkpro-core-textnormalizer-asl/src/test/resources/mappings.txt +++ b/dkpro-core-textnormalizer-asl/src/test/resources/mappings.txt @@ -1,2 +1,2 @@ -Brannfchweig Braunschweig -test1 test +Brannfchweig Braunschweig +test1 test diff --git a/dkpro-core-textnormalizer-asl/src/test/resources/testmessages_sports/1.txt b/dkpro-core-textnormalizer-asl/src/test/resources/testmessages_sports/1.txt index 8b6d067e8f..774eb1aaf7 100644 --- a/dkpro-core-textnormalizer-asl/src/test/resources/testmessages_sports/1.txt +++ b/dkpro-core-textnormalizer-asl/src/test/resources/testmessages_sports/1.txt @@ -1,5 +1,5 @@ -Spelling: Haustühr Konferenzf -Umlaut: Haustuer Mueller -Emoticons: :-D ;-) -SharpS: Grüße weiße +Spelling: Haustühr Konferenzf +Umlaut: Haustuer Mueller +Emoticons: :-D ;-) +SharpS: Grüße weiße Internet Slang: idk afk \ No newline at end of file diff --git a/dkpro-core-textnormalizer-asl/src/test/resources/testmessages_sports/2.txt b/dkpro-core-textnormalizer-asl/src/test/resources/testmessages_sports/2.txt index 9fed2ed2b4..1c06f0cde8 100644 --- a/dkpro-core-textnormalizer-asl/src/test/resources/testmessages_sports/2.txt +++ b/dkpro-core-textnormalizer-asl/src/test/resources/testmessages_sports/2.txt @@ -1,5 +1,5 @@ -Spelling: Werkstoffg Telefonzellle -Umlaut: Einkaufstuete Hoehe -Emoticons: :-P :-* -SharpS: heiße beißen +Spelling: Werkstoffg Telefonzellle +Umlaut: Einkaufstuete Hoehe +Emoticons: :-P :-* +SharpS: heiße beißen Internet Slang: thx afaik \ No newline at end of file diff --git a/dkpro-core-tokit-asl/pom.xml b/dkpro-core-tokit-asl/pom.xml index 216e3c58a2..b44e59bbba 100644 --- a/dkpro-core-tokit-asl/pom.xml +++ b/dkpro-core-tokit-asl/pom.xml @@ -18,14 +18,15 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.tokit-asl</artifactId> + <artifactId>dkpro-core-tokit-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - Tokenization Toolkit</name> + <url>https://dkpro.github.io/dkpro-core/</url> <dependencies> <dependency> <groupId>org.apache.uima</groupId> @@ -39,30 +40,38 @@ <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> </dependency> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + </dependency> <dependency> <groupId>commons-jxpath</groupId> <artifactId>commons-jxpath</artifactId> <version>1.3</version> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> + </dependency> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-featurepath-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.featurepath-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> </dependency> <dependency> <groupId>junit</groupId> @@ -70,18 +79,18 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.ner-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-ner-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.io.text-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-io-text-asl</artifactId> <scope>test</scope> </dependency> </dependencies> diff --git a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/AnnotationByLengthFilter.java b/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/AnnotationByLengthFilter.java deleted file mode 100644 index 4910fd42ce..0000000000 --- a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/AnnotationByLengthFilter.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package de.tudarmstadt.ukp.dkpro.core.tokit; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Map.Entry; -import java.util.Set; - -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.tcas.Annotation; - -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; -import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathFactory; - -/** - * Removes annotations that do not conform to minimum or maximum length constraints. - * - * (This was previously called TokenFilter). - * - */ -@ResourceMetaData(name="Annotation-By-Length Filter") -public class AnnotationByLengthFilter - extends JCasAnnotator_ImplBase -{ - - /** - * A set of annotation types that should be filtered. - */ - public static final String PARAM_FILTER_ANNOTATION_TYPES = "FilterTypes"; - @ConfigurationParameter(name=PARAM_FILTER_ANNOTATION_TYPES, mandatory=true, defaultValue={}) - private Set<String> filterTypes; - - /** - * Any annotation in filterTypes shorter than this value will be removed. - */ - public static final String PARAM_MIN_LENGTH = "MinLengthFilter"; - @ConfigurationParameter(name=PARAM_MIN_LENGTH, mandatory=true, defaultValue="0") - private int minTokenLength; - - /** - * Any annotation in filterAnnotations shorter than this value will be removed. - */ - public static final String PARAM_MAX_LENGTH = "MaxLengthFilter"; - @ConfigurationParameter(name=PARAM_MAX_LENGTH, mandatory=true, defaultValue="1000") - private int maxTokenLength; - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - - for (String filterType : filterTypes) { - try { - Collection<Annotation> toRemove = new ArrayList<Annotation>(); - for (Entry<AnnotationFS, String> entry : FeaturePathFactory.select(aJCas.getCas(), filterType)) { - int length = entry.getKey().getCoveredText().length(); - if (length < minTokenLength || length > maxTokenLength) { - toRemove.add((Annotation)entry.getKey()); - } - } - for (Annotation anno : toRemove) { - anno.removeFromIndexes(); - } - } - catch (FeaturePathException e) { - throw new AnalysisEngineProcessException(e); - } - } - } -} diff --git a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/CamelCaseTokenSegmenter.java b/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/CamelCaseTokenSegmenter.java deleted file mode 100644 index 2eb4728a2a..0000000000 --- a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/CamelCaseTokenSegmenter.java +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.tokit; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.CasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.tcas.Annotation; - -import java.util.ArrayList; -import java.util.List; - -import static org.apache.uima.fit.util.JCasUtil.select; - -/** - * Split up existing tokens again if they are camel-case text. - */ -@ResourceMetaData(name="CamelCase Token Segmenter") -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}) -public class CamelCaseTokenSegmenter - extends JCasAnnotator_ImplBase -{ - /** - * Whether to remove the original token. - * - * Default: {@code true} - */ - public static final String PARAM_DELETE_COVER = ComponentParameters.PARAM_DELETE_COVER; - @ConfigurationParameter(name = PARAM_DELETE_COVER, mandatory = true, defaultValue = "true") - private boolean deleteCover; - - /** - * Optional annotation type to markup the original covered token area when specified. This type must be a subtype of - * {@link Annotation}. - */ - public static final String PARAM_MARKUP_TYPE = "markupType"; - @ConfigurationParameter(name = PARAM_MARKUP_TYPE, mandatory = false) - private String markupType; - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - List<Token> toAdd = new ArrayList<Token>(); - List<Token> toRemove = new ArrayList<Token>(); - - for (Token t : select(aJCas, Token.class)) { - if ((t.getEnd() - t.getBegin()) < 2) { - continue; - } - - String text = t.getCoveredText(); - int offset = t.getBegin(); - int start = 0; - boolean seenLower = Character.isLowerCase(text.charAt(0)); - for (int i = 1; i < text.length(); i++) { - // Upper-case means a new token is starting if we are at a lower-case/upper-case - // boundary. This allows us to properly treat "GetFileUploadURLRequest" - boolean nextIsLower = i + 1 < text.length() - && Character.isLowerCase(text.charAt(i + 1)); - if (Character.isUpperCase(text.charAt(i)) && (seenLower || nextIsLower)) { - toAdd.add(new Token(aJCas, offset + start, offset + i)); - start = i; - } - seenLower = Character.isLowerCase(text.charAt(i)); - } - - // If we would just create the same token again, better do nothing - if (start == 0) { - continue; - } - - // The rest goes into the final token - toAdd.add(new Token(aJCas, offset + start, offset + text.length())); - - if (deleteCover) { - toRemove.add(t); - } - - if(markupType != null) { - CAS cas = aJCas.getCas(); - AnnotationFS annotation = cas.createAnnotation(CasUtil.getType(cas, markupType), t.getBegin(), t.getEnd()); - cas.addFsToIndexes(annotation); - } - } - - for (Token t : toAdd) { - t.addToIndexes(); - } - - for (Token t : toRemove) { - t.removeFromIndexes(); - } - } -} diff --git a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/GermanSeparatedParticleAnnotator.java b/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/GermanSeparatedParticleAnnotator.java deleted file mode 100644 index e8e252a3e3..0000000000 --- a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/GermanSeparatedParticleAnnotator.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.tokit; - -import java.util.List; - -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.LanguageCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * Annotator to be used for post-processing of German corpora that have been lemmatized and - * POS-tagged with the TreeTagger, based on the STTS tagset. - * - * This Annotator deals with German particle verbs. Particle verbs consist of a particle and a stem, - * e.g. anfangen = an+fangen There are many usages of German particle verbs where the stem and the - * particle are separated, e.g., Wir fangen gleich an. The TreeTagger lemmatizes the verb stem as - * "fangen" and the separated particle as "an", the proper verblemma "anfangen" is thus not - * available as an annotation. The GermanSeparatedParticleAnnotator replaces the lemma of the stem - * of particle-verbs (e.g., fangen) by the proper verb lemma (e.g. anfangen) and leaves the lemma of - * the separated particle unchanged. - */ -@ResourceMetaData(name="German Separated Particle Annotator") -@LanguageCapability("de") -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma"}, - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma"}) - -public class GermanSeparatedParticleAnnotator extends JCasAnnotator_ImplBase { - - @Override - public void process(JCas jcas) throws AnalysisEngineProcessException { - - for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) { - List<Token> tokens = JCasUtil.selectCovered(jcas, Token.class, sentence); - for (int i = 0; i < tokens.size(); i++) { - Token token = tokens.get(i); - if (token.getPos() != null) { - if (token.getPos().getPosValue().matches("PTKVZ.*")) { - // go back and find the next finite verb - String particle = token.getText(); - String verblemma = ""; - - int j = i-1; - while (j >= 0){ - Token t = tokens.get(j); - if (t.getLemma() != null && t.getPos() != null) { - if (t.getPos().getPosValue().matches("V.*FIN")) { - verblemma = t.getLemma().getValue(); - Lemma l = t.getLemma(); - l.setValue(particle +verblemma); - break; - //l.addToIndexes(); // do not add to indexes: creates Lemma twice - } - } - j--; - } - - } - } - - } // for all tokens in the sentence - - } // for all sentences - - } // process - - -} // class diff --git a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/ParagraphSplitter.java b/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/ParagraphSplitter.java deleted file mode 100644 index 3eb32e649d..0000000000 --- a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/ParagraphSplitter.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.tokit; - -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; - -/** - * This class creates paragraph annotations for the given input document. It searches for the - * occurrence of two or more line-breaks (Unix and Windows) and regards this as the boundary between - * paragraphs. - */ -@ResourceMetaData(name="Paragraph Splitter") -@TypeCapability( - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph"}) - -public class ParagraphSplitter - extends JCasAnnotator_ImplBase -{ - public static final String SINGLE_LINE_BREAKS_PATTERN = "((\r\n)+(\r\n)*)|((\n)+(\n)*)"; - public static final String DOUBLE_LINE_BREAKS_PATTERN = "((\r\n\r\n)+(\r\n)*)|((\n\n)+(\n)*)"; - - /** - * A regular expression used to detect paragraph splits. - * - * Default: {@link #DOUBLE_LINE_BREAKS_PATTERN} (split on two consecutive line breaks) - */ - public static final String PARAM_SPLIT_PATTERN = "splitPattern"; - @ConfigurationParameter(name = PARAM_SPLIT_PATTERN, defaultValue = DOUBLE_LINE_BREAKS_PATTERN) - private Pattern splitPattern; - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - String input = aJCas.getDocumentText(); - - if (input.length() < 1) { - throw new AnalysisEngineProcessException(new Throwable("Document text is empty.")); - } - - Pattern ParagraphPattern = splitPattern; - Matcher matcher = ParagraphPattern.matcher(input); - int pos = 0; - int nextBeginning = 0; - while (matcher.find(pos)) { - Paragraph paragraph = new Paragraph(aJCas, nextBeginning, matcher.start()); - paragraph.addToIndexes(); - nextBeginning = matcher.end(); - pos = matcher.end(); - } - if (pos < input.length()) { - Paragraph paragraph = new Paragraph(aJCas, nextBeginning, input.length()); - paragraph.addToIndexes(); - } - } -} diff --git a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/PatternBasedTokenSegmenter.java b/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/PatternBasedTokenSegmenter.java deleted file mode 100644 index 1b3c54d6cd..0000000000 --- a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/PatternBasedTokenSegmenter.java +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.tokit; - -import static org.apache.uima.fit.util.JCasUtil.select; - -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * Split up existing tokens again at particular split-chars. - * The prefix states whether the split chars should be added as separate {@link Token Tokens}. - * If the {@link #INCLUDE_PREFIX} precedes the split pattern, the pattern is included. - * Consequently, patterns following the {@link #EXCLUDE_PREFIX}, will not be added as a Token. - */ -@ResourceMetaData(name="Pattern-based Token Segmenter") -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}) - -public class PatternBasedTokenSegmenter - extends JCasAnnotator_ImplBase -{ - public static final String INCLUDE_PREFIX = "+|"; - public static final String EXCLUDE_PREFIX = "-|"; - - /** - * Wether to remove the original token. - * - * Default: {@code true} - */ - public static final String PARAM_DELETE_COVER = ComponentParameters.PARAM_DELETE_COVER; - @ConfigurationParameter(name = PARAM_DELETE_COVER, mandatory = true, defaultValue = "true") - private boolean deleteCover; - - /** - * A list of regular expressions, prefixed with {@link #INCLUDE_PREFIX} or - * {@link #EXCLUDE_PREFIX}. If neither of the prefixes is used, {@link #EXCLUDE_PREFIX} is - * assumed. - */ - public static final String PARAM_PATTERNS = "patterns"; - @ConfigurationParameter(name=PARAM_PATTERNS, mandatory=true) - private String[] rawPatterns; - - private StringBuilder buf; - - private SplitPattern[] patterns; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - patterns = new SplitPattern[rawPatterns.length]; - for (int i = 0; i < rawPatterns.length; i++) { - if (rawPatterns[i].startsWith(INCLUDE_PREFIX)) { - patterns[i] = new SplitPattern(rawPatterns[i].substring(INCLUDE_PREFIX.length()), true); - } - else if (rawPatterns[i].startsWith(EXCLUDE_PREFIX)) { - patterns[i] = new SplitPattern(rawPatterns[i].substring(EXCLUDE_PREFIX.length()), false); - } - else { - patterns[i] = new SplitPattern(rawPatterns[i], false); - } - } - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - buf = new StringBuilder(); - List<Token> toAdd = new ArrayList<Token>(); - List<Token> toRemove = new ArrayList<Token>(); - - for (Token t : select(aJCas, Token.class)) { - String text = t.getCoveredText(); - int offset = t.getBegin(); - int start = 0; - SplitPattern lastPattern = getPattern(text.charAt(0), null); - Token firstToken = null; - for (int i = 1; i < text.length(); i++) { - SplitPattern pattern = getPattern(text.charAt(i), lastPattern); - if (pattern != lastPattern) { - if (lastPattern == null || lastPattern.includeInOutput) { - Token nt = addToken(aJCas, offset, text, start, i, toAdd); - firstToken = (firstToken == null) ? nt : firstToken; - } - start = i; - } - lastPattern = pattern; - } - - // If we would just create the same token again, better do nothing - if (start == 0) { - // That is - if the whole token matches something to exclude, we remove it - if (lastPattern != null && !lastPattern.includeInOutput) { - toRemove.add(t); - } - continue; - } - - if (deleteCover) { - toRemove.add(t); - } - - // The rest goes into the final token - if (lastPattern == null || lastPattern.includeInOutput) { - addToken(aJCas, offset, text, start, text.length(), toAdd); - } - } - - for (Token t : toAdd) { - t.addToIndexes(); - } - - for (Token t : toRemove) { - t.removeFromIndexes(); - } - } - - private Token addToken(JCas aJCas, int offset, String text, int start, int end, List<Token> toAdd) - { - // No adding empty tokens - if (end == start) { - return null; - } - - Token t = new Token(aJCas, offset+start, offset+end); - toAdd.add(t); - return t; - } - - SplitPattern getPattern(char ch, SplitPattern aLastPattern) - { - buf.append(ch); - for (SplitPattern p : patterns) { - p.matchter.reset(buf); - if (p.matchter.matches()) { - if (p != aLastPattern) { - buf.setLength(0); - } - return p; - } - } - buf.setLength(0); - return null; - } - - private static class SplitPattern - { - final boolean includeInOutput; - final Matcher matchter; - - public SplitPattern(String aPattern, boolean aInclude) - { - includeInOutput = aInclude; - matchter = Pattern.compile(aPattern).matcher(""); - } - } -} diff --git a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/TokenMerger.java b/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/TokenMerger.java deleted file mode 100644 index 76db7b7742..0000000000 --- a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/TokenMerger.java +++ /dev/null @@ -1,292 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package de.tudarmstadt.ukp.dkpro.core.tokit; - -import static org.apache.uima.fit.util.JCasUtil.selectCovered; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Iterator; -import java.util.List; - -import org.apache.commons.jxpath.JXPathContext; -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.CasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.tcas.Annotation; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * Merges any Tokens that are covered by a given annotation type. E.g. this component can be used - * to create a single tokens from all tokens that constitute a multi-token named entity. - */ -@ResourceMetaData(name="Token Merger") -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma"}, - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma"}) -public class TokenMerger - extends JCasAnnotator_ImplBase -{ - public static enum LemmaMode { - JOIN, REMOVE, LEAVE - } - - /** - * Annotation type for which tokens should be merged. - */ - public static final String PARAM_ANNOTATION_TYPE = "annotationType"; - @ConfigurationParameter(name=PARAM_ANNOTATION_TYPE, mandatory=true) - private String annotationType; - - /** - * A constraint on the annotations that should be considered in form of a JXPath statement. - * Example: set {@link #PARAM_ANNOTATION_TYPE} to a {@code NamedEntity} type and set the - * {@link #PARAM_CONSTRAINT} to {@code ".[value = 'LOCATION']"} to merge only tokens that are - * part of a location named entity. - */ - public static final String PARAM_CONSTRAINT = "constraint"; - @ConfigurationParameter(name=PARAM_CONSTRAINT, mandatory=false) - private String constraint; - - /** - * Configure what should happen to the lemma of the merged tokens. It is possible to JOIN - * the lemmata to a single lemma (space separated), to REMOVE the lemma or LEAVE the lemma - * of the first token as-is. - */ - public static final String PARAM_LEMMA_MODE = "lemmaMode"; - @ConfigurationParameter(name=PARAM_LEMMA_MODE, mandatory=true, defaultValue="JOIN") - private LemmaMode lemmaMode; - - /** - * Set a new POS value for the new merged token. This is the actual tag set value and is subject - * to tagset mapping. For example when merging tokens for named entities, the new POS value - * may be set to "NNP" (English/Penn Treebank Tagset). - */ - public static final String PARAM_POS_VALUE = "posValue"; - @ConfigurationParameter(name=PARAM_POS_VALUE, mandatory=false) - private String posValue; - - /** - * Set a new coarse POS value for the new merged token. This is the actual tag set value and is - * subject to tagset mapping. For example when merging tokens for named entities, the new POS - * value may be set to "NNP" (English/Penn Treebank Tagset). - */ - public static final String PARAM_CPOS_VALUE = "cposValue"; - @ConfigurationParameter(name=PARAM_CPOS_VALUE, mandatory=false) - private String cposValue; - - /** - * Set a new POS tag for the new merged token. This is the mapped type. If this is specified, - * tag set mapping will not be performed. This parameter has no effect unless PARAM_POS_VALUE - * is also set. - */ - public static final String PARAM_POS_TYPE = "posType"; - @ConfigurationParameter(name=PARAM_POS_TYPE, mandatory=false) - private String posType; - - /** - * Use this language instead of the document language to resolve the model and tag set mapping. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Override the tagset mapping. - */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - protected String posMappingLocation; - - private MappingProvider mappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - mappingProvider = new MappingProvider(); - mappingProvider.setDefault(MappingProvider.LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/" + - "core/api/lexmorph/tagset/${language}-${pos.tagset}-pos.map"); - mappingProvider.setDefault(MappingProvider.BASE_TYPE, POS.class.getName()); - mappingProvider.setDefault("pos.tagset", "default"); - mappingProvider.setOverride(MappingProvider.LOCATION, posMappingLocation); - mappingProvider.setOverride(MappingProvider.LANGUAGE, language); - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - - if (posValue != null) { - mappingProvider.configure(cas); - } - - List<AnnotationFS> covers = new ArrayList<>(CasUtil.select(cas, - CasUtil.getAnnotationType(cas, annotationType))); - Collection<Annotation> toRemove = new ArrayList<Annotation>(); - for (AnnotationFS cover : covers) { - List<Token> covered = selectCovered(Token.class, cover); - if (covered.size() < 2) { - continue; - } - - if (constraint != null) { - JXPathContext ctx = JXPathContext.newContext(cover); - boolean match = ctx.iterate(constraint).hasNext(); - if (!match) { - continue; - } - } - - Iterator<Token> i = covered.iterator(); - - // Extend first token - Token token = i.next(); - token.removeFromIndexes(); - token.setEnd(covered.get(covered.size() - 1).getEnd()); - token.addToIndexes(); - - // Optionally update the POS value - if (posValue != null) { - updatePos(token, toRemove); - } - - // Record lemma - may be needed for join later - List<String> lemmata = new ArrayList<String>(); - if (token.getLemma() != null) { - lemmata.add(token.getLemma().getValue()); - } - - // Mark the rest for deletion - record lemmata if desired for later join - while (i.hasNext()) { - Token t = i.next(); - - Lemma lemma = t.getLemma(); - if (lemma != null) { - lemmata.add(lemma.getValue()); - toRemove.add(lemma); - } - - POS pos = t.getPos(); - if (pos != null) { - toRemove.add(pos); - } - - toRemove.add(t); - } - - // Join lemmata if desired - if (lemmaMode == LemmaMode.JOIN) { - Lemma lemma = token.getLemma(); - if (!lemmata.isEmpty()) { - if (lemma == null) { - lemma = new Lemma(aJCas); - } - lemma.setValue(StringUtils.join(lemmata, " ")); - } - // Remove if there was nothing to join... I don't really ever expect to get here - else if (lemma != null) { - token.setLemma(null); - toRemove.add(lemma); - } - } - // Remove the lemma - if desired - else if (lemmaMode == LemmaMode.REMOVE) { - Lemma lemma = token.getLemma(); - if (lemma != null) { - token.setLemma(null); - toRemove.add(lemma); - } - } - - // Update offsets for lemma - if (token.getLemma() != null) { - Lemma lemma = token.getLemma(); - lemma.removeFromIndexes(); - lemma.setBegin(token.getBegin()); - lemma.setEnd(token.getEnd()); - lemma.addToIndexes(); - } - } - - // Remove tokens no longer needed - for (Annotation t : toRemove) { - t.removeFromIndexes(); - } - } - - private void updatePos(Token aToken, Collection<Annotation> aToRemove) - { - // Determine the mapped type - Type type; - if (posType != null) { - type = CasUtil.getType(aToken.getCAS(), posType); - } - else { - type = mappingProvider.getTagType(posValue); - } - - POS pos = aToken.getPos(); - if (pos != null && !pos.getType().equals(type)) { - // Remove wrong existing POS annotation - aToRemove.add(pos); - pos = null; - } - - if (pos == null) { - // Create correct annotation - pos = (POS) aToken.getCAS().createAnnotation(type, aToken.getBegin(), - aToken.getEnd()); - pos.addToIndexes(); - } - else { - // Update offsets - no need to add to indexes, was in CAS already - pos.setBegin(aToken.getBegin()); - pos.setEnd(aToken.getEnd()); - } - - // Update the POS value - pos.setPosValue(posValue); - pos.setCoarseValue(cposValue); - aToken.setPos(pos); - } -} diff --git a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/TokenTrimmer.java b/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/TokenTrimmer.java deleted file mode 100644 index 1320d12a15..0000000000 --- a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/TokenTrimmer.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package de.tudarmstadt.ukp.dkpro.core.tokit; - -import static org.apache.uima.fit.util.JCasUtil.select; - -import java.util.ArrayList; -import java.util.Collection; - -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -/** - * Remove prefixes and suffixes from tokens. - */ -@ResourceMetaData(name="Token Trimmer") -@TypeCapability( - inputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}) - -public class TokenTrimmer - extends JCasAnnotator_ImplBase -{ - /** - * List of prefixes to remove. - */ - public static final String PARAM_PREFIXES = "prefixes"; - @ConfigurationParameter(name=PARAM_PREFIXES, mandatory=true) - private String[] prefixes; - - /** - * List of suffixes to remove. - */ - public static final String PARAM_SUFFIXES = "suffixes"; - @ConfigurationParameter(name=PARAM_SUFFIXES, mandatory=true) - private String[] suffixes; - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - Collection<Token> toRemove = new ArrayList<Token>(); - for (Token t : select(aJCas, Token.class)) { - String text = t.getCoveredText(); - for (String prefix : prefixes) { - if (text.startsWith(prefix)) { - t.setBegin(t.getBegin()+prefix.length()); - break; - } - } - - text = t.getCoveredText(); - for (String suffix : suffixes) { - if (text.endsWith(suffix)) { - t.setEnd(t.getEnd()-suffix.length()); - break; - } - } - - if (t.getCoveredText().length() == 0) { - toRemove.add(t); - } - } - for (Token t : toRemove) { - t.removeFromIndexes(); - } - } -} diff --git a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/package-info.java b/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/package-info.java deleted file mode 100644 index 1fccfef907..0000000000 --- a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/package-info.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Collection of tokenization and segmentation components. - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.tokit; diff --git a/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/AnnotationByLengthFilter.java b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/AnnotationByLengthFilter.java new file mode 100644 index 0000000000..7a945d2211 --- /dev/null +++ b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/AnnotationByLengthFilter.java @@ -0,0 +1,96 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.dkpro.core.tokit; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Map.Entry; +import java.util.Set; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.dkpro.core.api.featurepath.FeaturePathException; +import org.dkpro.core.api.featurepath.FeaturePathFactory; + +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Removes annotations that do not conform to minimum or maximum length constraints. + * + * (This was previously called TokenFilter). + */ +@Component(OperationType.NORMALIZER) +@ResourceMetaData(name = "Annotation-By-Length Filter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +public class AnnotationByLengthFilter + extends JCasAnnotator_ImplBase +{ + /** + * A set of annotation types that should be filtered. + */ + public static final String PARAM_FILTER_ANNOTATION_TYPES = "FilterTypes"; + @ConfigurationParameter(name = PARAM_FILTER_ANNOTATION_TYPES, mandatory = true, + defaultValue = {}) + private Set<String> filterTypes; + + /** + * Any annotation in filterTypes shorter than this value will be removed. + */ + public static final String PARAM_MIN_LENGTH = "MinLengthFilter"; + @ConfigurationParameter(name = PARAM_MIN_LENGTH, mandatory = true, defaultValue = "0") + private int minTokenLength; + + /** + * Any annotation in filterAnnotations shorter than this value will be removed. + */ + public static final String PARAM_MAX_LENGTH = "MaxLengthFilter"; + @ConfigurationParameter(name = PARAM_MAX_LENGTH, mandatory = true, defaultValue = "1000") + private int maxTokenLength; + + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException + { + + for (String filterType : filterTypes) { + try { + Collection<Annotation> toRemove = new ArrayList<Annotation>(); + for (Entry<AnnotationFS, String> entry : FeaturePathFactory.select(aJCas.getCas(), + filterType)) { + int length = entry.getKey().getCoveredText().length(); + if (length < minTokenLength || length > maxTokenLength) { + toRemove.add((Annotation) entry.getKey()); + } + } + for (Annotation anno : toRemove) { + anno.removeFromIndexes(); + } + } + catch (FeaturePathException e) { + throw new AnalysisEngineProcessException(e); + } + } + } +} diff --git a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/BreakIteratorSegmenter.java b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/BreakIteratorSegmenter.java similarity index 84% rename from dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/BreakIteratorSegmenter.java rename to dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/BreakIteratorSegmenter.java index 0f3f6c4935..4e2d7b7f35 100644 --- a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/BreakIteratorSegmenter.java +++ b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/BreakIteratorSegmenter.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.tokit; +package org.dkpro.core.tokit; import java.text.BreakIterator; @@ -26,18 +26,20 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; +import org.dkpro.core.api.segmentation.SegmenterBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.TrimUtils; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.DocumentationResource; /** * BreakIterator segmenter. */ -@ResourceMetaData(name="Java BreakIterator Segmenter") +@ResourceMetaData(name = "Java BreakIterator Segmenter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @LanguageCapability({ "ar", "be", "bg", "ca", "cs", "da", "de", "el", "en", "es", "et", "fi", "fr", - "ga", "hi", "hr", "hu", "in", "is", "it", "iw", "ja", "ko", "lt", "lv", "mk", "ms", "mt", - "nl", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sq", "sr", "sv", "th", "tr", "uk", "vi", - "zh" }) + "ga", "hi", "hr", "hu", "is", "it", "ja", "ko", "lt", "lv", "mk", "ms", "mt", "nl", "no", + "pl", "pt", "ro", "ru", "sk", "sl", "sq", "sr", "sv", "th", "tr", "uk", "vi", "zh" }) @TypeCapability( outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", @@ -72,8 +74,9 @@ protected void process(JCas aJCas, String text, int zoneBegin) } else { int[] span = new int[] { last, cur }; - trim(aJCas.getDocumentText(), span); - processSentence(aJCas, aJCas.getDocumentText().substring(span[0], span[1]), span[0]); + TrimUtils.trim(aJCas.getDocumentText(), span); + processSentence(aJCas, aJCas.getDocumentText().substring(span[0], span[1]), + span[0]); } last = cur; cur = bi.next(); diff --git a/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/CamelCaseTokenSegmenter.java b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/CamelCaseTokenSegmenter.java new file mode 100644 index 0000000000..d47609d5f2 --- /dev/null +++ b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/CamelCaseTokenSegmenter.java @@ -0,0 +1,124 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.tokit; + +import static org.apache.uima.fit.util.JCasUtil.select; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.CasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.dkpro.core.api.parameter.ComponentParameters; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Split up existing tokens again if they are camel-case text. + */ +@Component(OperationType.SEGMENTER) +@ResourceMetaData(name = "CamelCase Token Segmenter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability(inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) +public class CamelCaseTokenSegmenter + extends JCasAnnotator_ImplBase +{ + /** + * Whether to remove the original token. + */ + public static final String PARAM_DELETE_COVER = ComponentParameters.PARAM_DELETE_COVER; + @ConfigurationParameter(name = PARAM_DELETE_COVER, mandatory = true, defaultValue = "true") + private boolean deleteCover; + + /** + * Optional annotation type to markup the original covered token area when specified. This type + * must be a subtype of {@link Annotation}. + */ + public static final String PARAM_MARKUP_TYPE = "markupType"; + @ConfigurationParameter(name = PARAM_MARKUP_TYPE, mandatory = false) + private String markupType; + + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException + { + List<Token> toAdd = new ArrayList<Token>(); + List<Token> toRemove = new ArrayList<Token>(); + + for (Token t : select(aJCas, Token.class)) { + if ((t.getEnd() - t.getBegin()) < 2) { + continue; + } + + String text = t.getCoveredText(); + int offset = t.getBegin(); + int start = 0; + boolean seenLower = Character.isLowerCase(text.charAt(0)); + for (int i = 1; i < text.length(); i++) { + // Upper-case means a new token is starting if we are at a lower-case/upper-case + // boundary. This allows us to properly treat "GetFileUploadURLRequest" + boolean nextIsLower = i + 1 < text.length() + && Character.isLowerCase(text.charAt(i + 1)); + if (Character.isUpperCase(text.charAt(i)) && (seenLower || nextIsLower)) { + toAdd.add(new Token(aJCas, offset + start, offset + i)); + start = i; + } + seenLower = Character.isLowerCase(text.charAt(i)); + } + + // If we would just create the same token again, better do nothing + if (start == 0) { + continue; + } + + // The rest goes into the final token + toAdd.add(new Token(aJCas, offset + start, offset + text.length())); + + if (deleteCover) { + toRemove.add(t); + } + + if (markupType != null) { + CAS cas = aJCas.getCas(); + AnnotationFS annotation = cas.createAnnotation(CasUtil.getType(cas, markupType), + t.getBegin(), t.getEnd()); + cas.addFsToIndexes(annotation); + } + } + + for (Token t : toAdd) { + t.addToIndexes(); + } + + for (Token t : toRemove) { + t.removeFromIndexes(); + } + } +} diff --git a/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/GermanSeparatedParticleAnnotator.java b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/GermanSeparatedParticleAnnotator.java new file mode 100644 index 0000000000..977c079d43 --- /dev/null +++ b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/GermanSeparatedParticleAnnotator.java @@ -0,0 +1,98 @@ +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.tokit; + +import java.util.List; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.LanguageCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Annotator to be used for post-processing of German corpora that have been lemmatized and + * POS-tagged with the TreeTagger, based on the STTS tagset. + * + * This Annotator deals with German particle verbs. Particle verbs consist of a particle and a stem, + * e.g. anfangen = an+fangen There are many usages of German particle verbs where the stem and the + * particle are separated, e.g., Wir fangen gleich an. The TreeTagger lemmatizes the verb stem as + * "fangen" and the separated particle as "an", the proper verblemma "anfangen" is thus not + * available as an annotation. The GermanSeparatedParticleAnnotator replaces the lemma of the stem + * of particle-verbs (e.g., fangen) by the proper verb lemma (e.g. anfangen) and leaves the lemma of + * the separated particle unchanged. + */ +@Component(OperationType.SEGMENTER) +@ResourceMetaData(name = "German Separated Particle Annotator") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@LanguageCapability("de") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }) +public class GermanSeparatedParticleAnnotator + extends JCasAnnotator_ImplBase +{ + @Override + public void process(JCas jcas) throws AnalysisEngineProcessException + { + + for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) { + List<Token> tokens = JCasUtil.selectCovered(jcas, Token.class, sentence); + for (int i = 0; i < tokens.size(); i++) { + Token token = tokens.get(i); + if (token.getPos() != null) { + if (token.getPos().getPosValue().matches("PTKVZ.*")) { + // go back and find the next finite verb + String particle = token.getText(); + String verblemma = ""; + + int j = i - 1; + while (j >= 0) { + Token t = tokens.get(j); + if (t.getLemma() != null && t.getPos() != null) { + if (t.getPos().getPosValue().matches("V.*FIN")) { + verblemma = t.getLemma().getValue(); + Lemma l = t.getLemma(); + l.setValue(particle + verblemma); + break; + // l.addToIndexes(); // do not add to indexes: creates Lemma + // twice + } + } + j--; + } + } + } + } // for all tokens in the sentence + } // for all sentences + } // process +} // class diff --git a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/LineBasedSentenceSegmenter.java b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/LineBasedSentenceSegmenter.java similarity index 84% rename from dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/LineBasedSentenceSegmenter.java rename to dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/LineBasedSentenceSegmenter.java index 06622cf9e9..790374d484 100644 --- a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/LineBasedSentenceSegmenter.java +++ b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/LineBasedSentenceSegmenter.java @@ -16,14 +16,13 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.tokit; +package org.dkpro.core.tokit; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; +import org.dkpro.core.api.segmentation.SegmenterBase; /** * Annotates each line in the source text as a sentence. This segmenter is not capable of creating @@ -31,13 +30,11 @@ * * @deprecated Use {@link RegexSegmenter} */ -@ResourceMetaData(name="Line-based Sentence Segmenter") -@TypeCapability( - outputs={ - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence"}) +@ResourceMetaData(name = "Line-based Sentence Segmenter") +@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) @Deprecated public class LineBasedSentenceSegmenter - extends SegmenterBase + extends SegmenterBase { @Override protected void process(JCas aJCas, String aText, int aZoneBegin) @@ -52,12 +49,12 @@ protected void process(JCas aJCas, String aText, int aZoneBegin) createSentence(aJCas, aZoneBegin + begin, aZoneBegin + cursor); begin = cursor + 1; } - + // Stop at end of text if (cursor >= aText.length()) { break; } - + cursor++; } } diff --git a/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/ParagraphSplitter.java b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/ParagraphSplitter.java new file mode 100644 index 0000000000..d387c59f0e --- /dev/null +++ b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/ParagraphSplitter.java @@ -0,0 +1,83 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.tokit; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * This class creates paragraph annotations for the given input document. It searches for the + * occurrence of two or more line-breaks (Unix and Windows) and regards this as the boundary between + * paragraphs. + */ +@Component(OperationType.SEGMENTER) +@ResourceMetaData(name = "Paragraph Splitter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph"}) +public class ParagraphSplitter + extends JCasAnnotator_ImplBase +{ + public static final String SINGLE_LINE_BREAKS_PATTERN = "((\r\n)+(\r\n)*)|((\n)+(\n)*)"; + public static final String DOUBLE_LINE_BREAKS_PATTERN = "((\r\n\r\n)+(\r\n)*)|((\n\n)+(\n)*)"; + + /** + * A regular expression used to detect paragraph splits. + */ + public static final String PARAM_SPLIT_PATTERN = "splitPattern"; + @ConfigurationParameter(name = PARAM_SPLIT_PATTERN, defaultValue = DOUBLE_LINE_BREAKS_PATTERN) + private Pattern splitPattern; + + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException + { + String input = aJCas.getDocumentText(); + + if (input.length() < 1) { + throw new AnalysisEngineProcessException(new Throwable("Document text is empty.")); + } + + Pattern ParagraphPattern = splitPattern; + Matcher matcher = ParagraphPattern.matcher(input); + int pos = 0; + int nextBeginning = 0; + while (matcher.find(pos)) { + Paragraph paragraph = new Paragraph(aJCas, nextBeginning, matcher.start()); + paragraph.addToIndexes(); + nextBeginning = matcher.end(); + pos = matcher.end(); + } + if (pos < input.length()) { + Paragraph paragraph = new Paragraph(aJCas, nextBeginning, input.length()); + paragraph.addToIndexes(); + } + } +} diff --git a/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/PatternBasedTokenSegmenter.java b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/PatternBasedTokenSegmenter.java new file mode 100644 index 0000000000..5fc5dca819 --- /dev/null +++ b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/PatternBasedTokenSegmenter.java @@ -0,0 +1,196 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.tokit; + +import static org.apache.uima.fit.util.JCasUtil.select; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Split up existing tokens again at particular split-chars. + * The prefix states whether the split chars should be added as separate {@link Token Tokens}. + * If the {@link #INCLUDE_PREFIX} precedes the split pattern, the pattern is included. + * Consequently, patterns following the {@link #EXCLUDE_PREFIX}, will not be added as a Token. + */ +@Component(OperationType.SEGMENTER) +@ResourceMetaData(name = "Pattern-based Token Segmenter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}) +public class PatternBasedTokenSegmenter + extends JCasAnnotator_ImplBase +{ + public static final String INCLUDE_PREFIX = "+|"; + public static final String EXCLUDE_PREFIX = "-|"; + + /** + * Whether to remove the original token. + */ + public static final String PARAM_DELETE_COVER = ComponentParameters.PARAM_DELETE_COVER; + @ConfigurationParameter(name = PARAM_DELETE_COVER, mandatory = true, defaultValue = "true") + private boolean deleteCover; + + /** + * A list of regular expressions, prefixed with {@link #INCLUDE_PREFIX} or + * {@link #EXCLUDE_PREFIX}. If neither of the prefixes is used, {@link #EXCLUDE_PREFIX} is + * assumed. + */ + public static final String PARAM_PATTERNS = "patterns"; + @ConfigurationParameter(name = PARAM_PATTERNS, mandatory = true) + private String[] rawPatterns; + + private StringBuilder buf; + + private SplitPattern[] patterns; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException + { + super.initialize(aContext); + + patterns = new SplitPattern[rawPatterns.length]; + for (int i = 0; i < rawPatterns.length; i++) { + if (rawPatterns[i].startsWith(INCLUDE_PREFIX)) { + patterns[i] = new SplitPattern(rawPatterns[i].substring(INCLUDE_PREFIX.length()), + true); + } + else if (rawPatterns[i].startsWith(EXCLUDE_PREFIX)) { + patterns[i] = new SplitPattern(rawPatterns[i].substring(EXCLUDE_PREFIX.length()), + false); + } + else { + patterns[i] = new SplitPattern(rawPatterns[i], false); + } + } + } + + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException + { + buf = new StringBuilder(); + List<Token> toAdd = new ArrayList<Token>(); + List<Token> toRemove = new ArrayList<Token>(); + + for (Token t : select(aJCas, Token.class)) { + String text = t.getCoveredText(); + int offset = t.getBegin(); + int start = 0; + SplitPattern lastPattern = getPattern(text.charAt(0), null); + Token firstToken = null; + for (int i = 1; i < text.length(); i++) { + SplitPattern pattern = getPattern(text.charAt(i), lastPattern); + if (pattern != lastPattern) { + if (lastPattern == null || lastPattern.includeInOutput) { + Token nt = addToken(aJCas, offset, text, start, i, toAdd); + firstToken = (firstToken == null) ? nt : firstToken; + } + start = i; + } + lastPattern = pattern; + } + + // If we would just create the same token again, better do nothing + if (start == 0) { + // That is - if the whole token matches something to exclude, we remove it + if (lastPattern != null && !lastPattern.includeInOutput) { + toRemove.add(t); + } + continue; + } + + if (deleteCover) { + toRemove.add(t); + } + + // The rest goes into the final token + if (lastPattern == null || lastPattern.includeInOutput) { + addToken(aJCas, offset, text, start, text.length(), toAdd); + } + } + + for (Token t : toAdd) { + t.addToIndexes(); + } + + for (Token t : toRemove) { + t.removeFromIndexes(); + } + } + + private Token addToken(JCas aJCas, int offset, String text, int start, int end, + List<Token> toAdd) + { + // No adding empty tokens + if (end == start) { + return null; + } + + Token t = new Token(aJCas, offset + start, offset + end); + toAdd.add(t); + return t; + } + + SplitPattern getPattern(char ch, SplitPattern aLastPattern) + { + buf.append(ch); + for (SplitPattern p : patterns) { + p.matchter.reset(buf); + if (p.matchter.matches()) { + if (p != aLastPattern) { + buf.setLength(0); + } + return p; + } + } + buf.setLength(0); + return null; + } + + private static class SplitPattern + { + final boolean includeInOutput; + final Matcher matchter; + + public SplitPattern(String aPattern, boolean aInclude) + { + includeInOutput = aInclude; + matchter = Pattern.compile(aPattern).matcher(""); + } + } +} diff --git a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/RegexSegmenter.java b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/RegexSegmenter.java similarity index 88% rename from dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/RegexSegmenter.java rename to dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/RegexSegmenter.java index 983a7d98d5..441f7a1169 100644 --- a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/RegexSegmenter.java +++ b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/RegexSegmenter.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.tokit; +package org.dkpro.core.tokit; import static org.apache.commons.lang3.StringUtils.isBlank; @@ -30,20 +30,22 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.segmentation.SegmenterBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.DocumentationResource; /** * This segmenter splits sentences and tokens based on regular expressions that define the sentence * and token boundaries. * <p> - * The default behaviour is to split sentences by a line break and tokens by whitespace. + * The default behavior is to split sentences by a line break and tokens by whitespace. */ -@ResourceMetaData(name="Regex Segmenter") +@ResourceMetaData(name = "Regex Segmenter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( - outputs={ + outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) public class RegexSegmenter @@ -53,25 +55,25 @@ public class RegexSegmenter private static final String WHITESPACE_PATTERN = "[\\s\n]+"; /** - * Defines the pattern that is used as token end boundary. Default: {@code [\s\n]+} (matching - * whitespace and linebreaks. + * Defines the pattern that is used as token end boundary. * <p> * When setting custom patterns, take into account that the final token is often terminated by a * linebreak rather than the boundary character. Therefore, the newline typically has to be * added to the group of matching characters, e.g. {@code "tokenized-text"} is correctly * tokenized with the pattern {@code [-\n]}. - * */ public static final String PARAM_TOKEN_BOUNDARY_REGEX = "tokenBoundaryRegex"; - @ConfigurationParameter(name = PARAM_TOKEN_BOUNDARY_REGEX, mandatory = true, defaultValue = WHITESPACE_PATTERN) + @ConfigurationParameter(name = PARAM_TOKEN_BOUNDARY_REGEX, mandatory = true, + defaultValue = WHITESPACE_PATTERN) private String tokenBoundaryRegex; private Pattern tokenBoundaryPattern; /** - * Define the sentence boundary. Default: {@code \n} (assume one sentence per line). + * Define the sentence boundary. */ public static final String PARAM_SENTENCE_BOUNDARY_REGEX = "sentenceBoundaryRegex"; - @ConfigurationParameter(name = PARAM_SENTENCE_BOUNDARY_REGEX, mandatory = true, defaultValue = LINEBREAK_PATTERN) + @ConfigurationParameter(name = PARAM_SENTENCE_BOUNDARY_REGEX, mandatory = true, + defaultValue = LINEBREAK_PATTERN) private String sentenceBoundaryRegex; private Pattern sentenceBoundaryPattern; diff --git a/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/StopWordRemover.java b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/StopWordRemover.java new file mode 100644 index 0000000000..c5c70e0061 --- /dev/null +++ b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/StopWordRemover.java @@ -0,0 +1,327 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.tokit; + +import static org.apache.commons.io.IOUtils.closeQuietly; +import static org.apache.uima.fit.util.CasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.getView; +import static org.apache.uima.util.Level.FINE; +import static org.dkpro.core.api.resources.ResourceUtils.resolveLocation; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Logger; +import org.dkpro.core.api.featurepath.FeaturePathException; +import org.dkpro.core.api.featurepath.FeaturePathInfo; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.tokit.internal.StopWordSet; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.StopWord; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Remove all of the specified types from the CAS if their covered text is in the stop word + * dictionary. Also remove any other of the specified types that is covered by a matching instance. + */ +@Component(OperationType.NORMALIZER) +@ResourceMetaData(name = "Stop Word Remover") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.StopWord" }) +public class StopWordRemover + extends JCasAnnotator_ImplBase +{ + // VIEW NAMES + private static final String TOPIC_VIEW = "topic"; + private static final String DOC_VIEW = "doc"; + + /** + * A list of URLs from which to load the stop word lists. If an URL is prefixed with a language + * code in square brackets, the stop word list is only used for documents in that language. + * Using no prefix or the prefix "[*]" causes the list to be used for every document. + * Example: "[de]classpath:/stopwords/en_articles.txt" + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true) + private Set<String> swFileNames; + + /** + * The character encoding used by the model. + */ + public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; + @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, defaultValue = "UTF-8") + private String modelEncoding; + + /** + * Feature paths for annotations that should be matched/removed. The default is + * + * <pre> + * StopWord.class.getName() + * Token.class.getName() + * Lemma.class.getName()+"/value" + * </pre> + */ + public static final String PARAM_PATHS = "Paths"; + @ConfigurationParameter(name = PARAM_PATHS, mandatory = false) + private Set<String> paths; + + /** + * Anything annotated with this type will be removed even if it does not match any word in the + * lists. + */ + public static final String PARAM_STOP_WORD_TYPE = "StopWordType"; + @ConfigurationParameter(name = PARAM_STOP_WORD_TYPE, mandatory = false) + private String stopWordType; + + private Map<String, StopWordSet> stopWordSets; + + @Override + public void initialize(UimaContext context) + throws ResourceInitializationException + { + super.initialize(context); + + // Set default paths. This cannot be done in the annotation because we cannot call + // methods there. + if (paths == null || paths.size() == 0) { + paths = new HashSet<String>(); + paths.add(StopWord.class.getName()); + paths.add(Token.class.getName()); + paths.add(Lemma.class.getName() + "/value"); + } + + // Set default stop word type. This cannot be done in the annotation because we cannot call + // methods there. + if (stopWordType == null) { + stopWordType = StopWord.class.getName(); + } + + try { + stopWordSets = new HashMap<String, StopWordSet>(); + for (String swFileName : swFileNames) { + String fileLocale = "*"; + // Check if a locale is defined for the file + if (swFileName.startsWith("[")) { + fileLocale = swFileName.substring(1, swFileName.indexOf(']')); + swFileName = swFileName.substring(swFileName.indexOf(']') + 1); + } + + // Fetch the set for the specified locale + StopWordSet set = stopWordSets.get(fileLocale); + if (set == null) { + set = new StopWordSet(); + stopWordSets.put(fileLocale, set); + } + + // Load the set + URL source = resolveLocation(swFileName, this, context); + InputStream is = null; + try { + is = source.openStream(); + set.load(is, modelEncoding); + } + finally { + closeQuietly(is); + } + + getLogger().info( + "Loaded stopwords for locale [" + fileLocale + "] from [" + source + "]"); + } + } + catch (IOException e1) { + throw new ResourceInitializationException(e1); + } + } + + @Override + public void process(JCas jcas) + throws AnalysisEngineProcessException + { + JCas doc = getView(jcas, DOC_VIEW, null); + JCas topic = getView(jcas, TOPIC_VIEW, null); + + try { + if (doc != null) { + check(doc); + } + + if (topic != null) { + check(topic); + } + + if (topic == null && doc == null) { + check(jcas); + } + } + catch (FeaturePathException e) { + throw new AnalysisEngineProcessException(e); + } + } + + private void check(JCas aJCas) + throws FeaturePathException + { + Logger log = getContext().getLogger(); + + Locale casLocale = new Locale(aJCas.getDocumentLanguage()); + StopWordSet anyLocaleSet = stopWordSets.get("*"); + StopWordSet casLocaleSet = stopWordSets.get(aJCas.getDocumentLanguage()); + + // Now really to the removal part + FeaturePathInfo fp = new FeaturePathInfo(); + for (String path : paths) { + // Create a sorted list of annotations that we can quickly search on + AnnotationFS[] candidates = getCandidates(aJCas); + + // Initialize list of annotations to remove + List<AnnotationFS> toRemove = new ArrayList<AnnotationFS>(); + + // Separate Typename and featurepath + String[] segments = path.split("/", 2); + + String typeName = segments[0]; + boolean isStopWordType = stopWordType.equals(typeName); + Type t = aJCas.getTypeSystem().getType(typeName); + if (t == null) { + throw new IllegalStateException("Type [" + typeName + "] not found in type system"); + } + + // initialize the FeaturePathInfo with the corresponding part + if (segments.length > 1) { + fp.initialize(segments[1]); + } + else { + fp.initialize(""); + } + + int safeStart = 0; + Iterator<Annotation> i = aJCas.getAnnotationIndex(t).iterator(); + while (i.hasNext()) { + Annotation anno = i.next(); + + // Move the start of the containment scanning range ahead if possible + while ((safeStart + 1) < candidates.length + && candidates[safeStart + 1].getEnd() < anno.getBegin()) { + safeStart++; + } + + String candidate = fp.getValue(anno).toLowerCase(casLocale); + if (isStopWordType || ((anyLocaleSet != null) && anyLocaleSet.contains(candidate)) + || ((casLocaleSet != null) && casLocaleSet.contains(candidate))) { + // Remove the annotation that matched the stop word + toRemove.add(anno); + if (log.isLoggable(FINE)) { + log.log(FINE, "Removing [" + + typeName.substring(typeName.lastIndexOf('.') + 1) + + "] annotated as stop word [" + anno.getCoveredText() + "]@" + + anno.getBegin() + ".." + anno.getEnd()); + } + + // Scan all potential annotations that may be covered the current + // annotation and remove them as well + int n = safeStart; + while (n < candidates.length && candidates[n].getBegin() < anno.getEnd()) { + if ((anno.getBegin() <= candidates[n].getBegin()) + && (candidates[n].getEnd() <= anno.getEnd())) { + if (log.isLoggable(FINE)) { + log.log(FINE, "Removing as well [" + + candidates[n].getClass().getSimpleName() + + "] annotated as stop word [" + + candidates[n].getCoveredText() + "]@" + + candidates[n].getBegin() + ".." + candidates[n].getEnd()); + } + toRemove.add(candidates[n]); + } + n++; + } + } + } + + // Remove from the CAS + for (AnnotationFS anno : toRemove) { + aJCas.removeFsFromIndexes(anno); + } + } + } + + private AnnotationFS[] getCandidates(JCas aJCas) + { + // Make a list of all the annotations that can be matched by the given paths. If any one + // of the paths match, we want to remove instances of all others being covered by the + // match as well. + List<AnnotationFS> candidateList = new ArrayList<AnnotationFS>(); + for (String path : paths) { + String[] segments = path.split("/", 2); + String typeName = segments[0]; + Type t = aJCas.getTypeSystem().getType(typeName); + if (t == null) { + throw new IllegalStateException("Type [" + typeName + "] not found in type system"); + } + + for (AnnotationFS fs : select(aJCas.getCas(), t)) { + candidateList.add(fs); + } + } + AnnotationFS[] candidates = candidateList.toArray(new AnnotationFS[candidateList.size()]); + Arrays.sort(candidates, new BeginEndComparator()); + return candidates; + + } + + static class BeginEndComparator implements Comparator<AnnotationFS> + { + @Override + public int compare(AnnotationFS aO1, AnnotationFS aO2) + { + if (aO1.getBegin() == aO2.getBegin()) { + return aO1.getEnd() - aO2.getEnd(); + } + else { + return aO1.getBegin() - aO2.getBegin(); + } + } + } +} diff --git a/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/TokenMerger.java b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/TokenMerger.java new file mode 100644 index 0000000000..8ada9f27e1 --- /dev/null +++ b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/TokenMerger.java @@ -0,0 +1,296 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.dkpro.core.tokit; + +import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.dkpro.core.api.resources.MappingProvider.BASE_TYPE; +import static org.dkpro.core.api.resources.ResourceObjectProviderBase.LANGUAGE; +import static org.dkpro.core.api.resources.ResourceObjectProviderBase.LOCATION; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; + +import org.apache.commons.jxpath.JXPathContext; +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.CasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.MappingProvider; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.DocumentationResource; + +/** + * Merges any Tokens that are covered by a given annotation type. E.g. this component can be used + * to create a single tokens from all tokens that constitute a multi-token named entity. + */ +@ResourceMetaData(name = "Token Merger") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma"}, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma"}) +public class TokenMerger + extends JCasAnnotator_ImplBase +{ + public static enum LemmaMode + { + JOIN, REMOVE, LEAVE + } + + /** + * Annotation type for which tokens should be merged. + */ + public static final String PARAM_ANNOTATION_TYPE = "annotationType"; + @ConfigurationParameter(name = PARAM_ANNOTATION_TYPE, mandatory = true) + private String annotationType; + + /** + * A constraint on the annotations that should be considered in form of a JXPath statement. + * Example: set {@link #PARAM_ANNOTATION_TYPE} to a {@code NamedEntity} type and set the + * {@link #PARAM_CONSTRAINT} to {@code ".[value = 'LOCATION']"} to merge only tokens that are + * part of a location named entity. + */ + public static final String PARAM_CONSTRAINT = "constraint"; + @ConfigurationParameter(name = PARAM_CONSTRAINT, mandatory = false) + private String constraint; + + /** + * Configure what should happen to the lemma of the merged tokens. It is possible to JOIN the + * lemmata to a single lemma (space separated), to REMOVE the lemma or LEAVE the lemma of the + * first token as-is. + */ + public static final String PARAM_LEMMA_MODE = "lemmaMode"; + @ConfigurationParameter(name = PARAM_LEMMA_MODE, mandatory = true, defaultValue = "JOIN") + private LemmaMode lemmaMode; + + /** + * Set a new POS value for the new merged token. This is the actual tag set value and is subject + * to tagset mapping. For example when merging tokens for named entities, the new POS value may + * be set to "NNP" (English/Penn Treebank Tagset). + */ + public static final String PARAM_POS_VALUE = "posValue"; + @ConfigurationParameter(name = PARAM_POS_VALUE, mandatory = false) + private String posValue; + + /** + * Set a new coarse POS value for the new merged token. This is the actual tag set value and is + * subject to tagset mapping. For example when merging tokens for named entities, the new POS + * value may be set to "NNP" (English/Penn Treebank Tagset). + */ + public static final String PARAM_CPOS_VALUE = "cposValue"; + @ConfigurationParameter(name = PARAM_CPOS_VALUE, mandatory = false) + private String cposValue; + + /** + * Set a new POS tag for the new merged token. This is the mapped type. If this is specified, + * tag set mapping will not be performed. This parameter has no effect unless PARAM_POS_VALUE is + * also set. + */ + public static final String PARAM_POS_TYPE = "posType"; + @ConfigurationParameter(name = PARAM_POS_TYPE, mandatory = false) + private String posType; + + /** + * Use this language instead of the document language to resolve the model and tag set mapping. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Override the tagset mapping. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + protected String posMappingLocation; + + private MappingProvider mappingProvider; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException + { + super.initialize(aContext); + + mappingProvider = new MappingProvider(); + mappingProvider.setDefault(LOCATION, + "classpath:/org/dkpro/core/api/lexmorph/tagset/${language}-${pos.tagset}-pos.map"); + mappingProvider.setDefault(BASE_TYPE, POS.class.getName()); + mappingProvider.setDefault("pos.tagset", "default"); + mappingProvider.setOverride(LOCATION, posMappingLocation); + mappingProvider.setOverride(LANGUAGE, language); + } + + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException + { + CAS cas = aJCas.getCas(); + + if (posValue != null) { + mappingProvider.configure(cas); + } + + List<AnnotationFS> covers = new ArrayList<>( + CasUtil.select(cas, CasUtil.getAnnotationType(cas, annotationType))); + Collection<Annotation> toRemove = new ArrayList<Annotation>(); + for (AnnotationFS cover : covers) { + List<Token> covered = selectCovered(Token.class, cover); + if (covered.size() < 2) { + continue; + } + + if (constraint != null) { + JXPathContext ctx = JXPathContext.newContext(cover); + boolean match = ctx.iterate(constraint).hasNext(); + if (!match) { + continue; + } + } + + Iterator<Token> i = covered.iterator(); + + // Extend first token + Token token = i.next(); + token.removeFromIndexes(); + token.setEnd(covered.get(covered.size() - 1).getEnd()); + token.addToIndexes(); + + // Optionally update the POS value + if (posValue != null) { + updatePos(token, toRemove); + } + + // Record lemma - may be needed for join later + List<String> lemmata = new ArrayList<String>(); + if (token.getLemma() != null) { + lemmata.add(token.getLemma().getValue()); + } + + // Mark the rest for deletion - record lemmata if desired for later join + while (i.hasNext()) { + Token t = i.next(); + + Lemma lemma = t.getLemma(); + if (lemma != null) { + lemmata.add(lemma.getValue()); + toRemove.add(lemma); + } + + POS pos = t.getPos(); + if (pos != null) { + toRemove.add(pos); + } + + toRemove.add(t); + } + + // Join lemmata if desired + if (lemmaMode == LemmaMode.JOIN) { + Lemma lemma = token.getLemma(); + if (!lemmata.isEmpty()) { + if (lemma == null) { + lemma = new Lemma(aJCas); + } + lemma.setValue(StringUtils.join(lemmata, " ")); + } + // Remove if there was nothing to join... I don't really ever expect to get here + else if (lemma != null) { + token.setLemma(null); + toRemove.add(lemma); + } + } + // Remove the lemma - if desired + else if (lemmaMode == LemmaMode.REMOVE) { + Lemma lemma = token.getLemma(); + if (lemma != null) { + token.setLemma(null); + toRemove.add(lemma); + } + } + + // Update offsets for lemma + if (token.getLemma() != null) { + Lemma lemma = token.getLemma(); + lemma.removeFromIndexes(); + lemma.setBegin(token.getBegin()); + lemma.setEnd(token.getEnd()); + lemma.addToIndexes(); + } + } + + // Remove tokens no longer needed + for (Annotation t : toRemove) { + t.removeFromIndexes(); + } + } + + private void updatePos(Token aToken, Collection<Annotation> aToRemove) + { + // Determine the mapped type + Type type; + if (posType != null) { + type = CasUtil.getType(aToken.getCAS(), posType); + } + else { + type = mappingProvider.getTagType(posValue); + } + + POS pos = aToken.getPos(); + if (pos != null && !pos.getType().equals(type)) { + // Remove wrong existing POS annotation + aToRemove.add(pos); + pos = null; + } + + if (pos == null) { + // Create correct annotation + pos = (POS) aToken.getCAS().createAnnotation(type, aToken.getBegin(), aToken.getEnd()); + pos.addToIndexes(); + } + else { + // Update offsets - no need to add to indexes, was in CAS already + pos.setBegin(aToken.getBegin()); + pos.setEnd(aToken.getEnd()); + } + + // Update the POS value + pos.setPosValue(posValue); + pos.setCoarseValue(cposValue); + aToken.setPos(pos); + } +} diff --git a/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/TokenTrimmer.java b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/TokenTrimmer.java new file mode 100644 index 0000000000..1bceef5fe3 --- /dev/null +++ b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/TokenTrimmer.java @@ -0,0 +1,91 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.dkpro.core.tokit; + +import static org.apache.uima.fit.util.JCasUtil.select; + +import java.util.ArrayList; +import java.util.Collection; + +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Remove prefixes and suffixes from tokens. + */ +@Component(OperationType.NORMALIZER) +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) +public class TokenTrimmer + extends JCasAnnotator_ImplBase +{ + /** + * List of prefixes to remove. + */ + public static final String PARAM_PREFIXES = "prefixes"; + @ConfigurationParameter(name = PARAM_PREFIXES, mandatory = true) + private String[] prefixes; + + /** + * List of suffixes to remove. + */ + public static final String PARAM_SUFFIXES = "suffixes"; + @ConfigurationParameter(name = PARAM_SUFFIXES, mandatory = true) + private String[] suffixes; + + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException + { + Collection<Token> toRemove = new ArrayList<Token>(); + for (Token t : select(aJCas, Token.class)) { + String text = t.getCoveredText(); + for (String prefix : prefixes) { + if (text.startsWith(prefix)) { + t.setBegin(t.getBegin() + prefix.length()); + break; + } + } + + text = t.getCoveredText(); + for (String suffix : suffixes) { + if (text.endsWith(suffix)) { + t.setEnd(t.getEnd() - suffix.length()); + break; + } + } + + if (t.getCoveredText().length() == 0) { + toRemove.add(t); + } + } + for (Token t : toRemove) { + t.removeFromIndexes(); + } + } +} diff --git a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/WhitespaceSegmenter.java b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/WhitespaceSegmenter.java similarity index 91% rename from dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/WhitespaceSegmenter.java rename to dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/WhitespaceSegmenter.java index a361bf2f98..5b6631b4a2 100644 --- a/dkpro-core-tokit-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/tokit/WhitespaceSegmenter.java +++ b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/WhitespaceSegmenter.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.tokit; +package org.dkpro.core.tokit; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -25,10 +25,11 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.segmentation.SegmenterBase; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.DocumentationResource; /** * A strict whitespace tokenizer, i.e. tokenizes according to whitespaces and linebreaks only. @@ -38,9 +39,10 @@ * * @deprecated Use {@link RegexSegmenter} */ -@ResourceMetaData(name="Whitespace Segmenter") +@ResourceMetaData(name = "Whitespace Segmenter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( - outputs={ + outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) @Deprecated diff --git a/dkpro-core-stopwordremover-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stopwordremover/StopWordSet.java b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/internal/StopWordSet.java similarity index 98% rename from dkpro-core-stopwordremover-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stopwordremover/StopWordSet.java rename to dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/internal/StopWordSet.java index 29a7bcf7f7..5b60f6712c 100644 --- a/dkpro-core-stopwordremover-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/stopwordremover/StopWordSet.java +++ b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/internal/StopWordSet.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.stopwordremover; +package org.dkpro.core.tokit.internal; import java.io.BufferedReader; import java.io.FileReader; diff --git a/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/package-info.java b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/package-info.java new file mode 100644 index 0000000000..3d370e289a --- /dev/null +++ b/dkpro-core-tokit-asl/src/main/java/org/dkpro/core/tokit/package-info.java @@ -0,0 +1,24 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Collection of tokenization and segmentation components. + * + * @since 1.1.0 + */ +package org.dkpro.core.tokit; diff --git a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/BreakIteratorSegmenterTest.java b/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/BreakIteratorSegmenterTest.java deleted file mode 100644 index 91922e5397..0000000000 --- a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/BreakIteratorSegmenterTest.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.tokit; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import java.text.BreakIterator; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.fit.factory.JCasFactory; -import org.apache.uima.jcas.JCas; -import org.junit.Ignore; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.harness.SegmenterHarness; - -public -class BreakIteratorSegmenterTest -{ - @Ignore("Only needed to get the list of the supported languages for the @LanguageCapability") - @Test - public void listLocales() throws Exception - { - List<String> supportedLanguages = Arrays.stream(BreakIterator.getAvailableLocales()) - .map(l -> l.getLanguage()) - .distinct() - .sorted() - .filter(lang -> lang.length() == 2) - .collect(Collectors.toList()); - - System.out.printf("["); - for (String l : supportedLanguages) { - System.out.printf("\"%s\", ", l); - } - System.out.printf("]"); - } - - - @Test - public void run() throws Throwable - { - AnalysisEngineDescription aed = createEngineDescription(BreakIteratorSegmenter.class); - - SegmenterHarness.run(aed, "de.1", "de.4", "en.1", "en.2", "en.3", "en.6", "en.7", "en.9", - "ar.1", "zh.1", "zh.2"); - } - - @Test - public void testJapanese() throws Exception - { - JCas jcas = JCasFactory.createText("滧の べ滦榥榜ぶ 廤ま楺獣お 䨣みゅ騪", "ja"); - - AnalysisEngine aed = createEngine(BreakIteratorSegmenter.class); - aed.process(jcas); - - String[] tokens = { "滧", "の", "べ", "滦榥榜", "ぶ", "廤", "ま", "楺獣", "お", "䨣", "みゅ", "騪" }; - - AssertAnnotations.assertToken(tokens, select(jcas, Token.class)); - } - - @Test - public void testZoning() throws Exception - { - SegmenterHarness.testZoning(BreakIteratorSegmenter.class); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/CamelCaseSegmenterTest.java b/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/CamelCaseSegmenterTest.java deleted file mode 100644 index 1c7ef19697..0000000000 --- a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/CamelCaseSegmenterTest.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.tokit; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.jcas.JCas; -import org.junit.Test; - -import java.util.Collection; -import java.util.List; - -import static java.util.Arrays.asList; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.toText; -import static org.junit.Assert.assertEquals; - -public class CamelCaseSegmenterTest -{ - @Test - public void testProcess() throws Exception - { - AnalysisEngine seg = createEngine(CamelCaseTokenSegmenter.class); - - // 0123456789012345678901 - String content = "ThisIsACamel CaseText"; - JCas cas = seg.newJCas(); - cas.setDocumentText(content); - new Token(cas, 0, 12).addToIndexes(); - new Token(cas, 13, 21).addToIndexes(); - - seg.process(cas); - - List<String> ref = asList("This", "Is", "A", "Camel", "Case", "Text"); - List<String> tokens = toText(select(cas, Token.class)); - System.out.println(tokens); - assertEquals(ref, tokens); - } - - @Test - public void testProcess2() throws Exception - { - AnalysisEngine seg = createEngine(CamelCaseTokenSegmenter.class); - - // 01234567890123456789012 - String content = "GetFileUploadURLRequest"; - JCas cas = seg.newJCas(); - cas.setDocumentText(content); - new Token(cas, 0, 23).addToIndexes(); - - seg.process(cas); - - List<String> ref = asList("Get", "File", "Upload", "URL", "Request"); - List<String> tokens = toText(select(cas, Token.class)); - System.out.println(tokens); - assertEquals(ref, tokens); - } - - @Test - public void testProcess3() throws Exception - { - AnalysisEngine seg = createEngine(CamelCaseTokenSegmenter.class); - - // 01234567890123 - String content = "_ORGANIZATION"; - JCas cas = seg.newJCas(); - cas.setDocumentText(content); - new Token(cas, 0, 1).addToIndexes(); - new Token(cas, 1, 13).addToIndexes(); - - seg.process(cas); - - List<String> ref = asList("_", "ORGANIZATION"); - List<String> tokens = toText(select(cas, Token.class)); - System.out.println(tokens); - assertEquals(ref, tokens); - } - - @Test - public void testProcess4() throws Exception - { - // Verifying that the camel case token is marked up correctly when the optional markup type is specified - AnalysisEngine seg = createEngine( - CamelCaseTokenSegmenter.class, - CamelCaseTokenSegmenter.PARAM_MARKUP_TYPE, - Split.class // Just reusing the Split annotation type for this test - ); - - // 01234567890123456789012 - String content = "Try getFileUploadURLRequest Now"; - JCas cas = seg.newJCas(); - cas.setDocumentText(content); - new Token(cas, 0, 3).addToIndexes(); - new Token(cas, 4, 27).addToIndexes(); - new Token(cas, 28, 31).addToIndexes(); - - seg.process(cas); - - Collection<Split> markups = select(cas, Split.class); - assertEquals(1, markups.size()); - Split markup = markups.stream().findFirst().get(); - assertEquals(4, markup.getBegin()); - assertEquals(27, markup.getEnd()); - assertEquals("getFileUploadURLRequest", markup.getCoveredText()); - List<String> ref = asList("Try", "get", "File", "Upload", "URL", "Request", "Now"); - List<String> tokens = toText(select(cas, Token.class)); - System.out.println(tokens); - assertEquals(ref, tokens); - - } -} diff --git a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/GermanSeparatedParticleAnnotatorTest.java b/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/GermanSeparatedParticleAnnotatorTest.java deleted file mode 100644 index 045c541d12..0000000000 --- a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/GermanSeparatedParticleAnnotatorTest.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.tokit; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.fit.testing.factory.TokenBuilder; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TestName; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; - -public class GermanSeparatedParticleAnnotatorTest -{ - @Test - public void testGermanSeparatedParticles() - throws Exception - { - runTest("de", "Wir schlagen ein Treffen vor .", - new String[] { "wir", "schlagen", "eine", "Treffen", "vor", "." }, - new String[] { "PPER", "VVFIN", "ART", "NN", "PTKVZ", "$." }, - new String[] { "PPER", "VVFIN", "ART", "NN", "PTKVZ", "$." }, - new String[] { "wir", "vorschlagen", "eine", "Treffen", "vor", "." }); - - - runTest("de", "Fangen wir jetzt an ?", - new String[] { "fangen", "wir", "jetzt", "an", "?" }, - new String[] { "VVFIN", "PPER", "ADV", "PTKVZ", "$." }, - new String[] { "VVFIN", "PPER", "ADV", "PTKVZ", "$." }, - new String[] { "anfangen", "wir", "jetzt", "an", "?" }); - } - - private void runTest(String language, String testDocument, String[] documentTreeTaggerLemmas, - String[] documentCPosTags, String[] documentPosTags, String[] lemmatizedDocument) - throws UIMAException - { - - AnalysisEngineDescription processor = createEngineDescription( - - createEngineDescription(GermanSeparatedParticleAnnotator.class) - ); - - AnalysisEngine engine = createEngine(processor); - JCas aJCas = engine.newJCas(); - aJCas.setDocumentLanguage(language); - - TokenBuilder<Token, Sentence> tb = new TokenBuilder<Token, Sentence>(Token.class, - Sentence.class); - tb.buildTokens(aJCas, testDocument); - - int offset = 0; - for (Token token : JCasUtil.select(aJCas, Token.class)) { - POS pos = new POS(aJCas, token.getBegin(), token.getEnd()); - pos.setPosValue(documentPosTags[offset]); - pos.setCoarseValue(documentCPosTags[offset]); - pos.addToIndexes(); - - token.setPos(pos); - - Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); - lemma.setValue(documentTreeTaggerLemmas[offset]); - lemma.addToIndexes(); - - token.setLemma(lemma); - - offset++; - } - engine.process(aJCas); - - AssertAnnotations.assertLemma(lemmatizedDocument, select(aJCas, Lemma.class)); - } - - @Rule - public TestName name = new TestName(); - - @Before - public void printSeparator() - { - System.out.println("\n=== " + name.getMethodName() + " ====================="); - } -} diff --git a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/ParagraphSplitterTest.java b/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/ParagraphSplitterTest.java deleted file mode 100644 index 4839f4ca82..0000000000 --- a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/ParagraphSplitterTest.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.tokit; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.fail; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.jcas.JCas; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; - -public class ParagraphSplitterTest -{ - @Test - public void paragraphSplitterTest_SingleLineBreaks() - throws Exception - { - StringBuilder sb = new StringBuilder(); - sb.append("paragraph1"); - sb.append(System.getProperty("line.separator")); - sb.append(System.getProperty("line.separator")); - sb.append("paragraph2"); - sb.append(System.getProperty("line.separator")); - sb.append("paragraph3"); - - AnalysisEngine ae = createEngine(ParagraphSplitter.class, - ParagraphSplitter.PARAM_SPLIT_PATTERN, ParagraphSplitter.SINGLE_LINE_BREAKS_PATTERN); - - JCas jcas = ae.newJCas(); - jcas.setDocumentLanguage("en"); - jcas.setDocumentText(sb.toString()); - ae.process(jcas); - - int i = 0; - for (Paragraph paragraph : select(jcas, Paragraph.class)) { - if (i == 0) { - assertEquals("paragraph1", paragraph.getCoveredText()); - } - else if (i == 1) { - assertEquals("paragraph2", paragraph.getCoveredText()); - } - else if (i == 2) { - assertEquals("paragraph3", paragraph.getCoveredText()); - } - else { - fail("too many paragraphs"); - } - i++; - } - } - - @Test - public void paragraphSplitterTest_DoubleLineBreaks() - throws Exception - { - StringBuilder sb = new StringBuilder(); - sb.append("paragraph1"); - sb.append(System.getProperty("line.separator")); - sb.append(System.getProperty("line.separator")); - sb.append("paragraph2"); - - AnalysisEngine ae = createEngine(ParagraphSplitter.class); - - JCas jcas = ae.newJCas(); - jcas.setDocumentLanguage("en"); - jcas.setDocumentText(sb.toString()); - ae.process(jcas); - - int i = 0; - for (Paragraph paragraph : select(jcas, Paragraph.class)) { - if (i == 0) { - assertEquals("paragraph1", paragraph.getCoveredText()); - } - else if (i == 1) { - assertEquals("paragraph2", paragraph.getCoveredText()); - } - else { - fail("too many paragraphs"); - } - i++; - } - } -} diff --git a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/PatternBasedTokenSegmenterTest.java b/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/PatternBasedTokenSegmenterTest.java deleted file mode 100644 index 2dde0bdcbf..0000000000 --- a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/PatternBasedTokenSegmenterTest.java +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.tokit; - -import static java.util.Arrays.asList; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.toText; -import static org.junit.Assert.assertEquals; - -import java.util.List; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.jcas.JCas; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -public class PatternBasedTokenSegmenterTest -{ - private static final String[] PATTERNS = new String[] { - PatternBasedTokenSegmenter.INCLUDE_PREFIX + "[0-9]+", - PatternBasedTokenSegmenter.EXCLUDE_PREFIX + "[\\/?!&%\"'#<>_=.:;]+" }; - - @Test - public void testProcess() - throws Exception - { - AnalysisEngine seg = createEngine(PatternBasedTokenSegmenter.class, - PatternBasedTokenSegmenter.PARAM_PATTERNS, PATTERNS); - - // 01234567890123456789012345 - String content = "This_Is_A_Camel Case_Text"; - JCas cas = seg.newJCas(); - cas.setDocumentText(content); - new Token(cas, 0, 15).addToIndexes(); - new Token(cas, 16, 25).addToIndexes(); - - seg.process(cas); - - List<String> ref = asList("This", "Is", "A", "Camel", "Case", "Text"); - List<String> tokens = toText(select(cas, Token.class)); - System.out.println(tokens); - assertEquals(ref, tokens); - } - - @Test - public void testProcess2() - throws Exception - { - AnalysisEngine seg = createEngine(PatternBasedTokenSegmenter.class, - PatternBasedTokenSegmenter.PARAM_PATTERNS, PATTERNS); - - // 0123456789012345678901234567 - String content = "This_Is.A_Camel_ _Case_Text"; - JCas cas = seg.newJCas(); - cas.setDocumentText(content); - new Token(cas, 0, 16).addToIndexes(); - new Token(cas, 17, 27).addToIndexes(); - - seg.process(cas); - - List<String> ref = asList("This", "Is", "A", "Camel", "Case", "Text"); - List<String> tokens = toText(select(cas, Token.class)); - System.out.println(tokens); - assertEquals(ref, tokens); - } - - @Test - public void testProcess3() - throws Exception - { - AnalysisEngine seg = createEngine(PatternBasedTokenSegmenter.class, - PatternBasedTokenSegmenter.PARAM_PATTERNS, PATTERNS); - - // 012345 - String content = "This_"; - JCas cas = seg.newJCas(); - cas.setDocumentText(content); - new Token(cas, 0, 5).addToIndexes(); - - seg.process(cas); - - List<String> ref = asList("This"); - List<String> tokens = toText(select(cas, Token.class)); - System.out.println(tokens); - assertEquals(ref, tokens); - } - - @Test - public void testProcess4() - throws Exception - { - AnalysisEngine seg = createEngine(PatternBasedTokenSegmenter.class, - PatternBasedTokenSegmenter.PARAM_PATTERNS, PATTERNS); - - // 0123456789012345 - String content = "rent25to29point9"; - JCas cas = seg.newJCas(); - cas.setDocumentText(content); - new Token(cas, 0, 16).addToIndexes(); - - seg.process(cas); - - List<String> ref = asList("rent", "25", "to", "29", "point", "9"); - List<String> tokens = toText(select(cas, Token.class)); - System.out.println(tokens); - assertEquals(ref, tokens); - } - - @Test - public void testProcess5() - throws Exception - { - AnalysisEngine seg = createEngine(PatternBasedTokenSegmenter.class, - PatternBasedTokenSegmenter.PARAM_PATTERNS, PATTERNS); - - // 012345 - String content = "_This"; - JCas cas = seg.newJCas(); - cas.setDocumentText(content); - new Token(cas, 0, 5).addToIndexes(); - - seg.process(cas); - - List<String> ref = asList("This"); - List<String> tokens = toText(select(cas, Token.class)); - System.out.println(tokens); - assertEquals(ref, tokens); - } - - @Test - public void testProcess6() - throws Exception - { - AnalysisEngine seg = createEngine(PatternBasedTokenSegmenter.class, - PatternBasedTokenSegmenter.PARAM_PATTERNS, PATTERNS); - - // 012345 - String content = "_This"; - JCas cas = seg.newJCas(); - cas.setDocumentText(content); - new Token(cas, 0, 1).addToIndexes(); - new Token(cas, 1, 5).addToIndexes(); - - seg.process(cas); - - List<String> ref = asList("This"); - List<String> tokens = toText(select(cas, Token.class)); - System.out.println(tokens); - assertEquals(ref, tokens); - } -} diff --git a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/TokenMergerTest.java b/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/TokenMergerTest.java deleted file mode 100644 index 5752f311a9..0000000000 --- a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/TokenMergerTest.java +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.tokit; - -import static java.util.Arrays.asList; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.toText; -import static org.junit.Assert.assertEquals; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -import org.apache.commons.jxpath.ClassFunctions; -import org.apache.commons.jxpath.DynamicPropertyHandler; -import org.apache.commons.jxpath.ExpressionContext; -import org.apache.commons.jxpath.JXPathContext; -import org.apache.commons.jxpath.JXPathIntrospector; -import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.CASException; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.FeatureStructure; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.fit.factory.JCasBuilder; -import org.apache.uima.fit.factory.JCasFactory; -import org.apache.uima.fit.util.CasUtil; -import org.apache.uima.jcas.JCas; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_NOUN; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_PRON; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_PUNCT; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_VERB; -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.tokit.TokenMerger.LemmaMode; - -public class TokenMergerTest -{ - @Test - public void testSimpleMerge() - throws Exception - { - AnalysisEngine filter = createEngine(TokenMerger.class, - TokenMerger.PARAM_ANNOTATION_TYPE, NamedEntity.class); - - JCas jcas = initCas(); - filter.process(jcas); - - assertEquals(asList("I", "love", "New York", "."), pick(select(jcas, Token.class), "cas:text()")); - } - - @Test - public void testWithConstraintMatch() - throws Exception - { - AnalysisEngine filter = createEngine(TokenMerger.class, - TokenMerger.PARAM_ANNOTATION_TYPE, NamedEntity.class, - TokenMerger.PARAM_CONSTRAINT, ".[value = 'LOCATION']"); - - JCas jcas = initCas(); - filter.process(jcas); - - assertEquals(asList("I", "love", "New York", "."), toText(select(jcas, Token.class))); - } - - @Test - public void testWithConstraintNoMatch() - throws Exception - { - AnalysisEngine filter = createEngine(TokenMerger.class, - TokenMerger.PARAM_ANNOTATION_TYPE, NamedEntity.class, - TokenMerger.PARAM_CONSTRAINT, ".[value = 'PERSON']"); - - JCas jcas = initCas(); - filter.process(jcas); - - assertEquals(asList("I", "love", "New", "York", "."), toText(select(jcas, Token.class))); - } - - @Test - public void testSimpleMergeLemmaJoin() - throws Exception - { - AnalysisEngine filter = createEngine(TokenMerger.class, - TokenMerger.PARAM_ANNOTATION_TYPE, NamedEntity.class, - TokenMerger.PARAM_LEMMA_MODE, LemmaMode.JOIN); - - JCas jcas = initCas(); - filter.process(jcas); - - assertEquals(asList("I", "love", "new york", "."), pick(select(jcas, Token.class), "./lemma/value")); - } - - private JCas initCas() throws UIMAException - { - JCas jcas = JCasFactory.createJCas(); - - JCasBuilder builder = new JCasBuilder(jcas); - setLemmaPos(builder.add("I", Token.class), POS_PRON.class, "PRON", "I"); - builder.add(" "); - setLemmaPos(builder.add("love", Token.class), POS_VERB.class, "VERB", "love"); - builder.add(" "); - int m = setLemmaPos(builder.add("New", Token.class), POS_NOUN.class, "NOUN", "new").getBegin(); - builder.add(" "); - setLemmaPos(builder.add("York", Token.class), POS_NOUN.class, "NOUN", "york"); - NamedEntity city = builder.add(m, NamedEntity.class); - city.setValue("LOCATION"); - setLemmaPos(builder.add(".", Token.class), POS_PUNCT.class, "PUNCT", "."); - builder.close(); - - return builder.getJCas(); - } - - private Token setLemmaPos(Token aToken, Class<? extends POS> aPosType, String aPosValue, - String aLemma) - throws CASException - { - CAS cas = aToken.getCAS(); - - POS pos = (POS) cas.createAnnotation(CasUtil.getType(cas, aPosType), aToken.getBegin(), - aToken.getEnd()); - pos.setPosValue(aPosValue); - POSUtils.assignCoarseValue(pos); - aToken.setPos(pos); - - Lemma lemma = new Lemma(aToken.getCAS().getJCas(), aToken.getBegin(), aToken.getEnd()); - lemma.setValue(aLemma); - aToken.setLemma(lemma); - - return aToken; - } - - // ============================================================================================= - // == JXPath helper methods - // ============================================================================================= - - { - JXPathIntrospector.registerDynamicClass(FeatureStructure.class, FeatureStructureHandler.class); - } - - public static class FeatureStructureHandler implements DynamicPropertyHandler - { - @Override - public String[] getPropertyNames(Object aObject) - { - FeatureStructure fs = (FeatureStructure) aObject; - Type t = fs.getType(); - List<Feature> features = t.getFeatures(); - String[] featureNames = new String[features.size()]; - - int i = 0; - for (Feature f : features) { - featureNames[i] = f.getShortName(); - i++; - } - return featureNames; - } - - @Override - public Object getProperty(Object aObject, String aPropertyName) - { - FeatureStructure fs = (FeatureStructure) aObject; - Feature f = fs.getType().getFeatureByBaseName(aPropertyName); - if (CAS.TYPE_NAME_BOOLEAN.equals(f.getRange().getName())) { - return fs.getBooleanValue(f); - } - else if (CAS.TYPE_NAME_BYTE.equals(f.getRange().getName())) { - return fs.getByteValue(f); - } - else if (CAS.TYPE_NAME_DOUBLE.equals(f.getRange().getName())) { - return fs.getDoubleValue(f); - } - else if (CAS.TYPE_NAME_FLOAT.equals(f.getRange().getName())) { - return fs.getFloatValue(f); - } - else if (CAS.TYPE_NAME_INTEGER.equals(f.getRange().getName())) { - return fs.getIntValue(f); - } - else if (CAS.TYPE_NAME_LONG.equals(f.getRange().getName())) { - return fs.getLongValue(f); - } - else if (CAS.TYPE_NAME_SHORT.equals(f.getRange().getName())) { - return fs.getShortValue(f); - } - else if (CAS.TYPE_NAME_STRING.equals(f.getRange().getName())) { - return fs.getStringValue(f); - } - else { - return fs.getFeatureValue(f); - } - } - - @Override - public void setProperty(Object aObject, String aPropertyName, Object aValue) - { - throw new UnsupportedOperationException(); - } - } - - @SuppressWarnings("unchecked") - public static List<Object> pick(Collection<?> aContext, String aPath) - { - List<Object> result = new ArrayList<Object>(); - for (Object a : aContext) { - JXPathContext ctx = JXPathContext.newContext(a); - ctx.setFunctions(new ClassFunctions(JXPathCasFunctions.class, "cas")); - result.addAll(ctx.selectNodes(aPath)); - } - return result; - } - - public static class JXPathCasFunctions - { - public static String text(ExpressionContext aCtx) - { - Object value = aCtx.getContextNodePointer().getValue(); - if (value instanceof AnnotationFS) { - return ((AnnotationFS) value).getCoveredText(); - } - else { - return String.valueOf(value); - } - } - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/AnnotationByLengthFilterTest.java b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/AnnotationByLengthFilterTest.java similarity index 82% rename from dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/AnnotationByLengthFilterTest.java rename to dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/AnnotationByLengthFilterTest.java index 8537f9f77b..d545c74675 100644 --- a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/AnnotationByLengthFilterTest.java +++ b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/AnnotationByLengthFilterTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.tokit; +package org.dkpro.core.tokit; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.util.JCasUtil.select; @@ -49,7 +49,8 @@ public void testMin() JCas jcas = filter.newJCas(); - TokenBuilder<Token, Annotation> tb = new TokenBuilder<Token, Annotation>(Token.class, Annotation.class); + TokenBuilder<Token, Annotation> tb = new TokenBuilder<Token, Annotation>(Token.class, + Annotation.class); tb.buildTokens(jcas, content); filter.process(jcas); @@ -57,22 +58,23 @@ public void testMin() } @Test - public void testMax() - throws Exception - { - AnalysisEngine filter = createEngine( - AnnotationByLengthFilter.class, - AnnotationByLengthFilter.PARAM_FILTER_ANNOTATION_TYPES, new String[] {Token.class.getName()}, - AnnotationByLengthFilter.PARAM_MAX_LENGTH, 5); + public void testMax() + throws Exception + { + AnalysisEngine filter = createEngine( + AnnotationByLengthFilter.class, + AnnotationByLengthFilter.PARAM_FILTER_ANNOTATION_TYPES, Token.class, + AnnotationByLengthFilter.PARAM_MAX_LENGTH, 5); - JCas jcas = filter.newJCas(); + JCas jcas = filter.newJCas(); - TokenBuilder<Token, Annotation> tb = new TokenBuilder<Token, Annotation>(Token.class, Annotation.class); + TokenBuilder<Token, Annotation> tb = new TokenBuilder<Token, Annotation>(Token.class, + Annotation.class); tb.buildTokens(jcas, content); - filter.process(jcas); + filter.process(jcas); - assertEquals("1 22 333 4444 55555", StringUtils.join(toText(select(jcas, Token.class)), " ")); - } + assertEquals("1 22 333 4444 55555", StringUtils.join(toText(select(jcas, Token.class)), " ")); + } @Test public void testMinMax() @@ -80,13 +82,14 @@ public void testMinMax() { AnalysisEngine filter = createEngine( AnnotationByLengthFilter.class, - AnnotationByLengthFilter.PARAM_FILTER_ANNOTATION_TYPES, new String[] {Token.class.getName()}, + AnnotationByLengthFilter.PARAM_FILTER_ANNOTATION_TYPES, Token.class, AnnotationByLengthFilter.PARAM_MIN_LENGTH, 3, AnnotationByLengthFilter.PARAM_MAX_LENGTH, 5); JCas jcas = filter.newJCas(); - TokenBuilder<Token, Annotation> tb = new TokenBuilder<Token, Annotation>(Token.class, Annotation.class); + TokenBuilder<Token, Annotation> tb = new TokenBuilder<Token, Annotation>(Token.class, + Annotation.class); tb.buildTokens(jcas, content); filter.process(jcas); @@ -99,13 +102,15 @@ public void testMinMaxTokenStem() { AnalysisEngine filter = createEngine( AnnotationByLengthFilter.class, - AnnotationByLengthFilter.PARAM_FILTER_ANNOTATION_TYPES, new String[] {Token.class.getName(), Stem.class.getName()}, + AnnotationByLengthFilter.PARAM_FILTER_ANNOTATION_TYPES, new String[] { + Token.class.getName(), Stem.class.getName()}, AnnotationByLengthFilter.PARAM_MIN_LENGTH, 3, AnnotationByLengthFilter.PARAM_MAX_LENGTH, 5); JCas jcas = filter.newJCas(); - TokenBuilder<Token, Annotation> tb = new TokenBuilder<Token, Annotation>(Token.class, Annotation.class); + TokenBuilder<Token, Annotation> tb = new TokenBuilder<Token, Annotation>(Token.class, + Annotation.class); tb.buildTokens(jcas, content); for (Token token : JCasUtil.select(jcas, Token.class)) { diff --git a/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/BreakIteratorSegmenterTest.java b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/BreakIteratorSegmenterTest.java new file mode 100644 index 0000000000..00bd526be1 --- /dev/null +++ b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/BreakIteratorSegmenterTest.java @@ -0,0 +1,100 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.tokit; + +import static java.util.Arrays.asList; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import java.text.BreakIterator; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.harness.SegmenterHarness; +import org.junit.Ignore; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class BreakIteratorSegmenterTest +{ + @Ignore("Only needed to get the list of the supported languages for the @LanguageCapability") + @Test + public void listLocales() throws Exception + { + List<String> supportedLanguages = Arrays.stream(BreakIterator.getAvailableLocales()) + .map(l -> l.getLanguage()) + .distinct() + .sorted() + .filter(lang -> lang.length() == 2) + // These language codes do not comply with ISO 639 / OMTD-SHARE + // "in" (Indonesian, should be "id") + // "iw" (Hebrew, should be "he") + // "ji" (Yiddish, should be "yi") + // Cf.: https://bugs.java.com/view_bug.do?bug_id=6457127 + // Cf.: https://bugs.java.com/bugdatabase/view_bug.do?bug_id=4140555 + .filter(lang -> !asList("in", "iw", "ji").contains(lang)) + .collect(Collectors.toList()); + + System.out.printf("["); + for (String l : supportedLanguages) { + System.out.printf("\"%s\", ", l); + } + System.out.printf("]"); + } + + @Test + public void run() throws Throwable + { + AnalysisEngineDescription aed = createEngineDescription(BreakIteratorSegmenter.class); + + SegmenterHarness.run(aed, "de.1", "de.4", "en.1", "en.2", "en.3", "en.6", "en.7", "en.9", + "ar.1", "zh.1", "zh.2"); + } + + @Test + public void testJapanese() throws Exception + { + JCas jcas = JCasFactory.createText("滧の べ滦榥榜ぶ 廤ま楺獣お 䨣みゅ騪", "ja"); + + AnalysisEngine aed = createEngine(BreakIteratorSegmenter.class); + aed.process(jcas); + + String[] tokens = { "滧", "の", "べ", "滦榥榜", "ぶ", "廤", "ま", "楺獣", "お", "䨣", "みゅ", "騪" }; + + AssertAnnotations.assertToken(tokens, select(jcas, Token.class)); + } + + @Test + public void testZoning() throws Exception + { + SegmenterHarness.testZoning(BreakIteratorSegmenter.class); + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/CamelCaseSegmenterTest.java b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/CamelCaseSegmenterTest.java new file mode 100644 index 0000000000..bbc6242428 --- /dev/null +++ b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/CamelCaseSegmenterTest.java @@ -0,0 +1,129 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.tokit; + +import static java.util.Arrays.asList; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.toText; +import static org.junit.Assert.assertEquals; + +import java.util.Collection; +import java.util.List; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.jcas.JCas; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class CamelCaseSegmenterTest +{ + @Test + public void testProcess() throws Exception + { + AnalysisEngine seg = createEngine(CamelCaseTokenSegmenter.class); + + // 0123456789012345678901 + String content = "ThisIsACamel CaseText"; + JCas cas = seg.newJCas(); + cas.setDocumentText(content); + new Token(cas, 0, 12).addToIndexes(); + new Token(cas, 13, 21).addToIndexes(); + + seg.process(cas); + + List<String> ref = asList("This", "Is", "A", "Camel", "Case", "Text"); + List<String> tokens = toText(select(cas, Token.class)); + System.out.println(tokens); + assertEquals(ref, tokens); + } + + @Test + public void testProcess2() throws Exception + { + AnalysisEngine seg = createEngine(CamelCaseTokenSegmenter.class); + + // 01234567890123456789012 + String content = "GetFileUploadURLRequest"; + JCas cas = seg.newJCas(); + cas.setDocumentText(content); + new Token(cas, 0, 23).addToIndexes(); + + seg.process(cas); + + List<String> ref = asList("Get", "File", "Upload", "URL", "Request"); + List<String> tokens = toText(select(cas, Token.class)); + System.out.println(tokens); + assertEquals(ref, tokens); + } + + @Test + public void testProcess3() throws Exception + { + AnalysisEngine seg = createEngine(CamelCaseTokenSegmenter.class); + + // 01234567890123 + String content = "_ORGANIZATION"; + JCas cas = seg.newJCas(); + cas.setDocumentText(content); + new Token(cas, 0, 1).addToIndexes(); + new Token(cas, 1, 13).addToIndexes(); + + seg.process(cas); + + List<String> ref = asList("_", "ORGANIZATION"); + List<String> tokens = toText(select(cas, Token.class)); + System.out.println(tokens); + assertEquals(ref, tokens); + } + + @Test + public void testProcess4() throws Exception + { + // Verifying that the camel case token is marked up correctly when the optional markup type + // is specified + AnalysisEngine seg = createEngine( + CamelCaseTokenSegmenter.class, + CamelCaseTokenSegmenter.PARAM_MARKUP_TYPE, + Split.class // Just reusing the Split annotation type for this test + ); + + // 01234567890123456789012 + String content = "Try getFileUploadURLRequest Now"; + JCas cas = seg.newJCas(); + cas.setDocumentText(content); + new Token(cas, 0, 3).addToIndexes(); + new Token(cas, 4, 27).addToIndexes(); + new Token(cas, 28, 31).addToIndexes(); + + seg.process(cas); + + Collection<Split> markups = select(cas, Split.class); + assertEquals(1, markups.size()); + Split markup = markups.stream().findFirst().get(); + assertEquals(4, markup.getBegin()); + assertEquals(27, markup.getEnd()); + assertEquals("getFileUploadURLRequest", markup.getCoveredText()); + List<String> ref = asList("Try", "get", "File", "Upload", "URL", "Request", "Now"); + List<String> tokens = toText(select(cas, Token.class)); + System.out.println(tokens); + assertEquals(ref, tokens); + } +} diff --git a/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/GermanSeparatedParticleAnnotatorTest.java b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/GermanSeparatedParticleAnnotatorTest.java new file mode 100644 index 0000000000..bac85591fd --- /dev/null +++ b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/GermanSeparatedParticleAnnotatorTest.java @@ -0,0 +1,106 @@ +/* + * Copyright 2012 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.tokit; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; + +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.fit.testing.factory.TokenBuilder; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class GermanSeparatedParticleAnnotatorTest +{ + @Test + public void testGermanSeparatedParticles() throws Exception + { + runTest("de", "Wir schlagen ein Treffen vor .", + new String[] { "wir", "schlagen", "eine", "Treffen", "vor", "." }, + new String[] { "PPER", "VVFIN", "ART", "NN", "PTKVZ", "$." }, + new String[] { "PPER", "VVFIN", "ART", "NN", "PTKVZ", "$." }, + new String[] { "wir", "vorschlagen", "eine", "Treffen", "vor", "." }); + + runTest("de", "Fangen wir jetzt an ?", + new String[] { "fangen", "wir", "jetzt", "an", "?" }, + new String[] { "VVFIN", "PPER", "ADV", "PTKVZ", "$." }, + new String[] { "VVFIN", "PPER", "ADV", "PTKVZ", "$." }, + new String[] { "anfangen", "wir", "jetzt", "an", "?" }); + } + + private void runTest(String language, String testDocument, String[] documentTreeTaggerLemmas, + String[] documentCPosTags, String[] documentPosTags, String[] lemmatizedDocument) + throws UIMAException + { + + AnalysisEngineDescription processor = createEngineDescription( + + createEngineDescription(GermanSeparatedParticleAnnotator.class)); + + AnalysisEngine engine = createEngine(processor); + JCas aJCas = engine.newJCas(); + aJCas.setDocumentLanguage(language); + + TokenBuilder<Token, Sentence> tb = new TokenBuilder<Token, Sentence>(Token.class, + Sentence.class); + tb.buildTokens(aJCas, testDocument); + + int offset = 0; + for (Token token : JCasUtil.select(aJCas, Token.class)) { + POS pos = new POS(aJCas, token.getBegin(), token.getEnd()); + pos.setPosValue(documentPosTags[offset]); + pos.setCoarseValue(documentCPosTags[offset]); + pos.addToIndexes(); + + token.setPos(pos); + + Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); + lemma.setValue(documentTreeTaggerLemmas[offset]); + lemma.addToIndexes(); + + token.setLemma(lemma); + + offset++; + } + engine.process(aJCas); + + AssertAnnotations.assertLemma(lemmatizedDocument, select(aJCas, Lemma.class)); + } + + @Rule + public TestName name = new TestName(); + + @Before + public void printSeparator() + { + System.out.println("\n=== " + name.getMethodName() + " ====================="); + } +} diff --git a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/LineBasedSentenceSegmenterTest.java b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/LineBasedSentenceSegmenterTest.java similarity index 95% rename from dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/LineBasedSentenceSegmenterTest.java rename to dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/LineBasedSentenceSegmenterTest.java index 85a777058d..7a81d57aa8 100644 --- a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/LineBasedSentenceSegmenterTest.java +++ b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/LineBasedSentenceSegmenterTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.tokit; +package org.dkpro.core.tokit; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; @@ -23,13 +23,13 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; import org.junit.Before; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TestName; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; public class LineBasedSentenceSegmenterTest { diff --git a/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/ParagraphSplitterTest.java b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/ParagraphSplitterTest.java new file mode 100644 index 0000000000..ffb24fe60e --- /dev/null +++ b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/ParagraphSplitterTest.java @@ -0,0 +1,101 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.tokit; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.jcas.JCas; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; + +public class ParagraphSplitterTest +{ + @Test + public void paragraphSplitterTest_SingleLineBreaks() throws Exception + { + StringBuilder sb = new StringBuilder(); + sb.append("paragraph1"); + sb.append(System.getProperty("line.separator")); + sb.append(System.getProperty("line.separator")); + sb.append("paragraph2"); + sb.append(System.getProperty("line.separator")); + sb.append("paragraph3"); + + AnalysisEngine ae = createEngine(ParagraphSplitter.class, + ParagraphSplitter.PARAM_SPLIT_PATTERN, + ParagraphSplitter.SINGLE_LINE_BREAKS_PATTERN); + + JCas jcas = ae.newJCas(); + jcas.setDocumentLanguage("en"); + jcas.setDocumentText(sb.toString()); + ae.process(jcas); + + int i = 0; + for (Paragraph paragraph : select(jcas, Paragraph.class)) { + if (i == 0) { + assertEquals("paragraph1", paragraph.getCoveredText()); + } + else if (i == 1) { + assertEquals("paragraph2", paragraph.getCoveredText()); + } + else if (i == 2) { + assertEquals("paragraph3", paragraph.getCoveredText()); + } + else { + fail("too many paragraphs"); + } + i++; + } + } + + @Test + public void paragraphSplitterTest_DoubleLineBreaks() throws Exception + { + StringBuilder sb = new StringBuilder(); + sb.append("paragraph1"); + sb.append(System.getProperty("line.separator")); + sb.append(System.getProperty("line.separator")); + sb.append("paragraph2"); + + AnalysisEngine ae = createEngine(ParagraphSplitter.class); + + JCas jcas = ae.newJCas(); + jcas.setDocumentLanguage("en"); + jcas.setDocumentText(sb.toString()); + ae.process(jcas); + + int i = 0; + for (Paragraph paragraph : select(jcas, Paragraph.class)) { + if (i == 0) { + assertEquals("paragraph1", paragraph.getCoveredText()); + } + else if (i == 1) { + assertEquals("paragraph2", paragraph.getCoveredText()); + } + else { + fail("too many paragraphs"); + } + i++; + } + } +} diff --git a/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/PatternBasedTokenSegmenterTest.java b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/PatternBasedTokenSegmenterTest.java new file mode 100644 index 0000000000..979d325ac8 --- /dev/null +++ b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/PatternBasedTokenSegmenterTest.java @@ -0,0 +1,162 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.tokit; + +import static java.util.Arrays.asList; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.toText; +import static org.junit.Assert.assertEquals; + +import java.util.List; + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.jcas.JCas; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class PatternBasedTokenSegmenterTest +{ + private static final String[] PATTERNS = new String[] { + PatternBasedTokenSegmenter.INCLUDE_PREFIX + "[0-9]+", + PatternBasedTokenSegmenter.EXCLUDE_PREFIX + "[\\/?!&%\"'#<>_=.:;]+" }; + + @Test + public void testProcess() throws Exception + { + AnalysisEngine seg = createEngine(PatternBasedTokenSegmenter.class, + PatternBasedTokenSegmenter.PARAM_PATTERNS, PATTERNS); + + // 01234567890123456789012345 + String content = "This_Is_A_Camel Case_Text"; + JCas cas = seg.newJCas(); + cas.setDocumentText(content); + new Token(cas, 0, 15).addToIndexes(); + new Token(cas, 16, 25).addToIndexes(); + + seg.process(cas); + + List<String> ref = asList("This", "Is", "A", "Camel", "Case", "Text"); + List<String> tokens = toText(select(cas, Token.class)); + System.out.println(tokens); + assertEquals(ref, tokens); + } + + @Test + public void testProcess2() throws Exception + { + AnalysisEngine seg = createEngine(PatternBasedTokenSegmenter.class, + PatternBasedTokenSegmenter.PARAM_PATTERNS, PATTERNS); + + // 0123456789012345678901234567 + String content = "This_Is.A_Camel_ _Case_Text"; + JCas cas = seg.newJCas(); + cas.setDocumentText(content); + new Token(cas, 0, 16).addToIndexes(); + new Token(cas, 17, 27).addToIndexes(); + + seg.process(cas); + + List<String> ref = asList("This", "Is", "A", "Camel", "Case", "Text"); + List<String> tokens = toText(select(cas, Token.class)); + System.out.println(tokens); + assertEquals(ref, tokens); + } + + @Test + public void testProcess3() throws Exception + { + AnalysisEngine seg = createEngine(PatternBasedTokenSegmenter.class, + PatternBasedTokenSegmenter.PARAM_PATTERNS, PATTERNS); + + // 012345 + String content = "This_"; + JCas cas = seg.newJCas(); + cas.setDocumentText(content); + new Token(cas, 0, 5).addToIndexes(); + + seg.process(cas); + + List<String> ref = asList("This"); + List<String> tokens = toText(select(cas, Token.class)); + System.out.println(tokens); + assertEquals(ref, tokens); + } + + @Test + public void testProcess4() throws Exception + { + AnalysisEngine seg = createEngine(PatternBasedTokenSegmenter.class, + PatternBasedTokenSegmenter.PARAM_PATTERNS, PATTERNS); + + // 0123456789012345 + String content = "rent25to29point9"; + JCas cas = seg.newJCas(); + cas.setDocumentText(content); + new Token(cas, 0, 16).addToIndexes(); + + seg.process(cas); + + List<String> ref = asList("rent", "25", "to", "29", "point", "9"); + List<String> tokens = toText(select(cas, Token.class)); + System.out.println(tokens); + assertEquals(ref, tokens); + } + + @Test + public void testProcess5() throws Exception + { + AnalysisEngine seg = createEngine(PatternBasedTokenSegmenter.class, + PatternBasedTokenSegmenter.PARAM_PATTERNS, PATTERNS); + + // 012345 + String content = "_This"; + JCas cas = seg.newJCas(); + cas.setDocumentText(content); + new Token(cas, 0, 5).addToIndexes(); + + seg.process(cas); + + List<String> ref = asList("This"); + List<String> tokens = toText(select(cas, Token.class)); + System.out.println(tokens); + assertEquals(ref, tokens); + } + + @Test + public void testProcess6() throws Exception + { + AnalysisEngine seg = createEngine(PatternBasedTokenSegmenter.class, + PatternBasedTokenSegmenter.PARAM_PATTERNS, PATTERNS); + + // 012345 + String content = "_This"; + JCas cas = seg.newJCas(); + cas.setDocumentText(content); + new Token(cas, 0, 1).addToIndexes(); + new Token(cas, 1, 5).addToIndexes(); + + seg.process(cas); + + List<String> ref = asList("This"); + List<String> tokens = toText(select(cas, Token.class)); + System.out.println(tokens); + assertEquals(ref, tokens); + } +} diff --git a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/RegexSegmenterTest.java b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/RegexSegmenterTest.java similarity index 96% rename from dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/RegexSegmenterTest.java rename to dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/RegexSegmenterTest.java index bfbad35ae2..692afc2d80 100644 --- a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/RegexSegmenterTest.java +++ b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/RegexSegmenterTest.java @@ -16,28 +16,28 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.tokit; +package org.dkpro.core.tokit; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertSentence; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertToken; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.dkpro.core.testing.AssertAnnotations.assertSentence; +import static org.dkpro.core.testing.AssertAnnotations.assertToken; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.io.text.StringReader; import org.junit.Test; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.io.text.StringReader; public class RegexSegmenterTest { @@ -66,7 +66,8 @@ public void simpleExample() // end::example[] assertToken( - new String[] { "This", "is", "sentence", "1", ".", "This", "is", "number", "2", "." }, + new String[] { "This", "is", "sentence", "1", ".", "This", "is", "number", "2", + "." }, select(jcas, Token.class)); assertSentence( new String[] { @@ -217,7 +218,8 @@ public void simpleExampleWithDivs() } assertToken( - new String[] { "This", "is", "sentence", "1", ".", "This", "is", "number", "2", "." }, + new String[] { "This", "is", "sentence", "1", ".", "This", "is", "number", "2", + "." }, select(jcas, Token.class)); assertSentence( new String[] { diff --git a/dkpro-core-stopwordremover-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stopwordremover/StopWordRemoverTest.java b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/StopWordRemoverTest.java similarity index 93% rename from dkpro-core-stopwordremover-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stopwordremover/StopWordRemoverTest.java rename to dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/StopWordRemoverTest.java index 3c61984333..430d8486e4 100644 --- a/dkpro-core-stopwordremover-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/stopwordremover/StopWordRemoverTest.java +++ b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/StopWordRemoverTest.java @@ -15,21 +15,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.stopwordremover; +package org.dkpro.core.tokit; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertToken; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.AssertAnnotations.assertToken; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.dkpro.core.tokit.StopWordRemover; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; /** * Test cases for StopwordRemover. diff --git a/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/TokenMergerTest.java b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/TokenMergerTest.java new file mode 100644 index 0000000000..4d12fa422d --- /dev/null +++ b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/TokenMergerTest.java @@ -0,0 +1,253 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.tokit; + +import static java.util.Arrays.asList; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.toText; +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import org.apache.commons.jxpath.ClassFunctions; +import org.apache.commons.jxpath.DynamicPropertyHandler; +import org.apache.commons.jxpath.ExpressionContext; +import org.apache.commons.jxpath.JXPathContext; +import org.apache.commons.jxpath.JXPathIntrospector; +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.CASException; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.FeatureStructure; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.fit.factory.JCasBuilder; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.CasUtil; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.tokit.TokenMerger.LemmaMode; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_NOUN; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_PRON; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_PUNCT; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_VERB; +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class TokenMergerTest +{ + @Test + public void testSimpleMerge() throws Exception + { + AnalysisEngine filter = createEngine(TokenMerger.class, + TokenMerger.PARAM_ANNOTATION_TYPE, NamedEntity.class); + + JCas jcas = initCas(); + filter.process(jcas); + + assertEquals(asList("I", "love", "New York", "."), + pick(select(jcas, Token.class), "cas:text()")); + } + + @Test + public void testWithConstraintMatch() throws Exception + { + AnalysisEngine filter = createEngine(TokenMerger.class, + TokenMerger.PARAM_ANNOTATION_TYPE, NamedEntity.class, + TokenMerger.PARAM_CONSTRAINT, ".[value = 'LOCATION']"); + + JCas jcas = initCas(); + filter.process(jcas); + + assertEquals(asList("I", "love", "New York", "."), toText(select(jcas, Token.class))); + } + + @Test + public void testWithConstraintNoMatch() throws Exception + { + AnalysisEngine filter = createEngine(TokenMerger.class, + TokenMerger.PARAM_ANNOTATION_TYPE, NamedEntity.class, + TokenMerger.PARAM_CONSTRAINT, ".[value = 'PERSON']"); + + JCas jcas = initCas(); + filter.process(jcas); + + assertEquals(asList("I", "love", "New", "York", "."), toText(select(jcas, Token.class))); + } + + @Test + public void testSimpleMergeLemmaJoin() throws Exception + { + AnalysisEngine filter = createEngine(TokenMerger.class, TokenMerger.PARAM_ANNOTATION_TYPE, + NamedEntity.class, TokenMerger.PARAM_LEMMA_MODE, LemmaMode.JOIN); + + JCas jcas = initCas(); + filter.process(jcas); + + assertEquals(asList("I", "love", "new york", "."), + pick(select(jcas, Token.class), "./lemma/value")); + } + + private JCas initCas() throws UIMAException + { + JCas jcas = JCasFactory.createJCas(); + + JCasBuilder builder = new JCasBuilder(jcas); + setLemmaPos(builder.add("I", Token.class), POS_PRON.class, "PRON", "I"); + builder.add(" "); + setLemmaPos(builder.add("love", Token.class), POS_VERB.class, "VERB", "love"); + builder.add(" "); + int m = setLemmaPos(builder.add("New", Token.class), POS_NOUN.class, "NOUN", "new") + .getBegin(); + builder.add(" "); + setLemmaPos(builder.add("York", Token.class), POS_NOUN.class, "NOUN", "york"); + NamedEntity city = builder.add(m, NamedEntity.class); + city.setValue("LOCATION"); + setLemmaPos(builder.add(".", Token.class), POS_PUNCT.class, "PUNCT", "."); + builder.close(); + + return builder.getJCas(); + } + + private Token setLemmaPos(Token aToken, Class<? extends POS> aPosType, String aPosValue, + String aLemma) + throws CASException + { + CAS cas = aToken.getCAS(); + + POS pos = (POS) cas.createAnnotation(CasUtil.getType(cas, aPosType), aToken.getBegin(), + aToken.getEnd()); + pos.setPosValue(aPosValue); + POSUtils.assignCoarseValue(pos); + aToken.setPos(pos); + + Lemma lemma = new Lemma(aToken.getCAS().getJCas(), aToken.getBegin(), aToken.getEnd()); + lemma.setValue(aLemma); + aToken.setLemma(lemma); + + return aToken; + } + + // ============================================================================================= + // == JXPath helper methods + // ============================================================================================= + + { + JXPathIntrospector.registerDynamicClass(FeatureStructure.class, + FeatureStructureHandler.class); + } + + public static class FeatureStructureHandler + implements DynamicPropertyHandler + { + @Override + public String[] getPropertyNames(Object aObject) + { + FeatureStructure fs = (FeatureStructure) aObject; + Type t = fs.getType(); + List<Feature> features = t.getFeatures(); + String[] featureNames = new String[features.size()]; + + int i = 0; + for (Feature f : features) { + featureNames[i] = f.getShortName(); + i++; + } + return featureNames; + } + + @Override + public Object getProperty(Object aObject, String aPropertyName) + { + FeatureStructure fs = (FeatureStructure) aObject; + Feature f = fs.getType().getFeatureByBaseName(aPropertyName); + if (CAS.TYPE_NAME_BOOLEAN.equals(f.getRange().getName())) { + return fs.getBooleanValue(f); + } + else if (CAS.TYPE_NAME_BYTE.equals(f.getRange().getName())) { + return fs.getByteValue(f); + } + else if (CAS.TYPE_NAME_DOUBLE.equals(f.getRange().getName())) { + return fs.getDoubleValue(f); + } + else if (CAS.TYPE_NAME_FLOAT.equals(f.getRange().getName())) { + return fs.getFloatValue(f); + } + else if (CAS.TYPE_NAME_INTEGER.equals(f.getRange().getName())) { + return fs.getIntValue(f); + } + else if (CAS.TYPE_NAME_LONG.equals(f.getRange().getName())) { + return fs.getLongValue(f); + } + else if (CAS.TYPE_NAME_SHORT.equals(f.getRange().getName())) { + return fs.getShortValue(f); + } + else if (CAS.TYPE_NAME_STRING.equals(f.getRange().getName())) { + return fs.getStringValue(f); + } + else { + return fs.getFeatureValue(f); + } + } + + @Override + public void setProperty(Object aObject, String aPropertyName, Object aValue) + { + throw new UnsupportedOperationException(); + } + } + + @SuppressWarnings("unchecked") + public static List<Object> pick(Collection<?> aContext, String aPath) + { + List<Object> result = new ArrayList<Object>(); + for (Object a : aContext) { + JXPathContext ctx = JXPathContext.newContext(a); + ctx.setFunctions(new ClassFunctions(JXPathCasFunctions.class, "cas")); + result.addAll(ctx.selectNodes(aPath)); + } + return result; + } + + public static class JXPathCasFunctions + { + public static String text(ExpressionContext aCtx) + { + Object value = aCtx.getContextNodePointer().getValue(); + if (value instanceof AnnotationFS) { + return ((AnnotationFS) value).getCoveredText(); + } + else { + return String.valueOf(value); + } + } + } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/WhitespaceSegmenterTest.java b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/WhitespaceSegmenterTest.java similarity index 96% rename from dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/WhitespaceSegmenterTest.java rename to dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/WhitespaceSegmenterTest.java index 354ff0e279..aa1649aac9 100644 --- a/dkpro-core-tokit-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/tokit/WhitespaceSegmenterTest.java +++ b/dkpro-core-tokit-asl/src/test/java/org/dkpro/core/tokit/WhitespaceSegmenterTest.java @@ -16,15 +16,15 @@ * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.tokit; +package org.dkpro.core.tokit; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertSentence; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertToken; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.dkpro.core.testing.AssertAnnotations.assertSentence; +import static org.dkpro.core.testing.AssertAnnotations.assertToken; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReaderDescription; @@ -32,11 +32,11 @@ import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.io.text.StringReader; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.io.text.StringReader; public class WhitespaceSegmenterTest { @@ -61,7 +61,8 @@ public void simpleExample() // end::example[] assertToken( - new String[] { "This", "is", "sentence", "1", ".", "This", "is", "number", "2", "." }, + new String[] { "This", "is", "sentence", "1", ".", "This", "is", "number", "2", + "." }, select(jcas, Token.class)); assertSentence( new String[] { diff --git a/dkpro-core-tokit-asl/src/test/resources/log4j.properties b/dkpro-core-tokit-asl/src/test/resources/log4j.properties deleted file mode 100644 index 43a1c1118f..0000000000 --- a/dkpro-core-tokit-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,8 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG -log4j.logger.de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceObjectProviderBase = INFO diff --git a/dkpro-core-tokit-asl/src/test/resources/log4j2.xml b/dkpro-core-tokit-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..31c71b9dc4 --- /dev/null +++ b/dkpro-core-tokit-asl/src/test/resources/log4j2.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Logger name="org.dkpro.core.api.resources.ResourceObjectProviderBase" level="INFO"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-stopwordremover-asl/src/test/resources/stopwords1.txt b/dkpro-core-tokit-asl/src/test/resources/stopwords1.txt similarity index 100% rename from dkpro-core-stopwordremover-asl/src/test/resources/stopwords1.txt rename to dkpro-core-tokit-asl/src/test/resources/stopwords1.txt diff --git a/dkpro-core-stopwordremover-asl/src/test/resources/stopwords2.txt b/dkpro-core-tokit-asl/src/test/resources/stopwords2.txt similarity index 100% rename from dkpro-core-stopwordremover-asl/src/test/resources/stopwords2.txt rename to dkpro-core-tokit-asl/src/test/resources/stopwords2.txt diff --git a/dkpro-core-treetagger-asl/pom.xml b/dkpro-core-treetagger-asl/pom.xml index 40d05af312..062579257a 100644 --- a/dkpro-core-treetagger-asl/pom.xml +++ b/dkpro-core-treetagger-asl/pom.xml @@ -18,14 +18,15 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <version>1.10.0-SNAPSHOT</version> + <artifactId>dkpro-core-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <artifactId>de.tudarmstadt.ukp.dkpro.core.treetagger-asl</artifactId> + <artifactId>dkpro-core-treetagger-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - TreeTagger (free for research)</name> + <url>https://dkpro.github.io/dkpro-core/</url> <dependencies> <dependency> <groupId>org.apache.uima</groupId> @@ -41,28 +42,32 @@ <version>1.2.1</version> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.metadata-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-metadata-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.syntax-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-syntax-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> </dependency> <dependency> <groupId>junit</groupId> @@ -85,13 +90,13 @@ <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.tokit-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-tokit-asl</artifactId> <scope>test</scope> </dependency> </dependencies> @@ -100,7 +105,7 @@ <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.treetagger-bin</artifactId> - <version>20160430.0</version> + <version>20190408.0</version> </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> @@ -125,12 +130,12 @@ <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.treetagger-model-tagger-de-le</artifactId> - <version>20170316.1</version> + <version>20190409.1</version> </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.treetagger-model-tagger-en-le</artifactId> - <version>20170220.1</version> + <version>20190304.1</version> </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> @@ -150,12 +155,12 @@ <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.treetagger-model-tagger-fr-le</artifactId> - <version>20100111.1</version> + <version>20190404.1</version> </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> <artifactId>de.tudarmstadt.ukp.dkpro.core.treetagger-model-tagger-gl-le</artifactId> - <version>20130516.1</version> + <version>20190413.1</version> </dependency> <dependency> <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> @@ -214,6 +219,23 @@ </dependency> </dependencies> </dependencyManagement> + <build> + <plugins> + <plugin> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-maven-plugin</artifactId> + <configuration> + <!-- + Since TreeTagger and its models are not redistributable, it deos not make any sense + to export them to OpenMinTeD. The binaries/models could not be used on the platform. + --> + <uimaDescriptorExcludes> + <exclude>**/*.xml</exclude> + </uimaDescriptorExcludes> + </configuration> + </plugin> + </plugins> + </build> <profiles> <profile> <id>use-proprietary-resources</id> diff --git a/dkpro-core-treetagger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/treetagger/TreeTaggerChunker.java b/dkpro-core-treetagger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/treetagger/TreeTaggerChunker.java deleted file mode 100644 index 5121b0f9e3..0000000000 --- a/dkpro-core-treetagger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/treetagger/TreeTaggerChunker.java +++ /dev/null @@ -1,340 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.treetagger; - -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; -import static org.apache.uima.util.Level.INFO; - -import java.io.File; -import java.io.IOException; -import java.net.URL; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Properties; - -import org.annolab.tt4j.DefaultModel; -import org.annolab.tt4j.TokenAdapter; -import org.annolab.tt4j.TokenHandler; -import org.annolab.tt4j.TreeTaggerException; -import org.annolab.tt4j.TreeTaggerModelUtil; -import org.annolab.tt4j.TreeTaggerWrapper; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Type; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; -import de.tudarmstadt.ukp.dkpro.core.treetagger.internal.DKProExecutableResolver; - -/** - * Chunk annotator using TreeTagger. - */ -@ResourceMetaData(name="TreeTagger Chunker") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk" }) -public class TreeTaggerChunker - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Use this TreeTagger executable instead of trying to locate the executable automatically. - */ - public static final String PARAM_EXECUTABLE_PATH = "executablePath"; - @ConfigurationParameter(name = PARAM_EXECUTABLE_PATH, mandatory = false) - private File executablePath; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - protected String modelLocation; - - /** - * Location of the mapping file for chunk tags to UIMA types. - */ - public static final String PARAM_CHUNK_MAPPING_LOCATION = ComponentParameters.PARAM_CHUNK_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_CHUNK_MAPPING_LOCATION, mandatory = false) - protected String chunkMappingLocation; - - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code true} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - - /** - * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") - protected boolean printTagSet; - - /** - * TT4J setting: Disable some sanity checks, e.g. whether tokens contain line breaks (which is - * not allowed). Turning this on will increase your performance, but the wrapper may throw - * exceptions if illegal data is provided. - */ - public static final String PARAM_PERFORMANCE_MODE = "performanceMode"; - @ConfigurationParameter(name = PARAM_PERFORMANCE_MODE, mandatory = true, defaultValue = "false") - private boolean performanceMode; - - /** - * A sequence to flush the internal TreeTagger buffer and to force it to output the rest of the - * completed analysis. This is typically just a sequence of like 5-10 full stops (".") separated - * by new line characters. However, some models may require a different flush sequence, e.g. a - * short sentence in the respective language. For chunker models, mind that the sentence must - * also be POS tagged, e.g. {@code Nous-PRO:PER\n...}. - */ - public static final String PARAM_FLUSH_SEQUENCE = "flushSequence"; - @ConfigurationParameter(name = PARAM_FLUSH_SEQUENCE, mandatory = false) - private String flushSequence; - - private CasConfigurableProviderBase<TreeTaggerWrapper<Token>> modelProvider; - private MappingProvider mappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase<TreeTaggerWrapper<Token>>() { - private TreeTaggerWrapper<Token> treetagger; - - { - setContextObject(TreeTaggerChunker.this); - - setDefault(ARTIFACT_ID, "${groupId}.treetagger-model-chunker-${language}-${variant}"); - setDefault(LOCATION, "classpath:/${package}/lib/chunker-${language}-${variant}.properties"); - //setDefaultVariantsLocation("de/tudarmstadt/ukp/dkpro/core/treetagger/lib/chunker-default-variants.map"); - setDefault(VARIANT, "le"); // le = little-endian - - setOverride(LOCATION, modelLocation); - setOverride(LANGUAGE, language); - setOverride(VARIANT, variant); - - treetagger = new TreeTaggerWrapper<Token>(); - treetagger.setPerformanceMode(performanceMode); - treetagger.setEpsilon(0.00000001); - treetagger.setHyphenHeuristics(true); - DKProExecutableResolver executableProvider = new DKProExecutableResolver(treetagger); - executableProvider.setExecutablePath(executablePath); - treetagger.setExecutableProvider(executableProvider); - } - - @Override - protected TreeTaggerWrapper<Token> produceResource(URL aUrl) - throws IOException - { - Properties meta = getResourceMetaData(); - String encoding = meta.getProperty("encoding"); - String tagset = meta.getProperty("chunk.tagset"); - String flush = meta.getProperty("flushSequence", - DefaultModel.DEFAULT_FLUSH_SEQUENCE); - if (flushSequence != null) { - flush = flushSequence; - } - - File modelFile = ResourceUtils.getUrlAsFile(aUrl, true); - - DefaultModel model = new DefaultModel(modelFile.getPath() + ":" + encoding, - modelFile, encoding, flush); - - // Reconfigure tagger - treetagger.setModel(model); - treetagger.setAdapter(new MappingTokenAdapter(meta)); - - // Get tagset - List<String> tags = TreeTaggerModelUtil.getTagset(modelFile, encoding); - SingletonTagset chunkTags = new SingletonTagset(Chunk.class, tagset); - for (String tag : tags) { - String fields1[] = tag.split("/"); - String fields2[] = fields1[1].split("-"); - String chunkTag = fields2.length == 2 ? fields2[1] : fields2[0]; - chunkTags.add(chunkTag); - } - addTagset(chunkTags); - - if (printTagSet) { - getContext().getLogger().log(INFO, getTagset().toString()); - } - - return treetagger; - } - }; - - mappingProvider = MappingProviderFactory.createChunkMappingProvider(chunkMappingLocation, - language, modelProvider); - } - - @Override - public void process(final JCas aJCas) - throws AnalysisEngineProcessException - { - final CAS cas = aJCas.getCas(); - - modelProvider.configure(cas); - mappingProvider.configure(cas); - - // Set the handler creating new UIMA annotations from the analyzed tokens - final TokenHandler<Token> handler = new TokenHandler<Token>() - { - private String openChunk; - private int start; - private int end; - - @Override - public void token(Token aToken, String aChunk, String aDummy) - { - synchronized (cas) { - if (aChunk == null) { - // End of processing signal - chunkComplete(); - return; - } - - String fields1[] = aChunk.split("/"); - String fields2[] = fields1[1].split("-"); - //String tag = fields1[0]; - String flag = fields2.length == 2 ? fields2[0] : "NONE"; - String chunk = fields2.length == 2 ? fields2[1] : fields2[0]; - - // Start of a new chunk - if (!chunk.equals(openChunk) || "B".equals(flag)) { - if (openChunk != null) { - // End of previous chunk - chunkComplete(); - } - - openChunk = chunk; - start = aToken.getBegin(); - } - - // Record how much of the chunk we have seen so far - end = aToken.getEnd(); - } - } - - private void chunkComplete() - { - if (openChunk != null) { - Type chunkType = mappingProvider.getTagType(openChunk); - Chunk chunk = (Chunk) cas.createAnnotation(chunkType, start, end); - chunk.setChunkValue(internTags ? openChunk.intern() : openChunk); - cas.addFsToIndexes(chunk); - openChunk = null; - } - } - }; - - try { - TreeTaggerWrapper<Token> treetagger = modelProvider.getResource(); - treetagger.setHandler(handler); - - // Issue #636 - process each sentence individually to ensure that sentence boundaries - // are respected - for (Sentence sentence : select(aJCas, Sentence.class)) { - List<Token> posTags = new ArrayList<Token>(selectCovered(Token.class, sentence)); - treetagger.process(posTags); - - // Commit the final chunk - handler.token(null, null, null); - } - } - catch (TreeTaggerException e) { - throw new AnalysisEngineProcessException(e); - } - catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - } - - private static class MappingTokenAdapter implements TokenAdapter<Token> - { - private Map<String, String> mapping; - - public MappingTokenAdapter(Properties aMetadata) - { - mapping = new HashMap<String, String>(); - - for (Entry<Object, Object> e : aMetadata.entrySet()) { - String key = String.valueOf(e.getKey()); - if (key.startsWith("pos.tag.map.")) { - String old = key.substring("pos.tag.map.".length()); - String rep = String.valueOf(e.getValue()); - mapping.put(old, rep); - } - } - } - - @Override - public String getText(Token aToken) - { - synchronized (aToken.getCAS()) { - String pos = mapping.get(aToken.getPosValue()); - if (pos == null) { - pos = aToken.getPosValue(); - } - - return aToken.getText() + "-" + pos; - } - } - } -} diff --git a/dkpro-core-treetagger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/treetagger/TreeTaggerPosTagger.java b/dkpro-core-treetagger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/treetagger/TreeTaggerPosTagger.java deleted file mode 100644 index 8ade782ec8..0000000000 --- a/dkpro-core-treetagger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/treetagger/TreeTaggerPosTagger.java +++ /dev/null @@ -1,302 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.treetagger; - -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.util.Level.INFO; - -import java.io.File; -import java.io.IOException; -import java.net.URL; -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; -import java.util.concurrent.atomic.AtomicInteger; - -import org.annolab.tt4j.TokenAdapter; -import org.annolab.tt4j.TokenHandler; -import org.annolab.tt4j.TreeTaggerException; -import org.annolab.tt4j.TreeTaggerModelUtil; -import org.annolab.tt4j.TreeTaggerWrapper; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Type; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.treetagger.internal.DKProExecutableResolver; - -/** - * Part-of-Speech and lemmatizer annotator using TreeTagger. - */ -@ResourceMetaData(name="TreeTagger POS-Tagger") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }) -public class TreeTaggerPosTagger - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * Use this TreeTagger executable instead of trying to locate the executable automatically. - */ - public static final String PARAM_EXECUTABLE_PATH = "executablePath"; - @ConfigurationParameter(name = PARAM_EXECUTABLE_PATH, mandatory = false) - private File executablePath; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - protected String modelLocation; - - /** - * The character encoding used by the model. - */ - public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; - @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = false) - protected String modelEncoding; - - /** - * Load the part-of-speech tag to UIMA type mapping from this location instead of locating - * the mapping automatically. - */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - protected String posMappingLocation; - - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code true} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - - /** - * Log the tag set(s) when a model is loaded. - * - * Default: {@code false} - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") - protected boolean printTagSet; - - /** - * TT4J setting: Disable some sanity checks, e.g. whether tokens contain line breaks (which is - * not allowed). Turning this on will increase your performance, but the wrapper may throw - * exceptions if illegal data is provided. - */ - public static final String PARAM_PERFORMANCE_MODE = "performanceMode"; - @ConfigurationParameter(name = PARAM_PERFORMANCE_MODE, mandatory = true, defaultValue = "false") - private boolean performanceMode; - - /** - * Write part-of-speech information. - * - * Default: {@code true} - */ - public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; - @ConfigurationParameter(name=PARAM_WRITE_POS, mandatory=true, defaultValue="true") - private boolean writePos; - - /** - * Write lemma information. - * - * Default: {@code true} - */ - public static final String PARAM_WRITE_LEMMA = ComponentParameters.PARAM_WRITE_LEMMA; - @ConfigurationParameter(name=PARAM_WRITE_LEMMA, mandatory=true, defaultValue="true") - private boolean writeLemma; - - private CasConfigurableProviderBase<TreeTaggerWrapper<Token>> modelProvider; - private MappingProvider posMappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase<TreeTaggerWrapper<Token>>() { - private TreeTaggerWrapper<Token> treetagger; - - { - setContextObject(TreeTaggerPosTagger.this); - - setDefault(ARTIFACT_ID, "${groupId}.treetagger-model-tagger-${language}-${variant}"); - setDefault(LOCATION, "classpath:/${package}/lib/tagger-${language}-${variant}.properties"); - //setDefaultVariantsLocation("de/tudarmstadt/ukp/dkpro/core/treetagger/lib/tagger-default-variants.map"); - setDefault(VARIANT, "le"); // le = little-endian - - setOverride(LOCATION, modelLocation); - setOverride(LANGUAGE, language); - setOverride(VARIANT, variant); - - treetagger = new TreeTaggerWrapper<Token>(); - treetagger.setPerformanceMode(performanceMode); - DKProExecutableResolver executableProvider = new DKProExecutableResolver(treetagger); - executableProvider.setExecutablePath(executablePath); - treetagger.setExecutableProvider(executableProvider); - treetagger.setAdapter(new TokenAdapter<Token>() - { - @Override - public String getText(Token aObject) - { - synchronized (aObject.getCAS()) { - return aObject.getText(); - } - } - }); - } - - @Override - protected TreeTaggerWrapper<Token> produceResource(URL aUrl) - throws IOException - { - Properties meta = getResourceMetaData(); - String encoding = modelEncoding != null ? modelEncoding : meta - .getProperty("encoding"); - String tagset = meta.getProperty("pos.tagset"); - - File modelFile = ResourceUtils.getUrlAsFile(aUrl, true); - - // Reconfigure tagger - treetagger.setModel(modelFile.getPath() + ":" + encoding); - - // Get tagset - List<String> tags = TreeTaggerModelUtil.getTagset(modelFile, encoding); - SingletonTagset posTags = new SingletonTagset(POS.class, tagset); - posTags.addAll(tags); - addTagset(posTags); - - if (printTagSet) { - getContext().getLogger().log(INFO, getTagset().toString()); - } - - return treetagger; - } - }; - - posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, - language, modelProvider); - } - - @Override - public void process(final JCas aJCas) - throws AnalysisEngineProcessException - { - final CAS cas = aJCas.getCas(); - - modelProvider.configure(cas); - posMappingProvider.configure(cas); - - TreeTaggerWrapper<Token> treetagger = modelProvider.getResource(); - - try { - List<Token> tokens = new ArrayList<Token>(select(aJCas, Token.class)); - final POS pos[] = new POS[tokens.size()]; - final Lemma lemma[] = new Lemma[tokens.size()]; - - // Set the handler creating new UIMA annotations from the analyzed - // tokens - final AtomicInteger count = new AtomicInteger(0); - treetagger.setHandler(new TokenHandler<Token>() { - @Override - public void token(Token aToken, String aPos, String aLemma) - { - synchronized (cas) { - // Add the Part of Speech - if (writePos && aPos != null) { - Type posTag = posMappingProvider.getTagType(aPos); - POS posAnno = (POS) cas.createAnnotation(posTag, aToken.getBegin(), - aToken.getEnd()); - posAnno.setPosValue(internTags ? aPos.intern() : aPos); - POSUtils.assignCoarseValue(posAnno); - aToken.setPos(posAnno); - pos[count.get()] = posAnno; - } - - // Add the lemma - if (writeLemma && aLemma != null) { - Lemma lemmaAnno = new Lemma(aJCas, aToken.getBegin(), aToken.getEnd()); - lemmaAnno.setValue(internTags ? aLemma.intern() : aLemma); - aToken.setLemma(lemmaAnno); - lemma[count.get()] = lemmaAnno; - } - - count.getAndIncrement(); - } - } - }); - - treetagger.process(tokens); - - // Add the annotations to the indexes - for (int i = 0; i < count.get(); i++) { - if (pos[i] != null) { - pos[i].addToIndexes(); - } - if (lemma[i] != null) { - lemma[i].addToIndexes(); - } - } - } - catch (TreeTaggerException e) { - throw new AnalysisEngineProcessException(e); - } - catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - } -} diff --git a/dkpro-core-treetagger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/treetagger/package-info.java b/dkpro-core-treetagger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/treetagger/package-info.java deleted file mode 100644 index 5f3bdcda03..0000000000 --- a/dkpro-core-treetagger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/treetagger/package-info.java +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Integration of the <a href="http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/DecisionTreeTagger.html"> - * TreeTagger</a> part-of-speech tagger, lemmatizer and chunker via <a href="http://code.google.com/p/tt4j/"> - * TT4J</a>. - * - * @since 1.1.0 - */ -package de.tudarmstadt.ukp.dkpro.core.treetagger; diff --git a/dkpro-core-treetagger-asl/src/main/java/org/dkpro/core/treetagger/TreeTaggerChunker.java b/dkpro-core-treetagger-asl/src/main/java/org/dkpro/core/treetagger/TreeTaggerChunker.java new file mode 100644 index 0000000000..bf44a1e951 --- /dev/null +++ b/dkpro-core-treetagger-asl/src/main/java/org/dkpro/core/treetagger/TreeTaggerChunker.java @@ -0,0 +1,365 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.treetagger; + +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.fit.util.JCasUtil.selectCovered; +import static org.apache.uima.util.Level.INFO; +import static org.dkpro.core.api.resources.MappingProviderFactory.createChunkMappingProvider; + +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Properties; + +import org.annolab.tt4j.DefaultModel; +import org.annolab.tt4j.TokenAdapter; +import org.annolab.tt4j.TokenHandler; +import org.annolab.tt4j.TreeTaggerException; +import org.annolab.tt4j.TreeTaggerModelUtil; +import org.annolab.tt4j.TreeTaggerWrapper; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Type; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.treetagger.internal.DKProExecutableResolver; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Chunk annotator using TreeTagger. + */ +@Component(OperationType.CHUNKER) +@ResourceMetaData(name = "TreeTagger Chunker") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk" }) +public class TreeTaggerChunker + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * Use this TreeTagger executable instead of trying to locate the executable automatically. + */ + public static final String PARAM_EXECUTABLE_PATH = "executablePath"; + @ConfigurationParameter(name = PARAM_EXECUTABLE_PATH, mandatory = false) + private File executablePath; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + protected String modelLocation; + + /** + * The character encoding used by the model. + */ + public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; + @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = false) + protected String modelEncoding; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Location of the mapping file for chunk tags to UIMA types. + */ + public static final String PARAM_CHUNK_MAPPING_LOCATION = + ComponentParameters.PARAM_CHUNK_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_CHUNK_MAPPING_LOCATION, mandatory = false) + protected String chunkMappingLocation; + + /** + * Log the tag set(s) when a model is loaded. + */ + public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") + protected boolean printTagSet; + + /** + * TT4J setting: Disable some sanity checks, e.g. whether tokens contain line breaks (which is + * not allowed). Turning this on will increase your performance, but the wrapper may throw + * exceptions if illegal data is provided. + */ + public static final String PARAM_PERFORMANCE_MODE = "performanceMode"; + @ConfigurationParameter(name = PARAM_PERFORMANCE_MODE, mandatory = true, defaultValue = "false") + private boolean performanceMode; + + /** + * A sequence to flush the internal TreeTagger buffer and to force it to output the rest of the + * completed analysis. This is typically just a sequence of like 5-10 full stops (".") separated + * by new line characters. However, some models may require a different flush sequence, e.g. a + * short sentence in the respective language. For chunker models, mind that the sentence must + * also be POS tagged, e.g. {@code Nous-PRO:PER\n...}. + */ + public static final String PARAM_FLUSH_SEQUENCE = "flushSequence"; + @ConfigurationParameter(name = PARAM_FLUSH_SEQUENCE, mandatory = false) + private String flushSequence; + + private CasConfigurableProviderBase<TreeTaggerWrapper<Token>> modelProvider; + private MappingProvider mappingProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase<TreeTaggerWrapper<Token>>() { + private TreeTaggerWrapper<Token> treetagger; + + { + setContextObject(TreeTaggerChunker.this); + + setDefault(ARTIFACT_ID, "${groupId}.treetagger-model-chunker-${language}-${variant}"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/treetagger/lib/chunker-${language}-${variant}.properties"); + setDefault(VARIANT, "le"); // le = little-endian + + setOverride(LOCATION, modelLocation); + setOverride(LANGUAGE, language); + setOverride(VARIANT, variant); + + treetagger = new TreeTaggerWrapper<Token>(); + treetagger.setPerformanceMode(performanceMode); + treetagger.setEpsilon(0.00000001); + treetagger.setHyphenHeuristics(true); + DKProExecutableResolver executableProvider = new DKProExecutableResolver( + treetagger); + executableProvider.setExecutablePath(executablePath); + treetagger.setExecutableProvider(executableProvider); + } + + @Override + protected TreeTaggerWrapper<Token> produceResource(URL aUrl) + throws IOException + { + Properties meta = getResourceMetaData(); + String encoding = modelEncoding != null ? modelEncoding : meta + .getProperty("encoding"); + String tagset = meta.getProperty("chunk.tagset"); + String flush = meta.getProperty("flushSequence", + DefaultModel.DEFAULT_FLUSH_SEQUENCE); + if (flushSequence != null) { + flush = flushSequence; + } + + File modelFile = ResourceUtils.getUrlAsFile(aUrl, true); + + DefaultModel model = new DefaultModel(modelFile.getPath() + ":" + encoding, + modelFile, encoding, flush); + + // Reconfigure tagger + treetagger.setModel(model); + treetagger.setAdapter(new MappingTokenAdapter(meta)); + + // Get tagset + List<String> tags = TreeTaggerModelUtil.getTagset(modelFile, encoding); + SingletonTagset chunkTags = new SingletonTagset(Chunk.class, tagset); + for (String tag : tags) { + String[] fields1 = tag.split("/"); + String[] fields2 = fields1[1].split("-"); + String chunkTag = fields2.length == 2 ? fields2[1] : fields2[0]; + chunkTags.add(chunkTag); + } + addTagset(chunkTags); + + if (printTagSet) { + getContext().getLogger().log(INFO, getTagset().toString()); + } + + return treetagger; + } + }; + + mappingProvider = createChunkMappingProvider(this, chunkMappingLocation, language, + modelProvider); + } + + @Override + public void process(final JCas aJCas) + throws AnalysisEngineProcessException + { + final CAS cas = aJCas.getCas(); + + modelProvider.configure(cas); + mappingProvider.configure(cas); + + // Set the handler creating new UIMA annotations from the analyzed tokens + final TokenHandler<Token> handler = new TokenHandler<Token>() + { + private String openChunk; + private int start; + private int end; + + @Override + public void token(Token aToken, String aChunk, String aDummy) + { + synchronized (cas) { + if (aChunk == null) { + // End of processing signal + chunkComplete(); + return; + } + + String[] fields1 = aChunk.split("/"); + String[] fields2 = fields1[1].split("-"); + //String tag = fields1[0]; + String flag = fields2.length == 2 ? fields2[0] : "NONE"; + String chunk = fields2.length == 2 ? fields2[1] : fields2[0]; + + // Start of a new chunk + if (!chunk.equals(openChunk) || "B".equals(flag)) { + if (openChunk != null) { + // End of previous chunk + chunkComplete(); + } + + openChunk = chunk; + start = aToken.getBegin(); + } + + // Record how much of the chunk we have seen so far + end = aToken.getEnd(); + } + } + + private void chunkComplete() + { + if (openChunk != null) { + Type chunkType = mappingProvider.getTagType(openChunk); + Chunk chunk = (Chunk) cas.createAnnotation(chunkType, start, end); + chunk.setChunkValue(openChunk.intern()); + cas.addFsToIndexes(chunk); + openChunk = null; + } + } + }; + + try { + TreeTaggerWrapper<Token> treetagger = modelProvider.getResource(); + treetagger.setHandler(handler); + + // Issue #636 - process each sentence individually to ensure that sentence boundaries + // are respected + for (Sentence sentence : select(aJCas, Sentence.class)) { + List<Token> posTags = new ArrayList<Token>(selectCovered(Token.class, sentence)); + treetagger.process(posTags); + + // Commit the final chunk + handler.token(null, null, null); + } + } + catch (TreeTaggerException e) { + throw new AnalysisEngineProcessException(e); + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + + private static class MappingTokenAdapter implements TokenAdapter<Token> + { + private Map<String, String> mapping; + + public MappingTokenAdapter(Properties aMetadata) + { + mapping = new HashMap<String, String>(); + + for (Entry<Object, Object> e : aMetadata.entrySet()) { + String key = String.valueOf(e.getKey()); + if (key.startsWith("pos.tag.map.")) { + String old = key.substring("pos.tag.map.".length()); + String rep = String.valueOf(e.getValue()); + mapping.put(old, rep); + } + } + } + + @Override + public String getText(Token aToken) + { + synchronized (aToken.getCAS()) { + String pos = mapping.get(aToken.getPosValue()); + if (pos == null) { + pos = aToken.getPosValue(); + } + + return aToken.getText() + "-" + pos; + } + } + } +} diff --git a/dkpro-core-treetagger-asl/src/main/java/org/dkpro/core/treetagger/TreeTaggerPosTagger.java b/dkpro-core-treetagger-asl/src/main/java/org/dkpro/core/treetagger/TreeTaggerPosTagger.java new file mode 100644 index 0000000000..ebee6c3d1f --- /dev/null +++ b/dkpro-core-treetagger-asl/src/main/java/org/dkpro/core/treetagger/TreeTaggerPosTagger.java @@ -0,0 +1,315 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.treetagger; + +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.apache.uima.util.Level.INFO; +import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; + +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; +import java.util.concurrent.atomic.AtomicInteger; + +import org.annolab.tt4j.TokenAdapter; +import org.annolab.tt4j.TokenHandler; +import org.annolab.tt4j.TreeTaggerException; +import org.annolab.tt4j.TreeTaggerModelUtil; +import org.annolab.tt4j.TreeTaggerWrapper; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Type; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.metadata.SingletonTagset; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.CasConfigurableProviderBase; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.treetagger.internal.DKProExecutableResolver; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; + +/** + * Part-of-Speech and lemmatizer annotator using TreeTagger. + */ +@Component(OperationType.PART_OF_SPEECH_TAGGER) +@ResourceMetaData(name = "TreeTagger POS-Tagger") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") +@TypeCapability( + inputs = { + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, + outputs = { + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }) +public class TreeTaggerPosTagger + extends JCasAnnotator_ImplBase +{ + /** + * Use this language instead of the document language to resolve the model. + */ + public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; + @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) + protected String language; + + /** + * Override the default variant used to locate the model. + */ + public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; + @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) + protected String variant; + + /** + * Use this TreeTagger executable instead of trying to locate the executable automatically. + */ + public static final String PARAM_EXECUTABLE_PATH = "executablePath"; + @ConfigurationParameter(name = PARAM_EXECUTABLE_PATH, mandatory = false) + private File executablePath; + + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + + /** + * Load the model from this location instead of locating the model automatically. + */ + public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; + @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) + protected String modelLocation; + + /** + * The character encoding used by the model. + */ + public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; + @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = false) + protected String modelEncoding; + + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + + /** + * Load the part-of-speech tag to UIMA type mapping from this location instead of locating + * the mapping automatically. + */ + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; + @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) + protected String posMappingLocation; + + /** + * Log the tag set(s) when a model is loaded. + */ + public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; + @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") + protected boolean printTagSet; + + /** + * TT4J setting: Disable some sanity checks, e.g. whether tokens contain line breaks (which is + * not allowed). Turning this on will increase your performance, but the wrapper may throw + * exceptions if illegal data is provided. + */ + public static final String PARAM_PERFORMANCE_MODE = "performanceMode"; + @ConfigurationParameter(name = PARAM_PERFORMANCE_MODE, mandatory = true, defaultValue = "false") + private boolean performanceMode; + + /** + * Write part-of-speech information. + */ + public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; + @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "true") + private boolean writePos; + + /** + * Write lemma information. + */ + public static final String PARAM_WRITE_LEMMA = ComponentParameters.PARAM_WRITE_LEMMA; + @ConfigurationParameter(name = PARAM_WRITE_LEMMA, mandatory = true, defaultValue = "true") + private boolean writeLemma; + + private CasConfigurableProviderBase<TreeTaggerWrapper<Token>> modelProvider; + private MappingProvider posMappingProvider; + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException + { + super.initialize(aContext); + + modelProvider = new ModelProviderBase<TreeTaggerWrapper<Token>>() { + private TreeTaggerWrapper<Token> treetagger; + + { + setContextObject(TreeTaggerPosTagger.this); + + setDefault(ARTIFACT_ID, "${groupId}.treetagger-model-tagger-${language}-${variant}"); + setDefault(LOCATION, + "classpath:/de/tudarmstadt/ukp/dkpro/core/treetagger/lib/tagger-${language}-${variant}.properties"); + setDefault(VARIANT, "le"); // le = little-endian + + setOverride(LOCATION, modelLocation); + setOverride(LANGUAGE, language); + setOverride(VARIANT, variant); + + treetagger = new TreeTaggerWrapper<Token>(); + treetagger.setPerformanceMode(performanceMode); + DKProExecutableResolver executableProvider = new DKProExecutableResolver( + treetagger); + executableProvider.setExecutablePath(executablePath); + treetagger.setExecutableProvider(executableProvider); + treetagger.setAdapter(new TokenAdapter<Token>() + { + @Override + public String getText(Token aObject) + { + synchronized (aObject.getCAS()) { + return aObject.getText(); + } + } + }); + } + + @Override + protected TreeTaggerWrapper<Token> produceResource(URL aUrl) + throws IOException + { + Properties meta = getResourceMetaData(); + String encoding = modelEncoding != null ? modelEncoding : meta + .getProperty("encoding"); + String tagset = meta.getProperty("pos.tagset"); + + File modelFile = ResourceUtils.getUrlAsFile(aUrl, true); + + // Reconfigure tagger + treetagger.setModel(modelFile.getPath() + ":" + encoding); + + // Get tagset + List<String> tags = TreeTaggerModelUtil.getTagset(modelFile, encoding); + SingletonTagset posTags = new SingletonTagset(POS.class, tagset); + posTags.addAll(tags); + addTagset(posTags); + + if (printTagSet) { + getContext().getLogger().log(INFO, getTagset().toString()); + } + + return treetagger; + } + }; + + posMappingProvider = createPosMappingProvider(this, posMappingLocation, language, + modelProvider); + } + + @Override + public void process(final JCas aJCas) + throws AnalysisEngineProcessException + { + final CAS cas = aJCas.getCas(); + + modelProvider.configure(cas); + posMappingProvider.configure(cas); + + TreeTaggerWrapper<Token> treetagger = modelProvider.getResource(); + + try { + List<Token> tokens = new ArrayList<Token>(select(aJCas, Token.class)); + final POS[] pos = new POS[tokens.size()]; + final Lemma[] lemma = new Lemma[tokens.size()]; + + // Set the handler creating new UIMA annotations from the analyzed + // tokens + final AtomicInteger count = new AtomicInteger(0); + treetagger.setHandler(new TokenHandler<Token>() { + @Override + public void token(Token aToken, String aPos, String aLemma) + { + synchronized (cas) { + // Add the Part of Speech + if (writePos && aPos != null) { + Type posTag = posMappingProvider.getTagType(aPos); + POS posAnno = (POS) cas.createAnnotation(posTag, aToken.getBegin(), + aToken.getEnd()); + posAnno.setPosValue(aPos.intern()); + POSUtils.assignCoarseValue(posAnno); + aToken.setPos(posAnno); + pos[count.get()] = posAnno; + } + + // Add the lemma + if (writeLemma && aLemma != null) { + Lemma lemmaAnno = new Lemma(aJCas, aToken.getBegin(), aToken.getEnd()); + lemmaAnno.setValue(aLemma.intern()); + aToken.setLemma(lemmaAnno); + lemma[count.get()] = lemmaAnno; + } + + count.getAndIncrement(); + } + } + }); + + treetagger.process(tokens); + + // Add the annotations to the indexes + for (int i = 0; i < count.get(); i++) { + if (pos[i] != null) { + pos[i].addToIndexes(); + } + if (lemma[i] != null) { + lemma[i].addToIndexes(); + } + } + } + catch (TreeTaggerException e) { + throw new AnalysisEngineProcessException(e); + } + catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } +} diff --git a/dkpro-core-treetagger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/treetagger/internal/DKProExecutableResolver.java b/dkpro-core-treetagger-asl/src/main/java/org/dkpro/core/treetagger/internal/DKProExecutableResolver.java similarity index 96% rename from dkpro-core-treetagger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/treetagger/internal/DKProExecutableResolver.java rename to dkpro-core-treetagger-asl/src/main/java/org/dkpro/core/treetagger/internal/DKProExecutableResolver.java index e4232d0d84..9c8d021e43 100644 --- a/dkpro-core-treetagger-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/treetagger/internal/DKProExecutableResolver.java +++ b/dkpro-core-treetagger-asl/src/main/java/org/dkpro/core/treetagger/internal/DKProExecutableResolver.java @@ -15,11 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.treetagger.internal; +package org.dkpro.core.treetagger.internal; -import static de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils.getUrlAsExecutable; import static java.io.File.separator; import static org.annolab.tt4j.Util.getSearchPaths; +import static org.dkpro.core.api.resources.ResourceUtils.getUrlAsExecutable; import java.io.File; import java.io.IOException; diff --git a/dkpro-core-treetagger-asl/src/main/java/org/dkpro/core/treetagger/package-info.java b/dkpro-core-treetagger-asl/src/main/java/org/dkpro/core/treetagger/package-info.java new file mode 100644 index 0000000000..c550e59663 --- /dev/null +++ b/dkpro-core-treetagger-asl/src/main/java/org/dkpro/core/treetagger/package-info.java @@ -0,0 +1,26 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Integration of the <a href="http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/DecisionTreeTagger.html"> + * TreeTagger</a> part-of-speech tagger, lemmatizer and chunker via <a href="http://code.google.com/p/tt4j/"> + * TT4J</a>. + * + * @since 1.1.0 + */ +package org.dkpro.core.treetagger; diff --git a/dkpro-core-treetagger-asl/src/scripts/build.xml b/dkpro-core-treetagger-asl/src/scripts/build.xml index 99f2694c7e..b033256afe 100644 --- a/dkpro-core-treetagger-asl/src/scripts/build.xml +++ b/dkpro-core-treetagger-asl/src/scripts/build.xml @@ -16,1091 +16,1109 @@ limitations under the License. --> <project basedir="../.." default="separate-jars"> - <import> - <url url="https://raw.githubusercontent.com/dkpro/resource-packager/0.8.0/ant-macros.xml"/> - </import> + <import> + <url url="https://raw.githubusercontent.com/dkpro/resource-packager/0.8.0/ant-macros.xml"/> + </import> - <!-- - - Output package configuration - --> - <property name="outputPackage" value="de/tudarmstadt/ukp/dkpro/core/treetagger/lib"/> - <property name="outputDir" value="target/model-staging/de/tudarmstadt/ukp/dkpro/core/treetagger"/> - - <target name="local-maven"> - <property name="install-artifact-mode" value="local"/> - <antcall target="separate-jars"/> - </target> - - <target name="remote-maven"> - <property name="install-artifact-mode" value="remote"/> - <antcall target="separate-jars"/> - </target> - - <target name="newmodels"> + <!-- + - Output package configuration + --> + <property name="outputPackage" value="de/tudarmstadt/ukp/dkpro/core/treetagger/lib"/> + <property name="outputDir" value="target/model-staging/de/tudarmstadt/ukp/dkpro/core/treetagger"/> + + <target name="local-maven"> + <property name="install-artifact-mode" value="local"/> + <antcall target="separate-jars"/> + </target> + + <target name="remote-maven"> + <property name="install-artifact-mode" value="remote"/> + <antcall target="separate-jars"/> + </target> + + <target name="newmodels"> <property name="install-artifact-mode" value="remote"/> - <antcall target="en-tagger-le"/> - <antcall target="es-tagger-le"/> - </target> + <antcall target="en-tagger-le"/> + <antcall target="es-tagger-le"/> + </target> - <target name="separate-jars" depends="check-license,install-executables,bg,de,en,es,et,fi,fr,gl,it,la,mn,nl,pl,pt,ru,sk,sl,sw,zh,jar-notice"/> + <target name="separate-jars" depends="check-license,install-executables,bg,de,en,es,et,fi,fr,gl,it,la,mn,nl,pl,pt,ru,sk,sl,sw,zh,jar-notice"/> - <target name="bg"> - <antcall target="bg-tagger-le"/> - </target> + <target name="bg"> + <antcall target="bg-tagger-le"/> + </target> - <target name="bg-tagger-le"> - <mkdir dir="target/download"/> - - <!-- FILE: bulgarian-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - 2014-07-02 | now | ee595fb1643760a94591cee91b556911 - - 2016-04-30 | now | 4cae50b7d8adb8269f583052a69e9184 - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/bulgarian-par-linux-3.2-utf8.bin.gz" - dest="target/download/bulgarian-par-linux-3.2-utf8.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/bulgarian-par-linux-3.2-utf8.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/bulgarian-par-linux-3.2-utf8.bin" - md5="4cae50b7d8adb8269f583052a69e9184" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20160430" - metaDataVersion="1" - tool="tagger" - language="bg" - variant="le" - extension="bin"> - <metadata> - <entry key="pos.tagset" value="btb"/> - <entry key="encoding" value="UTF-8"/> - </metadata> - </install-stub-and-upstream-file> - </target> - - <target name="de"> - <antcall target="de-tagger-le"/> - <antcall target="de-chunker-le"/> - </target> + <target name="bg-tagger-le"> + <mkdir dir="target/download"/> + + <!-- FILE: bulgarian-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - + - 2014-07-02 | now | ee595fb1643760a94591cee91b556911 + - 2016-04-30 | now | 4cae50b7d8adb8269f583052a69e9184 + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/bulgarian.par.gz" + dest="target/download/bulgarian.par.gz" + skipexisting="true"/> + <gunzip src="target/download/bulgarian.par.gz"/> + <install-stub-and-upstream-file + file="target/download/bulgarian.par" + md5="4cae50b7d8adb8269f583052a69e9184" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20160430" + metaDataVersion="1" + tool="tagger" + language="bg" + variant="le" + extension="bin"> + <metadata> + <entry key="pos.tagset" value="btb"/> + <entry key="encoding" value="UTF-8"/> + </metadata> + </install-stub-and-upstream-file> + </target> + + <target name="de"> + <antcall target="de-tagger-le"/> + <antcall target="de-chunker-le"/> + </target> - <target name="de-tagger-le"> - <mkdir dir="target/download"/> - <!-- FILE: german-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - | 2012-02-13 | acbfeaafdcfcc07cbb1e9c396757934c - - 2012-02-13 | 2012-03-15 | 45afdcdabfffc69995523748cd42bf81 - - 2012-03-15 | 2012-12-07 | 3a18444accf080f54c5ebf655fedc781 - - 2012-12-07 | 2017-03-16 | 54f16f045083ae6ebe8b9d9d4a24484e - - 2017-03-16 | now | 36183505f5ef1059aac104fb1fe1534d - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/german-par-linux-3.2-utf8.bin.gz" - dest="target/download/german-par-linux-3.2-utf8.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/german-par-linux-3.2-utf8.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/german-par-linux-3.2-utf8.bin" - md5="36183505f5ef1059aac104fb1fe1534d" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20170316" - metaDataVersion="1" - tool="tagger" - language="de" - variant="le" - extension="bin"> - <metadata> - <entry key="pos.tagset" value="stts"/> - <entry key="encoding" value="UTF-8"/> - </metadata> - </install-stub-and-upstream-file> - </target> + <target name="de-tagger-le"> + <mkdir dir="target/download"/> + <!-- FILE: german-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - + - | 2012-02-13 | acbfeaafdcfcc07cbb1e9c396757934c + - 2012-02-13 | 2012-03-15 | 45afdcdabfffc69995523748cd42bf81 + - 2012-03-15 | 2012-12-07 | 3a18444accf080f54c5ebf655fedc781 + - 2012-12-07 | 2017-03-16 | 54f16f045083ae6ebe8b9d9d4a24484e + - 2017-03-16 | 2018-11-02 | 36183505f5ef1059aac104fb1fe1534d + - 2018-11-02 | 2019-04-09 | 71bb01ee3b908c0b84a69fee7d34f829 + - 2019-04-09 | now | 535cd21cdc137c32c2a376875be2099d + - + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/german.par.gz" + dest="target/download/german.par.gz" + skipexisting="true"/> + <gunzip src="target/download/german.par.gz"/> + <install-stub-and-upstream-file + file="target/download/german.par" + md5="535cd21cdc137c32c2a376875be2099d" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20190409" + metaDataVersion="1" + tool="tagger" + language="de" + variant="le" + extension="bin"> + <metadata> + <entry key="pos.tagset" value="stts"/> + <entry key="encoding" value="UTF-8"/> + </metadata> + </install-stub-and-upstream-file> + </target> - <target name="de-chunker-le"> - <!-- FILE: german-chunker-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - 2011-04-29 | now | 56ea6ac618bc21d28929a85a6964ff65 - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/german-chunker-par-linux-3.2-utf8.bin.gz" - dest="target/download/german-chunker-par-linux-3.2-utf8.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/german-chunker-par-linux-3.2-utf8.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/german-chunker-par-linux-3.2-utf8.bin" - md5="56ea6ac618bc21d28929a85a6964ff65" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20110429" - metaDataVersion="1" - tool="chunker" - language="de" - variant="le" - extension="bin"> - <metadata> - <entry key="chunk.tagset" value="tt"/> - <entry key="encoding" value="UTF-8"/> - </metadata> - </install-stub-and-upstream-file> - </target> + <target name="de-chunker-le"> + <!-- FILE: german-chunker-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - + - 2011-04-29 | now | 56ea6ac618bc21d28929a85a6964ff65 + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/german-chunker.par.gz" + dest="target/download/german-chunker.par.gz" + skipexisting="true"/> + <gunzip src="target/download/german-chunker.par.gz"/> + <install-stub-and-upstream-file + file="target/download/german-chunker.par" + md5="56ea6ac618bc21d28929a85a6964ff65" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20110429" + metaDataVersion="1" + tool="chunker" + language="de" + variant="le" + extension="bin"> + <metadata> + <entry key="chunk.tagset" value="tt"/> + <entry key="encoding" value="UTF-8"/> + </metadata> + </install-stub-and-upstream-file> + </target> - <target name="en"> - <antcall target="en-tagger-le"/> - <antcall target="en-chunker-iso8859-le"/> - <antcall target="en-chunker-le"/> - </target> + <target name="en"> + <antcall target="en-tagger-le"/> + <!-- <antcall target="en-chunker-iso8859-le"/> --> + <antcall target="en-chunker-le"/> + </target> - <target name="en-tagger-le"> - <mkdir dir="target/download"/> + <target name="en-tagger-le"> + <mkdir dir="target/download"/> - <!-- FILE: english-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - 2014-05-20 | 2015-02-09 | db60237f6f42faac4c89e511f33960ec - - 2015-02-09 | 2015-11-19 | 17d93abadfd8786b9a36dfdfd12f8d98 - - 2015-11-19 | 2016-12-21 | 441b51369b6888387a17a26a5dc2d146 - - 2016-12-21 | 2017-02-20 | ccf1bac5ff50d0d7bf3423aa82b2e6c1 - - 2017-02-20 | now | 9a70c2da699c4a57d8cc361039cd0cc7 - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/english-par-linux-3.2-utf8.bin.gz" - dest="target/download/english-par-linux-3.2-utf8.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/english-par-linux-3.2-utf8.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/english-par-linux-3.2-utf8.bin" - md5="9a70c2da699c4a57d8cc361039cd0cc7" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20170220" - metaDataVersion="1" - tool="tagger" - language="en" - variant="le" - extension="bin"> - <metadata> - <entry key="pos.tagset" value="ptb-tt"/> - <entry key="encoding" value="UTF-8"/> - </metadata> - </install-stub-and-upstream-file> - </target> + <!-- FILE: english-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - + - 2014-05-20 | 2015-02-09 | db60237f6f42faac4c89e511f33960ec + - 2015-02-09 | 2015-11-19 | 17d93abadfd8786b9a36dfdfd12f8d98 + - 2015-11-19 | 2016-12-21 | 441b51369b6888387a17a26a5dc2d146 + - 2016-12-21 | 2017-02-20 | ccf1bac5ff50d0d7bf3423aa82b2e6c1 + - 2017-02-20 | 2019-03-04 | 9a70c2da699c4a57d8cc361039cd0cc7 + - 2019-03-04 | now | cb1be6672731a77d4c7974a8392eea0a + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/english.par.gz" + dest="target/download/english.par.gz" + skipexisting="true"/> + <gunzip src="target/download/english.par.gz"/> + <install-stub-and-upstream-file + file="target/download/english.par" + md5="cb1be6672731a77d4c7974a8392eea0a" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20190304" + metaDataVersion="1" + tool="tagger" + language="en" + variant="le" + extension="bin"> + <metadata> + <entry key="pos.tagset" value="ptb-tt"/> + <entry key="encoding" value="UTF-8"/> + </metadata> + </install-stub-and-upstream-file> + </target> - <target name="en-chunker-iso8859-le"> - <!-- FILE: english-chunker-par-linux-3.2.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - 2009-08-24 | now | 981df5901588a016ade69108d4421531 - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/english-chunker-par-linux-3.2.bin.gz" - dest="target/download/english-chunker-par-linux-3.2.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/english-chunker-par-linux-3.2.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/english-chunker-par-linux-3.2.bin" - md5="981df5901588a016ade69108d4421531" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20090824" - metaDataVersion="1" - tool="chunker" - language="en" - variant="iso8859-le" - extension="bin"> - <metadata> - <entry key="chunk.tagset" value="tt"/> - <entry key="encoding" value="ISO-8859-1"/> - </metadata> - </install-stub-and-upstream-file> - </target> - - <target name="en-chunker-le"> - <!-- FILE: english-chunker-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - 2014-05-20 | now | c179a5ec0b6ae057425ba3469611e907 - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/english-chunker-par-linux-3.2-utf8.bin.gz" - dest="target/download/english-chunker-par-linux-3.2-utf8.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/english-chunker-par-linux-3.2-utf8.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/english-chunker-par-linux-3.2-utf8.bin" - md5="c179a5ec0b6ae057425ba3469611e907" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20140520" - metaDataVersion="1" - tool="chunker" - language="en" - variant="le" - extension="bin"> - <metadata> - <entry key="chunk.tagset" value="tt"/> - <entry key="encoding" value="UTF-8"/> - </metadata> - </install-stub-and-upstream-file> - </target> + <target name="en-chunker-iso8859-le"> + <!-- FILE: english-chunker-par-linux-3.2.bin.gz - - - - - - - - - - - - - - - - - - - - - - + - 2009-08-24 | now | 981df5901588a016ade69108d4421531 + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/english-chunker-par-linux-3.2.bin.gz" + dest="target/download/english-chunker-par-linux-3.2.bin.gz" + skipexisting="true"/> + <gunzip src="target/download/english-chunker-par-linux-3.2.bin.gz"/> + <install-stub-and-upstream-file + file="target/download/english-chunker-par-linux-3.2.bin" + md5="981df5901588a016ade69108d4421531" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20090824" + metaDataVersion="1" + tool="chunker" + language="en" + variant="iso8859-le" + extension="bin"> + <metadata> + <entry key="chunk.tagset" value="tt"/> + <entry key="encoding" value="ISO-8859-1"/> + </metadata> + </install-stub-and-upstream-file> + </target> + + <target name="en-chunker-le"> + <!-- FILE: english-chunker-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - + - 2014-05-20 | now | c179a5ec0b6ae057425ba3469611e907 + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/english-chunker.par.gz" + dest="target/download/english-chunker.par.gz" + skipexisting="true"/> + <gunzip src="target/download/english-chunker.par.gz"/> + <install-stub-and-upstream-file + file="target/download/english-chunker.par" + md5="c179a5ec0b6ae057425ba3469611e907" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20140520" + metaDataVersion="1" + tool="chunker" + language="en" + variant="le" + extension="bin"> + <metadata> + <entry key="chunk.tagset" value="tt"/> + <entry key="encoding" value="UTF-8"/> + </metadata> + </install-stub-and-upstream-file> + </target> - <target name="es"> - <antcall target="es-tagger-le"/> - </target> + <target name="es"> + <antcall target="es-tagger-le"/> + </target> - <target name="es-tagger-le"> - <mkdir dir="target/download"/> - - <!-- FILE: spanish-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - 2011-11-07 | 2013-02-14 | 9770d507bd3cfb30385e1c2d4ce46f89 - - 2013-02-14 | 2013-04-03 | 33359d3deca5f9acab7f3b5b38529e17 - - 2013-04-03 | 2013-10-07 | 30a9c4deed380e7986a4a76753ecde51 - - 2013-10-07 | 2014-05-20 | 8488c4319fac414802637b654a61db37 - - 2014-05-20 | 2015-07-24 | 48f14061833274842d6cf5ed3db7b903 - - 2015-07-24 | 2016-12-22 | ea8142cb5a0efc59226a0f5d41d690ac + <target name="es-tagger-le"> + <mkdir dir="target/download"/> + + <!-- FILE: spanish-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - + - 2011-11-07 | 2013-02-14 | 9770d507bd3cfb30385e1c2d4ce46f89 + - 2013-02-14 | 2013-04-03 | 33359d3deca5f9acab7f3b5b38529e17 + - 2013-04-03 | 2013-10-07 | 30a9c4deed380e7986a4a76753ecde51 + - 2013-10-07 | 2014-05-20 | 8488c4319fac414802637b654a61db37 + - 2014-05-20 | 2015-07-24 | 48f14061833274842d6cf5ed3db7b903 + - 2015-07-24 | 2016-12-22 | ea8142cb5a0efc59226a0f5d41d690ac - 2016-12-22 | now | eb4b3ef771e4ba2e21aa715ca5a08d5f - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/spanish-par-linux-3.2-utf8.bin.gz" - dest="target/download/spanish-par-linux-3.2-utf8.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/spanish-par-linux-3.2-utf8.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/spanish-par-linux-3.2-utf8.bin" - md5="eb4b3ef771e4ba2e21aa715ca5a08d5f" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20161222" - metaDataVersion="1" - tool="tagger" - language="es" - variant="le" - extension="bin"> - <metadata> - <entry key="pos.tagset" value="crater"/> - <entry key="encoding" value="UTF-8"/> - </metadata> - </install-stub-and-upstream-file> - </target> + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/spanish.par.gz" + dest="target/download/spanish.par.gz" + skipexisting="true"/> + <gunzip src="target/download/spanish.par.gz"/> + <install-stub-and-upstream-file + file="target/download/spanish.par" + md5="eb4b3ef771e4ba2e21aa715ca5a08d5f" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20161222" + metaDataVersion="1" + tool="tagger" + language="es" + variant="le" + extension="bin"> + <metadata> + <entry key="pos.tagset" value="crater"/> + <entry key="encoding" value="UTF-8"/> + </metadata> + </install-stub-and-upstream-file> + </target> - <target name="et"> - <antcall target="et-tagger-le"/> - </target> + <target name="et"> + <antcall target="et-tagger-le"/> + </target> - <target name="et-tagger-le"> - <mkdir dir="target/download"/> - - <!-- FILE: estonian-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - 2011-01-24 | now | 4a98b412e36afefbc204fca9e561f5cb - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/estonian-par-linux-3.2-utf8.bin.gz" - dest="target/download/estonian-par-linux-3.2.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/estonian-par-linux-3.2.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/estonian-par-linux-3.2.bin" - md5="4a98b412e36afefbc204fca9e561f5cb" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20110124" - metaDataVersion="1" - tool="tagger" - language="et" - variant="le" - extension="bin"> - <metadata> - <entry key="pos.tagset" value="tartu"/> - <entry key="encoding" value="UTF-8"/> - </metadata> - </install-stub-and-upstream-file> - </target> + <target name="et-tagger-le"> + <mkdir dir="target/download"/> + + <!-- FILE: estonian-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - + - 2011-01-24 | now | 4a98b412e36afefbc204fca9e561f5cb + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/estonian.par.gz" + dest="target/download/estonian.par.gz" + skipexisting="true"/> + <gunzip src="target/download/estonian.par.gz"/> + <install-stub-and-upstream-file + file="target/download/estonian.par" + md5="4a98b412e36afefbc204fca9e561f5cb" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20110124" + metaDataVersion="1" + tool="tagger" + language="et" + variant="le" + extension="bin"> + <metadata> + <entry key="pos.tagset" value="tartu"/> + <entry key="encoding" value="UTF-8"/> + </metadata> + </install-stub-and-upstream-file> + </target> - <target name="fi"> - <antcall target="fi-tagger-le"/> - </target> + <target name="fi"> + <antcall target="fi-tagger-le"/> + </target> - <target name="fi-tagger-le"> - <mkdir dir="target/download"/> - - <!-- FILE: finnish-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - 2013-07-16 | 2014-07-04 | 241dbbae52baad70787a952d3a0e71a3 - - 2014-07-04 | now | c0659e3dbbc5d74f15bb3174c0597155 - - - - Corpus: http://www.ling.helsinki.fi/kieliteknologia/tutkimus/treebank/ - - Tagset: http://www.ling.helsinki.fi/kieliteknologia/tutkimus/treebank/sources/FinnTreeBankManual.pdf - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/finnish-par-linux-3.2-utf8.bin.gz" - dest="target/download/finnish-par-linux-3.2.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/finnish-par-linux-3.2.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/finnish-par-linux-3.2.bin" - md5="c0659e3dbbc5d74f15bb3174c0597155" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20140704" - metaDataVersion="1" - tool="tagger" - language="fi" - variant="le" - extension="bin"> - <metadata> - <entry key="pos.tagset" value="ftb-tt"/> - <entry key="encoding" value="UTF-8"/> - </metadata> - </install-stub-and-upstream-file> - </target> + <target name="fi-tagger-le"> + <mkdir dir="target/download"/> + + <!-- FILE: finnish-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - + - 2013-07-16 | 2014-07-04 | 241dbbae52baad70787a952d3a0e71a3 + - 2014-07-04 | now | c0659e3dbbc5d74f15bb3174c0597155 + - + - Corpus: http://www.ling.helsinki.fi/kieliteknologia/tutkimus/treebank/ + - Tagset: http://www.ling.helsinki.fi/kieliteknologia/tutkimus/treebank/sources/FinnTreeBankManual.pdf + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/finnish.par.gz" + dest="target/download/finnish.par.gz" + skipexisting="true"/> + <gunzip src="target/download/finnish.par.gz"/> + <install-stub-and-upstream-file + file="target/download/finnish.par" + md5="c0659e3dbbc5d74f15bb3174c0597155" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20140704" + metaDataVersion="1" + tool="tagger" + language="fi" + variant="le" + extension="bin"> + <metadata> + <entry key="pos.tagset" value="ftb-tt"/> + <entry key="encoding" value="UTF-8"/> + </metadata> + </install-stub-and-upstream-file> + </target> - <target name="fr"> - <antcall target="fr-tagger-le"/> - <antcall target="fr-chunker-le"/> - </target> + <target name="fr"> + <antcall target="fr-tagger-le"/> + <antcall target="fr-chunker-le"/> + </target> - <target name="fr-tagger-le"> - <mkdir dir="target/download"/> - - <!-- FILE: french-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - 2010-01-11 | now | 41d978f247b23d6fee733da32a55a775 - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/french-par-linux-3.2-utf8.bin.gz" - dest="target/download/french-par-linux-3.2-utf8.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/french-par-linux-3.2-utf8.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/french-par-linux-3.2-utf8.bin" - md5="41d978f247b23d6fee733da32a55a775" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20100111" - metaDataVersion="1" - tool="tagger" - language="fr" - variant="le" - extension="bin"> - <metadata> - <entry key="pos.tagset" value="stein"/> - <entry key="encoding" value="UTF-8"/> - </metadata> - </install-stub-and-upstream-file> - </target> + <target name="fr-tagger-le"> + <mkdir dir="target/download"/> + + <!-- FILE: french-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - + - 2010-01-11 | 2019-04-04 | 41d978f247b23d6fee733da32a55a775 + - 2019-04-04 | now | 5f11bd985160ed85b40fdde848f1b94f + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/french.par.gz" + dest="target/download/french.par.gz" + skipexisting="true"/> + <gunzip src="target/download/french.par.gz"/> + <install-stub-and-upstream-file + file="target/download/french.par" + md5="5f11bd985160ed85b40fdde848f1b94f" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20190404" + metaDataVersion="1" + tool="tagger" + language="fr" + variant="le" + extension="bin"> + <metadata> + <entry key="pos.tagset" value="stein"/> + <entry key="encoding" value="UTF-8"/> + </metadata> + </install-stub-and-upstream-file> + </target> - <target name="fr-chunker-le"> - <mkdir dir="target/download"/> - - <!-- FILE: french-chunker-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - 2008-06-11 | 2014-12-18 | 975e3e306198c9f61f14428621ed87ba - - 2014-12-18 | now | 371fbe11e13d169120c9ce04c0d05206 - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/french-chunker-par-linux-3.2-utf8.bin.gz" - dest="target/download/french-chunker-par-linux-3.2-utf8.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/french-chunker-par-linux-3.2-utf8.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/french-chunker-par-linux-3.2-utf8.bin" - md5="371fbe11e13d169120c9ce04c0d05206" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20141218" - metaDataVersion="2" - tool="chunker" - language="fr" - variant="le" - extension="bin"> - <metadata> - <entry key="chunk.tagset" value="ftb"/> - <entry key="encoding" value="UTF-8"/> - <entry key="flushSequence" value="Ce-PRO:DEM est-VER:pres la-DET:ART fin-NOM mon-DET:POS ami-NOM .-PONCT:S"/> - <entry key="pos.tag.map.SENT" value="PONCT:S"/> - </metadata> - </install-stub-and-upstream-file> - </target> + <target name="fr-chunker-le"> + <mkdir dir="target/download"/> + + <!-- FILE: french-chunker-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - + - 2008-06-11 | 2014-12-18 | 975e3e306198c9f61f14428621ed87ba + - 2014-12-18 | now | 371fbe11e13d169120c9ce04c0d05206 + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/french-chunker.par.gz" + dest="target/download/french-chunker.par.gz" + skipexisting="true"/> + <gunzip src="target/download/french-chunker.par.gz"/> + <install-stub-and-upstream-file + file="target/download/french-chunker.par" + md5="371fbe11e13d169120c9ce04c0d05206" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20141218" + metaDataVersion="2" + tool="chunker" + language="fr" + variant="le" + extension="bin"> + <metadata> + <entry key="chunk.tagset" value="ftb"/> + <entry key="encoding" value="UTF-8"/> + <entry key="flushSequence" value="Ce-PRO:DEM est-VER:pres la-DET:ART fin-NOM mon-DET:POS ami-NOM .-PONCT:S"/> + <entry key="pos.tag.map.SENT" value="PONCT:S"/> + </metadata> + </install-stub-and-upstream-file> + </target> - <target name="gl"> - <antcall target="gl-tagger-le"/> - </target> + <target name="gl"> + <antcall target="gl-tagger-le"/> + </target> - <target name="gl-tagger-le"> - <mkdir dir="target/download"/> - - <!-- FILE: galician-par-linux-3.2.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - - - 2013-05-16 | now | 85cc88ae110500aa6ba924c10c2dd8bb - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/galician-par-linux-3.2.bin.gz" - dest="target/download/galician-par-linux-3.2.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/galician-par-linux-3.2.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/galician-par-linux-3.2.bin" - md5="85cc88ae110500aa6ba924c10c2dd8bb" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20130516" - metaDataVersion="1" - tool="tagger" - language="gl" - variant="le" - extension="bin"> - <metadata> - <entry key="pos.tagset" value="xiada"/> - <entry key="encoding" value="UTF-8"/> - </metadata> - </install-stub-and-upstream-file> - </target> + <target name="gl-tagger-le"> + <mkdir dir="target/download"/> + + <!-- FILE: galician-par-linux-3.2.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - + - 2013-05-16 | 2019-04-13 | 85cc88ae110500aa6ba924c10c2dd8bb + - 2019-04-13 | now | 6b791aba6f7e1ef5d3e9689819cdc17b + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/galician.par.gz" + dest="target/download/galician.par.gz" + skipexisting="true"/> + <gunzip src="target/download/galician.par.gz"/> + <install-stub-and-upstream-file + file="target/download/galician.par" + md5="6b791aba6f7e1ef5d3e9689819cdc17b" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20190413" + metaDataVersion="1" + tool="tagger" + language="gl" + variant="le" + extension="bin"> + <metadata> + <entry key="pos.tagset" value="xiada"/> + <entry key="encoding" value="UTF-8"/> + </metadata> + </install-stub-and-upstream-file> + </target> - <target name="gmh"> - <antcall target="gmh-tagger-le"/> - </target> + <target name="gmh"> + <antcall target="gmh-tagger-le"/> + </target> - <target name="gmh-tagger-le"> - <mkdir dir="target/download"/> - <!-- FILE: middle-high-german-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - 2016-11-07 | now | 7192c51a01a79b56294f5388f9dc38ba - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/middle-high-german-par-linux-3.2-utf8.bin.gz" - dest="target/download/middle-high-german-par-linux-3.2-utf8.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/middle-high-german-par-linux-3.2-utf8.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/middle-high-german-par-linux-3.2-utf8.bin" - md5="7192c51a01a79b56294f5388f9dc38ba" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20161107" - metaDataVersion="1" - tool="tagger" - language="gmh" - variant="le" - extension="bin"> - <metadata> - <entry key="pos.tagset" value="universal"/> - <entry key="encoding" value="UTF-8"/> - </metadata> - </install-stub-and-upstream-file> - </target> + <target name="gmh-tagger-le"> + <mkdir dir="target/download"/> + <!-- FILE: middle-high-german-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - + - 2016-11-07 | now | 7192c51a01a79b56294f5388f9dc38ba + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/middle-high-german.par.gz" + dest="target/download/middle-high-german.par.gz" + skipexisting="true"/> + <gunzip src="target/download/middle-high-german.par.gz"/> + <install-stub-and-upstream-file + file="target/download/middle-high-german.par" + md5="7192c51a01a79b56294f5388f9dc38ba" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20161107" + metaDataVersion="1" + tool="tagger" + language="gmh" + variant="le" + extension="bin"> + <metadata> + <entry key="pos.tagset" value="universal"/> + <entry key="encoding" value="UTF-8"/> + </metadata> + </install-stub-and-upstream-file> + </target> - <target name="it"> - <antcall target="it-tagger-le"/> - </target> + <target name="it"> + <antcall target="it-tagger-le"/> + </target> - <target name="it-tagger-le"> - <mkdir dir="target/download"/> - - <!-- FILE: italian-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - 2009-06-22 | 2014-10-20 | 79c03b4cc14c89d58aa1f97a133d01d7 - - 2014-10-20 | now | d3266e8b3d1860543403342fe0c95fc8 - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/italian-par-linux-3.2-utf8.bin.gz" - dest="target/download/italian-par-linux-3.2-utf8.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/italian-par-linux-3.2-utf8.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/italian-par-linux-3.2-utf8.bin" - md5="d3266e8b3d1860543403342fe0c95fc8" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20141020" - metaDataVersion="1" - tool="tagger" - language="it" - variant="le" - extension="bin"> - <metadata> - <entry key="pos.tagset" value="stein"/> - <entry key="encoding" value="UTF-8"/> - </metadata> - </install-stub-and-upstream-file> - </target> + <target name="it-tagger-le"> + <mkdir dir="target/download"/> + + <!-- FILE: italian-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - + - 2009-06-22 | 2014-10-20 | 79c03b4cc14c89d58aa1f97a133d01d7 + - 2014-10-20 | now | d3266e8b3d1860543403342fe0c95fc8 + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/italian.par.gz" + dest="target/download/italian.par.gz" + skipexisting="true"/> + <gunzip src="target/download/italian.par.gz"/> + <install-stub-and-upstream-file + file="target/download/italian.par" + md5="d3266e8b3d1860543403342fe0c95fc8" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20141020" + metaDataVersion="1" + tool="tagger" + language="it" + variant="le" + extension="bin"> + <metadata> + <entry key="pos.tagset" value="stein"/> + <entry key="encoding" value="UTF-8"/> + </metadata> + </install-stub-and-upstream-file> + </target> - <target name="la"> - <antcall target="la-tagger-le"/> - </target> + <target name="la"> + <antcall target="la-tagger-le"/> + </target> - <target name="la-tagger-le"> - <mkdir dir="target/download"/> - - <!-- FILE: latin-par-linux-3.2.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 2011-05-07 | 2011-08-19 | f959f8633ef842f069f0331ad19dc8b4 - - 2011-08-19 | now | bde1f6a63b2c5a658ba25a8eb90832a8 - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/latin-par-linux-3.2.bin.gz" - dest="target/download/latin-par-linux-3.2.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/latin-par-linux-3.2.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/latin-par-linux-3.2.bin" - md5="bde1f6a63b2c5a658ba25a8eb90832a8" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20110819" - metaDataVersion="1" - tool="tagger" - language="la" - variant="le" - extension="bin"> - <metadata> - <entry key="pos.tagset" value="brandolini"/> - <entry key="encoding" value="ISO-8859-1"/> - </metadata> - </install-stub-and-upstream-file> - </target> + <target name="la-tagger-le"> + <mkdir dir="target/download"/> + + <!-- FILE: latin-par-linux-3.2.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - - + - 2011-05-07 | 2011-08-19 | f959f8633ef842f069f0331ad19dc8b4 + - 2011-08-19 | now | bde1f6a63b2c5a658ba25a8eb90832a8 + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/latin.par.gz" + dest="target/download/latin.par.gz" + skipexisting="true"/> + <gunzip src="target/download/latin.par.gz"/> + <install-stub-and-upstream-file + file="target/download/latin.par" + md5="bde1f6a63b2c5a658ba25a8eb90832a8" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20110819" + metaDataVersion="1" + tool="tagger" + language="la" + variant="le" + extension="bin"> + <metadata> + <entry key="pos.tagset" value="brandolini"/> + <entry key="encoding" value="ISO-8859-1"/> + </metadata> + </install-stub-and-upstream-file> + </target> - <target name="nl"> - <antcall target="nl-tagger-le"/> - </target> + <target name="nl"> + <antcall target="nl-tagger-le"/> + </target> - <target name="nl-tagger-le"> - <mkdir dir="target/download"/> - - <!-- FILE: dutch-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - 2008-06-02 | ????-??-?? | 22dd271035795747768ead15b2931c75 (dutch-par-linux-3.1) - - ????-??-?? | 2013-01-07 | 7d533ea8e8b768f87d9c684e47a59872 (dutch-par-linux-3.1) - - 2013-01-07 | now | 40c82daae0c7063eb3eabf1541303fca - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/dutch-par-linux-3.2-utf8.bin.gz" - dest="target/download/dutch-par-linux-3.2-utf8.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/dutch-par-linux-3.2-utf8.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/dutch-par-linux-3.2-utf8.bin" - md5="40c82daae0c7063eb3eabf1541303fca" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20130107" - metaDataVersion="1" - tool="tagger" - language="nl" - variant="le" - extension="bin"> - <metadata> - <entry key="pos.tagset" value="tt"/> - <entry key="encoding" value="UTF-8"/> - </metadata> - </install-stub-and-upstream-file> - </target> - - <target name="mn"> - <antcall target="mn-tagger-le"/> - </target> + <target name="nl-tagger-le"> + <mkdir dir="target/download"/> + + <!-- FILE: dutch-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - + - 2008-06-02 | ????-??-?? | 22dd271035795747768ead15b2931c75 (dutch-par-linux-3.1) + - ????-??-?? | 2013-01-07 | 7d533ea8e8b768f87d9c684e47a59872 (dutch-par-linux-3.1) + - 2013-01-07 | now | 40c82daae0c7063eb3eabf1541303fca + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/dutch.par.gz" + dest="target/download/dutch.par.gz" + skipexisting="true"/> + <gunzip src="target/download/dutch.par.gz"/> + <install-stub-and-upstream-file + file="target/download/dutch.par" + md5="40c82daae0c7063eb3eabf1541303fca" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20130107" + metaDataVersion="1" + tool="tagger" + language="nl" + variant="le" + extension="bin"> + <metadata> + <entry key="pos.tagset" value="tt"/> + <entry key="encoding" value="UTF-8"/> + </metadata> + </install-stub-and-upstream-file> + </target> + + <target name="mn"> + <antcall target="mn-tagger-le"/> + </target> - <target name="mn-tagger-le"> - <mkdir dir="target/download"/> - - <!-- mongolian-par-linux-3.2.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 2012-09-25 | now | a3b1658180c96ef6412df0c5074ecb42 - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/mongolian-par-linux-3.2.bin.gz" - dest="target/download/mongolian-par-linux-3.2.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/mongolian-par-linux-3.2.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/mongolian-par-linux-3.2.bin" - md5="a3b1658180c96ef6412df0c5074ecb42" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20120925" - metaDataVersion="1" - tool="tagger" - language="mn" - variant="le" - extension="bin"> - <metadata> - <entry key="pos.tagset" value="tt"/> - <entry key="encoding" value="UTF-8"/> - </metadata> - </install-stub-and-upstream-file> - </target> - - <target name="pl"> - <antcall target="pl-tagger-le"/> - </target> + <target name="mn-tagger-le"> + <mkdir dir="target/download"/> + + <!-- mongolian-par-linux-3.2.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - 2012-09-25 | now | a3b1658180c96ef6412df0c5074ecb42 + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/mongolian.par.gz" + dest="target/download/mongolian.par.gz" + skipexisting="true"/> + <gunzip src="target/download/mongolian.par.gz"/> + <install-stub-and-upstream-file + file="target/download/mongolian.par" + md5="a3b1658180c96ef6412df0c5074ecb42" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20120925" + metaDataVersion="1" + tool="tagger" + language="mn" + variant="le" + extension="bin"> + <metadata> + <entry key="pos.tagset" value="tt"/> + <entry key="encoding" value="UTF-8"/> + </metadata> + </install-stub-and-upstream-file> + </target> + + <target name="pl"> + <antcall target="pl-tagger-le"/> + </target> - <target name="pl-tagger-le"> - <mkdir dir="target/download"/> - - <!-- polish-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 2014-06-02 | 2015-05-06 | e7ac63b234ed40f5e850bdac4c883ef1 + <target name="pl-tagger-le"> + <mkdir dir="target/download"/> + + <!-- polish-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - 2014-06-02 | 2015-05-06 | e7ac63b234ed40f5e850bdac4c883ef1 - 2015-05-06 | now | bec86e83c25ff19a7ce23e93f89168ee - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/polish-par-linux-3.2-utf8.bin.gz" - dest="target/download/polish-par-linux-3.2-utf8.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/polish-par-linux-3.2-utf8.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/polish-par-linux-3.2-utf8.bin" - md5="bec86e83c25ff19a7ce23e93f89168ee" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20150506" - metaDataVersion="1" - tool="tagger" - language="pl" - variant="le" - extension="bin"> - <metadata> - <!-- http://nkjp.pl/poliqarp/help/ense2.html --> - <entry key="pos.tagset" value="ncp"/> - <entry key="encoding" value="UTF-8"/> - </metadata> - </install-stub-and-upstream-file> - </target> - - <target name="pt"> - <antcall target="pt-tagger-le"/> - </target> + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/polish.par.gz" + dest="target/download/polish.par.gz" + skipexisting="true"/> + <gunzip src="target/download/polish.par.gz"/> + <install-stub-and-upstream-file + file="target/download/polish.par" + md5="bec86e83c25ff19a7ce23e93f89168ee" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20150506" + metaDataVersion="1" + tool="tagger" + language="pl" + variant="le" + extension="bin"> + <metadata> + <!-- http://nkjp.pl/poliqarp/help/ense2.html --> + <entry key="pos.tagset" value="ncp"/> + <entry key="encoding" value="UTF-8"/> + </metadata> + </install-stub-and-upstream-file> + </target> + + <target name="pt"> + <antcall target="pt-tagger-le"/> + </target> - <target name="pt-tagger-le"> - <mkdir dir="target/download"/> - - <!-- FILE: tree-taggerPT-GZ.tar.gz - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 2010-11-15 | now | cf9855b4e215fa7a393700aa135830b0 - --> - <get - src="http://gramatica.usc.es/~gamallo/tagger/tree-taggerPT-GZ.tar.gz" - dest="target/download/tree-taggerPT-GZ.tar.gz" - skipexisting="true"/> - <gunzip src="target/download/tree-taggerPT-GZ.tar.gz"/> - <untar src="target/download/tree-taggerPT-GZ.tar" dest="target/download"> - <patternset> - <include name="**/pt.par"/> - </patternset> - <mapper type="flatten"/> - </untar> - <install-stub-and-upstream-file - file="target/download/pt.par" - md5="cf9855b4e215fa7a393700aa135830b0" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20101115" - metaDataVersion="2" - tool="tagger" - language="pt" - variant="le" - extension="bin"> - <metadata> - <entry key="pos.tagset" value="gamallo"/> - <entry key="encoding" value="ISO-8859-1"/> - </metadata> - </install-stub-and-upstream-file> - </target> - - <target name="ru"> - <antcall target="ru-tagger-le"/> - </target> + <target name="pt-tagger-le"> + <mkdir dir="target/download"/> + + <!-- FILE: tree-taggerPT-GZ.tar.gz - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - 2010-11-15 | now | cf9855b4e215fa7a393700aa135830b0 + --> + <get + src="http://gramatica.usc.es/~gamallo/tagger/tree-taggerPT-GZ.tar.gz" + dest="target/download/tree-taggerPT-GZ.tar.gz" + skipexisting="true"/> + <gunzip src="target/download/tree-taggerPT-GZ.tar.gz"/> + <untar src="target/download/tree-taggerPT-GZ.tar" dest="target/download"> + <patternset> + <include name="**/pt.par"/> + </patternset> + <mapper type="flatten"/> + </untar> + <install-stub-and-upstream-file + file="target/download/pt.par" + md5="cf9855b4e215fa7a393700aa135830b0" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20101115" + metaDataVersion="2" + tool="tagger" + language="pt" + variant="le" + extension="bin"> + <metadata> + <entry key="pos.tagset" value="gamallo"/> + <entry key="encoding" value="ISO-8859-1"/> + </metadata> + </install-stub-and-upstream-file> + </target> + + <target name="ru"> + <antcall target="ru-tagger-le"/> + </target> - <target name="ru-tagger-le"> - <mkdir dir="target/download"/> - - <!-- russian-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 2014-05-05 | now | 7cc4bbaf7c2ee519e934dc47c34a366b - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/russian-par-linux-3.2-utf8.bin.gz" - dest="target/download/russian-par-linux-3.2-utf8.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/russian-par-linux-3.2-utf8.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/russian-par-linux-3.2-utf8.bin" - md5="7cc4bbaf7c2ee519e934dc47c34a366b" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20140505" - metaDataVersion="1" - tool="tagger" - language="ru" - variant="le" - extension="bin"> - <metadata> - <entry key="pos.tagset" value="msd"/> - <entry key="encoding" value="UTF-8"/> - </metadata> - </install-stub-and-upstream-file> - </target> - - <target name="sk"> - <antcall target="sk-tagger-le"/> - </target> + <target name="ru-tagger-le"> + <mkdir dir="target/download"/> + + <!-- russian-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - - + - 2014-05-05 | now | 7cc4bbaf7c2ee519e934dc47c34a366b + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/russian.par.gz" + dest="target/download/russian.par.gz" + skipexisting="true"/> + <gunzip src="target/download/russian.par.gz"/> + <install-stub-and-upstream-file + file="target/download/russian.par" + md5="7cc4bbaf7c2ee519e934dc47c34a366b" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20140505" + metaDataVersion="1" + tool="tagger" + language="ru" + variant="le" + extension="bin"> + <metadata> + <entry key="pos.tagset" value="msd"/> + <entry key="encoding" value="UTF-8"/> + </metadata> + </install-stub-and-upstream-file> + </target> + + <target name="sk"> + <antcall target="sk-tagger-le"/> + </target> - <target name="sk-tagger-le"> - <mkdir dir="target/download"/> - - <!-- FILE: slovak-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - 2013-07-25 | now | b920e2b9e1fdd1173352d7df2bb7fd88 - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/slovak-par-linux-3.2-utf8.bin.gz" - dest="target/download/slovak-par-linux-3.2-utf8.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/slovak-par-linux-3.2-utf8.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/slovak-par-linux-3.2-utf8.bin" - md5="b920e2b9e1fdd1173352d7df2bb7fd88" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20130725" - metaDataVersion="1" - tool="tagger" - language="sk" - variant="le" - extension="bin"> - <metadata> - <entry key="pos.tagset" value="smt-reduced"/> - <entry key="encoding" value="UTF-8"/> - </metadata> - </install-stub-and-upstream-file> - </target> - - <target name="sl"> - <antcall target="sl-tagger-le"/> - </target> + <target name="sk-tagger-le"> + <mkdir dir="target/download"/> + + <!-- FILE: slovak-par-linux-3.2-utf8.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - + - 2013-07-25 | now | b920e2b9e1fdd1173352d7df2bb7fd88 + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/slovak.par.gz" + dest="target/download/slovak.par.gz" + skipexisting="true"/> + <gunzip src="target/download/slovak.par.gz"/> + <install-stub-and-upstream-file + file="target/download/slovak.par" + md5="b920e2b9e1fdd1173352d7df2bb7fd88" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20130725" + metaDataVersion="1" + tool="tagger" + language="sk" + variant="le" + extension="bin"> + <metadata> + <entry key="pos.tagset" value="smt-reduced"/> + <entry key="encoding" value="UTF-8"/> + </metadata> + </install-stub-and-upstream-file> + </target> + + <target name="sl"> + <antcall target="sl-tagger-le"/> + </target> - <target name="sl-tagger-le"> - <mkdir dir="target/download"/> - - <!-- FILE: slovenian.par.gz - - - - - - - - - - - - - - - - - - - - - - - - - - 2010-07-03 | now | d6b230483364be5ec0a3054a1b78df2d - - - - Excluded because we do not know the tagset - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/slovenian.par.gz" - dest="target/download/slovenian.par.gz" - skipexisting="true"/> - <gunzip src="target/download/slovenian.par.gz"/> - <install-stub-and-upstream-file - file="target/download/slovenian.par" - md5="d6b230483364be5ec0a3054a1b78df2d" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20100703" - metaDataVersion="1" - tool="tagger" - language="sl" - variant="le" - extension="bin"> - <metadata> - <entry key="pos.tagset" value="unknown"/> - <entry key="encoding" value="UTF-8"/> - </metadata> - </install-stub-and-upstream-file> - --> - </target> - - <target name="sw"> - <antcall target="sw-tagger-le"/> - </target> + <target name="sl-tagger-le"> + <mkdir dir="target/download"/> + + <!-- FILE: slovenian.par.gz - - - - - - - - - - - - - - - - - - - - - - - - + - 2010-07-03 | now | d6b230483364be5ec0a3054a1b78df2d + - + - Excluded because we do not know the tagset + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/slovenian.par.gz" + dest="target/download/slovenian.par.gz" + skipexisting="true"/> + <gunzip src="target/download/slovenian.par.gz"/> + <install-stub-and-upstream-file + file="target/download/slovenian.par" + md5="d6b230483364be5ec0a3054a1b78df2d" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20100703" + metaDataVersion="1" + tool="tagger" + language="sl" + variant="le" + extension="bin"> + <metadata> + <entry key="pos.tagset" value="unknown"/> + <entry key="encoding" value="UTF-8"/> + </metadata> + </install-stub-and-upstream-file> + --> + </target> + + <target name="sw"> + <antcall target="sw-tagger-le"/> + </target> - <target name="sw-tagger-le"> - <mkdir dir="target/download"/> - - <!-- FILE: swahili-par-linux-3.2.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - - - 2010-03-17 | 2013-07-29 | 0df5f70b0cc81a3b437a46a9e0a3d71b - - 2013-07-29 | now | 002bebeeca52d1112aa5f31173c5ecec - --> - <get - src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/swahili-par-linux-3.2.bin.gz" - dest="target/download/swahili-par-linux-3.2.bin.gz" - skipexisting="true"/> - <gunzip src="target/download/swahili-par-linux-3.2.bin.gz"/> - <install-stub-and-upstream-file - file="target/download/swahili-par-linux-3.2.bin" - md5="002bebeeca52d1112aa5f31173c5ecec" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20130729" - metaDataVersion="1" - tool="tagger" - language="sw" - variant="le" - extension="bin"> - <metadata> - <entry key="pos.tagset" value="swatwol"/> - <entry key="encoding" value="ISO-8859-1"/> - </metadata> - </install-stub-and-upstream-file> - </target> + <target name="sw-tagger-le"> + <mkdir dir="target/download"/> + + <!-- FILE: swahili-par-linux-3.2.bin.gz - - - - - - - - - - - - - - - - - - - - - - - - + - 2010-03-17 | 2013-07-29 | 0df5f70b0cc81a3b437a46a9e0a3d71b + - 2013-07-29 | now | 002bebeeca52d1112aa5f31173c5ecec + --> + <get + src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/swahili.par.gz" + dest="target/download/swahili.par.gz" + skipexisting="true"/> + <gunzip src="target/download/swahili.par.gz"/> + <install-stub-and-upstream-file + file="target/download/swahili.par" + md5="002bebeeca52d1112aa5f31173c5ecec" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20130729" + metaDataVersion="1" + tool="tagger" + language="sw" + variant="le" + extension="bin"> + <metadata> + <entry key="pos.tagset" value="swatwol"/> + <entry key="encoding" value="ISO-8859-1"/> + </metadata> + </install-stub-and-upstream-file> + </target> - <target name="zh"> - <antcall target="zh-tagger-le"/> - </target> + <target name="zh"> + <antcall target="zh-tagger-le"/> + </target> - <target name="zh-tagger-le"> - <mkdir dir="target/download"/> - - <!-- FILE: tt-lcmc.tgz - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 2010-11-15 | now | 7392059e61b5555ab984918e3dea9464 - --> - <get - src="http://corpus.leeds.ac.uk/tools/zh/tt-lcmc.tgz" - dest="target/download/tt-lcmc.tar.gz" - skipexisting="true"/> - <gunzip src="target/download/tt-lcmc.tar.gz"/> - <untar src="target/download/tt-lcmc.tar" dest="target/download"> - <patternset> - <include name="**/zh.par"/> - </patternset> - <mapper type="flatten"/> - </untar> - <install-stub-and-upstream-file - file="target/download/zh.par" - md5="7392059e61b5555ab984918e3dea9464" - groupId="de.tudarmstadt.ukp.dkpro.core" - artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" - upstreamVersion="20101115" - metaDataVersion="1" - tool="tagger" - language="zh" - variant="le" - extension="bin"> - <metadata> - <entry key="pos.tagset" value="lcmc"/> - <entry key="encoding" value="UTF-8"/> - </metadata> - </install-stub-and-upstream-file> - </target> - - <target name="install-executables"> - <property name="version.bin" value="20160430.0"/> - - <fail unless="outputDir">No output directory set.</fail> - <mkdir dir="${outputDir}/bin"/> - <get src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/Tagger-Licence" dest="${outputDir}/bin/LICENSE.txt"/> - - <!-- - - Helmut Schmid seems to have a habit of releasing new versions of TreeTagger without - - updating the version number of maintaining any kind of changes file. Thus we record - - here at which dates we notice that files change. We have also observed that treetagger - - produces different output on different platforms even when using the same model file - - and supposedly the same version of the package (e.g. 3.2). - --> - <!-- FILE: tree-tagger-MacOSX-3.2-intel.tar.gz - - - - - - - - - - - - - - - - - - - - - - - - - 2010-11-19 | 2012-04-25 | 076c8002337b89a9a8581aae81d5d481 - - 2012-04-25 | now | a47bac91b5f373c5ba8703faa5ec7dd7 : buffer flush after 10 lines with -prob , the file dropped on 2016-11-24 the intel but is the same - --> - <install-executable-file-tar url="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tree-tagger-MacOSX-3.2.tar.gz" - platform="osx-x86_64" file="tree-tagger" md5="a47bac91b5f373c5ba8703faa5ec7dd7"/> - <install-executable-file-tar url="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tree-tagger-MacOSX-3.2.tar.gz" - platform="osx-x86_32" file="tree-tagger" md5="a47bac91b5f373c5ba8703faa5ec7dd7"/> - - <!-- FILE: tree-tagger-MacOSX-3.2.tar.gz - - - - - - - - - - - - - - - - - - - - - - - - - - - - 2010-11-19 | now | 63560dcb3a5932bc5ae0e9aab8f48e42 - <install-executable-file-tar url="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tree-tagger-MacOSX-3.2.tar.gz" - platform="osx-ppc" file="tree-tagger" md5="63560dcb3a5932bc5ae0e9aab8f48e42"/> - --> - - <!-- FILE: tree-tagger-linux-3.2-64bit.tar.gz - - - - - - - - - - - - - - - - - - - - - - - - - - - - 2012-08-03 | 2012-09-12 | 1bab69906368061d6e7c7466abdeb643 : build can now be 32 bit or 64 bit - - 2012-09-12 | 2013-02-28 | 4b6459574b6b334a090399b76dbc2cdb - - 2013-02-28 | 2013-05-07 | 3c8433d34cc2430197f8ed7b9e220b9c : only 64 bit build - - 2013-05-07 | 2013-11-18 | 9c015219f8a5c696f03ee43b72e913b0 - - 2013-11-18 | 2015-01-14 | cbc0833fd6cb8c1ec1ea4cae4d40b25a - - 2015-01-14 | 2016-04-30 | 2a0a3740cc9a838949219caf0d64544c - - 2016-04-30 | now | 16cf3578fafd8c39b5b9df0077297025 - --> - <install-executable-file-tar url="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tree-tagger-linux-3.2.tar.gz" - platform="linux-x86_64" file="tree-tagger" md5="16cf3578fafd8c39b5b9df0077297025"/> + <target name="zh-tagger-le"> + <mkdir dir="target/download"/> + + <!-- FILE: tt-lcmc.tgz - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - 2010-11-15 | now | 7392059e61b5555ab984918e3dea9464 + --> + <get + src="http://corpus.leeds.ac.uk/tools/zh/tt-lcmc.tgz" + dest="target/download/tt-lcmc.tar.gz" + skipexisting="true"/> + <gunzip src="target/download/tt-lcmc.tar.gz"/> + <untar src="target/download/tt-lcmc.tar" dest="target/download"> + <patternset> + <include name="**/zh.par"/> + </patternset> + <mapper type="flatten"/> + </untar> + <install-stub-and-upstream-file + file="target/download/zh.par" + md5="7392059e61b5555ab984918e3dea9464" + groupId="de.tudarmstadt.ukp.dkpro.core" + artifactIdBase="de.tudarmstadt.ukp.dkpro.core.treetagger" + upstreamVersion="20101115" + metaDataVersion="1" + tool="tagger" + language="zh" + variant="le" + extension="bin"> + <metadata> + <entry key="pos.tagset" value="lcmc"/> + <entry key="encoding" value="UTF-8"/> + </metadata> + </install-stub-and-upstream-file> + </target> + + <target name="install-executables"> + <property name="version.bin" value="20190408.0"/> + + <fail unless="outputDir">No output directory set.</fail> + <mkdir dir="${outputDir}/bin"/> + <get src="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/Tagger-Licence" dest="${outputDir}/bin/LICENSE.txt"/> + + <!-- + - Helmut Schmid seems to have a habit of releasing new versions of TreeTagger without + - updating the version number of maintaining any kind of changes file. Thus we record + - here at which dates we notice that files change. We have also observed that treetagger + - produces different output on different platforms even when using the same model file + - and supposedly the same version of the package (e.g. 3.2). + --> + <!-- FILE: tree-tagger-MacOSX-3.2-intel.tar.gz - - - - - - - - - - - - - - - - - - - - - - - + - 2010-11-19 | 2012-04-25 | 076c8002337b89a9a8581aae81d5d481 + - 2012-04-25 | 2019-04-08 | a47bac91b5f373c5ba8703faa5ec7dd7 : buffer flush after 10 lines with -prob , the file dropped on 2016-11-24 the intel but is the same + - 2019-04-08 | now | 77f2f48149540d8569da6963b12c67b4 + --> + <install-executable-file-tar url="https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tree-tagger-MacOSX-3.2.2.tar.gz" + platform="osx-x86_64" file="tree-tagger" md5="77f2f48149540d8569da6963b12c67b4"/> - <!-- FILE: tree-tagger-linux-3.2.tar.gz - - - - - - - - - - - - - - - - - - - - - - - - - - - - xxxx-xx-xx | 2010-11-19 | 293ecb477a9f3993206674d68e81b792 (contains dynamically linked binary) - - 2010-11-19 | 2012-02-06 | 3f1d752b0f8c5ad2667346caafaaa5e7 (statically linked binary again) - - 2012-02-06 | 2012-04-18 | 6599b70197be926a26a4966e34f1cad5 (ELF 32-bit LSB executable, Intel 80386, version 1 (GNU/Linux), statically linked, for GNU/Linux 2.6.32, stripped) - - 2012-04-18 | 2012-08-03 | 45bfecd9daac30a4c05c40d03ed1f6f9 : buffer flush after 10 lines with -prob - - 2012-08-03 | 2013-02-02 | 33c2b37803114ec68f36733b1f8b1702 : build can now be 32 bit or 64 bit - - 2013-02-28 | now | 32-bit version is no longer available! - <install-executable-file-tar url="ftp://ftp.ims.uni-stuttgart.de/pub/corpora/tree-tagger-linux-3.2.tar.gz" - platform="linux-x86_32" file="tree-tagger" md5="33c2b37803114ec68f36733b1f8b1702"/> - --> - - <!-- FILE: tree-tagger-3.2.tar.gz - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 2010-11-19 | 2012-02-06 | 4a58ee2a4177cc4e9bfaead8cec6430d - - 2012-02-06 | now | bcebb9b9745e2ac12c5226a032b8bfb4 (ELF 32-bit MSB executable, SPARC, version 1 (SYSV), dynamically linked (uses shared libs), stripped) - <install-executable-file-tar url="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tree-tagger-3.2.tar.gz" - platform="solaris-sparc" file="tree-tagger" md5="bcebb9b9745e2ac12c5226a032b8bfb4"/> - --> - - <!-- FILE: tree-tagger-windows-3.2.zip - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 2010-11-19 | 2012-04-25 | 72a5489fe197b0173ea476e2224a6ad6 - - 2012-04-25 | 2015-03-25 | c88d7fe1aa63bebaccfa019c222f54ea : buffer flush after 10 lines with -prob - - 2015-03-25 | now | e78a1b710d738f9e29d4756ccac4fc3a - --> - <install-executable-file-zip url="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tree-tagger-windows-3.2.zip" - platform="windows-x86_32" file="tree-tagger.exe" md5="e78a1b710d738f9e29d4756ccac4fc3a"/> - <install-executable-file-zip url="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tree-tagger-windows-3.2.zip" - platform="windows-x86_64" file="tree-tagger.exe" md5="e78a1b710d738f9e29d4756ccac4fc3a"/> - - <clean-pom/> - <generate-pom groupId="de.tudarmstadt.ukp.dkpro.core" - artifactId="de.tudarmstadt.ukp.dkpro.core.treetagger-bin" version="${version.bin}"/> - <jar destfile="target/de.tudarmstadt.ukp.dkpro.core.treetagger-bin-${version.bin}.jar" compress="true"> - <fileset dir="target/model-staging"> - <include name="META-INF/**/*"/> - <include name="**/bin/**/*"/> - </fileset> - </jar> - - <install-artifact file="target/de.tudarmstadt.ukp.dkpro.core.treetagger-bin-${version.bin}.jar" - groupId="de.tudarmstadt.ukp.dkpro.core" artifactId="de.tudarmstadt.ukp.dkpro.core.treetagger-bin" - version="${version.bin}"/> - - <clean-pom/> - </target> - - <target name="check-license"> - <!-- CHECK LICENSE AGREEMENT --> - - <echo>Before I can proceed downloading TreeTagger, you have to read and agree to the</echo> - <echo>TreeTagger license agreement. I am trying to download the latest version of the</echo> - <echo>agreement from the TreeTagger homepage.</echo> - <echo/> - <tempfile property="treetagger.license.tempfile" deleteonexit="true"/> - <get src="http://www.ims.uni-stuttgart.de/~schmid/Tagger-Licence" dest="${treetagger.license.tempfile}"/> - <loadfile property="treetagger.license.text" srcfile="${treetagger.license.tempfile}"/> - <echo>${treetagger.license.text}</echo> - <input message="Do you agree to respect the TreeTagger license agreement? (y/n)" - validargs="y,n" - addProperty="treetagger.license.ok"/> - <condition property="do.abort.on.treetagger.license"> - <equals arg1="n" arg2="${treetagger.license.ok}"/> - </condition> - <fail if="do.abort.on.treetagger.license">License agreement not accepted. ABORTING.</fail> - </target> + <!-- FILE: tree-tagger-MacOSX-3.2.tar.gz - - - - - - - - - - - - - - - - - - - - - - - - - - + - 2010-11-19 | 2012-04-25 | 076c8002337b89a9a8581aae81d5d481 + - 2012-04-25 | now | a47bac91b5f373c5ba8703faa5ec7dd7 : buffer flush after 10 lines with -prob , the file dropped on 2016-11-24 the intel but is the same + <install-executable-file-tar url="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tree-tagger-MacOSX-3.2.tar.gz" + platform="osx-x86_32" file="tree-tagger" md5="a47bac91b5f373c5ba8703faa5ec7dd7"/> + --> - <target name="jar-notice"> - <echo>================================</echo> - <echo>== IMPORTANT LICENSE REMINDER ==</echo> - <echo>================================</echo> - <echo> </echo> - <echo>JARs containing the TreeTagger binaries and models have been generated.</echo> - <echo> </echo> - <echo>YOU MAY NOT REDISTRIBUTE THESE JARS WITHOUT THE EXPLICIT PERMISSION</echo> - <echo> </echo> - <echo>from HELMUT SCHMID at the Institute for Computational Linguistics of the</echo> - <echo>University of Stuttgart and/or by the ORIGINAL CREATORS OF THE MODEL FILES.</echo> - <echo> </echo> - <echo>For Maven users:</echo> - <echo> </echo> - <echo>Use the build target "local-maven" (ant local-maven) to automatically install</echo> - <echo>the jars into your local Maven repository at ~/.m2/repository.</echo> - <echo> </echo> - <echo>AGAIN REMEMBER THAT YOU MAY NOT REDISTRIBUTE THESE JARS - Thanks.</echo> - </target> + <!-- FILE: tree-tagger-MacOSX-3.2.tar.gz - - - - - - - - - - - - - - - - - - - - - - - - - - + - 2010-11-19 | now | 63560dcb3a5932bc5ae0e9aab8f48e42 + <install-executable-file-tar url="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tree-tagger-MacOSX-3.2.tar.gz" + platform="osx-ppc" file="tree-tagger" md5="63560dcb3a5932bc5ae0e9aab8f48e42"/> + --> + + <!-- FILE: tree-tagger-linux-3.2-64bit.tar.gz - - - - - - - - - - - - - - - - - - - - - - - - - - + - 2012-08-03 | 2012-09-12 | 1bab69906368061d6e7c7466abdeb643 : build can now be 32 bit or 64 bit + - 2012-09-12 | 2013-02-28 | 4b6459574b6b334a090399b76dbc2cdb + - 2013-02-28 | 2013-05-07 | 3c8433d34cc2430197f8ed7b9e220b9c : only 64 bit build + - 2013-05-07 | 2013-11-18 | 9c015219f8a5c696f03ee43b72e913b0 + - 2013-11-18 | 2015-01-14 | cbc0833fd6cb8c1ec1ea4cae4d40b25a + - 2015-01-14 | 2016-04-30 | 2a0a3740cc9a838949219caf0d64544c + - 2016-04-30 | 2019-04-03 | 16cf3578fafd8c39b5b9df0077297025 + - 2019-04-03 | now | 3fc90c3908574890778af08fde90e065 + --> + <install-executable-file-tar url="https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tree-tagger-linux-3.2.2.tar.gz" + platform="linux-x86_64" file="tree-tagger" md5="3fc90c3908574890778af08fde90e065"/> - <macrodef name="install-executable-file-tar"> - <attribute name="url"/> - <attribute name="platform"/> - <attribute name="file"/> - <attribute name="md5"/> - <attribute name="prop.verify.md5" default="verify.md5.@{platform}.@{file}"/> - <attribute name="prop.verify.md5.actual" default="verify.md5.actual.@{platform}.@{file}"/> - <sequential> - <fail unless="outputDir">No output directory set.</fail> - <mkdir dir="${outputDir}/bin/@{platform}"/> - <copy todir="${outputDir}/bin/@{platform}"> - <gzipresource> - <url url="@{url}"/> - </gzipresource> - <chainedmapper> - <mapper type="flatten"/> - <firstmatchmapper> - <globmapper from="*" to="tree-tagger-@{platform}.tar"/> - </firstmatchmapper> - </chainedmapper> - </copy> - <untar src="${outputDir}/bin/@{platform}/tree-tagger-@{platform}.tar" - dest="${outputDir}/bin/@{platform}"> - <patternset> - <include name="**/@{file}"/> - </patternset> - <chainedmapper> - <mapper type="flatten"/> - </chainedmapper> - </untar> - <delete file="${outputDir}/bin/@{platform}/tree-tagger-@{platform}.tar"/> - - <checksum file="${outputDir}/bin/@{platform}/@{file}" property="@{md5}" verifyproperty="@{prop.verify.md5}"/> - <checksum file="${outputDir}/bin/@{platform}/@{file}" property="@{prop.verify.md5.actual}"/> - <condition property="checksum.mismatch"> - <equals arg1="false" arg2="${@{prop.verify.md5}}"/> - </condition> - <fail if="checksum.mismatch"> + <!-- FILE: tree-tagger-linux-3.2.tar.gz - - - - - - - - - - - - - - - - - - - - - - - - - - + - xxxx-xx-xx | 2010-11-19 | 293ecb477a9f3993206674d68e81b792 (contains dynamically linked binary) + - 2010-11-19 | 2012-02-06 | 3f1d752b0f8c5ad2667346caafaaa5e7 (statically linked binary again) + - 2012-02-06 | 2012-04-18 | 6599b70197be926a26a4966e34f1cad5 (ELF 32-bit LSB executable, Intel 80386, version 1 (GNU/Linux), statically linked, for GNU/Linux 2.6.32, stripped) + - 2012-04-18 | 2012-08-03 | 45bfecd9daac30a4c05c40d03ed1f6f9 : buffer flush after 10 lines with -prob + - 2012-08-03 | 2013-02-02 | 33c2b37803114ec68f36733b1f8b1702 : build can now be 32 bit or 64 bit + - 2013-02-28 | now | 32-bit version is no longer available! + <install-executable-file-tar url="ftp://ftp.ims.uni-stuttgart.de/pub/corpora/tree-tagger-linux-3.2.tar.gz" + platform="linux-x86_32" file="tree-tagger" md5="33c2b37803114ec68f36733b1f8b1702"/> + --> + + <!-- FILE: tree-tagger-3.2.tar.gz - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - 2010-11-19 | 2012-02-06 | 4a58ee2a4177cc4e9bfaead8cec6430d + - 2012-02-06 | now | bcebb9b9745e2ac12c5226a032b8bfb4 (ELF 32-bit MSB executable, SPARC, version 1 (SYSV), dynamically linked (uses shared libs), stripped) + <install-executable-file-tar url="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tree-tagger-3.2.tar.gz" + platform="solaris-sparc" file="tree-tagger" md5="bcebb9b9745e2ac12c5226a032b8bfb4"/> + --> + + <!-- FILE: tree-tagger-windows32-3.2.zip - - - - - - - - - - - - - - - - - - - - - - - - - - - + - 2010-11-19 | 2012-04-25 | 72a5489fe197b0173ea476e2224a6ad6 + - 2012-04-25 | 2015-03-25 | c88d7fe1aa63bebaccfa019c222f54ea : buffer flush after 10 lines with -prob + - 2015-03-25 | 2019-04-08 | e78a1b710d738f9e29d4756ccac4fc3a + - 2019-04-08 | now | 3fc521855221e3dfc82be95f2d324aea + --> + <install-executable-file-zip url="https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tree-tagger-windows32-3.2.2.zip" + platform="windows-x86_32" file="tree-tagger.exe" md5="3fc521855221e3dfc82be95f2d324aea"/> + + <!-- FILE: tree-tagger-windows-3.2.zip - - - - - - - - - - - - - - - - - - - - - - - - - - - + - 2019-04-08 | now | 6bf25051fa133b42c222b6f02ecab9d0 + --> + <install-executable-file-zip url="https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tree-tagger-windows-3.2.2.zip" + platform="windows-x86_64" file="tree-tagger.exe" md5="6bf25051fa133b42c222b6f02ecab9d0"/> + + <clean-pom/> + <generate-pom groupId="de.tudarmstadt.ukp.dkpro.core" + artifactId="de.tudarmstadt.ukp.dkpro.core.treetagger-bin" version="${version.bin}"/> + <jar destfile="target/de.tudarmstadt.ukp.dkpro.core.treetagger-bin-${version.bin}.jar" compress="true"> + <fileset dir="target/model-staging"> + <include name="META-INF/**/*"/> + <include name="**/bin/**/*"/> + </fileset> + </jar> + + <install-artifact file="target/de.tudarmstadt.ukp.dkpro.core.treetagger-bin-${version.bin}.jar" + groupId="de.tudarmstadt.ukp.dkpro.core" artifactId="de.tudarmstadt.ukp.dkpro.core.treetagger-bin" + version="${version.bin}"/> + + <clean-pom/> + </target> + + <target name="check-license"> + <!-- CHECK LICENSE AGREEMENT --> + + <echo>Before I can proceed downloading TreeTagger, you have to read and agree to the</echo> + <echo>TreeTagger license agreement. I am trying to download the latest version of the</echo> + <echo>agreement from the TreeTagger homepage.</echo> + <echo/> + <tempfile property="treetagger.license.tempfile" deleteonexit="true"/> + <get src="http://www.ims.uni-stuttgart.de/~schmid/Tagger-Licence" dest="${treetagger.license.tempfile}"/> + <loadfile property="treetagger.license.text" srcfile="${treetagger.license.tempfile}"/> + <echo>${treetagger.license.text}</echo> + <input message="Do you agree to respect the TreeTagger license agreement? (y/n)" + validargs="y,n" + addProperty="treetagger.license.ok"/> + <condition property="do.abort.on.treetagger.license"> + <equals arg1="n" arg2="${treetagger.license.ok}"/> + </condition> + <fail if="do.abort.on.treetagger.license">License agreement not accepted. ABORTING.</fail> + </target> + + <target name="jar-notice"> + <echo>================================</echo> + <echo>== IMPORTANT LICENSE REMINDER ==</echo> + <echo>================================</echo> + <echo> </echo> + <echo>JARs containing the TreeTagger binaries and models have been generated.</echo> + <echo> </echo> + <echo>YOU MAY NOT REDISTRIBUTE THESE JARS WITHOUT THE EXPLICIT PERMISSION</echo> + <echo> </echo> + <echo>from HELMUT SCHMID at the Institute for Computational Linguistics of the</echo> + <echo>University of Stuttgart and/or by the ORIGINAL CREATORS OF THE MODEL FILES.</echo> + <echo> </echo> + <echo>For Maven users:</echo> + <echo> </echo> + <echo>Use the build target "local-maven" (ant local-maven) to automatically install</echo> + <echo>the jars into your local Maven repository at ~/.m2/repository.</echo> + <echo> </echo> + <echo>AGAIN REMEMBER THAT YOU MAY NOT REDISTRIBUTE THESE JARS - Thanks.</echo> + </target> + + <macrodef name="install-executable-file-tar"> + <attribute name="url"/> + <attribute name="platform"/> + <attribute name="file"/> + <attribute name="md5"/> + <attribute name="prop.verify.md5" default="verify.md5.@{platform}.@{file}"/> + <attribute name="prop.verify.md5.actual" default="verify.md5.actual.@{platform}.@{file}"/> + <sequential> + <fail unless="outputDir">No output directory set.</fail> + <mkdir dir="${outputDir}/bin/@{platform}"/> + <copy todir="${outputDir}/bin/@{platform}"> + <gzipresource> + <url url="@{url}"/> + </gzipresource> + <chainedmapper> + <mapper type="flatten"/> + <firstmatchmapper> + <globmapper from="*" to="tree-tagger-@{platform}.tar"/> + </firstmatchmapper> + </chainedmapper> + </copy> + <untar src="${outputDir}/bin/@{platform}/tree-tagger-@{platform}.tar" + dest="${outputDir}/bin/@{platform}"> + <patternset> + <include name="**/@{file}"/> + </patternset> + <chainedmapper> + <mapper type="flatten"/> + </chainedmapper> + </untar> + <delete file="${outputDir}/bin/@{platform}/tree-tagger-@{platform}.tar"/> + + <checksum file="${outputDir}/bin/@{platform}/@{file}" property="@{md5}" verifyproperty="@{prop.verify.md5}"/> + <checksum file="${outputDir}/bin/@{platform}/@{file}" property="@{prop.verify.md5.actual}"/> + <condition property="checksum.mismatch"> + <equals arg1="false" arg2="${@{prop.verify.md5}}"/> + </condition> + <fail if="checksum.mismatch"> MD5 checksum mismatch for [@{platform}/@{file}]. Please verify the checksum and if necessary update this script. Expected: @{md5} Actual : ${@{prop.verify.md5.actual}} - </fail> - - <generate-executable-properties url="@{url}" platform="@{platform}" file="@{file}"/> - <echo>Installed executable for @{platform} systems</echo> - </sequential> - </macrodef> + </fail> + + <generate-executable-properties url="@{url}" platform="@{platform}" file="@{file}"/> + <echo>Installed executable for @{platform} systems</echo> + </sequential> + </macrodef> - <macrodef name="install-executable-file-zip"> - <attribute name="url"/> - <attribute name="platform"/> - <attribute name="file"/> - <attribute name="md5"/> - <attribute name="prop.verify.md5" default="verify.md5.@{platform}.@{file}"/> - <attribute name="prop.verify.md5.actual" default="verify.md5.actual.@{platform}.@{file}"/> - <sequential> - <fail unless="outputDir">No output directory set.</fail> - <mkdir dir="${outputDir}/bin/@{platform}"/> - <copy todir="${outputDir}/bin/@{platform}"> - <url url="@{url}"/> - <chainedmapper> - <mapper type="flatten"/> - <firstmatchmapper> - <globmapper from="*" to="tree-tagger-@{platform}.zip"/> - </firstmatchmapper> - </chainedmapper> - </copy> - <unzip src="${outputDir}/bin/@{platform}/tree-tagger-@{platform}.zip" - dest="${outputDir}/bin/@{platform}"> - <patternset> - <include name="**/@{file}"/> - </patternset> - <chainedmapper> - <mapper type="flatten"/> - </chainedmapper> - </unzip> - <delete file="${outputDir}/bin/@{platform}/tree-tagger-@{platform}.zip"/> - - <checksum file="${outputDir}/bin/@{platform}/@{file}" property="@{md5}" verifyproperty="@{prop.verify.md5}"/> - <checksum file="${outputDir}/bin/@{platform}/@{file}" property="@{prop.verify.md5.actual}"/> - <condition property="checksum.mismatch"> - <equals arg1="false" arg2="${@{prop.verify.md5}}"/> - </condition> - <fail if="checksum.mismatch"> + <macrodef name="install-executable-file-zip"> + <attribute name="url"/> + <attribute name="platform"/> + <attribute name="file"/> + <attribute name="md5"/> + <attribute name="prop.verify.md5" default="verify.md5.@{platform}.@{file}"/> + <attribute name="prop.verify.md5.actual" default="verify.md5.actual.@{platform}.@{file}"/> + <sequential> + <fail unless="outputDir">No output directory set.</fail> + <mkdir dir="${outputDir}/bin/@{platform}"/> + <copy todir="${outputDir}/bin/@{platform}"> + <url url="@{url}"/> + <chainedmapper> + <mapper type="flatten"/> + <firstmatchmapper> + <globmapper from="*" to="tree-tagger-@{platform}.zip"/> + </firstmatchmapper> + </chainedmapper> + </copy> + <unzip src="${outputDir}/bin/@{platform}/tree-tagger-@{platform}.zip" + dest="${outputDir}/bin/@{platform}"> + <patternset> + <include name="**/@{file}"/> + </patternset> + <chainedmapper> + <mapper type="flatten"/> + </chainedmapper> + </unzip> + <delete file="${outputDir}/bin/@{platform}/tree-tagger-@{platform}.zip"/> + + <checksum file="${outputDir}/bin/@{platform}/@{file}" property="@{md5}" verifyproperty="@{prop.verify.md5}"/> + <checksum file="${outputDir}/bin/@{platform}/@{file}" property="@{prop.verify.md5.actual}"/> + <condition property="checksum.mismatch"> + <equals arg1="false" arg2="${@{prop.verify.md5}}"/> + </condition> + <fail if="checksum.mismatch"> MD5 checksum mismatch for [@{platform}/@{file}]. Please verify the checksum and if necessary update this script. Expected: @{md5} Actual : ${@{prop.verify.md5.actual}} - </fail> + </fail> - <generate-executable-properties url="@{url}" platform="@{platform}" file="@{file}"/> - <echo>Installed executable for @{platform} systems</echo> - </sequential> - </macrodef> - - <macrodef name="generate-executable-properties"> - <attribute name="url"/> - <attribute name="platform"/> - <attribute name="file"/> - <attribute name="prop.checksum.md5" default="checksum.md5.@{platform}/@{file}"/> - <attribute name="prop.checksum.sha1" default="checksum.sha1.@{platform}/@{file}"/> - <sequential> - <checksum file="${outputDir}/bin/@{platform}/@{file}" property="@{prop.checksum.md5}" - algorithm="MD5"/> - <checksum file="${outputDir}/bin/@{platform}/@{file}" property="@{prop.checksum.sha1}" - algorithm="SHA"/> - <propertyfile file="${outputDir}/bin/@{platform}/tree-tagger.properties" - comment="TreeTagger executable description"> - <entry key="url" value="@{url}"/> - <entry key="platform" value="@{platform}"/> - <entry key="downloaded" type="date" value="now"/> - <entry key="md5" value="${@{prop.checksum.md5}}"/> - <entry key="sha1" value="${@{prop.checksum.sha1}}"/> - </propertyfile> - </sequential> - </macrodef> + <generate-executable-properties url="@{url}" platform="@{platform}" file="@{file}"/> + <echo>Installed executable for @{platform} systems</echo> + </sequential> + </macrodef> + + <macrodef name="generate-executable-properties"> + <attribute name="url"/> + <attribute name="platform"/> + <attribute name="file"/> + <attribute name="prop.checksum.md5" default="checksum.md5.@{platform}/@{file}"/> + <attribute name="prop.checksum.sha1" default="checksum.sha1.@{platform}/@{file}"/> + <sequential> + <checksum file="${outputDir}/bin/@{platform}/@{file}" property="@{prop.checksum.md5}" + algorithm="MD5"/> + <checksum file="${outputDir}/bin/@{platform}/@{file}" property="@{prop.checksum.sha1}" + algorithm="SHA"/> + <propertyfile file="${outputDir}/bin/@{platform}/tree-tagger.properties" + comment="TreeTagger executable description"> + <entry key="url" value="@{url}"/> + <entry key="platform" value="@{platform}"/> + <entry key="downloaded" type="date" value="now"/> + <entry key="md5" value="${@{prop.checksum.md5}}"/> + <entry key="sha1" value="${@{prop.checksum.sha1}}"/> + </propertyfile> + </sequential> + </macrodef> </project> diff --git a/dkpro-core-treetagger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/treetagger/TreeTaggerPosTaggerTest.java b/dkpro-core-treetagger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/treetagger/TreeTaggerPosTaggerTest.java deleted file mode 100644 index 1171cd024e..0000000000 --- a/dkpro-core-treetagger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/treetagger/TreeTaggerPosTaggerTest.java +++ /dev/null @@ -1,1178 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.dkpro.core.treetagger; - -import static org.apache.commons.lang3.StringUtils.repeat; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.junit.Assert.assertEquals; - -import java.io.BufferedInputStream; -import java.io.File; -import java.io.FileOutputStream; -import java.io.InputStream; -import java.io.OutputStream; -import java.net.URL; -import java.util.ArrayList; -import java.util.List; -import org.annolab.tt4j.TreeTaggerWrapper; -import org.apache.commons.compress.compressors.CompressorStreamFactory; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.fit.factory.JCasBuilder; -import org.apache.uima.fit.testing.util.HideOutput; -import org.apache.uima.jcas.JCas; -import org.junit.Assume; -import org.junit.Before; -import org.junit.Ignore; -import org.junit.Rule; -import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; - -public -class TreeTaggerPosTaggerTest -{ - @Before - public void initTrace() - { - // TreeTaggerWrapper.TRACE = true; - } - - @Test - public void testEnglishAutoDownload() - throws Exception - { - Assume.assumeTrue(getClass().getResource( - "/de/tudarmstadt/ukp/dkpro/core/treetagger/bin/LICENSE.txt") != null || - System.getProperty("treetagger.home") != null); - - URL aUrl = new URL("http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/english-par-linux-3.2-utf8.bin.gz"); - File targetFile = File.createTempFile("model", ".bin"); - - try ( - InputStream input = new CompressorStreamFactory() - .createCompressorInputStream(new BufferedInputStream(aUrl.openStream())); - OutputStream target = new FileOutputStream(targetFile); - ) { - IOUtils.copy(input, target); - } - - AnalysisEngineDescription engine = createEngineDescription(TreeTaggerPosTagger.class, - TreeTaggerPosTagger.PARAM_MODEL_LOCATION, targetFile, - TreeTaggerPosTagger.PARAM_MODEL_ENCODING, "utf-8"); - - JCas jcas = TestRunner.runTest(engine, "en", "This is a test ."); - - String[] lemmas = { "this", "be", "a", "test", "." }; - String[] tags = { "DT", "VBZ", "DT", "NN", "SENT" }; - String[] tagClasses = { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }; - - AssertAnnotations.assertLemma(lemmas, select(jcas, Lemma.class)); - AssertAnnotations.assertPOS(tagClasses, tags, select(jcas, POS.class)); - } - - @Test - public void testEnglish() - throws Exception - { - String[] tagset = { "#", "$", "''", "(", ")", ",", ":", "CC", "CD", "DT", "EX", "FW", "IN", - "IN/that", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNS", "NP", "NPS", "PDT", "POS", - "PP", "PP$", "RB", "RBR", "RBS", "RP", "SENT", "SYM", "TO", "UH", "VB", "VBD", - "VBG", "VBN", "VBP", "VBZ", "VH", "VHD", "VHG", "VHN", "VHP", "VHZ", "VV", "VVD", - "VVG", "VVN", "VVP", "VVZ", "WDT", "WP", "WP$", "WRB", "``" }; - - runTest("en", "ptb-tt", tagset, "This is a test .", - new String[] { "this", "be", "a", "test", "." }, - new String[] { "DT", "VBZ", "DT", "NN", "SENT" }, - new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - - runTest("en", "ptb-tt", tagset, "A neural net .", - new String[] { "a", "neural", "net", "." }, - new String[] { "DT", "JJ", "NN", "SENT" }, - new String[] { "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); - - runTest("en", "ptb-tt", tagset, "John is purchasing oranges .", - new String[] { "John", "be", "purchase", "orange", "." }, - new String[] { "NP", "VBZ", "VVG", "NNS", "SENT" }, - new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); - - // TT4J per default runs TreeTagger with the -sgml option, so XML tags are not tagged - runTest("en", "ptb-tt", tagset, "My homepage is <url> http://null.dummy </url> .", - new String[] { "my", "homepage", "be", "http://null.dummy", "." }, - new String[] { "PP$", "NN", "VBZ", "JJ", "SENT" }, - new String[] { "POS_PRON", "POS_NOUN", "POS_VERB", "POS_ADJ", "POS_PUNCT" }); - } - - @Test - public void testFrench() - throws Exception - { - String[] tagset = { "ABR", "ADJ", "ADV", "DET:ART", "DET:POS", "INT", "KON", "NAM", "NOM", - "NUM", "PRO", "PRO:DEM", "PRO:IND", "PRO:PER", "PRO:POS", "PRO:REL", "PRP", - "PRP:det", "PUN", "PUN:cit", "SENT", "SYM", "VER:cond", "VER:futu", "VER:impe", - "VER:impf", "VER:infi", "VER:pper", "VER:ppre", "VER:pres", "VER:simp", "VER:subi", - "VER:subp" }; - - runTest("fr", "stein", tagset, "Ceci est un test .", - new String[] { "ceci", "être", "un", "test", "." }, - new String[] { "PRO:DEM", "VER:pres", "DET:ART", "NOM", "SENT" }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - } - - @Test - public void testGerman() - throws Exception - { - String[] tagset = { "$(", "$,", "$.", "ADJ", "ADJA", "ADJD", "ADV", "APPO", "APPR", - "APPRART", "APZR", "ART", "CARD", "FM", "ITJ", "KOKOM", "KON", "KOUI", "KOUS", - "NE", "NN", "PAV", "PDAT", "PDS", "PIAT", "PIS", "PPER", "PPOSAT", "PPOSS", - "PRELAT", "PRELS", "PRF", "PTKA", "PTKANT", "PTKNEG", "PTKVZ", "PTKZU", "PWAT", - "PWAV", "PWS", "TRUNC", "VAFIN", "VAIMP", "VAINF", "VAPP", "VMFIN", "VMINF", - "VMPP", "VVFIN", "VVIMP", "VVINF", "VVIZU", "VVPP", "XY" }; - - runTest("de", "stts", tagset, "10 Minuten sind das Mikro an und die Bühne frei .", - new String[] { "10", "Minute", "sein", "die", "Mikro", "an", "und", "die", "Bühne", "frei", "." }, - new String[] { "CARD", "NN", "VAFIN", "ART", "NN", "PTKVZ", "KON", "ART", "NN", "PTKVZ", "$." }, - new String[] { "POS_NUM", "POS_NOUN", "POS_VERB", "POS_DET", "POS_NOUN", "POS_VERB", "POS_CONJ", "POS_DET", "POS_NOUN", "POS_VERB", "POS_PUNCT" }); - - runTest("de", "stts", tagset, "Das ist ein Test .", - new String[] { "die", "sein", "eine", "Test", "." }, - new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - } - - @Test - public void testDutch() - throws Exception - { - String[] tagset = { "$.", "adj", "adj*kop", "adjabbr", "adv", "advabbr", "conjcoord", - "conjsubo", "det__art", "det__demo", "det__excl", "det__indef", "det__poss", - "det__quest", "det__rel", "int", "noun*kop", "nounabbr", "nounpl", "nounprop", - "nounsg", "num__card", "num__ord", "partte", "prep", "prepabbr", "pronadv", - "prondemo", "pronindef", "pronpers", "pronposs", "pronquest", "pronrefl", - "pronrel", "punc", "verbinf", "verbpapa", "verbpastpl", "verbpastsg", "verbpresp", - "verbprespl", "verbpressg" }; - - runTest("nl", "tt", tagset, "Dit is een test .", - new String[] { "dit", "zijn", "een", "test", "." }, - new String[] { "prondemo", "verbpressg", "det__art", "nounsg", "$." }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - - runTest("nl", "tt", tagset, "10 minuten op de microfoon en vrij podium .", - new String[] { "@card@", "minuut", "op", "de", "microfoon", "en", "vrij", "podium", "." }, - new String[] { "num__ord", "nounpl", "prep", "det__art", "nounsg", "conjcoord", "adj", "nounsg", "$." }, - new String[] { "POS_NUM", "POS_NOUN", "POS_ADP", "POS_DET", "POS_NOUN", "POS_CONJ", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); - } - - @Test - public void testMongolian() - throws Exception - { - String[] tagset = { "\"", "(", ")", ",", "-", ".", ":", "?", "@", "CC", "CD", "DC", "FR", - "IN", "JJ", "NN", "NNP", "PR", "RB", "SX", "VB", "|" }; - - runTest("mn", "tt", tagset, "Энэ нь тест юм .", - new String[] { "-", "-", "тест", "-", "-" }, - new String[] { "PR", "SX", "NN", "DC", "." }, - new String[] { "POS", "POS", "POS", "POS", "POS" }); - } - - @Test - public void testGalician() - throws Exception - { - String[] tagset = { "A0aa", "A0ap", "A0as", "A0fa", "A0fp", "A0fs", "A0ma", "A0mp", "A0ms", - "Acap", "Acas", "Acfp", "Acfs", "Acmp", "Acms", "Asap", "Asas", "Asfp", "Asfs", - "Asmp", "Asms", "Cc", "Cs", "Cs+Ddfp", "Cs+Ddfs", "Cs+Ddmp", "Cs+Ddms", "Ddfp", - "Ddfp+Spfp", "Ddfs", "Ddfs+Spfs", "Ddmp", "Ddmp+Spmp", "Ddms", "Ddms+Spms", "Difp", - "Difs", "Dimp", "Dims", "Dims+Spms", "Edfp", "Edfs", "Edmp", "Edmp+Inmp", "Edms", - "Enfp", "Enfs", "Enmp", "Enms", "Enns", "Gdaa", "Gdap", "Gdas", "Gdfp", "Gdfs", - "Gdmp", "Gdms", "Gnaa", "Gnap", "Gnas", "Gnfp", "Gnfs", "Gnmp", "Gnms", "Iafp", - "Iafs", "Iamp", "Iams", "Idap", "Idas", "Idfp", "Idfp+Ddfp", "Idfs", "Idmp", - "Idmp+Ddmp", "Idms", "In00", "Inaa", "Inap", "Inas", "Infp", "Infs", "Inmp", - "Inmp+Ddmp", "Inms", "La0", "Lcc", "Lcc+Ddfp", "Lcc+Ddfs", "Lcc+Ddmp", "Lcc+Ddms", - "Lcs", "Lp0", "Lp0+Ddfp", "Lp0+Ddfs", "Lp0+Ddmp", "Lp0+Ddms", "Lp0+Difp", - "Lp0+Difs", "Lp0+Dimp", "Lp0+Dims", "Lp0+Edfp", "Lp0+Edfs", "Lp0+Edmp", "Lp0+Enfs", - "Lp0+Enmp", "Lp0+Enns", "Lp0+Idfp", "Lp0+Idmp", "Lp0+Ncdms", "Lp0+Sp00", "Md1pfp", - "Md1pfs", "Md1pmp", "Md1pms", "Md1sfp", "Md1sfs", "Md1smp", "Md1sms", "Md2pfp", - "Md2pfs", "Md2pmp", "Md2pms", "Md2sfp", "Md2sfs", "Md2smp", "Md2sms", "Md3afp", - "Md3afs", "Md3amp", "Md3ams", "Md3pfp", "Md3pfs", "Md3pmp", "Md3pms", "Md3sfp", - "Md3sfs", "Md3smp", "Md3sms", "Mn1pfp", "Mn1pfs", "Mn1pmp", "Mn1pms", "Mn1sfp", - "Mn1sfs", "Mn1smp", "Mn1sms", "Mn2pfp", "Mn2pfs", "Mn2pmp", "Mn2pms", "Mn2sfp", - "Mn2sfs", "Mn2smp", "Mn2sms", "Mn3afp", "Mn3afs", "Mn3amp", "Mn3ams", "Mn3pfp", - "Mn3pfs", "Mn3pmp", "Mn3pms", "Mn3sfp", "Mn3sfs", "Mn3smp", "Mn3sms", "Ncdap", - "Ncdfp", "Ncdfs", "Ncdmp", "Ncdms", "Ncnap", "Ncnfp", "Ncnfs", "Ncnmp", "Ncnms", - "Nodfp", "Nodfs", "Nodmp", "Nodms", "Nonfp", "Nonfs", "Nonmp", "Nonms", "P", - "P+Ddfp", "P+Ddfs", "P+Ddmp", "P+Ddms", "P+Difp", "P+Difs", "P+Dimp", "P+Dims", - "P+Edfp", "P+Edfs", "P+Edmp", "P+Edmp+Inmp", "P+Edms", "P+Enfp", "P+Enfs", - "P+Enmp", "P+Enms", "P+Enns", "P+Iafp", "P+Iamp", "P+Idfp", "P+Idfp+Ddfp", - "P+Idfs", "P+Idmp", "P+Idms", "P+Infp", "P+Infs", "P+Inmp", "P+Inms", "P+Ncdfs", - "P+Ncdms", "P+Ncnfs", "P+Rtp3fp", "P+Rtp3fs", "P+Rtp3mp", "P+Rtp3ms", "P+Sp00", - "P+Spfp", "P+Spfs", "P+Spmp", "P+Spms", "P+Wn", "P-Rtp3mp", "Q", "Q!", "Q\"", "Q'", - "Q(", "Q)", "Q,", "Q-", "Q.", "Q...", "Q/", "Q:", "Q;", "Q?", "Q[", "Q]", "Q_", - "Q{", "Q}", "Q¡", "Q¿", "Raa1ap", "Raa1as", "Raa1fp", "Raa1fs", "Raa1mp", "Raa1ms", - "Raa2ap", "Raa2as", "Raa2fp", "Raa2fs", "Raa2mp", "Raa2ms", "Raa3fp", "Raa3fs", - "Raa3mp", "Raa3ms", "Rad1ap", "Rad1ap+Raa3ms", "Rad1as", "Rad1as+Raa3fs", - "Rad1as+Raa3ms", "Rad1fp", "Rad1fs", "Rad1mp", "Rad1mp+Raa3fs", "Rad1ms", "Rad2ap", - "Rad2ap+Raa3ms", "Rad2as", "Rad2fp", "Rad2fs", "Rad2mp", "Rad2mp+Raa3ms", "Rad2ms", - "Rad3ap", "Rad3as", "Rad3as+Raa3ms", "Rad3fp", "Rad3fs", "Rad3fs+Raa3ms", "Rad3mp", - "Rad3ms", "Rad3ms+Raa3fp", "Rad3ms+Raa3ms", "Raf1ap", "Raf1as", "Raf1fp", "Raf1fs", - "Raf1mp", "Raf1ms", "Raf2ap", "Raf2as", "Raf2fp", "Raf2fs", "Raf2mp", "Raf2ms", - "Rao3aa", "Rtn1ap", "Rtn1as", "Rtn1fp", "Rtn1fs", "Rtn1mp", "Rtn1ms", "Rtn2ap", - "Rtn2as", "Rtn2fp", "Rtn2fs", "Rtn2mp", "Rtn2ms", "Rtn3ap", "Rtn3as", "Rtn3fp", - "Rtn3fs", "Rtn3mp", "Rtn3ms", "Rtn3ns", "Rtp1ap", "Rtp1as", "Rtp1fp", "Rtp1fs", - "Rtp1mp", "Rtp1ms", "Rtp2ap", "Rtp2as", "Rtp2fp", "Rtp2fs", "Rtp2mp", "Rtp2ms", - "Rtp3aa", "Rtp3ap", "Rtp3as", "Rtp3fp", "Rtp3fs", "Rtp3mp", "Rtp3ms", "Rtp3ns", - "SA0fs", "Scaa", "Scap", "Scas", "Scfa", "Scfp", "Scfs", "Scma", "Scmp", "Scms", - "Sp00", "Spf0", "Spfp", "Spfs", "Spm0", "Spmp", "Spms", "Tdfp", "Tdfs", "Tdmp", - "Tdms", "Tnaa", "Tnap", "Tnas", "Tnfp", "Tnfs", "Tnmp", "Tnms", "V0f000", - "V0f000+Raa1ap", "V0f000+Raa1as", "V0f000+Raa1fp", "V0f000+Raa1mp", - "V0f000+Raa1ms", "V0f000+Raa2ap", "V0f000+Raa2ms", "V0f000+Raa3fp", - "V0f000+Raa3fs", "V0f000+Raa3mp", "V0f000+Raa3ms", "V0f000+Rad1ap", - "V0f000+Rad1fs", "V0f000+Rad1mp", "V0f000+Rad1ms", "V0f000+Rad2ap", - "V0f000+Rad3ap", "V0f000+Rad3as", "V0f000+Rad3as+Raa3fs", "V0f000+Rad3as+Raa3ms", - "V0f000+Rad3fp", "V0f000+Rad3fs", "V0f000+Rad3fs+Raa3ms", "V0f000+Rad3mp", - "V0f000+Rad3mp+Raa3mp", "V0f000+Rad3ms", "V0f000+Rad3ms+Raa3fp", "V0f000+Raf1ap", - "V0f000+Raf1as", "V0f000+Raf2as", "V0f000+Raf2fp", "V0f000+Rao3aa", - "V0f000+Rao3aa+Rad1ap", "V0f10p", "V0f10p+Raa1ap", "V0f10p+Raa3ms", "V0f20p", - "V0f20s", "V0f30p", "V0f30p+Rad3fs", "V0f30p+Rao3aa", "V0m10p", "V0m20p", - "V0m20p+Raa3ms", "V0m20p+Raf2ap", "V0m20s", "V0m20s+Raa2as", "V0m20s+Rad3ap", - "V0m20s+Rad3mp", "V0m20s+Raf2as", "V0m20s+Raf2ms", "V0p0fp", "V0p0fs", "V0p0mp", - "V0p0ms", "V0x000", "V0x000+Raa1ap", "V0x000+Raa3fp", "V0x000+Raa3fs", - "V0x000+Raa3mp", "V0x000+Raa3ms", "V0x000+Rad1ap", "V0x000+Rad1as+Raa3mp", - "V0x000+Rad3ap", "V0x000+Rad3as", "V0x000+Rad3fp", "V0x000+Rad3fs", - "V0x000+Rad3mp", "V0x000+Rad3ms", "V0x000+Rad3ms+Raa3ms", "V0x000+Rao3aa", - "V0x10p", "V0x20p", "Vci10p", "Vci10s", "Vci10s+Raa3mp", "Vci10s+Raa3ms", "Vci20p", - "Vci20s", "Vci20s+Raa2as", "Vci30p", "Vci30p+Rad1ap", "Vci30p+Rao3aa", "Vci30s", - "Vci30s+Raa3ms", "Vci30s+Rad1ap", "Vci30s+Rad1as", "Vci30s+Rad1ms", - "Vci30s+Rad3ap", "Vci30s+Rad3as", "Vci30s+Rad3fs", "Vci30s+Rad3ms", - "Vci30s+Rao3aa", "Vcia0s", "Vei10p", "Vei10p+Raa3ms", "Vei10p+Rad3ms", - "Vei10p+Raf1ap", "Vei10s", "Vei10s+Raa1as", "Vei10s+Raa1ms", "Vei10s+Raa3fp", - "Vei10s+Raa3fs", "Vei10s+Raa3mp", "Vei10s+Raa3ms", "Vei10s+Rad3as", - "Vei10s+Rad3as+Raa3ms", "Vei10s+Rad3mp", "Vei10s+Rad3ms", "Vei10s+Raf1as", - "Vei10s+Raf1ms", "Vei20p", "Vei20s", "Vei20s+Raa3ms", "Vei20s+Rad1as", - "Vei20s+Raf2as", "Vei30p", "Vei30p+Raa1ap", "Vei30p+Raa1as", "Vei30p+Raa3fs", - "Vei30p+Raa3ms", "Vei30p+Rad1as", "Vei30p+Rad3as", "Vei30p+Rad3fp", - "Vei30p+Rad3fs", "Vei30p+Rad3mp", "Vei30p+Rad3ms", "Vei30p+Rao3aa", - "Vei30p+Rao3aa+Rad3fp", "Vei30p+Rao3aa+Rad3fs", "Vei30s", "Vei30s+Raa1ap", - "Vei30s+Raa1as", "Vei30s+Raa3as", "Vei30s+Raa3fp", "Vei30s+Raa3fs", - "Vei30s+Raa3mp", "Vei30s+Raa3ms", "Vei30s+Rad1ap", "Vei30s+Rad1as", - "Vei30s+Rad1fs", "Vei30s+Rad1ms", "Vei30s+Rad1ms+Raa3fp", "Vei30s+Rad3ap", - "Vei30s+Rad3as", "Vei30s+Rad3fp", "Vei30s+Rad3fs", "Vei30s+Rad3mp", - "Vei30s+Rad3ms", "Vei30s+Rao3aa", "Vei30s+Rao3aa+Rad3as", "Vei30s+Rao3aa+Rad3ms", - "Ves10p", "Ves10s", "Ves20p", "Ves20s", "Ves30p", "Ves30s", "Vesa0s", "Vfi10p", - "Vfi10p+Raa1ap", "Vfi10p+Raa3ms", "Vfi10p+Rad3fp", "Vfi10p+Raf1ap", "Vfi10s", - "Vfi10s+Rad3mp", "Vfi20p", "Vfi20s", "Vfi30p", "Vfi30p+Rad3fs", "Vfi30p+Rad3mp", - "Vfi30p+Rad3ms", "Vfi30p+Rad3ms+Raa3ms", "Vfi30p+Rao3aa", "Vfi30p+Rao3aa+Rad3as", - "Vfi30s", "Vfi30s+Raa3fp", "Vfi30s+Raa3fs", "Vfi30s+Raa3mp", "Vfi30s+Raa3ms", - "Vfi30s+Rad3as", "Vfi30s+Rad3fp", "Vfi30s+Rad3fs", "Vfi30s+Rad3mp", - "Vfi30s+Rad3ms", "Vfi30s+Rao3aa", "Vfi30s+Rao3aa+Rad3as", "Vfi30s+Rao3aa+Rad3fs", - "Vfs10p", "Vfs10s", "Vfs20p", "Vfs20s", "Vfs30p", "Vfs30s", "Vfsa0s", "Vii10p", - "Vii10p+Raa3fs", "Vii10s", "Vii10s+Rad3ap", "Vii20p", "Vii20s", "Vii30p", - "Vii30p+Raa3fp", "Vii30p+Raa3fs", "Vii30p+Rad1ms", "Vii30p+Rad3fp", - "Vii30p+Rad3mp", "Vii30p+Rao3aa", "Vii30s", "Vii30s+Raa3fp", "Vii30s+Rad1ap", - "Vii30s+Rad1as", "Vii30s+Rad3as", "Vii30s+Rad3fs", "Vii30s+Rad3mp", - "Vii30s+Rad3ms", "Vii30s+Rao3aa", "Vii30s+Rao3aa+Rad3as", "Viia0s", "Vli10p", - "Vli10s", "Vli20p", "Vli20s", "Vli30p", "Vli30p+Rad3as", "Vli30p+Rao3aa", "Vli30s", - "Vli30s+Raa3ms", "Vli30s+Rad3as", "Vli30s+Rao3aa", "Vlia0s", "Vpi10p", - "Vpi10p+Raa1ap", "Vpi10p+Raa2ap", "Vpi10p+Raa3fp", "Vpi10p+Raa3fs", - "Vpi10p+Raa3mp", "Vpi10p+Raa3ms", "Vpi10p+Rad1ap", "Vpi10p+Rad2fs", - "Vpi10p+Rad3mp", "Vpi10p+Raf1ap", "Vpi10p+Raf1fp", "Vpi10p+Raf1mp", "Vpi10s", - "Vpi10s+Raa1as", "Vpi10s+Raa1ms", "Vpi10s+Raa3fp", "Vpi10s+Raa3fs", - "Vpi10s+Raa3mp", "Vpi10s+Raa3ms", "Vpi10s+Rad1as", "Vpi10s+Rad3as", - "Vpi10s+Rad3mp", "Vpi10s+Rad3ms", "Vpi10s+Raf1as", "Vpi10s+Raf1fs", - "Vpi10s+Raf1ms", "Vpi20p", "Vpi20s", "Vpi20s+Raa2as", "Vpi20s+Raa3fs", "Vpi30p", - "Vpi30p+Raa1ap", "Vpi30p+Raa3fp", "Vpi30p+Raa3fs", "Vpi30p+Raa3mp", - "Vpi30p+Raa3ms", "Vpi30p+Rad1ap", "Vpi30p+Rad1as", "Vpi30p+Rad1mp", - "Vpi30p+Rad3ap", "Vpi30p+Rad3as", "Vpi30p+Rad3fp", "Vpi30p+Rad3fs", - "Vpi30p+Rad3mp", "Vpi30p+Rad3ms", "Vpi30p+Rao3aa", "Vpi30p+Rao3aa+Rad1ap", - "Vpi30s", "Vpi30s+Raa1ap", "Vpi30s+Raa1as", "Vpi30s+Raa1mp", "Vpi30s+Raa2as", - "Vpi30s+Raa3fp", "Vpi30s+Raa3fs", "Vpi30s+Raa3mp", "Vpi30s+Raa3ms", - "Vpi30s+Rad1ap", "Vpi30s+Rad1ap+Raa3fs", "Vpi30s+Rad1as", "Vpi30s+Rad1fs", - "Vpi30s+Rad1ms", "Vpi30s+Rad2as", "Vpi30s+Rad2fp", "Vpi30s+Rad3ap", - "Vpi30s+Rad3as", "Vpi30s+Rad3fp", "Vpi30s+Rad3fs", "Vpi30s+Rad3mp", - "Vpi30s+Rad3ms", "Vpi30s+Rao3aa", "Vpi30s+Rao3aa+Rad1ap", "Vpi30s+Rao3aa+Rad3ap", - "Vpi30s+Rao3aa+Rad3fs", "Vpi30s+Rao3aa+Rad3mp", "Vpi30s+Rao3aa+Rad3ms", "Vps10p", - "Vps10p+Raa1ap", "Vps10p+Raa3ms", "Vps10p+Raf1ap", "Vps10s", "Vps20p", "Vps20s", - "Vps30p", "Vps30p+Rad3fs", "Vps30p+Rao3aa", "Vps30s", "Vps30s+Raa1ap", - "Vps30s+Rad1as", "Vps30s+Rao3aa", "Vpsa0s", "Wg", "Wm", "Wn", "Wr", "Y", "Za00", - "Zaas", "Zafp", "Zafs", "Zamp", "Zams", "Zams+Ncnms", "Zf00", "Zg00", "Zgaa", - "Zgfa", "Zgfp", "Zgfs", "Zgma", "Zgmp", "Zgms", "Zo00", "Zs00", "Zs00+Ncdmp", - "Zs00+Ncnmp", "Zs00+Ncnms" }; - - runTest("gl", "xiada", tagset, "Este é un exame .", - new String[] { "este", "ser", "un", "exame", "." }, - new String[] { "Enms", "Vpi30s", "Dims", "Scms", "Q." }, - new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - } - - @Test - public void testPolish() - throws Exception - { - String[] tagset = { "SENT", "adj:pl:acc:f:com", "adj:pl:acc:f:pos", "adj:pl:acc:f:sup", - "adj:pl:acc:m1:com", "adj:pl:acc:m1:pos", "adj:pl:acc:m1:sup", "adj:pl:acc:m2:com", - "adj:pl:acc:m2:pos", "adj:pl:acc:m2:sup", "adj:pl:acc:m3:com", "adj:pl:acc:m3:pos", - "adj:pl:acc:m3:sup", "adj:pl:acc:n:com", "adj:pl:acc:n:pos", "adj:pl:acc:n:sup", - "adj:pl:dat:f:com", "adj:pl:dat:f:pos", "adj:pl:dat:f:sup", "adj:pl:dat:m1:com", - "adj:pl:dat:m1:pos", "adj:pl:dat:m1:sup", "adj:pl:dat:m2:pos", "adj:pl:dat:m3:com", - "adj:pl:dat:m3:pos", "adj:pl:dat:n:pos", "adj:pl:dat:n:sup", "adj:pl:gen:f:com", - "adj:pl:gen:f:pos", "adj:pl:gen:f:sup", "adj:pl:gen:m1:com", "adj:pl:gen:m1:pos", - "adj:pl:gen:m1:sup", "adj:pl:gen:m2:com", "adj:pl:gen:m2:pos", "adj:pl:gen:m2:sup", - "adj:pl:gen:m3:com", "adj:pl:gen:m3:pos", "adj:pl:gen:m3:sup", "adj:pl:gen:n:com", - "adj:pl:gen:n:pos", "adj:pl:gen:n:sup", "adj:pl:inst:f:com", "adj:pl:inst:f:pos", - "adj:pl:inst:f:sup", "adj:pl:inst:m1:com", "adj:pl:inst:m1:pos", - "adj:pl:inst:m1:sup", "adj:pl:inst:m2:pos", "adj:pl:inst:m3:com", - "adj:pl:inst:m3:pos", "adj:pl:inst:m3:sup", "adj:pl:inst:n:com", - "adj:pl:inst:n:pos", "adj:pl:inst:n:sup", "adj:pl:loc:f:com", "adj:pl:loc:f:pos", - "adj:pl:loc:f:sup", "adj:pl:loc:m1:com", "adj:pl:loc:m1:pos", "adj:pl:loc:m1:sup", - "adj:pl:loc:m2:pos", "adj:pl:loc:m3:com", "adj:pl:loc:m3:pos", "adj:pl:loc:m3:sup", - "adj:pl:loc:n:com", "adj:pl:loc:n:pos", "adj:pl:loc:n:sup", "adj:pl:nom:f:com", - "adj:pl:nom:f:pos", "adj:pl:nom:f:sup", "adj:pl:nom:m1:com", "adj:pl:nom:m1:pos", - "adj:pl:nom:m1:sup", "adj:pl:nom:m2:com", "adj:pl:nom:m2:pos", "adj:pl:nom:m2:sup", - "adj:pl:nom:m3:com", "adj:pl:nom:m3:pos", "adj:pl:nom:m3:sup", "adj:pl:nom:n:com", - "adj:pl:nom:n:pos", "adj:pl:nom:n:sup", "adj:sg:acc:f:com", "adj:sg:acc:f:pos", - "adj:sg:acc:f:sup", "adj:sg:acc:m1:com", "adj:sg:acc:m1:pos", "adj:sg:acc:m1:sup", - "adj:sg:acc:m2:com", "adj:sg:acc:m2:pos", "adj:sg:acc:m2:sup", "adj:sg:acc:m3:com", - "adj:sg:acc:m3:pos", "adj:sg:acc:m3:sup", "adj:sg:acc:n:com", "adj:sg:acc:n:pos", - "adj:sg:acc:n:sup", "adj:sg:dat:f:com", "adj:sg:dat:f:pos", "adj:sg:dat:f:sup", - "adj:sg:dat:m1:com", "adj:sg:dat:m1:pos", "adj:sg:dat:m1:sup", "adj:sg:dat:m2:pos", - "adj:sg:dat:m3:com", "adj:sg:dat:m3:pos", "adj:sg:dat:m3:sup", "adj:sg:dat:n:com", - "adj:sg:dat:n:pos", "adj:sg:dat:n:sup", "adj:sg:gen:f:com", "adj:sg:gen:f:pos", - "adj:sg:gen:f:sup", "adj:sg:gen:m1:com", "adj:sg:gen:m1:pos", "adj:sg:gen:m1:sup", - "adj:sg:gen:m2:pos", "adj:sg:gen:m2:sup", "adj:sg:gen:m3:com", "adj:sg:gen:m3:pos", - "adj:sg:gen:m3:sup", "adj:sg:gen:n:com", "adj:sg:gen:n:pos", "adj:sg:gen:n:sup", - "adj:sg:inst:f:com", "adj:sg:inst:f:pos", "adj:sg:inst:f:sup", - "adj:sg:inst:m1:com", "adj:sg:inst:m1:pos", "adj:sg:inst:m1:sup", - "adj:sg:inst:m2:com", "adj:sg:inst:m2:pos", "adj:sg:inst:m2:sup", - "adj:sg:inst:m3:com", "adj:sg:inst:m3:pos", "adj:sg:inst:m3:sup", - "adj:sg:inst:n:com", "adj:sg:inst:n:pos", "adj:sg:inst:n:sup", "adj:sg:loc:f:com", - "adj:sg:loc:f:pos", "adj:sg:loc:f:sup", "adj:sg:loc:m1:com", "adj:sg:loc:m1:pos", - "adj:sg:loc:m1:sup", "adj:sg:loc:m2:com", "adj:sg:loc:m2:pos", "adj:sg:loc:m3:com", - "adj:sg:loc:m3:pos", "adj:sg:loc:m3:sup", "adj:sg:loc:n:com", "adj:sg:loc:n:pos", - "adj:sg:loc:n:sup", "adj:sg:nom:f:com", "adj:sg:nom:f:pos", "adj:sg:nom:f:sup", - "adj:sg:nom:m1:com", "adj:sg:nom:m1:pos", "adj:sg:nom:m1:sup", "adj:sg:nom:m2:com", - "adj:sg:nom:m2:pos", "adj:sg:nom:m2:sup", "adj:sg:nom:m3:com", "adj:sg:nom:m3:pos", - "adj:sg:nom:m3:sup", "adj:sg:nom:n:com", "adj:sg:nom:n:pos", "adj:sg:nom:n:sup", - "adj:sg:voc:f:pos", "adj:sg:voc:f:sup", "adj:sg:voc:m1:pos", "adj:sg:voc:m1:sup", - "adj:sg:voc:m2:pos", "adj:sg:voc:m3:pos", "adj:sg:voc:n:pos", "adja", "adjc", - "adjp", "adv", "adv:com", "adv:pos", "adv:sup", "aglt:pl:pri:imperf:nwok", - "aglt:pl:pri:imperf:wok", "aglt:pl:sec:imperf:nwok", "aglt:sg:pri:imperf:nwok", - "aglt:sg:pri:imperf:wok", "aglt:sg:sec:imperf:nwok", "aglt:sg:sec:imperf:wok", - "aglt:sg:ter:imperf:nwok", "bedzie:pl:pri:imperf", "bedzie:pl:sec:imperf", - "bedzie:pl:ter:imperf", "bedzie:sg:pri:imperf", "bedzie:sg:sec:imperf", - "bedzie:sg:ter:imperf", "brev:npun", "brev:pun", "burk", "comp", "conj", - "depr:pl:acc:m2", "depr:pl:nom:m2", "fin:pl:pri:imperf", "fin:pl:pri:perf", - "fin:pl:sec:imperf", "fin:pl:sec:perf", "fin:pl:ter:imperf", "fin:pl:ter:perf", - "fin:sg:pri:imperf", "fin:sg:pri:perf", "fin:sg:sec:imperf", "fin:sg:sec:perf", - "fin:sg:ter:imperf", "fin:sg:ter:perf", "ger:pl:dat:n:perf:aff", - "ger:pl:gen:n:imperf:aff", "ger:pl:gen:n:perf:aff", "ger:pl:inst:n:imperf:aff", - "ger:pl:inst:n:perf:aff", "ger:pl:loc:n:imperf:aff", "ger:pl:nom:n:imperf:aff", - "ger:pl:nom:n:perf:aff", "ger:sg:acc:n:imperf:aff", "ger:sg:acc:n:imperf:neg", - "ger:sg:acc:n:perf:aff", "ger:sg:acc:n:perf:neg", "ger:sg:dat:n:imperf:aff", - "ger:sg:dat:n:perf:aff", "ger:sg:gen:n:imperf:aff", "ger:sg:gen:n:imperf:neg", - "ger:sg:gen:n:perf:aff", "ger:sg:gen:n:perf:neg", "ger:sg:inst:n:imperf:aff", - "ger:sg:inst:n:imperf:neg", "ger:sg:inst:n:perf:aff", "ger:sg:inst:n:perf:neg", - "ger:sg:loc:n:imperf:aff", "ger:sg:loc:n:imperf:neg", "ger:sg:loc:n:perf:aff", - "ger:sg:loc:n:perf:neg", "ger:sg:nom:n:imperf:aff", "ger:sg:nom:n:imperf:neg", - "ger:sg:nom:n:perf:aff", "ger:sg:nom:n:perf:neg", "imps:imperf", "imps:perf", - "impt:pl:pri:imperf", "impt:pl:pri:perf", "impt:pl:sec:imperf", "impt:pl:sec:perf", - "impt:sg:sec:imperf", "impt:sg:sec:perf", "inf:imperf", "inf:perf", "interj", - "interp", "num:pl:acc:f:congr", "num:pl:acc:f:rec", "num:pl:acc:m1:congr", - "num:pl:acc:m1:rec", "num:pl:acc:m2:congr", "num:pl:acc:m2:rec", - "num:pl:acc:m3:congr", "num:pl:acc:m3:rec", "num:pl:acc:n:congr", - "num:pl:acc:n:rec", "num:pl:dat:f:congr", "num:pl:dat:m1:congr", - "num:pl:dat:m2:congr", "num:pl:dat:m3:congr", "num:pl:dat:m3:rec", - "num:pl:dat:n:congr", "num:pl:gen:f:congr", "num:pl:gen:f:rec", - "num:pl:gen:m1:congr", "num:pl:gen:m1:rec", "num:pl:gen:m2:congr", - "num:pl:gen:m2:rec", "num:pl:gen:m3:congr", "num:pl:gen:m3:rec", - "num:pl:gen:n:congr", "num:pl:gen:n:rec", "num:pl:inst:f:congr", - "num:pl:inst:m1:congr", "num:pl:inst:m2:congr", "num:pl:inst:m3:congr", - "num:pl:inst:m3:rec", "num:pl:inst:n:congr", "num:pl:loc:f:congr", - "num:pl:loc:f:rec", "num:pl:loc:m1:congr", "num:pl:loc:m2:congr", - "num:pl:loc:m2:rec", "num:pl:loc:m3:congr", "num:pl:loc:m3:rec", - "num:pl:loc:n:congr", "num:pl:nom:f:congr", "num:pl:nom:f:rec", - "num:pl:nom:m1:congr", "num:pl:nom:m1:rec", "num:pl:nom:m2:congr", - "num:pl:nom:m2:rec", "num:pl:nom:m3:congr", "num:pl:nom:m3:rec", - "num:pl:nom:n:congr", "num:pl:nom:n:rec", "num:sg:acc:m3:rec", - "num:sg:gen:m1:congr", "num:sg:gen:m3:congr", "num:sg:gen:m3:rec", - "num:sg:nom:f:rec", "num:sg:nom:m3:congr", "num:sg:nom:m3:rec", "num:sg:nom:n:rec", - "numcol:pl:acc:m1:rec", "numcol:pl:acc:n:rec", "numcol:pl:dat:m1:congr", - "numcol:pl:gen:m1:congr", "numcol:pl:gen:m1:rec", "numcol:pl:gen:n:congr", - "numcol:pl:gen:n:rec", "numcol:pl:inst:m1:rec", "numcol:pl:inst:n:rec", - "numcol:pl:nom:m1:rec", "numcol:pl:nom:n:rec", "pact:pl:acc:f:imperf:aff", - "pact:pl:acc:f:imperf:neg", "pact:pl:acc:m1:imperf:aff", - "pact:pl:acc:m2:imperf:aff", "pact:pl:acc:m3:imperf:aff", - "pact:pl:acc:m3:imperf:neg", "pact:pl:acc:n:imperf:aff", - "pact:pl:acc:n:imperf:neg", "pact:pl:dat:f:imperf:aff", - "pact:pl:dat:m1:imperf:aff", "pact:pl:dat:m2:imperf:aff", - "pact:pl:dat:m3:imperf:aff", "pact:pl:dat:n:imperf:aff", - "pact:pl:gen:f:imperf:aff", "pact:pl:gen:f:imperf:neg", - "pact:pl:gen:m1:imperf:aff", "pact:pl:gen:m1:imperf:neg", - "pact:pl:gen:m2:imperf:aff", "pact:pl:gen:m3:imperf:aff", - "pact:pl:gen:m3:imperf:neg", "pact:pl:gen:n:imperf:aff", - "pact:pl:inst:f:imperf:aff", "pact:pl:inst:m1:imperf:aff", - "pact:pl:inst:m2:imperf:aff", "pact:pl:inst:m3:imperf:aff", - "pact:pl:inst:m3:imperf:neg", "pact:pl:inst:n:imperf:aff", - "pact:pl:inst:n:imperf:neg", "pact:pl:loc:f:imperf:aff", - "pact:pl:loc:m1:imperf:aff", "pact:pl:loc:m3:imperf:aff", - "pact:pl:loc:m3:imperf:neg", "pact:pl:loc:n:imperf:aff", - "pact:pl:loc:n:imperf:neg", "pact:pl:nom:f:imperf:aff", "pact:pl:nom:f:imperf:neg", - "pact:pl:nom:m1:imperf:aff", "pact:pl:nom:m2:imperf:aff", - "pact:pl:nom:m3:imperf:aff", "pact:pl:nom:n:imperf:aff", - "pact:pl:nom:n:imperf:neg", "pact:sg:acc:f:imperf:aff", "pact:sg:acc:f:imperf:neg", - "pact:sg:acc:m1:imperf:aff", "pact:sg:acc:m2:imperf:aff", - "pact:sg:acc:m3:imperf:aff", "pact:sg:acc:n:imperf:aff", - "pact:sg:acc:n:imperf:neg", "pact:sg:dat:f:imperf:aff", - "pact:sg:dat:m1:imperf:aff", "pact:sg:dat:m2:imperf:aff", - "pact:sg:dat:m3:imperf:aff", "pact:sg:dat:n:imperf:aff", - "pact:sg:gen:f:imperf:aff", "pact:sg:gen:f:imperf:neg", - "pact:sg:gen:m1:imperf:aff", "pact:sg:gen:m1:imperf:neg", - "pact:sg:gen:m2:imperf:aff", "pact:sg:gen:m3:imperf:aff", - "pact:sg:gen:m3:imperf:neg", "pact:sg:gen:n:imperf:aff", - "pact:sg:gen:n:imperf:neg", "pact:sg:inst:f:imperf:aff", - "pact:sg:inst:f:imperf:neg", "pact:sg:inst:m1:imperf:aff", - "pact:sg:inst:m1:imperf:neg", "pact:sg:inst:m2:imperf:aff", - "pact:sg:inst:m2:imperf:neg", "pact:sg:inst:m3:imperf:aff", - "pact:sg:inst:m3:imperf:neg", "pact:sg:inst:n:imperf:aff", - "pact:sg:loc:f:imperf:aff", "pact:sg:loc:f:imperf:neg", - "pact:sg:loc:m1:imperf:aff", "pact:sg:loc:m2:imperf:aff", - "pact:sg:loc:m3:imperf:aff", "pact:sg:loc:m3:imperf:neg", - "pact:sg:loc:n:imperf:aff", "pact:sg:loc:n:imperf:neg", "pact:sg:nom:f:imperf:aff", - "pact:sg:nom:f:imperf:neg", "pact:sg:nom:m1:imperf:aff", - "pact:sg:nom:m1:imperf:neg", "pact:sg:nom:m2:imperf:aff", - "pact:sg:nom:m3:imperf:aff", "pact:sg:nom:m3:imperf:neg", - "pact:sg:nom:n:imperf:aff", "pact:sg:nom:n:imperf:neg", - "pact:sg:voc:m1:imperf:aff", "pant:perf", "pcon:imperf", - "ppas:pl:acc:f:imperf:aff", "ppas:pl:acc:f:perf:aff", "ppas:pl:acc:f:perf:neg", - "ppas:pl:acc:m1:imperf:aff", "ppas:pl:acc:m1:imperf:neg", - "ppas:pl:acc:m1:perf:aff", "ppas:pl:acc:m2:imperf:aff", "ppas:pl:acc:m2:perf:aff", - "ppas:pl:acc:m3:imperf:aff", "ppas:pl:acc:m3:perf:aff", "ppas:pl:acc:m3:perf:neg", - "ppas:pl:acc:n:imperf:aff", "ppas:pl:acc:n:imperf:neg", "ppas:pl:acc:n:perf:aff", - "ppas:pl:acc:n:perf:neg", "ppas:pl:dat:f:imperf:aff", "ppas:pl:dat:f:perf:aff", - "ppas:pl:dat:f:perf:neg", "ppas:pl:dat:m1:imperf:aff", "ppas:pl:dat:m1:perf:aff", - "ppas:pl:dat:m1:perf:neg", "ppas:pl:dat:m2:imperf:aff", - "ppas:pl:dat:m3:imperf:aff", "ppas:pl:dat:m3:perf:aff", "ppas:pl:dat:n:imperf:aff", - "ppas:pl:dat:n:perf:aff", "ppas:pl:gen:f:imperf:aff", "ppas:pl:gen:f:imperf:neg", - "ppas:pl:gen:f:perf:aff", "ppas:pl:gen:f:perf:neg", "ppas:pl:gen:m1:imperf:aff", - "ppas:pl:gen:m1:imperf:neg", "ppas:pl:gen:m1:perf:aff", "ppas:pl:gen:m1:perf:neg", - "ppas:pl:gen:m2:imperf:aff", "ppas:pl:gen:m2:perf:aff", - "ppas:pl:gen:m3:imperf:aff", "ppas:pl:gen:m3:imperf:neg", - "ppas:pl:gen:m3:perf:aff", "ppas:pl:gen:m3:perf:neg", "ppas:pl:gen:n:imperf:aff", - "ppas:pl:gen:n:perf:aff", "ppas:pl:gen:n:perf:neg", "ppas:pl:inst:f:imperf:aff", - "ppas:pl:inst:f:perf:aff", "ppas:pl:inst:m1:imperf:aff", - "ppas:pl:inst:m1:perf:aff", "ppas:pl:inst:m2:perf:aff", - "ppas:pl:inst:m3:imperf:aff", "ppas:pl:inst:m3:perf:aff", - "ppas:pl:inst:n:imperf:aff", "ppas:pl:inst:n:perf:aff", "ppas:pl:loc:f:imperf:aff", - "ppas:pl:loc:f:imperf:neg", "ppas:pl:loc:f:perf:aff", "ppas:pl:loc:f:perf:neg", - "ppas:pl:loc:m1:imperf:aff", "ppas:pl:loc:m1:perf:aff", - "ppas:pl:loc:m2:imperf:aff", "ppas:pl:loc:m3:imperf:aff", - "ppas:pl:loc:m3:perf:aff", "ppas:pl:loc:m3:perf:neg", "ppas:pl:loc:n:imperf:aff", - "ppas:pl:loc:n:perf:aff", "ppas:pl:loc:n:perf:neg", "ppas:pl:nom:f:imperf:aff", - "ppas:pl:nom:f:imperf:neg", "ppas:pl:nom:f:perf:aff", "ppas:pl:nom:f:perf:neg", - "ppas:pl:nom:m1:imperf:aff", "ppas:pl:nom:m1:imperf:neg", - "ppas:pl:nom:m1:perf:aff", "ppas:pl:nom:m1:perf:neg", "ppas:pl:nom:m2:imperf:aff", - "ppas:pl:nom:m2:perf:aff", "ppas:pl:nom:m3:imperf:aff", - "ppas:pl:nom:m3:imperf:neg", "ppas:pl:nom:m3:perf:aff", "ppas:pl:nom:m3:perf:neg", - "ppas:pl:nom:n:imperf:aff", "ppas:pl:nom:n:perf:aff", "ppas:pl:nom:n:perf:neg", - "ppas:sg:acc:f:imperf:aff", "ppas:sg:acc:f:imperf:neg", "ppas:sg:acc:f:perf:aff", - "ppas:sg:acc:f:perf:neg", "ppas:sg:acc:m1:imperf:aff", "ppas:sg:acc:m1:perf:aff", - "ppas:sg:acc:m2:imperf:aff", "ppas:sg:acc:m2:perf:aff", - "ppas:sg:acc:m3:imperf:aff", "ppas:sg:acc:m3:imperf:neg", - "ppas:sg:acc:m3:perf:aff", "ppas:sg:acc:m3:perf:neg", "ppas:sg:acc:n:imperf:aff", - "ppas:sg:acc:n:perf:aff", "ppas:sg:acc:n:perf:neg", "ppas:sg:dat:f:imperf:aff", - "ppas:sg:dat:f:imperf:neg", "ppas:sg:dat:f:perf:aff", "ppas:sg:dat:f:perf:neg", - "ppas:sg:dat:m1:imperf:aff", "ppas:sg:dat:m1:perf:aff", - "ppas:sg:dat:m3:imperf:aff", "ppas:sg:dat:m3:perf:aff", "ppas:sg:dat:n:perf:aff", - "ppas:sg:gen:f:imperf:aff", "ppas:sg:gen:f:imperf:neg", "ppas:sg:gen:f:perf:aff", - "ppas:sg:gen:f:perf:neg", "ppas:sg:gen:m1:imperf:aff", "ppas:sg:gen:m1:perf:aff", - "ppas:sg:gen:m1:perf:neg", "ppas:sg:gen:m2:imperf:aff", "ppas:sg:gen:m2:perf:aff", - "ppas:sg:gen:m3:imperf:aff", "ppas:sg:gen:m3:imperf:neg", - "ppas:sg:gen:m3:perf:aff", "ppas:sg:gen:m3:perf:neg", "ppas:sg:gen:n:imperf:aff", - "ppas:sg:gen:n:imperf:neg", "ppas:sg:gen:n:perf:aff", "ppas:sg:gen:n:perf:neg", - "ppas:sg:inst:f:imperf:aff", "ppas:sg:inst:f:imperf:neg", - "ppas:sg:inst:f:perf:aff", "ppas:sg:inst:f:perf:neg", "ppas:sg:inst:m1:imperf:aff", - "ppas:sg:inst:m1:imperf:neg", "ppas:sg:inst:m1:perf:aff", - "ppas:sg:inst:m1:perf:neg", "ppas:sg:inst:m2:imperf:aff", - "ppas:sg:inst:m2:perf:aff", "ppas:sg:inst:m3:imperf:aff", - "ppas:sg:inst:m3:imperf:neg", "ppas:sg:inst:m3:perf:aff", - "ppas:sg:inst:m3:perf:neg", "ppas:sg:inst:n:imperf:aff", - "ppas:sg:inst:n:imperf:neg", "ppas:sg:inst:n:perf:aff", "ppas:sg:inst:n:perf:neg", - "ppas:sg:loc:f:imperf:aff", "ppas:sg:loc:f:perf:aff", "ppas:sg:loc:f:perf:neg", - "ppas:sg:loc:m1:imperf:aff", "ppas:sg:loc:m1:perf:aff", - "ppas:sg:loc:m2:imperf:aff", "ppas:sg:loc:m3:imperf:aff", - "ppas:sg:loc:m3:imperf:neg", "ppas:sg:loc:m3:perf:aff", "ppas:sg:loc:m3:perf:neg", - "ppas:sg:loc:n:imperf:aff", "ppas:sg:loc:n:perf:aff", "ppas:sg:loc:n:perf:neg", - "ppas:sg:nom:f:imperf:aff", "ppas:sg:nom:f:imperf:neg", "ppas:sg:nom:f:perf:aff", - "ppas:sg:nom:f:perf:neg", "ppas:sg:nom:m1:imperf:aff", "ppas:sg:nom:m1:imperf:neg", - "ppas:sg:nom:m1:perf:aff", "ppas:sg:nom:m1:perf:neg", "ppas:sg:nom:m2:imperf:aff", - "ppas:sg:nom:m2:perf:aff", "ppas:sg:nom:m3:imperf:aff", - "ppas:sg:nom:m3:imperf:neg", "ppas:sg:nom:m3:perf:aff", "ppas:sg:nom:m3:perf:neg", - "ppas:sg:nom:n:imperf:aff", "ppas:sg:nom:n:imperf:neg", "ppas:sg:nom:n:perf:aff", - "ppas:sg:nom:n:perf:neg", "ppas:sg:voc:m2:imperf:aff", "ppron12:pl:acc:f:pri", - "ppron12:pl:acc:f:sec", "ppron12:pl:acc:m1:pri", "ppron12:pl:acc:m1:sec", - "ppron12:pl:acc:m2:sec", "ppron12:pl:acc:n:sec", "ppron12:pl:dat:f:pri", - "ppron12:pl:dat:f:sec", "ppron12:pl:dat:m1:pri", "ppron12:pl:dat:m1:sec", - "ppron12:pl:dat:m3:sec", "ppron12:pl:gen:f:pri", "ppron12:pl:gen:f:sec", - "ppron12:pl:gen:m1:pri", "ppron12:pl:gen:m1:sec", "ppron12:pl:gen:m2:pri", - "ppron12:pl:inst:f:pri", "ppron12:pl:inst:m1:pri", "ppron12:pl:inst:m1:sec", - "ppron12:pl:inst:n:pri", "ppron12:pl:loc:f:sec", "ppron12:pl:loc:m1:pri", - "ppron12:pl:loc:m1:sec", "ppron12:pl:loc:m3:sec", "ppron12:pl:nom:f:pri", - "ppron12:pl:nom:f:sec", "ppron12:pl:nom:m1:pri", "ppron12:pl:nom:m1:pri:akc", - "ppron12:pl:nom:m1:sec", "ppron12:pl:nom:m1:sec:akc", "ppron12:pl:nom:m2:pri", - "ppron12:pl:nom:m2:sec", "ppron12:pl:nom:n:sec", "ppron12:sg:acc:f:pri:akc", - "ppron12:sg:acc:f:sec:akc", "ppron12:sg:acc:f:sec:nakc", - "ppron12:sg:acc:m1:pri:akc", "ppron12:sg:acc:m1:pri:nakc", - "ppron12:sg:acc:m1:sec:akc", "ppron12:sg:acc:m1:sec:nakc", - "ppron12:sg:acc:m2:pri:akc", "ppron12:sg:acc:m2:sec:nakc", - "ppron12:sg:acc:m3:pri:akc", "ppron12:sg:acc:m3:sec:nakc", - "ppron12:sg:acc:n:pri:akc", "ppron12:sg:acc:n:sec:nakc", - "ppron12:sg:dat:f:pri:akc", "ppron12:sg:dat:f:pri:nakc", - "ppron12:sg:dat:f:sec:akc", "ppron12:sg:dat:f:sec:nakc", - "ppron12:sg:dat:m1:pri:akc", "ppron12:sg:dat:m1:pri:nakc", - "ppron12:sg:dat:m1:sec:akc", "ppron12:sg:dat:m1:sec:nakc", - "ppron12:sg:dat:m2:pri:nakc", "ppron12:sg:dat:m2:sec:akc", - "ppron12:sg:dat:m2:sec:nakc", "ppron12:sg:gen:f:pri:akc", - "ppron12:sg:gen:f:sec:akc", "ppron12:sg:gen:f:sec:nakc", - "ppron12:sg:gen:m1:pri:akc", "ppron12:sg:gen:m1:sec:akc", - "ppron12:sg:gen:m1:sec:nakc", "ppron12:sg:gen:m2:sec:akc", - "ppron12:sg:gen:m2:sec:nakc", "ppron12:sg:gen:n:pri:akc", "ppron12:sg:inst:f:pri", - "ppron12:sg:inst:f:sec", "ppron12:sg:inst:m1:pri", "ppron12:sg:inst:m1:pri:nakc", - "ppron12:sg:inst:m1:sec", "ppron12:sg:inst:n:sec", "ppron12:sg:loc:f:pri", - "ppron12:sg:loc:f:sec", "ppron12:sg:loc:m1:pri", "ppron12:sg:loc:m1:sec", - "ppron12:sg:loc:m3:pri", "ppron12:sg:nom:f:pri", - "ppron12:sg:nom:f:sec", "ppron12:sg:nom:m1:pri", "ppron12:sg:nom:m1:pri:akc", - "ppron12:sg:nom:m1:pri:nakc", "ppron12:sg:nom:m1:sec", "ppron12:sg:nom:m1:sec:akc", - "ppron12:sg:nom:m2:pri", "ppron12:sg:nom:m2:sec", "ppron12:sg:nom:m3:pri", - "ppron12:sg:nom:m3:sec", "ppron12:sg:nom:n:sec", "ppron12:sg:voc:n:sec", - "ppron3:pl:acc:f:ter:akc:npraep", "ppron3:pl:acc:f:ter:akc:praep", - "ppron3:pl:acc:m1:ter:akc:npraep", "ppron3:pl:acc:m1:ter:akc:praep", - "ppron3:pl:acc:m2:ter:akc:npraep", "ppron3:pl:acc:m2:ter:akc:praep", - "ppron3:pl:acc:m3:ter:akc:npraep", "ppron3:pl:acc:m3:ter:akc:praep", - "ppron3:pl:acc:n:ter:akc:npraep", "ppron3:pl:acc:n:ter:akc:praep", - "ppron3:pl:dat:f:ter:akc:npraep", "ppron3:pl:dat:f:ter:akc:praep", - "ppron3:pl:dat:m1:ter:akc:npraep", "ppron3:pl:dat:m1:ter:akc:praep", - "ppron3:pl:dat:m2:ter:akc:npraep", "ppron3:pl:dat:m3:ter:akc:npraep", - "ppron3:pl:dat:m3:ter:akc:praep", "ppron3:pl:dat:n:ter:akc:npraep", - "ppron3:pl:gen:f:ter:akc:npraep", "ppron3:pl:gen:f:ter:akc:praep", - "ppron3:pl:gen:m1:ter:akc:npraep", "ppron3:pl:gen:m1:ter:akc:praep", - "ppron3:pl:gen:m2:ter:akc:npraep", "ppron3:pl:gen:m2:ter:akc:praep", - "ppron3:pl:gen:m3:ter:akc:npraep", "ppron3:pl:gen:m3:ter:akc:praep", - "ppron3:pl:gen:n:ter:akc:npraep", "ppron3:pl:gen:n:ter:akc:praep", - "ppron3:pl:inst:f:ter:akc:npraep", "ppron3:pl:inst:f:ter:akc:praep", - "ppron3:pl:inst:m1:ter:akc:npraep", "ppron3:pl:inst:m1:ter:akc:praep", - "ppron3:pl:inst:m2:ter:akc:npraep", "ppron3:pl:inst:m2:ter:akc:praep", - "ppron3:pl:inst:m3:ter:akc:npraep", "ppron3:pl:inst:m3:ter:akc:praep", - "ppron3:pl:inst:n:ter:akc:npraep", "ppron3:pl:inst:n:ter:akc:praep", - "ppron3:pl:loc:f:ter:akc:praep", "ppron3:pl:loc:m1:ter:akc:praep", - "ppron3:pl:loc:m2:ter:akc:praep", "ppron3:pl:loc:m3:ter:akc:praep", - "ppron3:pl:loc:n:ter:akc:praep", "ppron3:pl:nom:f:ter:akc:npraep", - "ppron3:pl:nom:m1:ter:akc:npraep", "ppron3:pl:nom:m2:ter:akc:npraep", - "ppron3:pl:nom:m3:ter:akc:npraep", "ppron3:pl:nom:n:ter:akc:npraep", - "ppron3:sg:acc:f:ter:akc:npraep", "ppron3:sg:acc:f:ter:akc:praep", - "ppron3:sg:acc:m1:ter:akc:npraep", "ppron3:sg:acc:m1:ter:akc:praep", - "ppron3:sg:acc:m1:ter:nakc:npraep", "ppron3:sg:acc:m1:ter:nakc:praep", - "ppron3:sg:acc:m2:ter:akc:praep", "ppron3:sg:acc:m2:ter:nakc:npraep", - "ppron3:sg:acc:m2:ter:nakc:praep", "ppron3:sg:acc:m3:ter:akc:npraep", - "ppron3:sg:acc:m3:ter:akc:praep", "ppron3:sg:acc:m3:ter:nakc:npraep", - "ppron3:sg:acc:m3:ter:nakc:praep", "ppron3:sg:acc:n:ter:akc:npraep", - "ppron3:sg:acc:n:ter:akc:praep", "ppron3:sg:dat:f:ter:akc:npraep", - "ppron3:sg:dat:f:ter:akc:praep", "ppron3:sg:dat:m1:ter:akc:npraep", - "ppron3:sg:dat:m1:ter:akc:praep", "ppron3:sg:dat:m1:ter:nakc:npraep", - "ppron3:sg:dat:m2:ter:akc:npraep", "ppron3:sg:dat:m2:ter:nakc:npraep", - "ppron3:sg:dat:m3:ter:akc:npraep", "ppron3:sg:dat:m3:ter:akc:praep", - "ppron3:sg:dat:m3:ter:nakc:npraep", "ppron3:sg:dat:n:ter:akc:npraep", - "ppron3:sg:dat:n:ter:akc:praep", "ppron3:sg:dat:n:ter:nakc:npraep", - "ppron3:sg:gen:f:ter:akc:npraep", "ppron3:sg:gen:f:ter:akc:praep", - "ppron3:sg:gen:m1:ter:akc:npraep", "ppron3:sg:gen:m1:ter:akc:praep", - "ppron3:sg:gen:m1:ter:nakc:npraep", "ppron3:sg:gen:m1:ter:nakc:praep", - "ppron3:sg:gen:m2:ter:akc:npraep", "ppron3:sg:gen:m2:ter:akc:praep", - "ppron3:sg:gen:m2:ter:nakc:npraep", "ppron3:sg:gen:m3:ter:akc:npraep", - "ppron3:sg:gen:m3:ter:akc:praep", "ppron3:sg:gen:m3:ter:nakc:npraep", - "ppron3:sg:gen:m3:ter:nakc:praep", "ppron3:sg:gen:n:ter:akc:npraep", - "ppron3:sg:gen:n:ter:akc:praep", "ppron3:sg:gen:n:ter:nakc:npraep", - "ppron3:sg:inst:f:ter:akc:praep", "ppron3:sg:inst:m1:ter:akc:npraep", - "ppron3:sg:inst:m1:ter:akc:praep", "ppron3:sg:inst:m2:ter:akc:npraep", - "ppron3:sg:inst:m2:ter:akc:praep", "ppron3:sg:inst:m3:ter:akc:npraep", - "ppron3:sg:inst:m3:ter:akc:praep", "ppron3:sg:inst:n:ter:akc:npraep", - "ppron3:sg:inst:n:ter:akc:praep", "ppron3:sg:loc:f:ter:akc:praep", - "ppron3:sg:loc:m1:ter:akc:praep", "ppron3:sg:loc:m2:ter:akc:praep", - "ppron3:sg:loc:m3:ter:akc:praep", "ppron3:sg:loc:n:ter:akc:praep", - "ppron3:sg:nom:f:ter:akc:npraep", "ppron3:sg:nom:f:ter:akc:praep", - "ppron3:sg:nom:m1:ter:akc:npraep", "ppron3:sg:nom:m2:ter:akc:npraep", - "ppron3:sg:nom:m2:ter:akc:praep", "ppron3:sg:nom:m3:ter:akc:npraep", - "ppron3:sg:nom:n:ter:akc:npraep", "praet:pl:f:imperf", "praet:pl:f:perf", - "praet:pl:m1:imperf", "praet:pl:m1:imperf:agl", "praet:pl:m1:perf", - "praet:pl:m1:perf:nagl", "praet:pl:m2:imperf", "praet:pl:m2:perf", - "praet:pl:m3:imperf", "praet:pl:m3:perf", "praet:pl:n:imperf", "praet:pl:n:perf", - "praet:sg:f:imperf", "praet:sg:f:imperf:agl", "praet:sg:f:imperf:nagl", - "praet:sg:f:perf", "praet:sg:m1:imperf", "praet:sg:m1:imperf:agl", - "praet:sg:m1:imperf:nagl", "praet:sg:m1:perf", "praet:sg:m1:perf:agl", - "praet:sg:m1:perf:nagl", "praet:sg:m2:imperf", "praet:sg:m2:imperf:nagl", - "praet:sg:m2:perf", "praet:sg:m2:perf:nagl", "praet:sg:m3:imperf", - "praet:sg:m3:imperf:nagl", "praet:sg:m3:perf", "praet:sg:m3:perf:nagl", - "praet:sg:n:imperf", "praet:sg:n:perf", "pred", "prep:acc", "prep:acc:nwok", - "prep:acc:wok", "prep:dat", "prep:gen", "prep:gen:nwok", "prep:gen:wok", - "prep:inst", "prep:inst:nwok", "prep:inst:wok", "prep:loc", "prep:loc:nwok", - "prep:loc:wok", "prep:nom", "qub", "qub:nwok", "qub:wok", "siebie:acc", - "siebie:dat", "siebie:gen", "siebie:inst", "siebie:loc", "subst:pl:acc:f", - "subst:pl:acc:m1", "subst:pl:acc:m2", "subst:pl:acc:m3", "subst:pl:acc:n", - "subst:pl:dat:f", "subst:pl:dat:m1", "subst:pl:dat:m2", "subst:pl:dat:m3", - "subst:pl:dat:n", "subst:pl:gen:f", "subst:pl:gen:m1", "subst:pl:gen:m2", - "subst:pl:gen:m3", "subst:pl:gen:n", "subst:pl:inst:f", "subst:pl:inst:m1", - "subst:pl:inst:m2", "subst:pl:inst:m3", "subst:pl:inst:n", "subst:pl:loc:f", - "subst:pl:loc:m1", "subst:pl:loc:m2", "subst:pl:loc:m3", "subst:pl:loc:n", - "subst:pl:nom:f", "subst:pl:nom:m1", "subst:pl:nom:m2", "subst:pl:nom:m3", - "subst:pl:nom:n", "subst:sg:acc:f", "subst:sg:acc:m1", "subst:sg:acc:m2", - "subst:sg:acc:m3", "subst:sg:acc:n", "subst:sg:dat:f", "subst:sg:dat:m1", - "subst:sg:dat:m2", "subst:sg:dat:m3", "subst:sg:dat:n", "subst:sg:gen:f", - "subst:sg:gen:m1", "subst:sg:gen:m2", "subst:sg:gen:m3", "subst:sg:gen:n", - "subst:sg:inst:f", "subst:sg:inst:m1", "subst:sg:inst:m2", "subst:sg:inst:m3", - "subst:sg:inst:n", "subst:sg:loc:f", "subst:sg:loc:m1", "subst:sg:loc:m2", - "subst:sg:loc:m3", "subst:sg:loc:n", "subst:sg:nom:f", "subst:sg:nom:m1", - "subst:sg:nom:m2", "subst:sg:nom:m3", "subst:sg:nom:n", "subst:sg:voc:f", - "subst:sg:voc:m1", "subst:sg:voc:m2", "subst:sg:voc:m3", "subst:sg:voc:n", - "winien:pl:f:imperf", "winien:pl:m1:imperf", "winien:pl:m2:imperf", - "winien:pl:m3:imperf", "winien:pl:n:imperf", "winien:sg:f:imperf", - "winien:sg:m1:imperf", "winien:sg:m2:imperf", "winien:sg:m3:imperf", - "winien:sg:n:imperf", "xxx" }; - - runTest("pl", "ncp", tagset, "To badanie .", - new String[] { "ten", "badanie", "." }, - new String[] { "adj:sg:acc:n:pos", "subst:sg:acc:n", "SENT" }, - new String[] { "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); - } - - @Test - public void testRussian() - throws Exception - { - String[] tagset = { ",", "-", "Afcmsnf", "Afpfpgf", "Afpfsaf", "Afpfsas", "Afpfsdf", - "Afpfsgf", "Afpfsif", "Afpfslf", "Afpfsnf", "Afpfsns", "Afpmpaf", "Afpmpdf", - "Afpmpgf", "Afpmpif", "Afpmplf", "Afpmpnf", "Afpmpns", "Afpmsaf", "Afpmsdf", - "Afpmsds", "Afpmsgf", "Afpmsgs", "Afpmsif", "Afpmslf", "Afpmsnf", "Afpmsns", - "Afpnpaf", "Afpnpnf", "Afpnsaf", "Afpnsdf", "Afpnsgf", "Afpnsif", "Afpnslf", - "Afpnsnf", "Afpnsns", "C", "I", "Mc", "Mc---d", "Mc--a", "Mc--ad", "Mc--d", - "Mc--dd", "Mc--g", "Mc--gd", "Mc--i", "Mc--id", "Mc--l", "Mc--n", "Mcf-a", "Mcf-d", - "Mcf-g", "Mcf-i", "Mcf-l", "Mcf-n", "Mcm-a", "Mcm-d", "Mcm-g", "Mcm-i", "Mcm-l", - "Mcm-n", "Mcn-a", "Mcn-d", "Mcn-g", "Mcn-i", "Mcn-l", "Mcn-n", "Mo---d", "Mo--g", - "Mo--i", "Mo-pa", "Mo-pad", "Mo-pd", "Mo-pdd", "Mo-pg", "Mo-pgd", "Mo-pi", - "Mo-pid", "Mo-pl", "Mo-pld", "Mo-pn", "Mo-pnd", "Mo-sad", "Mof", "Mof-a", "Mof-d", - "Mof-g", "Mof-i", "Mof-l", "Mof-n", "Mofsa", "Mofsad", "Mofsd", "Mofsdd", "Mofsg", - "Mofsgd", "Mofsi", "Mofsid", "Mofsl", "Mofsld", "Mofsn", "Mofsnd", "Mom-a", - "Mom-d", "Mom-g", "Mom-i", "Mom-l", "Mom-n", "Momsa", "Momsad", "Momsd", "Momsg", - "Momsgd", "Momsi", "Momsid", "Momsl", "Momsld", "Momsn", "Momsnd", "Mon-a", - "Mon-d", "Mon-g", "Mon-i", "Mon-l", "Mon-n", "Monsa", "Monsad", "Monsd", "Monsg", - "Monsgd", "Monsi", "Monsid", "Monsl", "Monsn", "Monsnd", "Nccpay", "Nccpdy", - "Nccpgy", "Nccpiy", "Nccply", "Nccpny", "Nccsay", "Nccsdy", "Nccsgn", "Nccsgy", - "Nccsiy", "Nccsly", "Nccsnn", "Nccsny", "Ncfpan", "Ncfpay", "Ncfpdn", "Ncfpdy", - "Ncfpgn", "Ncfpgy", "Ncfpin", "Ncfpiy", "Ncfpln", "Ncfply", "Ncfpnn", "Ncfpny", - "Ncfsan", "Ncfsay", "Ncfsdn", "Ncfsdy", "Ncfsgn", "Ncfsgy", "Ncfsin", "Ncfsiy", - "Ncfsln", "Ncfsly", "Ncfsnn", "Ncfsnnl", "Ncfsnnp", "Ncfsny", "Ncfsvy", "Ncmpan", - "Ncmpay", "Ncmpdn", "Ncmpdy", "Ncmpgn", "Ncmpgy", "Ncmpin", "Ncmpiy", "Ncmpln", - "Ncmply", "Ncmpnn", "Ncmpnnl", "Ncmpny", "Ncmsan", "Ncmsay", "Ncmsdn", "Ncmsdy", - "Ncmsgn", "Ncmsgy", "Ncmsin", "Ncmsiy", "Ncmsln", "Ncmsly", "Ncmsnn", "Ncmsnnl", - "Ncmsnnp", "Ncmsny", "Ncmsvn", "Ncmsvy", "Ncnpan", "Ncnpay", "Ncnpdn", "Ncnpdy", - "Ncnpgn", "Ncnpgy", "Ncnpin", "Ncnpiy", "Ncnpln", "Ncnply", "Ncnpnn", "Ncnpny", - "Ncnsan", "Ncnsay", "Ncnsdn", "Ncnsdy", "Ncnsgn", "Ncnsgy", "Ncnsin", "Ncnsiy", - "Ncnsln", "Ncnsly", "Ncnsnn", "Ncnsny", "Npcpay", "Npcsay", "Npcsdy", "Npcsgy", - "Npcsiy", "Npcsly", "Npcsnn", "Npcsny", "Npcsvy", "Npfpay", "Npfpdy", "Npfpgy", - "Npfpiy", "Npfpny", "Npfsay", "Npfsdy", "Npfsgn", "Npfsgy", "Npfsiy", "Npfsly", - "Npfsnn", "Npfsny", "Npfsvy", "Npmpay", "Npmpdy", "Npmpgy", "Npmpiy", "Npmpny", - "Npmpvy", "Npmsay", "Npmsdn", "Npmsdy", "Npmsgn", "Npmsgy", "Npmsiy", "Npmsly", - "Npmsnn", "Npmsny", "Npmsvy", "Npnsan", "Npnsnn", "P-----a", "P-----r", "P----an", - "P----ar", "P----dn", "P----dr", "P----gn", "P----gr", "P----in", "P----ir", - "P----ln", "P----nn", "P---p-a", "P---paa", "P---pan", "P---pda", "P---pdn", - "P---pga", "P---pgn", "P---pia", "P---pin", "P---pla", "P---pln", "P---pna", - "P---pnn", "P---san", "P---sar", "P---sdn", "P---sdr", "P---sga", "P---sgn", - "P---sgr", "P---sia", "P---sin", "P---sir", "P---sln", "P---snn", "P--f-aa", - "P--f-la", "P--fpaa", "P--fs-a", "P--fsaa", "P--fsan", "P--fsda", "P--fsdn", - "P--fsga", "P--fsgn", "P--fsia", "P--fsin", "P--fsla", "P--fsln", "P--fsna", - "P--fsnn", "P--m-aa", "P--m-ga", "P--m-ia", "P--m-la", "P--mpga", "P--ms-a", - "P--msaa", "P--msan", "P--msda", "P--msdn", "P--msga", "P--msgn", "P--msia", - "P--msin", "P--msla", "P--msln", "P--msna", "P--msnn", "P--n-an", "P--n-ga", - "P--n-la", "P--n-na", "P--npan", "P--npgn", "P--npnn", "P--ns-a", "P--nsaa", - "P--nsan", "P--nsda", "P--nsdn", "P--nsga", "P--nsgn", "P--nsia", "P--nsin", - "P--nsla", "P--nsln", "P--nsna", "P--nsnn", "P-1-pan", "P-1-pdn", "P-1-pgn", - "P-1-pin", "P-1-pln", "P-1-pnn", "P-1-san", "P-1-sdn", "P-1-sgn", "P-1-sin", - "P-1-sln", "P-1-snn", "P-1nsnn", "P-2-pan", "P-2-pdn", "P-2-pgn", "P-2-pin", - "P-2-pln", "P-2-pnn", "P-2-san", "P-2-sdn", "P-2-sgn", "P-2-sin", "P-2-sln", - "P-2-snn", "P-2msdn", "P-2nsan", "P-3-pan", "P-3-pdn", "P-3-pgn", "P-3-pin", - "P-3-pln", "P-3-pnn", "P-3-san", "P-3fsan", "P-3fsdn", "P-3fsgn", "P-3fsin", - "P-3fsln", "P-3fsnn", "P-3msan", "P-3msdn", "P-3msgn", "P-3msin", "P-3msln", - "P-3msnn", "P-3nsan", "P-3nsdn", "P-3nsgn", "P-3nsin", "P-3nsln", "P-3nsnn", "Q", - "R", "Rc", "SENT", "Sp-a", "Sp-d", "Sp-g", "Sp-i", "Sp-l", "Sp-n", "Vmg----a-p", - "Vmg----m-p", "Vmgp---a-e", "Vmgp---a-p", "Vmgp---m-e", "Vmgp---m-p", "Vmgs---a-e", - "Vmgs---a-p", "Vmgs---m-e", "Vmgs---m-p", "Vmi-1--a-e", "Vmif1p-a-e", "Vmif1p-a-p", - "Vmif1p-m-p", "Vmif1s-a-e", "Vmif1s-a-p", "Vmif1s-m-p", "Vmif2p-a-e", "Vmif2p-a-p", - "Vmif2p-m-p", "Vmif2s-a-e", "Vmif2s-a-p", "Vmif2s-m-p", "Vmif3p-a-e", "Vmif3p-a-p", - "Vmif3p-m-p", "Vmif3s-a-e", "Vmif3s-a-p", "Vmif3s-m-p", "Vmip---m-e", "Vmip1p-a-e", - "Vmip1p-a-p", "Vmip1p-m-e", "Vmip1s-a-e", "Vmip1s-a-p", "Vmip1s-m-e", "Vmip2p-a-e", - "Vmip2p-m-e", "Vmip2s-a-e", "Vmip2s-m-e", "Vmip3p-a-e", "Vmip3p-a-p", "Vmip3p-m-e", - "Vmip3p-p-e", "Vmip3s-a-e", "Vmip3s-m-e", "Vmip3s-p-e", "Vmis---a-e", "Vmis---a-p", - "Vmis---m-e", "Vmis--nm-e", "Vmis-p-a-e", "Vmis-p-a-p", "Vmis-p-m-e", "Vmis-p-m-p", - "Vmis-p-p-e", "Vmis-s-a-e", "Vmis-s-a-p", "Vmis-sfa-e", "Vmis-sfa-p", "Vmis-sfm-e", - "Vmis-sfm-p", "Vmis-sfp-e", "Vmis-sma-e", "Vmis-sma-p", "Vmis-smm-e", "Vmis-smm-p", - "Vmis-smp-e", "Vmis-smp-p", "Vmis-sna-e", "Vmis-sna-p", "Vmis-snm-e", "Vmis-snm-p", - "Vmis-snp-e", "Vmm--s-a-e", "Vmm-1p-a-e", "Vmm-1p-a-p", "Vmm-1p-m-p", "Vmm-1s-a-e", - "Vmm-1s-a-p", "Vmm-1s-m-p", "Vmm-2--a-e", "Vmm-2--a-p", "Vmm-2p-a-e", "Vmm-2p-a-p", - "Vmm-2p-m-e", "Vmm-2p-m-p", "Vmm-2s-a-e", "Vmm-2s-a-p", "Vmm-2s-m-e", "Vmm-2s-m-p", - "Vmn----a-e", "Vmn----a-p", "Vmn----m-e", "Vmn----m-p", "Vmn----p-e", - "Vmpp-p-a-ea", "Vmpp-p-a-ed", "Vmpp-p-a-eg", "Vmpp-p-a-ei", "Vmpp-p-a-el", - "Vmpp-p-a-en", "Vmpp-p-afea", "Vmpp-p-afed", "Vmpp-p-afeg", "Vmpp-p-afei", - "Vmpp-p-afel", "Vmpp-p-afen", "Vmpp-p-m-ea", "Vmpp-p-m-ed", "Vmpp-p-m-eg", - "Vmpp-p-m-ei", "Vmpp-p-m-el", "Vmpp-p-m-en", "Vmpp-p-mfea", "Vmpp-p-mfed", - "Vmpp-p-mfeg", "Vmpp-p-mfei", "Vmpp-p-mfel", "Vmpp-p-mfen", "Vmpp-p-p-ea", - "Vmpp-p-p-ed", "Vmpp-p-p-eg", "Vmpp-p-p-en", "Vmpp-p-pfea", "Vmpp-p-pfed", - "Vmpp-p-pfeg", "Vmpp-p-pfei", "Vmpp-p-pfel", "Vmpp-p-pfen", "Vmpp-p-pse", - "Vmpp-pma-eg", "Vmpp-s-a-ei", "Vmpp-s-afei", "Vmpp-sfa-ea", "Vmpp-sfa-ed", - "Vmpp-sfa-eg", "Vmpp-sfa-ei", "Vmpp-sfa-el", "Vmpp-sfa-en", "Vmpp-sfafea", - "Vmpp-sfafed", "Vmpp-sfafeg", "Vmpp-sfafei", "Vmpp-sfafel", "Vmpp-sfafen", - "Vmpp-sfm-ea", "Vmpp-sfm-ed", "Vmpp-sfm-eg", "Vmpp-sfm-ei", "Vmpp-sfm-el", - "Vmpp-sfm-en", "Vmpp-sfmfea", "Vmpp-sfmfed", "Vmpp-sfmfeg", "Vmpp-sfmfei", - "Vmpp-sfmfel", "Vmpp-sfmfen", "Vmpp-sfp-ea", "Vmpp-sfp-eg", "Vmpp-sfp-ei", - "Vmpp-sfp-el", "Vmpp-sfp-en", "Vmpp-sfpfea", "Vmpp-sfpfed", "Vmpp-sfpfeg", - "Vmpp-sfpfei", "Vmpp-sfpfel", "Vmpp-sfpfen", "Vmpp-sfpse", "Vmpp-sma-ea", - "Vmpp-sma-ed", "Vmpp-sma-eg", "Vmpp-sma-ei", "Vmpp-sma-el", "Vmpp-sma-en", - "Vmpp-smafea", "Vmpp-smafed", "Vmpp-smafeg", "Vmpp-smafei", "Vmpp-smafel", - "Vmpp-smafen", "Vmpp-smase", "Vmpp-smm-ea", "Vmpp-smm-ed", "Vmpp-smm-eg", - "Vmpp-smm-ei", "Vmpp-smm-el", "Vmpp-smm-en", "Vmpp-smmfea", "Vmpp-smmfed", - "Vmpp-smmfeg", "Vmpp-smmfei", "Vmpp-smmfel", "Vmpp-smmfen", "Vmpp-smp-ea", - "Vmpp-smp-eg", "Vmpp-smp-ei", "Vmpp-smp-el", "Vmpp-smp-en", "Vmpp-smpfea", - "Vmpp-smpfed", "Vmpp-smpfeg", "Vmpp-smpfei", "Vmpp-smpfel", "Vmpp-smpfen", - "Vmpp-smpse", "Vmpp-sna-ea", "Vmpp-sna-ed", "Vmpp-sna-eg", "Vmpp-sna-ei", - "Vmpp-sna-el", "Vmpp-sna-en", "Vmpp-snafea", "Vmpp-snafed", "Vmpp-snafeg", - "Vmpp-snafei", "Vmpp-snafel", "Vmpp-snafen", "Vmpp-snm-ea", "Vmpp-snm-ed", - "Vmpp-snm-eg", "Vmpp-snm-ei", "Vmpp-snm-en", "Vmpp-snmfea", "Vmpp-snmfed", - "Vmpp-snmfeg", "Vmpp-snmfei", "Vmpp-snmfel", "Vmpp-snmfen", "Vmpp-snp-ea", - "Vmpp-snp-ed", "Vmpp-snp-eg", "Vmpp-snp-ei", "Vmpp-snp-en", "Vmpp-snpfea", - "Vmpp-snpfed", "Vmpp-snpfeg", "Vmpp-snpfei", "Vmpp-snpfel", "Vmpp-snpfen", - "Vmpp-snpse", "Vmps-p-a-ea", "Vmps-p-a-ed", "Vmps-p-a-eg", "Vmps-p-a-ei", - "Vmps-p-a-el", "Vmps-p-a-en", "Vmps-p-a-pa", "Vmps-p-a-pd", "Vmps-p-a-pg", - "Vmps-p-a-pi", "Vmps-p-a-pl", "Vmps-p-a-pn", "Vmps-p-afea", "Vmps-p-afed", - "Vmps-p-afeg", "Vmps-p-afei", "Vmps-p-afel", "Vmps-p-afen", "Vmps-p-afpa", - "Vmps-p-afpd", "Vmps-p-afpg", "Vmps-p-afpi", "Vmps-p-afpl", "Vmps-p-afpn", - "Vmps-p-m-ea", "Vmps-p-m-eg", "Vmps-p-m-ei", "Vmps-p-m-el", "Vmps-p-m-en", - "Vmps-p-m-pa", "Vmps-p-m-pd", "Vmps-p-m-pg", "Vmps-p-m-pi", "Vmps-p-m-pl", - "Vmps-p-m-pn", "Vmps-p-mfea", "Vmps-p-mfed", "Vmps-p-mfeg", "Vmps-p-mfei", - "Vmps-p-mfel", "Vmps-p-mfen", "Vmps-p-mfpa", "Vmps-p-mfpd", "Vmps-p-mfpg", - "Vmps-p-mfpi", "Vmps-p-mfpl", "Vmps-p-mfpn", "Vmps-p-p-ed", "Vmps-p-p-eg", - "Vmps-p-p-ei", "Vmps-p-p-en", "Vmps-p-p-pa", "Vmps-p-p-pd", "Vmps-p-p-pg", - "Vmps-p-p-pi", "Vmps-p-p-pl", "Vmps-p-p-pn", "Vmps-p-pfea", "Vmps-p-pfed", - "Vmps-p-pfeg", "Vmps-p-pfei", "Vmps-p-pfel", "Vmps-p-pfen", "Vmps-p-pfpa", - "Vmps-p-pfpd", "Vmps-p-pfpg", "Vmps-p-pfpi", "Vmps-p-pfpl", "Vmps-p-pfpn", - "Vmps-p-pse", "Vmps-p-psp", "Vmps-s-pfpa", "Vmps-s-pfpn", "Vmps-sfa-ea", - "Vmps-sfa-ed", "Vmps-sfa-eg", "Vmps-sfa-ei", "Vmps-sfa-el", "Vmps-sfa-en", - "Vmps-sfa-pa", "Vmps-sfa-pd", "Vmps-sfa-pg", "Vmps-sfa-pi", "Vmps-sfa-pl", - "Vmps-sfa-pn", "Vmps-sfafea", "Vmps-sfafed", "Vmps-sfafeg", "Vmps-sfafei", - "Vmps-sfafel", "Vmps-sfafen", "Vmps-sfafpa", "Vmps-sfafpd", "Vmps-sfafpg", - "Vmps-sfafpi", "Vmps-sfafpl", "Vmps-sfafpn", "Vmps-sfm-ea", "Vmps-sfm-eg", - "Vmps-sfm-el", "Vmps-sfm-en", "Vmps-sfm-pa", "Vmps-sfm-pd", "Vmps-sfm-pg", - "Vmps-sfm-pi", "Vmps-sfm-pl", "Vmps-sfm-pn", "Vmps-sfmfea", "Vmps-sfmfed", - "Vmps-sfmfeg", "Vmps-sfmfei", "Vmps-sfmfel", "Vmps-sfmfen", "Vmps-sfmfpa", - "Vmps-sfmfpd", "Vmps-sfmfpg", "Vmps-sfmfpi", "Vmps-sfmfpl", "Vmps-sfmfpn", - "Vmps-sfp-ea", "Vmps-sfp-ed", "Vmps-sfp-eg", "Vmps-sfp-ei", "Vmps-sfp-en", - "Vmps-sfp-pa", "Vmps-sfp-pd", "Vmps-sfp-pg", "Vmps-sfp-pi", "Vmps-sfp-pl", - "Vmps-sfp-pn", "Vmps-sfpfea", "Vmps-sfpfed", "Vmps-sfpfeg", "Vmps-sfpfei", - "Vmps-sfpfel", "Vmps-sfpfen", "Vmps-sfpfpa", "Vmps-sfpfpd", "Vmps-sfpfpg", - "Vmps-sfpfpi", "Vmps-sfpfpl", "Vmps-sfpfpn", "Vmps-sfpse", "Vmps-sfpsp", - "Vmps-sma-ea", "Vmps-sma-ed", "Vmps-sma-eg", "Vmps-sma-ei", "Vmps-sma-el", - "Vmps-sma-en", "Vmps-sma-pa", "Vmps-sma-pd", "Vmps-sma-pg", "Vmps-sma-pi", - "Vmps-sma-pl", "Vmps-sma-pn", "Vmps-smafea", "Vmps-smafed", "Vmps-smafeg", - "Vmps-smafei", "Vmps-smafel", "Vmps-smafen", "Vmps-smafpa", "Vmps-smafpd", - "Vmps-smafpg", "Vmps-smafpi", "Vmps-smafpl", "Vmps-smafpn", "Vmps-smm-ea", - "Vmps-smm-ed", "Vmps-smm-eg", "Vmps-smm-ei", "Vmps-smm-en", "Vmps-smm-pa", - "Vmps-smm-pd", "Vmps-smm-pg", "Vmps-smm-pi", "Vmps-smm-pl", "Vmps-smm-pn", - "Vmps-smmfea", "Vmps-smmfeg", "Vmps-smmfei", "Vmps-smmfel", "Vmps-smmfen", - "Vmps-smmfpa", "Vmps-smmfpd", "Vmps-smmfpg", "Vmps-smmfpi", "Vmps-smmfpl", - "Vmps-smmfpn", "Vmps-smp-ea", "Vmps-smp-eg", "Vmps-smp-ei", "Vmps-smp-en", - "Vmps-smp-pa", "Vmps-smp-pd", "Vmps-smp-pg", "Vmps-smp-pi", "Vmps-smp-pl", - "Vmps-smp-pn", "Vmps-smpfea", "Vmps-smpfed", "Vmps-smpfeg", "Vmps-smpfei", - "Vmps-smpfel", "Vmps-smpfen", "Vmps-smpfpa", "Vmps-smpfpd", "Vmps-smpfpg", - "Vmps-smpfpi", "Vmps-smpfpl", "Vmps-smpfpn", "Vmps-smpse", "Vmps-smpsp", - "Vmps-sna-ea", "Vmps-sna-eg", "Vmps-sna-ei", "Vmps-sna-el", "Vmps-sna-en", - "Vmps-sna-p", "Vmps-sna-pa", "Vmps-sna-pd", "Vmps-sna-pg", "Vmps-sna-pi", - "Vmps-sna-pl", "Vmps-sna-pn", "Vmps-snafea", "Vmps-snafed", "Vmps-snafeg", - "Vmps-snafei", "Vmps-snafel", "Vmps-snafen", "Vmps-snafpa", "Vmps-snafpd", - "Vmps-snafpg", "Vmps-snafpi", "Vmps-snafpl", "Vmps-snafpn", "Vmps-snm-ea", - "Vmps-snm-eg", "Vmps-snm-en", "Vmps-snm-pa", "Vmps-snm-pg", "Vmps-snm-pi", - "Vmps-snm-pl", "Vmps-snm-pn", "Vmps-snmfea", "Vmps-snmfed", "Vmps-snmfeg", - "Vmps-snmfei", "Vmps-snmfel", "Vmps-snmfen", "Vmps-snmfpa", "Vmps-snmfpd", - "Vmps-snmfpg", "Vmps-snmfpi", "Vmps-snmfpl", "Vmps-snmfpn", "Vmps-snp-el", - "Vmps-snp-p", "Vmps-snp-pa", "Vmps-snp-pd", "Vmps-snp-pg", "Vmps-snp-pi", - "Vmps-snp-pl", "Vmps-snp-pn", "Vmps-snpfea", "Vmps-snpfeg", "Vmps-snpfen", - "Vmps-snpfpa", "Vmps-snpfpd", "Vmps-snpfpg", "Vmps-snpfpi", "Vmps-snpfpl", - "Vmps-snpfpn", "Vmps-snpse", "Vmps-snpsp" }; - - runTest("ru", "msd", tagset, "Это тест .", - new String[] { "это", "тест", "." }, - new String[] { "P--nsnn", "Ncmsnn", "SENT" }, - new String[] { "POS_PRON", "POS_NOUN", "POS_PUNCT" }); - } - - @Test - @Ignore("Slovene model currently not in Artifactory because we do not know tagset yet") - public void testSlovene() - throws Exception - { - String[] tagset = { }; - - runTest("sl", null, tagset, "To je test .", - new String[] { "ta", "biti", "test", "." }, - new String[] { "zk-sei----s", "gvpste--n", "somei", "SENT" }, - new String[] { "POS", "POS", "POS", "POS" }); - - runTest("sl", null, tagset, "Gremo na Češko za kosilo .", - new String[] { "iti", "na", "Češko", "za", "kosilo", "." }, - new String[] { "gppspm--n-----d", "dpet", "slmei", "dpet", "soset", "SENT" }, - new String[] { "POS", "POS", "POS", "POS", "POS", "POS" }); - } - - @Test - public void testSlovak() - throws Exception - { - String[] tagset = { "!", "\"", "#", "%", "(", ")", ",", ".", "0", ":", ";", "?", "Apx", - "Apy", "Apz", "Asx", "Asy", "Asz", "Dx", "Dy", "Dz", "E", "Gpx", "Gpy", "Gpz", - "Gsx", "Gsy", "Gsz", "J", "ND", "Np", "Ns", "O", "OY", "PD", "Pp", "Ps", "Q", "R", - "Sp", "Ss", "T", "TY", "VBpa", "VBpb", "VBpc", "VBsa", "VBsb", "VBsc", "VH", "VI", - "VKpa", "VKpb", "VKpc", "VKsa", "VKsb", "VKsc", "VLpa", "VLpb", "VLpc", "VLsa", - "VLsb", "VLsc", "VMpa", "VMpb", "VMsb", "W", "Y", "Z", "par" }; - - runTest("sk", "smt-reduced", tagset, "To je test .", - new String[] { "to", "byť", "test", "." }, - new String[] { "Ps", "VKsc", "Ss", "." }, - new String[] { "POS_PRON", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); - } - - @Test - public - void testChinese() - throws Exception - { - String[] tagset = { "a", "ad", "ag", "an", "b", "bg", "c", "d", "dg", "e", "ew", "f", "g", - "h", "i", "j", "k", "l", "m", "mg", "n", "nd", "ng", "nh", "ni", "nl", "nr", "ns", - "nt", "nx", "nz", "o", "p", "q", "r", "rg", "s", "t", "tg", "u", "v", "vd", "vg", - "vn", "w", "wp", "ws", "x", "y", "z" }; - - // The rudder often in the wake of the wind round the back of the area. - runTest("zh", "lcmc", tagset, "尾 舵 常 处于 风轮 后面 的 尾流 区里 。", - new String[] { "_", "_", "_", "_", "风轮", "_", "_", "_", "_", "_" }, - new String[] { "ng", "n", "d", "v", "n", "f", "u", "n", "nl", "ew" }, - new String[] { "POS_NOUN", "POS_NOUN", "POS_ADV", "POS_VERB", "POS_NOUN", "POS_X", "POS_AUX", "POS_NOUN", "POS_X", "POS_PUNCT" }); - - // The service sector has become an important engine of Guangdong's economic transformation - // and upgrading. - runTest("zh", "lcmc", tagset, "服务业 成为 广东 经济 转型 升级 的 重要 引擎 。", - new String[] { "_", "_", "_", "_", "_", "_", "_", "_", "_", "_" }, - new String[] { "n", "v", "ns", "n", "v", "v", "u", "a", "n", "ew" }, - new String[] { "POS_NOUN", "POS_VERB", "POS_PROPN", "POS_NOUN", "POS_VERB", "POS_VERB", "POS_AUX", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); - - // How far is China from the world brand? - runTest("zh", "lcmc", tagset, "中国 离 世界 技术 品牌 有 多远 ?", - new String[] { "_", "_", "_", "_", "_", "_", "多远", "_" }, - new String[] { "ns", "v", "n", "n", "n", "v", "n", "ew" }, - new String[] { "POS_PROPN", "POS_VERB", "POS_NOUN", "POS_NOUN", "POS_NOUN", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); - } - - @Test -// @Ignore("Platform specific") - public void testOddCharacters() - throws Exception - { - runTest("en", null, null, "² § ¶ § °", - new String[] { "²", "§", "¶", "§", "°" }, - new String[] { "NN", "SYM", "NN", "SYM", "SYM" }, - new String[] { "POS_NOUN", "POS_SYM", "POS_NOUN", "POS_SYM", "POS_SYM" }); - } - - /** - * Generate a very large document and test it. - */ - @Test - @Ignore("Ignoring test to avoid memory errors (see issue #850 in GitHub") - public void hugeDocumentTest() - throws Exception - { - // Start Java with -Xmx512m - boolean run = Runtime.getRuntime().maxMemory() > (500000000); - if (!run) { - System.out.println("Test requires more heap than available, skipping"); - } - Assume.assumeTrue(run); - - // Disable trace as this significantly slows down the test - TreeTaggerWrapper.TRACE = false; - - String text = "This is a test ."; - int reps = 4000000 / text.length(); - String testString = repeat(text, " ", reps); - - JCas jcas = runTest("en", null, null, testString, null, null, null); - List<POS> actualTags = new ArrayList<POS>(select(jcas, POS.class)); - assertEquals(reps * 5, actualTags.size()); - - // test POS annotations - String[] expectedTags = new String[] { "DT", "VBZ", "DT", "NN", "SENT" }; - String[] expectedTagClasses = new String[] { "POS_ART", "POS_V", "POS_ART", "POS_NN", "POS_PUNC" }; - - for (int i = 0; i < actualTags.size(); i++) { - POS posAnnotation = actualTags.get(i); - assertEquals("In position "+i, expectedTagClasses[i%5], posAnnotation.getType().getShortName()); - assertEquals("In position "+i, expectedTags[i%5], posAnnotation.getPosValue()); - } - - System.out.println("Successfully tagged document with " + testString.length() + - " characters"); - } - - /** - * Test using the same AnalysisEngine multiple times. - */ - @Test - public void multiDocumentTest() throws Exception - { - checkModelsAndBinary("en"); - - String testDocument = "This is a test ."; - String[] lemmas = { "this", "be", "a", "test", "." }; - String[] tags = { "DT", "VBZ", "DT", "NN", "SENT" }; - String[] tagClasses = { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }; - - AnalysisEngine engine = createEngine(TreeTaggerPosTagger.class); - - HideOutput hideOut = new HideOutput(); - try { - - for (int n = 0; n < 100; n++) { - JCas aJCas = TestRunner.runTest(engine, "en", testDocument); - - AssertAnnotations.assertPOS(tagClasses, tags, select(aJCas, POS.class)); - AssertAnnotations.assertLemma(lemmas, select(aJCas, Lemma.class)); - } - } - finally { - engine.destroy(); - hideOut.restoreOutput(); - } - } - - /** - * Run the {@link #hugeDocumentTest()} 100 times. - */ - @Test - @Ignore("This test takes a very long time. Only include it if you need to "+ - "test the stability of the annotator") - public void loadTest() - throws Exception - { - for (int i = 0; i < 100; i++) { - System.out.println("Load test iteration " + i); - hugeDocumentTest(); - } - } - - private void checkModelsAndBinary(String lang) - { - Assume.assumeTrue(getClass().getResource( - "/de/tudarmstadt/ukp/dkpro/core/treetagger/lib/tagger-" + lang + "-le.bin") != null); - - Assume.assumeTrue(getClass().getResource( - "/de/tudarmstadt/ukp/dkpro/core/treetagger/bin/LICENSE.txt") != null || - System.getProperty("treetagger.home") != null); - } - - private JCas runTest(String language, String tagsetName, String[] tagset, String testDocument, - String[] lemmas, String[] tags, String[] tagClasses) - throws Exception - { - checkModelsAndBinary(language); - - AnalysisEngine engine = createEngine(TreeTaggerPosTagger.class, - TreeTaggerPosTagger.PARAM_PRINT_TAGSET, true); - - JCas aJCas = TestRunner.runTest(engine, language, testDocument); - - AssertAnnotations.assertLemma(lemmas, select(aJCas, Lemma.class)); - AssertAnnotations.assertPOS(tagClasses, tags, select(aJCas, POS.class)); - if (tagset != null) { - AssertAnnotations.assertTagset(POS.class, tagsetName, tagset, aJCas); - } - - return aJCas; - } - - /** - * Test using the same AnalysisEngine multiple times. - */ - @Test - public void longTokenTest() - throws Exception - { - checkModelsAndBinary("en"); - - AnalysisEngine engine = createEngine(TreeTaggerPosTagger.class); - JCas jcas = engine.newJCas(); - - try { - for (int n = 99990; n < 100000; n ++) { - System.out.println(n); - jcas.setDocumentLanguage("en"); - JCasBuilder builder = new JCasBuilder(jcas); - builder.add("Start", Token.class); - builder.add("with", Token.class); - builder.add("good", Token.class); - builder.add("tokens", Token.class); - builder.add(".", Token.class); - builder.add(StringUtils.repeat("b", n), Token.class); - builder.add("End", Token.class); - builder.add("with", Token.class); - builder.add("some", Token.class); - builder.add("good", Token.class); - builder.add("tokens", Token.class); - builder.add(".", Token.class); - builder.close(); - engine.process(jcas); - jcas.reset(); - } - } - finally { - engine.destroy(); - } - } - - /** - * Runs a small pipeline on a text containing quite odd characters such as - * Unicode LEFT-TO-RIGHT-MARKs. The BreakIteratorSegmenter creates tokens from these - * which are send to TreeTagger as tokens containing line breaks or only - * whitespace. TreeTaggerPosLemmaTT4J has to filter these tokens before - * they reach the TreeTaggerWrapper. - */ -// @Test -// public -// void testStrangeDocument() -// throws Exception -// { -// CollectionReader reader = createReader( -// FileSystemReader.class, -// createTypeSystemDescription(), -// FileSystemReader.PARAM_INPUTDIR, getTestResource( -// "test_files/annotator/TreeTaggerPosLemmaTT4J/strange")); -// -// AnalysisEngine sentenceSplitter = createEngine( -// BreakIteratorSegmenter.class, -// tsd); -// -// AnalysisEngine tt = createEngine(TreeTaggerPosLemmaTT4J.class, tsd, -// TreeTaggerTT4JBase.PARAM_LANGUAGE_CODE, "en"); -// -// runPipeline(reader, sentenceSplitter, tt); -// } - -// @Test -// @Ignore("This test should fail, however - due to fixes in the Tokenizer, " + -// "we can currently not provokate a failure with the given 'strange' " + -// "document.") -// public -// void testStrangeDocumentFail() -// throws Exception -// { -// CollectionReader reader = createReader( -// FileSystemReader.class, -// createTypeSystemDescription(), -// FileSystemReader.PARAM_INPUTDIR, getTestResource( -// "test_files/annotator/TreeTaggerPosLemmaTT4J/strange")); -// -// AnalysisEngine sentenceSplitter = createEngine( -// BreakIteratorSegmenter.class, -// tsd); -// -// AnalysisEngine tt = createEngine(TreeTaggerPosLemmaTT4J.class, tsd, -// TreeTaggerTT4JBase.PARAM_LANGUAGE_CODE, "en", -// TreeTaggerTT4JBase.PARAM_PERFORMANCE_MODE, true); -// -// runPipeline( -// reader, -// sentenceSplitter, -// tt); -// } - - /** - * When running this test, check manually if TreeTagger is restarted - * between the documents. If you jank up the log levels, that should be - * visible on the console. Unfortunately we cannot easily access the - * restartCount of the TreeTaggerWrapper. - */ -// @Test -// public -// void testRealMultiDocument() -// throws Exception -// { -// CollectionReader reader = createReader( -// FileSystemReader.class, -// createTypeSystemDescription(), -// FileSystemReader.PARAM_INPUTDIR, getTestResource( -// "test_files/annotator/TreeTaggerPosLemmaTT4J/multiDoc")); -// -// AnalysisEngine sentenceSplitter = createEngine( -// BreakIteratorSegmenter.class, -// tsd); -// -// AnalysisEngine tt = createEngine(TreeTaggerPosLemmaTT4J.class, tsd, -// TreeTaggerTT4JBase.PARAM_LANGUAGE_CODE, "en"); -// -// runPipeline( -// reader, -// sentenceSplitter, -// tt); -// } - - /* - * Uncomment to test explicitly setting model/binary locations - */ -// @Test -// public void testExplicitBinaryModel() throws Exception -// { -// AnalysisEngine tt = createEngine(TreeTaggerPosTagger.class, -// TreeTaggerPosTagger.PARAM_EXECUTABLE_PATH, -// "/Applications/tree-tagger-MacOSX-3.2-intel/bin/tree-tagger", -// TreeTaggerPosTagger.PARAM_MODEL_LOCATION, -// "/Applications/tree-tagger-MacOSX-3.2-intel/models/german-par-linux-3.2-utf8.bin", -// TreeTaggerPosTagger.PARAM_MODEL_ENCODING, "UTF-8"); -// -// JCas jcas = JCasFactory.createJCas(); -// jcas.setDocumentLanguage("de"); -// -// TokenBuilder<Token, Sentence> tb = new TokenBuilder<Token, Sentence>(Token.class, -// Sentence.class); -// tb.buildTokens(jcas, "Dies ist ein test ."); -// -// tt.process(jcas); -// } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/dkpro-core-treetagger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/treetagger/SegmenterCompatibilityTest.java b/dkpro-core-treetagger-asl/src/test/java/org/dkpro/core/treetagger/SegmenterCompatibilityTest.java similarity index 78% rename from dkpro-core-treetagger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/treetagger/SegmenterCompatibilityTest.java rename to dkpro-core-treetagger-asl/src/test/java/org/dkpro/core/treetagger/SegmenterCompatibilityTest.java index 5c7809c1a0..80980ab6e8 100644 --- a/dkpro-core-treetagger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/treetagger/SegmenterCompatibilityTest.java +++ b/dkpro-core-treetagger-asl/src/test/java/org/dkpro/core/treetagger/SegmenterCompatibilityTest.java @@ -15,10 +15,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.treetagger; +package org.dkpro.core.treetagger; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; import static org.junit.Assert.assertEquals; @@ -27,21 +27,21 @@ import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.jcas.JCas; +import org.dkpro.core.tokit.BreakIteratorSegmenter; +import org.dkpro.core.treetagger.TreeTaggerPosTagger; import org.junit.Assume; import org.junit.Before; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; -public -class SegmenterCompatibilityTest +public class SegmenterCompatibilityTest { - @Before - public void initTrace() - { - // TreeTaggerWrapper.TRACE = true; - } + @Before + public void initTrace() + { + // TreeTaggerWrapper.TRACE = true; + } @Test public void segmenterCompatibilityTest() throws Exception @@ -62,19 +62,20 @@ public void segmenterCompatibilityTest() throws Exception select(aJCas, Lemma.class)); } - private void checkLemma(String[] expected, Collection<Lemma> actual) - { + private void checkLemma(String[] expected, Collection<Lemma> actual) + { int i = 0; for (Lemma lemmaAnnotation : actual) { - assertEquals("In position "+i, expected[i], lemmaAnnotation.getValue()); + assertEquals("In position " + i, expected[i], lemmaAnnotation.getValue()); i++; } - } + } private void checkModelsAndBinary(String lang) { - Assume.assumeTrue(getClass().getResource( - "/de/tudarmstadt/ukp/dkpro/core/treetagger/lib/tagger-" + lang + "-le.bin") != null); + Assume.assumeTrue( + getClass().getResource("/de/tudarmstadt/ukp/dkpro/core/treetagger/lib/tagger-" + + lang + "-le.bin") != null); Assume.assumeTrue(getClass().getResource( "/de/tudarmstadt/ukp/dkpro/core/treetagger/bin/LICENSE.txt") != null); diff --git a/dkpro-core-treetagger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/treetagger/TreeTaggerChunkerTest.java b/dkpro-core-treetagger-asl/src/test/java/org/dkpro/core/treetagger/TreeTaggerChunkerTest.java similarity index 83% rename from dkpro-core-treetagger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/treetagger/TreeTaggerChunkerTest.java rename to dkpro-core-treetagger-asl/src/test/java/org/dkpro/core/treetagger/TreeTaggerChunkerTest.java index ed36b9f146..27f328979b 100644 --- a/dkpro-core-treetagger-asl/src/test/java/de/tudarmstadt/ukp/dkpro/core/treetagger/TreeTaggerChunkerTest.java +++ b/dkpro-core-treetagger-asl/src/test/java/org/dkpro/core/treetagger/TreeTaggerChunkerTest.java @@ -15,13 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package de.tudarmstadt.ukp.dkpro.core.treetagger; +package org.dkpro.core.treetagger; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertChunks; -import static de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations.assertTagset; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.testing.AssertAnnotations.assertChunks; +import static org.dkpro.core.testing.AssertAnnotations.assertTagset; import static org.junit.Assert.assertEquals; import org.apache.uima.analysis_engine.AnalysisEngine; @@ -30,14 +30,16 @@ import org.apache.uima.fit.testing.factory.TokenBuilder; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.dkpro.core.treetagger.TreeTaggerChunker; +import org.dkpro.core.treetagger.TreeTaggerPosTagger; import org.junit.Assume; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class TreeTaggerChunkerTest { @@ -149,13 +151,13 @@ private JCas runTest(String aLanguage, String aVariant, String aText) return TestRunner.runTest(aggregate, aLanguage, aText); } - private JCas runTest(String aLanguage, String aText, String[] aLemmas, String[] aTags, - String[] aTagClasses) - throws Exception - { - AnalysisEngine tagger = createEngine(TreeTaggerPosTagger.class); + private JCas runTest(String aLanguage, String aText, String[] aLemmas, String[] aTags, + String[] aTagClasses) + throws Exception + { + AnalysisEngine tagger = createEngine(TreeTaggerPosTagger.class); AnalysisEngine chunker = createEngine(TreeTaggerChunker.class, - TreeTaggerPosTagger.PARAM_PRINT_TAGSET, true); + TreeTaggerPosTagger.PARAM_PRINT_TAGSET, true); JCas aJCas = JCasFactory.createJCas(); aJCas.setDocumentLanguage(aLanguage); @@ -168,14 +170,16 @@ private JCas runTest(String aLanguage, String aText, String[] aLemmas, String[] // test Chunk annotations if (aTagClasses != null && aTags != null) { - int i = 0; - for (Chunk posAnnotation : select(aJCas, Chunk.class)) { - System.out.println(posAnnotation.getChunkValue()+": ["+posAnnotation.getCoveredText()+"]"); - assertEquals("In position "+i, aTagClasses[i], posAnnotation.getType().getShortName()); - assertEquals("In position "+i, aTags[i], posAnnotation.getChunkValue()); - i++; - } - assertEquals(aTags.length, i); + int i = 0; + for (Chunk posAnnotation : select(aJCas, Chunk.class)) { + System.out.println(posAnnotation.getChunkValue() + ": [" + + posAnnotation.getCoveredText() + "]"); + assertEquals("In position " + i, aTagClasses[i], + posAnnotation.getType().getShortName()); + assertEquals("In position " + i, aTags[i], posAnnotation.getChunkValue()); + i++; + } + assertEquals(aTags.length, i); } return aJCas; @@ -183,8 +187,9 @@ private JCas runTest(String aLanguage, String aText, String[] aLemmas, String[] private void checkModelsAndBinary(String lang) { - Assume.assumeTrue(getClass().getResource( - "/de/tudarmstadt/ukp/dkpro/core/treetagger/lib/chunker-" + lang + "-le.bin") != null); + Assume.assumeTrue( + getClass().getResource("/de/tudarmstadt/ukp/dkpro/core/treetagger/lib/chunker-" + + lang + "-le.bin") != null); Assume.assumeTrue(getClass().getResource( "/de/tudarmstadt/ukp/dkpro/core/treetagger/bin/LICENSE.txt") != null); diff --git a/dkpro-core-treetagger-asl/src/test/java/org/dkpro/core/treetagger/TreeTaggerPosTaggerTest.java b/dkpro-core-treetagger-asl/src/test/java/org/dkpro/core/treetagger/TreeTaggerPosTaggerTest.java new file mode 100644 index 0000000000..a6328c5731 --- /dev/null +++ b/dkpro-core-treetagger-asl/src/test/java/org/dkpro/core/treetagger/TreeTaggerPosTaggerTest.java @@ -0,0 +1,1547 @@ +/* + * Copyright 2017 + * Ubiquitous Knowledge Processing (UKP) Lab + * Technische Universität Darmstadt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.treetagger; + +import static org.apache.commons.lang3.StringUtils.repeat; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.junit.Assert.assertEquals; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import org.annolab.tt4j.TreeTaggerWrapper; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.fit.factory.JCasBuilder; +import org.apache.uima.fit.testing.util.HideOutput; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; +import org.dkpro.core.treetagger.TreeTaggerPosTagger; +import org.junit.Assume; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Rule; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class TreeTaggerPosTaggerTest +{ + @Before + public void initTrace() + { + // TreeTaggerWrapper.TRACE = true; + } + + @Test + public void testEnglishAutoDownload() throws Exception + { + Assume.assumeTrue(getClass().getResource( + "/de/tudarmstadt/ukp/dkpro/core/treetagger/bin/LICENSE.txt") != null || + System.getProperty("treetagger.home") != null); + + URL aUrl = new URL("https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/english.par.gz"); + File targetFile = File.createTempFile("model", ".bin"); + + try (InputStream input = new CompressorStreamFactory() + .createCompressorInputStream(new BufferedInputStream(aUrl.openStream())); + OutputStream target = new FileOutputStream(targetFile);) { + IOUtils.copy(input, target); + } + + AnalysisEngineDescription engine = createEngineDescription(TreeTaggerPosTagger.class, + TreeTaggerPosTagger.PARAM_MODEL_LOCATION, targetFile, + TreeTaggerPosTagger.PARAM_MODEL_ENCODING, "utf-8"); + + JCas jcas = TestRunner.runTest(engine, "en", "This is a test ."); + + String[] lemmas = { "this", "be", "a", "test", "." }; + String[] tags = { "DT", "VBZ", "DT", "NN", "SENT" }; + String[] tagClasses = { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }; + + AssertAnnotations.assertLemma(lemmas, select(jcas, Lemma.class)); + AssertAnnotations.assertPOS(tagClasses, tags, select(jcas, POS.class)); + } + + @Test + public void testEnglish() throws Exception + { + String[] tagset = { "#", "$", "''", "(", ")", ",", ":", "CC", "CD", "DT", "EX", "FW", "IN", + "IN/that", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNS", "NP", "NPS", "PDT", "POS", + "PP", "PP$", "RB", "RBR", "RBS", "RP", "SENT", "SYM", "TO", "UH", "VB", "VBD", + "VBG", "VBN", "VBP", "VBZ", "VH", "VHD", "VHG", "VHN", "VHP", "VHZ", "VV", "VVD", + "VVG", "VVN", "VVP", "VVZ", "WDT", "WP", "WP$", "WRB", "``" }; + + runTest("en", "ptb-tt", tagset, "This is a test .", + new String[] { "this", "be", "a", "test", "." }, + new String[] { "DT", "VBZ", "DT", "NN", "SENT" }, + new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + + runTest("en", "ptb-tt", tagset, "A neural net .", + new String[] { "a", "neural", "net", "." }, + new String[] { "DT", "JJ", "NN", "SENT" }, + new String[] { "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); + + runTest("en", "ptb-tt", tagset, "John is purchasing oranges .", + new String[] { "John", "be", "purchase", "orange", "." }, + new String[] { "NP", "VBZ", "VVG", "NNS", "SENT" }, + new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); + + // TT4J per default runs TreeTagger with the -sgml option, so XML tags are not tagged + runTest("en", "ptb-tt", tagset, "My homepage is <url> http://null.dummy </url> .", + new String[] { "my", "homepage", "be", "http://null.dummy", "." }, + new String[] { "PP$", "NN", "VBZ", "JJ", "SENT" }, + new String[] { "POS_PRON", "POS_NOUN", "POS_VERB", "POS_ADJ", "POS_PUNCT" }); + } + + @Test + public void testFrench() + throws Exception + { + String[] tagset = { "ABR", "ADJ", "ADV", "DET:ART", "DET:POS", "INT", "KON", "NAM", "NOM", + "NUM", "PRO", "PRO:DEM", "PRO:IND", "PRO:PER", "PRO:POS", "PRO:REL", "PRP", + "PRP:det", "PUN", "PUN:cit", "SENT", "SYM", "VER:cond", "VER:futu", "VER:impe", + "VER:impf", "VER:infi", "VER:pper", "VER:ppre", "VER:pres", "VER:simp", "VER:subi", + "VER:subp" }; + + runTest("fr", "stein", tagset, "Ceci est un test .", + new String[] { "ceci", "être", "un", "test", "." }, + new String[] { "PRO:DEM", "VER:pres", "DET:ART", "NOM", "SENT" }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + } + + @Test + public void testGerman() + throws Exception + { + String[] tagset = { "$(", "$,", "$.", "ADJA", "ADJD", "ADV", "APPO", "APPR", + "APPRART", "APZR", "ART", "CARD", "FM", "ITJ", "KOKOM", "KON", "KOUI", "KOUS", + "NE", "NN", "PAV", "PDAT", "PDS", "PIAT", "PIS", "PPER", "PPOSAT", "PPOSS", + "PRELAT", "PRELS", "PRF", "PTKA", "PTKANT", "PTKNEG", "PTKVZ", "PTKZU", "PWAT", + "PWAV", "PWS", "TRUNC", "VAFIN", "VAIMP", "VAINF", "VAPP", "VMFIN", "VMINF", + "VMPP", "VVFIN", "VVIMP", "VVINF", "VVIZU", "VVPP", "XY" }; + + runTest("de", "stts", tagset, "10 Minuten sind das Mikro an und die Bühne frei .", + new String[] { "@card@", "Minute", "sein", "die", "Mikro", "an", "und", "die", "Bühne", + "frei", "." }, + new String[] { "CARD", "NN", "VAFIN", "ART", "NN", "PTKVZ", "KON", "ART", "NN", + "PTKVZ", "$." }, + new String[] { "POS_NUM", "POS_NOUN", "POS_VERB", "POS_DET", "POS_NOUN", "POS_VERB", + "POS_CONJ", "POS_DET", "POS_NOUN", "POS_VERB", "POS_PUNCT" }); + + runTest("de", "stts", tagset, "Das ist ein Test .", + new String[] { "die", "sein", "eine", "Test", "." }, + new String[] { "PDS", "VAFIN", "ART", "NN", "$." }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + } + + @Test + public void testDutch() + throws Exception + { + String[] tagset = { "$.", "adj", "adj*kop", "adjabbr", "adv", "advabbr", "conjcoord", + "conjsubo", "det__art", "det__demo", "det__excl", "det__indef", "det__poss", + "det__quest", "det__rel", "int", "noun*kop", "nounabbr", "nounpl", "nounprop", + "nounsg", "num__card", "num__ord", "partte", "prep", "prepabbr", "pronadv", + "prondemo", "pronindef", "pronpers", "pronposs", "pronquest", "pronrefl", + "pronrel", "punc", "verbinf", "verbpapa", "verbpastpl", "verbpastsg", "verbpresp", + "verbprespl", "verbpressg" }; + + runTest("nl", "tt", tagset, "Dit is een test .", + new String[] { "dit", "zijn", "een", "test", "." }, + new String[] { "prondemo", "verbpressg", "det__art", "nounsg", "$." }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + + runTest("nl", "tt", tagset, "10 minuten op de microfoon en vrij podium .", + new String[] { "@card@", "minuut", "op", "de", "microfoon", "en", "vrij", "podium", + "." }, + new String[] { "num__ord", "nounpl", "prep", "det__art", "nounsg", "conjcoord", + "adj", "nounsg", "$." }, + new String[] { "POS_NUM", "POS_NOUN", "POS_ADP", "POS_DET", "POS_NOUN", "POS_CONJ", + "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); + } + + @Test + public void testMongolian() + throws Exception + { + String[] tagset = { "\"", "(", ")", ",", "-", ".", ":", "?", "@", "CC", "CD", "DC", "FR", + "IN", "JJ", "NN", "NNP", "PR", "RB", "SX", "VB", "|" }; + + runTest("mn", "tt", tagset, "Энэ нь тест юм .", + new String[] { "-", "-", "тест", "-", "-" }, + new String[] { "PR", "SX", "NN", "DC", "." }, + new String[] { "POS", "POS", "POS", "POS", "POS" }); + } + + @Test + public void testGalician() + throws Exception + { + String[] tagset = { "A0aa", "A0ap", "A0ap+P+Idfs", "A0as", "A0fa", "A0fp", "A0fs", + "A0fs+A0fs", "A0fs+Lp0+Ddms", "A0fs+P+Sp00", "A0fs+Q(", "A0fs+Scfs", + "A0fs+Vii30s+Rao3aa+Q_+P", "A0ma", "A0mp", "A0mp+Ddfs", "A0mp+Q\"", "A0ms", + "A0ms+Q_+Vei10s+Raa3fs", "A0ms+Spms", "A0ms+V0f000+Raa3ms", "A0ms+Vpi30p", + "A0ms+Wn", "A0xp", "A0xs", "Acap", "Acas", "Acfp", "Acfs", "Acmp", "Acms", "Asap", + "Asas", "Asfp", "Asfs", "Asmp", "Asms", "Asxp", "Asxs", "Cc", "Cc+A0ms", "Cc+Ddfp", + "Cc+Ddms+P", "Cc+Edfs+Infs", "Cc+La0", "Cc+La0+V0f000+Raa3ms", + "Cc+Rtn3fs+Vpi30s+Cs+Ddfs+Md3sfs+Scfs+Ddfs+Spfs+Rao3aa+Vii30s+V0f000+P+Enns", "Cs", + "Cs+Ddfp", "Cs+Ddfp+Scfp", "Cs+Ddfs", "Cs+Ddmp", "Cs+Ddms", "Cs+Difs", "Cs+La0", + "Cs+La0+P", "Cs+P", "Cs+Raa3ms", "Cs+Rad1as+Raa3ms", "Cs+Vci20s+Rad3ap", + "Cs+Ves10p", "Cs+Wn", "Ddfp", "Ddfp+Scfp+Ddms+Spms", "Ddfp+Scfp+P+Scms", + "Ddfp+Scfp+P+Scms+A0ms", "Ddfp+Spfp", "Ddfs", "Ddfs+Scfs+P+Sp00", "Ddfs+Spfs", + "Ddfs+Spfs+Lcs", "Ddfs+Spfs+Lp0", "Ddfs+Spfs+Lp0+Ddfs", "Ddfs+Spfs+P+Sp00", + "Ddfs+Spfs+P+Spm0", "Ddfs+Spfs+Q_+Ddfs", "Ddfs+Spfs+Vpi30s+Raa1ap", + "Ddfs+Spfs+Vpi30s+Rad3ms", "Ddmp", "Ddmp+P+Ncdfp", "Ddmp+Scmp+A0mp", + "Ddmp+Scmp+P+Ddfs", "Ddmp+Spmp", "Ddms", "Ddms+A0ms", "Ddms+Scms", "Ddms+Scms+P", + "Ddms+Spms", "Ddms+Spms+Ddfs", "Ddms+Spms+Lp0+Ddfp", "Ddms+Spms+P+Ddfs+Spfs", + "Ddms+Spms+P+Idmp+Ddmp", "Ddms+Spms+P+Sp00", "Ddms+Spms+P+Spm0", "Ddms+V0f000", + "Ddxp", "Ddxs", "Difp", "Difs", "Difs+Scfs", "Difs+Spfs", "Dimp", "Dims", + "Dims+Spms", "Dixp", "Dixs", "Edfp", "Edfp+Infp", "Edfs", "Edfs+Idfs", "Edfs+Infs", + "Edmp", "Edmp+Inmp", "Edms", "Edms+Idms", "Edms+Inms", "Edms+Scms", "Edms+Spms", + "Edxp", "Edxs", "Enfp", "Enfs", "Enfs+Vpi20s+Raa3fs", "Enmp", "Enms", "Enns", + "Enns+Vei10s+Raa3ms", "Enns+Vei30s+Rad1as", "Enns+Vfi30s+Rad3ap+La0", + "Enns+Vii30s+Rad1as", "Enns+Vpi30s+Raa1as", "Enns+Vpi30s+Raa2as", + "Enns+Vpi30s+Raa3ms", "Enns+Vpi30s+Rad1as", "Enns+Vpi30s+Rad3ms", "Enxp", "Enxs", + "Etiqueta", "Gdaa", "Gdap", "Gdas", "Gdfp", "Gdfs", "Gdmp", "Gdms", "Gdxp", "Gdxs", + "Gnaa", "Gnap", "Gnas", "Gnfp", "Gnfs", "Gnmp", "Gnms", "Gnxp", "Gnxs", "Iafp", + "Iafs", "Iamp", "Iams", "Idap", "Idas", "Idfp", "Idfp+Ddfp", "Idfs", "Idmp", + "Idmp+Ddmp", "Idmp+Ddmp+Scmp", "Idms", "Idms+Ddms+P", "Idxp", "Idxs", "In00", + "Inaa", "Inap", "Inas", "Infp", "Infp+Viia0s+Raa3ms", "Infs", + "Infs+P+Rtp3fp+Vei30s+Rad3as", "Inmp", "Inmp+Ddmp+P+Sp00", "Inmp+Lp0+Ddms", + "Inmp+Vci30p+Raa3ms", "Inmp+Vei30p+Rad3ap", "Inms", "Inms+Lp0+Ddms", "Inms+P+Edmp", + "Inms+P+Enmp", "Inms+P+Rtp3mp+Vei30s+Rad1as", "Inms+P+Rtp3mp+Vei30s+Rao3aa+Rad3as", + "Inxp", "Inxs", "La0", "La0+Ddfs+P", "La0+La0+Q_+P+Ddms+V0f000+Rao3aa", + "La0+Lp0+Ddfp", "La0+Lp0+Ddms", "La0+Ncdmp+Zs00", "La0+P+Edfs", "La0+P+Idmp", + "La0+Q,+V0x000+Raa1as", "La0+Q?", "La0+Q_", "La0+Vei10s+Raf1as", + "La0+Vei30s+Raa3ms", "La0+Vei30s+Rad3ms", "La0+Vii20s+Raa3mp", "La0+Vii30p+Raa3ms", + "La0+Vii30s+Rad1as", "La0+Viia0s+Rad3as", "La0+Vli30p+Rad3as", "La0+Vpi10s+Rad1as", + "La0+Vpi10s+Raf1as", "La0+Vpi30s+Raa1as", "La0+Vpi30s+Rad1as", "Lcc", "Lcc+Ddfp", + "Lcc+Ddfs", "Lcc+Ddmp", "Lcc+Ddms", "Lcc+Lp0", "Lcc+Sp00", "Lcs", "Lcs+Ddmp+P", + "Lcs+Ddms+P", "Lcs+La0+A0fs+P", "Lp0", "Lp0+Ddfp", "Lp0+Ddfp+Ncdfp", "Lp0+Ddfs", + "Lp0+Ddfs+Scfs", "Lp0+Ddfs+Spfs", "Lp0+Ddmp", "Lp0+Ddmp+Scmp+A0mp", "Lp0+Ddms", + "Lp0+Ddms+Spms", "Lp0+Ddms+Zams+Ncnms", "Lp0+Difp", "Lp0+Difs", "Lp0+Dimp", + "Lp0+Dims", "Lp0+Edfp", "Lp0+Edfs", "Lp0+Edfs+Q_+Vpi30s", "Lp0+Edmp", "Lp0+Enfs", + "Lp0+Enmp", "Lp0+Enns", "Lp0+Idfp", "Lp0+Idfs", "Lp0+Idmp", "Lp0+Infp", "Lp0+Inms", + "Lp0+Ncdfs", "Lp0+Ncdms", "Lp0+Rtp3fs", "Lp0+Rtp3mp", "Lp0+Rtp3ms", "Lp0+Rtp3ms+P", + "Lp0+Scfp", "Lp0+Scfs", "Lp0+Scms", "Lp0+V0f000", "Md1pfp", "Md1pfs", "Md1pmp", + "Md1pms", "Md1pxp", "Md1pxs", "Md1sfp", "Md1sfs", "Md1smp", "Md1sms", "Md1sxp", + "Md1sxs", "Md2pfp", "Md2pfs", "Md2pmp", "Md2pms", "Md2pxp", "Md2pxs", "Md2sfp", + "Md2sfs", "Md2smp", "Md2sms", "Md2sxp", "Md2sxs", "Md3afp", "Md3afs", "Md3amp", + "Md3ams", "Md3pfp", "Md3pfs", "Md3pmp", "Md3pms", "Md3pxp", "Md3pxs", "Md3sfp", + "Md3sfs", "Md3smp", "Md3sms", "Md3sxp", "Md3sxs", "Mn1pfp", "Mn1pfs", "Mn1pmp", + "Mn1pms", "Mn1pxp", "Mn1pxs", "Mn1sfp", "Mn1sfs", "Mn1smp", "Mn1sms", "Mn1sxp", + "Mn1sxs", "Mn2pfp", "Mn2pfs", "Mn2pmp", "Mn2pms", "Mn2pxp", "Mn2pxs", "Mn2sfp", + "Mn2sfs", "Mn2smp", "Mn2sms", "Mn2sxp", "Mn2sxs", "Mn3afp", "Mn3afs", "Mn3amp", + "Mn3ams", "Mn3pfp", "Mn3pfs", "Mn3pmp", "Mn3pms", "Mn3pxp", "Mn3pxs", "Mn3sfp", + "Mn3sfs", "Mn3smp", "Mn3sms", "Mn3sxp", "Mn3sxs", "Ncamp", "Ncaxp", "Ncdap", + "Ncdfp", "Ncdfs", "Ncdmp", "Ncdmp+Zs00", "Ncdms", "Ncdxp", "Ncnaa", "Ncnap", + "Ncnap+Zs00+Ncnmp", "Ncnas", "Ncnfp", "Ncnfs", "Ncnmp", "Ncnmp+Q:+Ncnmp", + "Ncnmp+Zs00+Ncnmp", "Ncnms", "Ncnms+La0", "Ncnms+La0+Q.", "Ncnms+Zs00", "Ncnxp", + "Nodfp", "Nodfs", "Nodmp", "Nodms", "Nodxp", "Nodxs", "Nonfp", "Nonfs", "Nonmp", + "Nonms", "Nonms+Nonfs", "Nonxp", "Nonxs", "P", "P+Cs", "P+Ddfp", "P+Ddfp+Scfp", + "P+Ddfp+Scfp+A0fp", "P+Ddfs", "P+Ddmp", "P+Ddmp+Md2smp+Scmp+Q.", "P+Ddmp+Scmp", + "P+Ddms", "P+Ddxp", "P+Ddxs", "P+Difp", "P+Difs", "P+Difs+Scfs", "P+Dimp", "P+Dims", + "P+Dims+Ncnms", "P+Dims+Ncnms+Zs00", "P+Edfp", "P+Edfs", "P+Edfs+Infs", "P+Edmp", + "P+Edmp+Inmp", "P+Edms", "P+Edms+Idms", "P+Edxp", "P+Enfp", "P+Enfs", "P+Enmp", + "P+Enmp+P+Ddmp", "P+Enmp+Q_", "P+Enms", "P+Enns", "P+Enns+V0f000", "P+Enxs", + "P+Gnaa", "P+Iafp", "P+Iamp", "P+Idfp", "P+Idfp+Ddfp", "P+Idfs", "P+Idmp", + "P+Idmp+Ddmp", "P+Idms", "P+Infp", "P+Infs", "P+Infs+P+Enfp", "P+Infs+P+Infs", + "P+Inmp", "P+Inms", "P+Inms+P+Edmp", "P+Inms+P+Enmp", "P+Ncdfs", "P+Ncdms", + "P+Ncnfs", "P+Ncnms", "P+P", "P+Q\"+A0fs", "P+Q\"+Sp00", "P+Q'+Sp00", "P+Rtn3fs", + "P+Rtp3fp", "P+Rtp3fs", "P+Rtp3fs+Lp0+Ddms", "P+Rtp3fs+Vei30p+Rad3as", "P+Rtp3mp", + "P+Rtp3mp+Rad2as+Raa3fs", "P+Rtp3ms", "P+Rtp3ms+Lcc+Ddms", "P+Sp00", "P+Sp00+Ddfs", + "P+Sp00+Difs", "P+Sp00+Dims", "P+Sp00+Lcc+Ddms", "P+Sp00+Lp0", "P+Sp00+Q_+Scmp", + "P+Sp00+Scmp", "P+Sp00+V0p0ms", "P+Sp00+Vei30s+Rad3as", "P+Sp00+Vii30p+Raa3ms", + "P+Sp00+Vpi30s+Rad3as", "P+Sp00+Wn", "P+Spfp", "P+Spfs", "P+Spm0+Q_+Q¿", + "P+Spm0+Vei30s+Rad3ms", "P+Spm0+Vli30s+Rad3ms", "P+Spm0+Vlia0s+Raa3ms", "P+Spmp", + "P+Spms", "P+V0f000+Raa3ms", "P+V0f000+Rad3fp", "P+Wn", "Q!", "Q\"", "Q'", "Q(", + "Q(+Zg00", "Q)", "Q,", "Q,+Vii30s", "Q-", "Q.", "Q...", "Q/", "Q:", "Q;", "Q?", + "Q[", "Q]", "Q_", "Q_+A0fp", "Q_+A0fs", "Q_+A0fs+Q_+Vei30s+Rad3ms", "Q_+A0mp", + "Q_+A0mp+Q_+P+Idfp", "Q_+A0ms", "Q_+Acms", "Q_+Cc", "Q_+Cc+Vei30s", + "Q_+Cc+Vpi30s+Rad2as", "Q_+Cc+Vps30s+Rad1as", "Q_+Cc+Vps30s+Rad1fs", "Q_+Cs", + "Q_+Cs+Rad1as+Raa3ms", "Q_+Cs+Rad2as+Raa3ms", "Q_+Cs+Vei30s", "Q_+Cs+Vpi30p", + "Q_+Ddfp", "Q_+Ddfs", "Q_+Ddfs+A0fs", "Q_+Ddmp", "Q_+Ddms", + "Q_+Ddms+Scms+Q_+Vei30s", "Q_+Difp", "Q_+Difs", "Q_+Dims", "Q_+Dims+Ncnms+Zs00", + "Q_+Dixp", "Q_+Edfp", "Q_+Edfs", "Q_+Edms", "Q_+Edms+Spms+P", "Q_+Enfs", "Q_+Enmp", + "Q_+Enms", "Q_+Enns", "Q_+Enns+Vpi30s", "Q_+Gdaa", "Q_+Gdfs", "Q_+Gdmp", "Q_+Gdms", + "Q_+Gnaa", "Q_+Gnas", "Q_+Gnms", "Q_+Idfp", "Q_+Idfs", "Q_+Idmp", "Q_+Idmp+Scmp", + "Q_+Idms", "Q_+In00", "Q_+In00+P+Enns", "Q_+Inas", "Q_+Infp+A0fp", "Q_+Infs", + "Q_+Inmp", "Q_+Inmp+A0mp", "Q_+Inmp+Q_", "Q_+Inms", "Q_+La0", "Q_+Lcc", "Q_+Lcs", + "Q_+Lcs+Rad1as+Raa3ms", "Q_+Lp0", "Q_+Lp0+Ddfp", "Q_+Lp0+Ddms", "Q_+Md1smp", + "Q_+Md1sms", "Q_+Ncdfp", "Q_+Ncdfp+Scfp", "Q_+Ncdfp+Scfp+Q_", + "Q_+Ncdfp+Scfp+Q_+P+Ddms", "Q_+Ncdfs", "Q_+Ncdmp", "Q_+Ncdmp+Scmp", "Q_+Ncdms", + "Q_+Ncnaa", "Q_+Ncnap", "Q_+Ncnfp", "Q_+Ncnfs", "Q_+Ncnmp", "Q_+Ncnms", + "Q_+Ncnms+Zs00", "Q_+P", "Q_+P+Ddfp", "Q_+P+Ddfs", "Q_+P+Ddmp", "Q_+P+Ddms", + "Q_+P+Difs", "Q_+P+Dims", "Q_+P+Edms", "Q_+P+Enns", "Q_+P+Idfp+Ddfp", "Q_+P+V0f000", + "Q_+P+V0f000+Raa3ms", "Q_+Q\"+Ddfs", "Q_+Q\"+Scmp", "Q_+Q\"+Sp00", "Q_+Q\"+Vpi30s", + "Q_+Q\"+Wn", "Q_+Q¡", "Q_+Q¡+A0fs", "Q_+Q¡+A0ms", "Q_+Q¡+Cc", "Q_+Q¡+Cs", + "Q_+Q¡+Cs+La0", "Q_+Q¡+Ddfs", "Q_+Q¡+Ddmp", "Q_+Q¡+Ddms", "Q_+Q¡+Enms", + "Q_+Q¡+Enns", "Q_+Q¡+Gdfp", "Q_+Q¡+Gdfs", "Q_+Q¡+Gdms", "Q_+Q¡+Idfs", "Q_+Q¡+Inas", + "Q_+Q¡+Inms", "Q_+Q¡+La0", "Q_+Q¡+Ncnfp", "Q_+Q¡+P", "Q_+Q¡+Q¿+Cs", "Q_+Q¡+Raf1as", + "Q_+Q¡+Rtn1as", "Q_+Q¡+Rtn1ms", "Q_+Q¡+Scfs", "Q_+Q¡+Scms", "Q_+Q¡+Sp00", + "Q_+Q¡+Spf0", "Q_+Q¡+Spfs+Asfs", "Q_+Q¡+Spm0", "Q_+Q¡+V0m20p", "Q_+Q¡+V0m20s", + "Q_+Q¡+V0m20s+Rad3as", "Q_+Q¡+Vei10s+Rad2as", "Q_+Q¡+Vei30s", "Q_+Q¡+Vfi30s", + "Q_+Q¡+Vpi10p", "Q_+Q¡+Vpi10s", "Q_+Q¡+Vpi10s+Raa3ms", "Q_+Q¡+Vpi20s", + "Q_+Q¡+Vpi30p", "Q_+Q¡+Vpi30s", "Q_+Q¡+Vps30s", "Q_+Q¡+Wg", "Q_+Q¡+Wm", "Q_+Q¡+Wn", + "Q_+Q¡+Y", "Q_+Q¿+A0ap", "Q_+Q¿+A0fs", "Q_+Q¿+A0ms", "Q_+Q¿+Cc", "Q_+Q¿+Cs", + "Q_+Q¿+Ddfp", "Q_+Q¿+Ddfs", "Q_+Q¿+Ddmp", "Q_+Q¿+Ddms", "Q_+Q¿+Dims", "Q_+Q¿+Gdaa", + "Q_+Q¿+Gdfs", "Q_+Q¿+Gdmp", "Q_+Q¿+Gdms", "Q_+Q¿+Gnaa", "Q_+Q¿+Gnas", "Q_+Q¿+Gnmp", + "Q_+Q¿+Gnms", "Q_+Q¿+Idms", "Q_+Q¿+Infs", "Q_+Q¿+La0", "Q_+Q¿+Ncnap", "Q_+Q¿+P", + "Q_+Q¿+P+Ddfs", "Q_+Q¿+Q¡+Ddfs", "Q_+Q¿+Q¡+Gnaa", "Q_+Q¿+Rtn1as", "Q_+Q¿+Rtn2as", + "Q_+Q¿+Rtn2as+Vei20s+Raa3ms", "Q_+Q¿+Rtn2ms", "Q_+Q¿+Rtn3as", "Q_+Q¿+Rtn3ms", + "Q_+Q¿+Scfp", "Q_+Q¿+Scfs", + "Q_+Q¿+Scfs+Cc+Enns+P+Ddms+Tnms+Rad1as+Vpi30s+P+V0f000+Gnaa+Vpi30s", "Q_+Q¿+Scmp", + "Q_+Q¿+Scms", "Q_+Q¿+Sp00", "Q_+Q¿+Spm0", "Q_+Q¿+V0f000", "Q_+Q¿+Vci20s", + "Q_+Q¿+Vcia0s", "Q_+Q¿+Vcia0s+V0f000", "Q_+Q¿+Vei20s", "Q_+Q¿+Vei20s+Raa2as", + "Q_+Q¿+Vei20s+Rad3ms", "Q_+Q¿+Vei30p", "Q_+Q¿+Vei30s", "Q_+Q¿+Vei30s+Rad2as", + "Q_+Q¿+Vfi10s", "Q_+Q¿+Vii30s", "Q_+Q¿+Viia0s", "Q_+Q¿+Vli30s", "Q_+Q¿+Vpi10p", + "Q_+Q¿+Vpi10p+Raa1ap", "Q_+Q¿+Vpi10s", "Q_+Q¿+Vpi10s+Rad2as", + "Q_+Q¿+Vpi10s+V0f000+Rad2as", "Q_+Q¿+Vpi20p", "Q_+Q¿+Vpi20s", "Q_+Q¿+Vpi20s+Raa1as", + "Q_+Q¿+Vpi20s+Rad1as", "Q_+Q¿+Vpi20s+V0f000+Rad1as", "Q_+Q¿+Vpi30p+Rad3as", + "Q_+Q¿+Vpi30s", "Q_+Q¿+Vpi30s+Raa1as", "Q_+Q¿+Vpi30s+Raa1ms", "Q_+Q¿+Vpi30s+Rad2as", + "Q_+Q¿+Vpi30s+Rad2fs", "Q_+Q¿+Vpi30s+Rad3as", "Q_+Q¿+Vpi30s+Rao3aa", "Q_+Q¿+Wg", + "Q_+Q¿+Wm", "Q_+Q¿+Wn", "Q_+Q¿+Wn+Rad1as+Raa3ms", "Q_+Raf1as", "Q_+Rtn1ap", + "Q_+Rtn1as", "Q_+Rtn1as+Vei10s+Rad2as", "Q_+Rtn1as+Vpi10s+Raa3fs", "Q_+Rtn1fs", + "Q_+Rtn1fs+Vpi10s+Raa3ms+Q_+Vii30s", "Q_+Rtn1ms", + "Q_+Rtn1ms+Vpi10s+Raf1ms+Spm0+Q_+Vei30s+Rad3as", "Q_+Rtn2as", "Q_+Rtn2ms", + "Q_+Rtn2ms+Vei20s+Rad3fs", "Q_+Rtn3ap", "Q_+Rtn3as", "Q_+Rtn3as+Vps30s+Rad3as", + "Q_+Rtn3fs", "Q_+Rtn3fs+Vfi30s+Rad2as", "Q_+Rtn3ms", "Q_+Scfp", "Q_+Scfs", + "Q_+Scfs+A0fs", "Q_+Scma", "Q_+Scmp", "Q_+Scms", "Q_+Scms+Ncnfp+Q_+Vei30s+Rad3as", + "Q_+Scms+Q_+Vei30s+Rtn3fs", "Q_+Scms+Q_+Vpi10s", "Q_+Scms+Sp00+Q_+P+Ddfs+Scfs", + "Q_+Sp00", "Q_+Spf0", "Q_+Spf0+Vei30s+Rad3ms", "Q_+Spfs", "Q_+Spfs+Q_+Vei30s", + "Q_+Spm0", "Q_+Tnas", "Q_+Tnfp", "Q_+Tnfs", "Q_+Tnmp+P+Idmp", "Q_+Tnms", + "Q_+V0f000", "Q_+V0f000+V0f000", "Q_+V0m10p+V0f000+Rad3ms", "Q_+V0m20p", + "Q_+V0m20s", "Q_+V0m20s+Q_+Vpi30s", "Q_+V0m20s+Raa1ap+A0mp", "Q_+V0m20s+Raa1as", + "Q_+V0m20s+Raa2as", "Q_+V0m20s+Rad1as", "Q_+V0m20s+Rad1ms", "Q_+V0m20s+Raf2as", + "Q_+V0m20s+Raf2as+La0+Q_+Vei10s", "Q_+V0m20s+Raf2as+Q_+Vlia0s+Rad3as", "Q_+V0p0fp", + "Q_+V0p0fs", "Q_+V0p0mp", "Q_+V0p0ms", "Q_+V0p0ms+P+Sp00", "Q_+V0x000", "Q_+V0x10p", + "Q_+Vci20s", "Q_+Vci30p+Raa3fs", "Q_+Vci30s", "Q_+Vci30s+Rad1as", + "Q_+Vci30s+Rao3aa", "Q_+Vei10p", "Q_+Vei10p+Raa3ms", "Q_+Vei10s", + "Q_+Vei10s+Raa3ms", "Q_+Vei10s+Rad3as", "Q_+Vei10s+Rad3as+V0x000+Rad3as", + "Q_+Vei10s+Rad3ms", "Q_+Vei10s+Raf1as", "Q_+Vei10s+Wn+Q_+Vpi10s+Raf1as", + "Q_+Vei20s", "Q_+Vei30p", "Q_+Vei30p+Raa2as", "Q_+Vei30p+Rad1as", + "Q_+Vei30p+Rad3as", "Q_+Vei30p+Rad3ms", "Q_+Vei30s", "Q_+Vei30s+A0fs", + "Q_+Vei30s+Inms+P+Ddmp+Scmp+P", "Q_+Vei30s+La0", "Q_+Vei30s+Q_+V0f000", + "Q_+Vei30s+Raa1ap", "Q_+Vei30s+Raa1as", "Q_+Vei30s+Raa1mp", "Q_+Vei30s+Raa3fs", + "Q_+Vei30s+Raa3ms", "Q_+Vei30s+Rad1ap", "Q_+Vei30s+Rad1as", + "Q_+Vei30s+Rad1as+Raa3ms", "Q_+Vei30s+Rad1ms", "Q_+Vei30s+Rad2as", + "Q_+Vei30s+Rad3as", "Q_+Vei30s+Rad3as+Spf0+Q_+Vpi10s", "Q_+Vei30s+Rad3fs", + "Q_+Vei30s+Rad3ms", "Q_+Vei30s+Rao3aa", "Q_+Vei30s+Rao3aa+Rad1as", + "Q_+Vei30s+Rao3aa+Rad1ms", "Q_+Vei30s+Rtn3fs", "Q_+Vei30s+V0x000+Rad3ms", + "Q_+Vfi10p", "Q_+Vfi10p+Q_+Vei30s", "Q_+Vfi10s", "Q_+Vfi10s+Rad3as", "Q_+Vfi20s", + "Q_+Vfi30s", "Q_+Vfi30s+Raa3fp", "Q_+Vii10s", "Q_+Vii10s+Rad2as", + "Q_+Vii10s+Raf1as", "Q_+Vii10s+V0f000+Raa3ms", "Q_+Vii20s", "Q_+Vii20s+Rad1as", + "Q_+Vii30p", "Q_+Vii30s", "Q_+Vii30s+Raa3ms", "Q_+Vii30s+Rad1as", + "Q_+Vii30s+Rao3aa", "Q_+Viia0s", "Q_+Vli30s", "Q_+Vli30s+Raa3ms", + "Q_+Vli30s+Rad1ms", "Q_+Vli30s+Rad3fs", "Q_+Vlia0s+Rad3as", "Q_+Vlia0s+Rad3fs", + "Q_+Vpi10p", "Q_+Vpi10p+Raa1ap", "Q_+Vpi10p+Raa3ms", "Q_+Vpi10s", + "Q_+Vpi10s+Raa2ms", "Q_+Vpi10s+Raa3fp", "Q_+Vpi10s+Raa3ms", "Q_+Vpi10s+Rad2as", + "Q_+Vpi10s+Rad2as+Raa3ms", "Q_+Vpi10s+Rad3as", "Q_+Vpi10s+Rad3as+Raa3ms", + "Q_+Vpi10s+Rad3ms", "Q_+Vpi10s+Raf1as", "Q_+Vpi10s+Raf1fs", "Q_+Vpi10s+Raf1ms", + "Q_+Vpi10s+Raf1ms+Spm0+Q_+Q¿+Vpi30s", "Q_+Vpi10s+V0f000+Rad2as", + "Q_+Vpi10s+V0f000+Rad3as", "Q_+Vpi20p", "Q_+Vpi20s", "Q_+Vpi20s+Raa1as", + "Q_+Vpi20s+Raa2ms", "Q_+Vpi20s+Raf2ms", "Q_+Vpi30p", "Q_+Vpi30p+Rad1ms", + "Q_+Vpi30p+Rad2as", "Q_+Vpi30p+Rad3as", "Q_+Vpi30p+Rao3aa", "Q_+Vpi30p+V0f000", + "Q_+Vpi30s", "Q_+Vpi30s+Lp0+Difs", "Q_+Vpi30s+P+Enfp", "Q_+Vpi30s+Raa1as", + "Q_+Vpi30s+Raa3fp", "Q_+Vpi30s+Raa3ms", "Q_+Vpi30s+Rad1as", "Q_+Vpi30s+Rad1fs", + "Q_+Vpi30s+Rad2as", "Q_+Vpi30s+Rad2fs", "Q_+Vpi30s+Rad2ms", "Q_+Vpi30s+Rad3as", + "Q_+Vpi30s+Rad3ms", "Q_+Vpi30s+Rao3aa", "Q_+Vpi30s+V0f000", "Q_+Vps10p", + "Q_+Vps10p+Raf1ap", "Q_+Vps20s", "Q_+Vps30p", "Q_+Vps30s", + "Q_+Vps30s+Q_+Vei30s+Raa3ms", "Q_+Vps30s+Raa1as", "Q_+Vps30s+Raa1fs", + "Q_+Vps30s+Raa3ms", "Q_+Vps30s+Rad1as", "Q_+Vps30s+Rad1as+Raa3ms", + "Q_+Vps30s+Rao3aa", "Q_+Wg", "Q_+Wg+Rad2as+Raa3ms", "Q_+Wm", "Q_+Wn", "Q_+Wn+A0fs", + "Q_+Wn+Q_+Vei10s", "Q_+Wn+Q_+Vei30s", "Q_+Wn+Rad2mp+Raa3ms", "Q_+Wn+Scmp", + "Q_+Wn+V0m20s+Raa3ms", "Q_+Wn+V0p0fp", "Q_+Wn+Vii30s+Rad3as", "Q_+Wn+Vpi10s+Raa3fs", + "Q_+Wn+Vpi20s", "Q_+Wn+Vps20s", "Q_+Wn+Wn+Q_+P+Edfs", + "Q_+Wn+Wn+Wn+Wn+Wn+Q...+Q_+Cs", "Q_+Wr", "Q_+Wr+Rad3as+Raa3ms", "Q_+Y", + "Q_+Y+Q_+Vei30s", "Q_+Y+Q_+Vpi30s", "Q_+Y+Scfs", "Q_+Za00+Idfs", "Q_+Zg00", + "Q_+Zo00", "Q{", "Q}", "Q¡", "Q¿", "Raa1ap", "Raa1as", "Raa1fp", "Raa1fs", "Raa1mp", + "Raa1ms", "Raa2ap", "Raa2as", "Raa2fp", "Raa2fs", "Raa2mp", "Raa2ms", "Raa3fp", + "Raa3fs", "Raa3mp", "Raa3ms", "Raa3xp", "Raa3xs", "Rad1ap", "Rad1ap+Raa3fs", + "Rad1ap+Raa3ms", "Rad1as", "Rad1as+Raa3fp", "Rad1as+Raa3fs", "Rad1as+Raa3mp", + "Rad1as+Raa3ms", "Rad1fp", "Rad1fs", "Rad1fs+Raa3fp", "Rad1mp", "Rad1mp+Raa3fs", + "Rad1ms", "Rad1ms+Raa3fs", "Rad1ms+Raa3ms+Vii30s", "Rad2ap", "Rad2ap+Raa3ms", + "Rad2as", "Rad2as+Raa3fp", "Rad2as+Raa3fs", "Rad2as+Raa3ms", "Rad2fp", "Rad2fs", + "Rad2fs+Raa3ms", "Rad2mp", "Rad2ms", "Rad3ap", "Rad3ap+Raa3ms", "Rad3as", + "Rad3as+Raa3fs", "Rad3as+Raa3mp", "Rad3as+Raa3ms", "Rad3fp", "Rad3fp+Raa3fs", + "Rad3fs", "Rad3fs+Raa3fp", "Rad3fs+Raa3fs", "Rad3fs+Raa3mp", "Rad3fs+Raa3ms", + "Rad3mp", "Rad3mp+Raa3fs", "Rad3mp+Raa3mp", "Rad3mp+Raa3ms", "Rad3ms", + "Rad3ms+Raa3fp", "Rad3ms+Raa3fs", "Rad3ms+Raa3ms", "Rad3ms+Raa3ms+Ves30s", "Raf1ap", + "Raf1as", "Raf1fp", "Raf1fs", "Raf1mp", "Raf1ms", "Raf2ap", "Raf2as", "Raf2fp", + "Raf2fs", "Raf2mp", "Raf2ms", "Rao3aa", "Rtn1ap", "Rtn1ap+Vei10p+Raa3ms", + "Rtn1ap+Vii10p+Raa1ap", "Rtn1ap+Vii10p+Raa3ms", "Rtn1as", "Rtn1as+Vei10s+Raa3fs", + "Rtn1as+Vei10s+Raa3ms", "Rtn1as+Vei10s+Rad2as", "Rtn1as+Vei10s+Rad3as", + "Rtn1as+Vei10s+Rad3as+Raa3ms", "Rtn1as+Vei10s+Raf1as", + "Rtn1as+Vfi10s+Rad3ap+P+Idfp", "Rtn1as+Vii10s+Raa3ms", "Rtn1as+Vii10s+Rad3as", + "Rtn1as+Vii10s+Rad3as+Raa3mp", "Rtn1as+Vii10s+Rad3fp", "Rtn1as+Vii10s+Raf1as", + "Rtn1as+Vli10s+Raa3mp", "Rtn1as+Vli10s+Raf1as", "Rtn1as+Vpi10s+Raa2fs", + "Rtn1as+Vpi10s+Raa3mp", "Rtn1as+Vpi10s+Rad2as", "Rtn1as+Vpi10s+Rad3ms", + "Rtn1as+Vpi10s+Raf1as", "Rtn1fp", "Rtn1fs", "Rtn1fs+Vii10s+Rad2as", "Rtn1mp", + "Rtn1ms", "Rtn1ms+Vei10s+Raa1ms", "Rtn1ms+Vei10s+Raf1ms", "Rtn1ms+Vii10s+Raa3ms", + "Rtn1ms+Vii10s+Raf1ms", "Rtn1xp", "Rtn1xp+Vpi10p+Raf1ap", "Rtn2ap", "Rtn2as", + "Rtn2as+V0m20s+Rad1as", "Rtn2as+Vei20s+Raa1as", "Rtn2as+Vpi20s+Raa1as", + "Rtn2as+Vpi20s+Rad3ms", "Rtn2fp", "Rtn2fs", "Rtn2mp", "Rtn2ms", "Rtn2xp", "Rtn3ap", + "Rtn3as", "Rtn3fp", "Rtn3fs", "Rtn3fs+Vei30s+Raa1as", "Rtn3fs+Vei30s+Raa3fs", + "Rtn3fs+Vei30s+Raa3ms", "Rtn3fs+Vei30s+Rad3as+Raa3ms", + "Rtn3fs+Vei30s+Rao3aa+Lp0+Rtp3ms", "Rtn3fs+Vfi30s+Raa3ms", "Rtn3fs+Vii30s+Raa1ms", + "Rtn3fs+Vii30s+Raa3ms", "Rtn3fs+Vli30s+Raa3ms", "Rtn3fs+Vli30s+Rad1as", + "Rtn3fs+Vli30s+Rad3ms", "Rtn3fs+Vpi30s+Raa1as", "Rtn3fs+Vpi30s+Raa3ms", + "Rtn3fs+Vpi30s+Rad1as", "Rtn3fs+Vpi30s+Rad3mp", "Rtn3mp", "Rtn3ms", + "Rtn3ms+Vpi30s+Raa3fs", "Rtn3ns", "Rtn3xp", "Rtn3xs", "Rtp1ap", "Rtp1as", "Rtp1fp", + "Rtp1fs", "Rtp1mp", "Rtp1ms", "Rtp1xp", "Rtp2ap", "Rtp2as", "Rtp2fp", "Rtp2fs", + "Rtp2mp", "Rtp2ms", "Rtp2xp", "Rtp3aa", "Rtp3ap", "Rtp3as", "Rtp3as+Q_+Vii30s", + "Rtp3fp", "Rtp3fp+Lcs", "Rtp3fs", "Rtp3fs+Vfi30p+Rao3aa+La0", "Rtp3mp", + "Rtp3mp+P+Ddfs+Spfs", "Rtp3ms", "Rtp3ns", "Rtp3xp", "Rtp3xs", "Scaa", "Scap", + "Scas", "Scfa", "Scfp", "Scfp+A0fp", "Scfp+A0fs", "Scfp+A0ms", "Scfp+Cs", + "Scfp+Ddfs", "Scfp+Ddfs+Spfs+Vei30s+Rad3ms", "Scfp+La0", "Scfp+Q,+La0", + "Scfp+Q_+V0f000+Rad3fs", "Scfp+Scms", "Scfp+V0x000", "Scfp+Vpi10s", + "Scfp+Vpi30s+Rad3ms", "Scfs", "Scfs+Lp0", "Scfs+Lp0+Difs", "Scfs+Q\"", + "Scfs+Q_+P+Edfs", "Scma", "Scmp", "Scmp+A0mp", "Scmp+Ddfs", "Scmp+Dims", "Scmp+Lcs", + "Scmp+Lp0+Ddfs", "Scmp+P+Edmp", "Scmp+P+Idfs", "Scmp+P+Scfp", + "Scmp+Q_+Vpi30s+Rad3mp", "Scmp+Scms", "Scmp+V0x000+Raa3fs", + "Scmp+Vpi10s+Raa3mp+Idmp+Ddmp", "Scmp+Vpi30s", "Scmp+Vpi30s+Raa3mp", "Scmp+Zo00", + "Scms", "Scms+Lp0+Ddfp", "Scms+P+Ddxs", "Scms+Q\"", "Scms+Q\"+Vli30s", "Scms+Q.+Q_", + "Scms+Spms+Vei30s+Raa3ms", "Scxp", "Scxs", "Sp00", "Sp00+Cc+Sp00+Q.", "Sp00+Ddfs", + "Sp00+Ddfs+Scfs", "Sp00+Lp0+Ddmp", "Sp00+P+Ddmp+Scmp", "Sp00+P+Ddms+Scms", + "Sp00+P+Ddms+Spms", "Sp00+Q,+Lp0", "Sp00+Q,+P", "Sp00+Q.", "Sp00+Q.+P", "Sp00+Q_", + "Sp00+Scms", "Sp00+Sp00", "Sp00+Sp00+Sp00+Sp00", "Sp00+Vfi30s+Rad3mp", + "Sp00+Vpi30s", "Spf0", "Spf0+Q_", "Spf0+Q_+Vei30s+Raa3fs", "Spfp", + "Spfp+P+Spm0+Scms", "Spfp+Vpi30p+Raa1ap", "Spfs", "Spfs+Ddms+Spms", "Spfs+Lp0+Ddms", + "Spfs+P", "Spfs+P+Ddfs+Scfs", "Spfs+P+Ddms+Scms", "Spfs+P+Sp00", "Spfs+Q(+Zg00", + "Spfs+Q,+P", "Spfs+Q,+P+Inms", "Spfs+Q.", "Spfs+Q.+Wn", "Spfs+Q_+Infs", + "Spfs+Q_+Q.", "Spfs+Vei30p+Rad3as+P+Sp00", "Spm0", "Spm0+Q,+Spf0", "Spm0+Q_", + "Spmp", "Spms", "Spms+Cc", "Spms+P", "Spms+P+Ddfs+Scfs", "Spms+Q,", + "Spms+Q,+P+Infs", "Spms+Q,+Sp00", "Spms+Q.", "Spms+Q_", "Spms+Vei30s", + "Spms+Vpi30s+Rao3aa", "Spms+Wn", "Tdfp", "Tdfs", "Tdmp", "Tdms", "Tdxp", "Tdxs", + "Tnaa", "Tnap", "Tnas", "Tnfa", "Tnfp", "Tnfs", "Tnmp", "Tnms", "Tnxp", "Tnxs", + "V0f000", "V0f000+Ddfp", "V0f000+Ddfs", "V0f000+Ddms+Scms", "V0f000+P+Spm0", + "V0f000+Raa1ap", "V0f000+Raa1ap+Vei10s+Rad3as+Raa3ms", "V0f000+Raa1as", + "V0f000+Raa1as+La0+P", "V0f000+Raa1as+Lp0+Ddfs", "V0f000+Raa1as+P", + "V0f000+Raa1as+Q_+Vii30s", "V0f000+Raa1fp", "V0f000+Raa1fs", "V0f000+Raa1mp", + "V0f000+Raa1ms", "V0f000+Raa2ap", "V0f000+Raa2as", "V0f000+Raa2as+Lp0+Difs", + "V0f000+Raa2fs", "V0f000+Raa2ms", "V0f000+Raa3fp", "V0f000+Raa3fp+Lp0", + "V0f000+Raa3fp+P", "V0f000+Raa3fp+P+Idfs", "V0f000+Raa3fp+Q_+Vei30s", + "V0f000+Raa3fp+Vpi30s+Raa3fp", "V0f000+Raa3fs", "V0f000+Raa3fs+Lp0+Ddfs", + "V0f000+Raa3fs+Q_+Vei10s", "V0f000+Raa3fs+Q_+Vei30s", "V0f000+Raa3mp", + "V0f000+Raa3mp+Lp0+Dims", "V0f000+Raa3mp+P+Idmp", "V0f000+Raa3ms", + "V0f000+Raa3ms+Lp0+Ddms", "V0f000+Raa3ms+P", "V0f000+Raa3ms+P+Enns", + "V0f000+Raa3ms+P+Idfs", "V0f000+Raa3ms+Q_+P", "V0f000+Raa3ms+Q_+Vei30s", + "V0f000+Raa3ms+Rtn1fp", "V0f000+Raa3ms+V0f000", "V0f000+Raa3ms+V0x000+Rad3as", + "V0f000+Raa3ms+V0x000+Raf1as", "V0f000+Raa3ms+Vei30s+Rad1ap", "V0f000+Rad1ap", + "V0f000+Rad1as", "V0f000+Rad1as+Enns", "V0f000+Rad1as+Raa3fs", + "V0f000+Rad1as+Raa3ms", "V0f000+Rad1as+V0f000", "V0f000+Rad1fs", "V0f000+Rad1mp", + "V0f000+Rad1mp+Raa3mp", "V0f000+Rad1ms", "V0f000+Rad2ap", "V0f000+Rad2ap+Raa3fs", + "V0f000+Rad2as", "V0f000+Rad2as+Vpi30s+Raa1as", "V0f000+Rad2mp", "V0f000+Rad2ms", + "V0f000+Rad3ap", "V0f000+Rad3as", "V0f000+Rad3as+La0", "V0f000+Rad3as+P", + "V0f000+Rad3as+Raa3fp", "V0f000+Rad3as+Raa3fs", "V0f000+Rad3as+Raa3mp", + "V0f000+Rad3as+Raa3ms", "V0f000+Rad3fp", "V0f000+Rad3fs", "V0f000+Rad3fs+Idfp+Ddfp", + "V0f000+Rad3fs+Raa3ms", "V0f000+Rad3mp", "V0f000+Rad3mp+Raa3mp", + "V0f000+Rad3mp+Raa3ms", "V0f000+Rad3mp+V0f000", "V0f000+Rad3ms", + "V0f000+Rad3ms+Ddms+P", "V0f000+Rad3ms+Idfp+Ddfp", "V0f000+Rad3ms+Raa1as", + "V0f000+Rad3ms+Raa3fp", "V0f000+Rad3ms+Raa3fs", "V0f000+Rad3ms+Raa3ms", + "V0f000+Raf1ap", "V0f000+Raf1as", "V0f000+Raf1as+P+Edfs", "V0f000+Raf1as+P+Rtp3ms", + "V0f000+Raf1fp", "V0f000+Raf1fs", "V0f000+Raf1mp", "V0f000+Raf1ms", + "V0f000+Raf1ms+P+Inms+P+Edmp", "V0f000+Raf2ap", "V0f000+Raf2as", "V0f000+Raf2fp", + "V0f000+Raf2ms", "V0f000+Rao3aa", "V0f000+Rao3aa+La0+P", + "V0f000+Rao3aa+P+Ddms+Scms+Q_+Vei30s", "V0f000+Rao3aa+P+V0f000+Rad3ms", + "V0f000+Rao3aa+Rad1ap", "V0f000+Rao3aa+Rad1as", "V0f000+Rao3aa+Rad1as+La0", + "V0f000+Rao3aa+Rad3fs", "V0f000+Rao3aa+Rad3ms", "V0f000+Sp00", "V0f10p", + "V0f10p+Raa1ap", "V0f10p+Raa3fs", "V0f10p+Raf1ap", "V0f20p", "V0f20p+Raa3ms", + "V0f20s", "V0f30p", "V0f30p+Rad3fs", "V0f30p+Rao3aa", "V0f30p+Rao3aa+Rad1as", + "V0m10p", "V0m10p+Raf1ap", "V0m10p+V0f000+Raa1ap", "V0m20p", "V0m20p+Raa1as", + "V0m20p+Raa3ms", "V0m20p+Rad1as", "V0m20p+Raf2ap", "V0m20p+Raf2ap+P+Rtp3ms", + "V0m20s", "V0m20s+Raa1ap", "V0m20s+Raa1as", "V0m20s+Raa1fs", "V0m20s+Raa2as", + "V0m20s+Raa2ms", "V0m20s+Raa3fp", "V0m20s+Raa3fs", "V0m20s+Raa3mp", "V0m20s+Raa3ms", + "V0m20s+Rad1as", "V0m20s+Rad1as+Raa3ms", "V0m20s+Rad3ap", "V0m20s+Rad3as", + "V0m20s+Rad3as+Raa3mp", "V0m20s+Rad3mp", "V0m20s+Rad3ms", "V0m20s+Raf2as", + "V0m20s+Raf2as+P+Rtp3fp", "V0m20s+Raf2as+Rad1as", "V0m20s+Raf2fs", "V0m20s+Raf2ms", + "V0m20s+V0f000", "V0p0fp", "V0p0fs", "V0p0fs+P+Ddfp+Spfp", "V0p0fs+P+Sp00", + "V0p0mp", "V0p0mp+La0+P+Ddms+Scms", "V0p0ms", "V0p0ms+P", "V0p0ms+P+Sp00", "V0p0xp", + "V0p0xs", "V0x000", "V0x000+P+Edfp", "V0x000+P+Enns", "V0x000+Raa1ap", + "V0x000+Raa1as", "V0x000+Raa1as+P+Rtp3fs", "V0x000+Raa1mp", "V0x000+Raa1ms", + "V0x000+Raa2as", "V0x000+Raa2ms", "V0x000+Raa3fp", "V0x000+Raa3fp+Lp0+Ddmp", + "V0x000+Raa3fs", "V0x000+Raa3mp", "V0x000+Raa3ms", "V0x000+Raa3ms+La0", + "V0x000+Raa3ms+Lp0", "V0x000+Rad1ap", "V0x000+Rad1as", "V0x000+Rad1as+P+Idfs", + "V0x000+Rad1as+Raa3fs", "V0x000+Rad1as+Raa3mp", "V0x000+Rad1as+Raa3ms", + "V0x000+Rad1mp", "V0x000+Rad3ap", "V0x000+Rad3as", "V0x000+Rad3as+Raa3fs", + "V0x000+Rad3as+Raa3ms", "V0x000+Rad3as+V0f000+Raa3mp", "V0x000+Rad3fp", + "V0x000+Rad3fs", "V0x000+Rad3mp", "V0x000+Rad3ms", "V0x000+Rad3ms+Raa3ms", + "V0x000+Raf1as", "V0x000+Raf1as+La0", "V0x000+Raf1mp", "V0x000+Raf1ms", + "V0x000+Raf1ms+Lp0+Ddfs", "V0x000+Rao3aa", "V0x10p", "V0x20p", "Vci10p", + "Vci10p+Raa1ap", "Vci10p+V0f000", "Vci10s", "Vci10s+Raa1ms", "Vci10s+Raa3mp", + "Vci10s+Raa3ms", "Vci10s+Rad1as+V0f000+Raf1as", "Vci10s+Rad3as", + "Vci10s+Rad3as+Raa3ms", "Vci10s+Raf1ms", "Vci10s+V0f000+Raf1as", "Vci20p", + "Vci20p+Raa3mp", "Vci20s", "Vci20s+Raa2as", "Vci30p", "Vci30p+Raa3fp", + "Vci30p+Raa3mp", "Vci30p+Rad1ap", "Vci30p+Rad3as+Raa3ms", "Vci30p+Rao3aa", "Vci30s", + "Vci30s+Raa1as", "Vci30s+Raa3fs", "Vci30s+Raa3ms", "Vci30s+Rad1ap", "Vci30s+Rad1as", + "Vci30s+Rad1ms", "Vci30s+Rad2as", "Vci30s+Rad3ap", "Vci30s+Rad3as", "Vci30s+Rad3fs", + "Vci30s+Rad3ms", "Vci30s+Rad3ms+V0f000", "Vci30s+Raf1as", "Vci30s+Rao3aa", "Vcia0s", + "Vcia0s+Raa1as", "Vcia0s+Raa3ms", "Vcia0s+Rad3ap", "Vcia0s+Rad3as", "Vcia0s+V0f000", + "Vcia0s+V0f000+Raa3ms", "Vei10p", "Vei10p+Lp0+Rtp3ms", "Vei10p+Raa1ap", + "Vei10p+Raa1mp", "Vei10p+Raa3fp", "Vei10p+Raa3fs", "Vei10p+Raa3mp", "Vei10p+Raa3ms", + "Vei10p+Raa3ms+Lp0+V0f10p+Raa3ms", "Vei10p+Rad3as", "Vei10p+Rad3fs", + "Vei10p+Rad3ms", "Vei10p+Rad3ms+V0f000", "Vei10p+Raf1ap", "Vei10p+Raf1as", + "Vei10p+Raf1mp", "Vei10s", "Vei10s+La0", "Vei10s+Lp0+Ddms", "Vei10s+P", + "Vei10s+P+Rtp3ms", "Vei10s+P+Sp00", "Vei10s+P+V0f000+Raf1as", "Vei10s+Q_+Vpi20s", + "Vei10s+Raa1as", "Vei10s+Raa1as+La0", "Vei10s+Raa1ms", "Vei10s+Raa2as", + "Vei10s+Raa2fp", "Vei10s+Raa3fp", "Vei10s+Raa3fp+La0", "Vei10s+Raa3fs", + "Vei10s+Raa3fs+P+Infp", "Vei10s+Raa3mp", "Vei10s+Raa3mp+La0+Cc+P", "Vei10s+Raa3ms", + "Vei10s+Rad1as", "Vei10s+Rad1fs", "Vei10s+Rad2as", "Vei10s+Rad3ap", "Vei10s+Rad3as", + "Vei10s+Rad3as+La0", "Vei10s+Rad3as+Raa3ms", "Vei10s+Rad3fp", "Vei10s+Rad3fs", + "Vei10s+Rad3fs+Raa3ms", "Vei10s+Rad3fs+Raf1as", "Vei10s+Rad3mp", + "Vei10s+Rad3mp+Raa3fs", "Vei10s+Rad3ms", "Vei10s+Rad3ms+Raa3fp", + "Vei10s+Rad3ms+Raf1as", "Vei10s+Raf1as", "Vei10s+Raf1as+La0", + "Vei10s+Raf1as+Lp0+Ddfs", "Vei10s+Raf1as+P+Enns", "Vei10s+Raf1fs", "Vei10s+Raf1ms", + "Vei10s+Raf1ms+Lp0+Ddms", "Vei10s+V0f000", "Vei10s+V0f000+Raa1as", + "Vei10s+V0f000+Raa3fs", "Vei10s+V0f000+Raa3mp", "Vei10s+V0f000+Raa3ms", + "Vei10s+V0f000+Rad3as", "Vei10s+V0f000+Rad3as+Raa3ms", "Vei10s+V0f000+Rad3fs", + "Vei10s+V0f000+Raf1as", "Vei20p", "Vei20s", "Vei20s+Raa1ap", "Vei20s+Raa1as", + "Vei20s+Raa1fs", "Vei20s+Raa3fs", "Vei20s+Raa3ms", "Vei20s+Rad1as", "Vei20s+Raf2as", + "Vei30p", "Vei30p+Raa1ap", "Vei30p+Raa1as", "Vei30p+Raa1fs", "Vei30p+Raa1ms", + "Vei30p+Raa2as", "Vei30p+Raa3fp", "Vei30p+Raa3fs", "Vei30p+Raa3mp", "Vei30p+Raa3ms", + "Vei30p+Raa3ms+Q_+Vei30s", "Vei30p+Rad1ap", "Vei30p+Rad1as", "Vei30p+Rad1as+Raa3ms", + "Vei30p+Rad1as+Raa3ms+Q_+Vei30s", "Vei30p+Rad1ms", "Vei30p+Rad3as", "Vei30p+Rad3fp", + "Vei30p+Rad3fs", "Vei30p+Rad3fs+La0", "Vei30p+Rad3fs+Raa3ms", "Vei30p+Rad3mp", + "Vei30p+Rad3ms", "Vei30p+Rao3aa", "Vei30p+Rao3aa+La0+P", "Vei30p+Rao3aa+Rad1as", + "Vei30p+Rao3aa+Rad3as", "Vei30p+Rao3aa+Rad3fp", "Vei30p+Rao3aa+Rad3fs", + "Vei30p+Rao3aa+Rad3mp", "Vei30p+Rao3aa+Rad3ms", "Vei30s", + "Vei30s+La0+Q,+V0x000+Rao3aa", "Vei30s+Lp0+Ddfs", "Vei30s+P", "Vei30s+P+Enns", + "Vei30s+P+Inms+P+Edmp", "Vei30s+P+Rtp3mp", "Vei30s+Q_", "Vei30s+Raa1ap", + "Vei30s+Raa1as", "Vei30s+Raa1as+Lp0+Ddms", "Vei30s+Raa1as+P+Edfs", "Vei30s+Raa1mp", + "Vei30s+Raa1ms", "Vei30s+Raa2as", "Vei30s+Raa2fs", "Vei30s+Raa3fp", "Vei30s+Raa3fs", + "Vei30s+Raa3fs+La0", "Vei30s+Raa3fs+La0+Lp0+Ddfs", "Vei30s+Raa3fs+Spm0+La0", + "Vei30s+Raa3mp", "Vei30s+Raa3mp+Ddfs+Spfs+P", "Vei30s+Raa3ms", "Vei30s+Raa3ms+La0", + "Vei30s+Raa3ms+P+Cs", "Vei30s+Raa3ms+V0x000+Rad3fs", "Vei30s+Rad1ap", + "Vei30s+Rad1ap+P+Idfs", "Vei30s+Rad1ap+Raa3ms", "Vei30s+Rad1as", + "Vei30s+Rad1as+Raa3fs", "Vei30s+Rad1as+Raa3ms", "Vei30s+Rad1as+V0f000+Raa3fs", + "Vei30s+Rad1fs", "Vei30s+Rad1mp", "Vei30s+Rad1ms", "Vei30s+Rad1ms+Raa3fp", + "Vei30s+Rad2ap+Rad1as", "Vei30s+Rad2as", "Vei30s+Rad2as+Raa3fs", + "Vei30s+Rad2as+Rad1as", "Vei30s+Rad3ap", "Vei30s+Rad3as", "Vei30s+Rad3as+La0", + "Vei30s+Rad3as+Raa3fp", "Vei30s+Rad3as+Raa3fs", "Vei30s+Rad3as+Raa3ms", + "Vei30s+Rad3as+V0m20s+Rad1as", "Vei30s+Rad3fp", "Vei30s+Rad3fs", + "Vei30s+Rad3fs+Raa3ms", "Vei30s+Rad3mp", "Vei30s+Rad3mp+Raa3ms", "Vei30s+Rad3ms", + "Vei30s+Rad3ms+Raa3fs", "Vei30s+Rad3ms+Raa3mp", "Vei30s+Rad3ms+Raa3ms", + "Vei30s+Rao3aa", "Vei30s+Rao3aa+La0+P", "Vei30s+Rao3aa+La0+P+Ddfs+Scfs", + "Vei30s+Rao3aa+La0+P+Ddms+Scms", "Vei30s+Rao3aa+La0+P+Ddms+Scms+P", + "Vei30s+Rao3aa+La0+Q.", "Vei30s+Rao3aa+P+V0f000+Raa3fs", + "Vei30s+Rao3aa+P+V0f000+Raa3fs+P", "Vei30s+Rao3aa+Rad1ap", "Vei30s+Rao3aa+Rad1as", + "Vei30s+Rao3aa+Rad1as+Lp0+Ddms", "Vei30s+Rao3aa+Rad1as+V0f000+Raa1as", + "Vei30s+Rao3aa+Rad1ms", "Vei30s+Rao3aa+Rad3as", "Vei30s+Rao3aa+Rad3fs", + "Vei30s+Rao3aa+Rad3ms", "Vei30s+Scms+A0ms", "Vei30s+V0f000", "Vei30s+V0f000+Raa3fs", + "Vei30s+V0f000+Raa3ms", "Vei30s+V0f000+Rad1as", "Vei30s+V0x000+Raa1as", "Ves10p", + "Ves10s", "Ves20p", "Ves20s", "Ves30p", "Ves30s", "Ves30s+Rao3aa", "Vesa0s", + "Vfi10p", "Vfi10p+Ddms", "Vfi10p+Raa1ap", "Vfi10p+Raa3fs", "Vfi10p+Raa3ms", + "Vfi10p+Rad3fp", "Vfi10p+Rad3mp", "Vfi10p+Raf1ap", "Vfi10s", "Vfi10s+Raa2as", + "Vfi10s+Raa3fs", "Vfi10s+Raa3fs+Q_+Vei30s+Rad3ms", "Vfi10s+Raa3ms", "Vfi10s+Rad1as", + "Vfi10s+Rad2as", "Vfi10s+Rad2as+Raa3mp", "Vfi10s+Rad2mp", "Vfi10s+Rad3ap", + "Vfi10s+Rad3as", "Vfi10s+Rad3as+Raa3ms", "Vfi10s+Rad3mp", "Vfi10s+Raf1as", "Vfi20p", + "Vfi20p+Raa3ms", "Vfi20s", "Vfi20s+Raa3ms", "Vfi20s+Rad1as", + "Vfi20s+V0f000+Raa3ms+P+Ddxs", "Vfi30p", "Vfi30p+Raa3ms", "Vfi30p+Rad1as+Raa3ms", + "Vfi30p+Rad3as", "Vfi30p+Rad3fs", "Vfi30p+Rad3mp", "Vfi30p+Rad3ms", + "Vfi30p+Rad3ms+Raa3ms", "Vfi30p+Rao3aa", "Vfi30p+Rao3aa+Rad3as", "Vfi30p+V0f000", + "Vfi30s", "Vfi30s+Lp0+Ddms", "Vfi30s+Raa1ap", "Vfi30s+Raa3fs", "Vfi30s+Raa3mp", + "Vfi30s+Raa3ms", "Vfi30s+Rad1as", "Vfi30s+Rad1as+Raa3fp", "Vfi30s+Rad2ap", + "Vfi30s+Rad3ap", "Vfi30s+Rad3as", "Vfi30s+Rad3fp", "Vfi30s+Rad3fs", "Vfi30s+Rad3mp", + "Vfi30s+Rad3ms", "Vfi30s+Rad3ms+Ddms+Scms+P", "Vfi30s+Rao3aa", + "Vfi30s+Rao3aa+Rad3as", "Vfi30s+Rao3aa+Rad3fs", "Vfs10p", "Vfs10s", "Vfs20p", + "Vfs20s", "Vfs30p", "Vfs30s", "Vfsa0s", "Vii10p", "Vii10p+Idmp+Ddmp", + "Vii10p+Raa1ap", "Vii10p+Raa3fp", "Vii10p+Raa3fs", "Vii10p+Raa3mp", "Vii10p+Raa3ms", + "Vii10p+Rad1ap", "Vii10p+Rad3as", "Vii10p+Rad3ms", "Vii10p+Raf1ap", + "Vii10p+V0f000+Raa3ms", "Vii10s", "Vii10s+Raa1as", "Vii10s+Raa1ms", "Vii10s+Raa3fp", + "Vii10s+Raa3fs", "Vii10s+Raa3mp", "Vii10s+Raa3mp+P+Ncnms", "Vii10s+Raa3ms", + "Vii10s+Rad1ms", "Vii10s+Rad2as+Raa3ms", "Vii10s+Rad3ap", "Vii10s+Rad3as", + "Vii10s+Rad3as+Raa3ms", "Vii10s+Rad3mp+V0f000", "Vii10s+Rad3ms", "Vii10s+Rad3ms+Q_", + "Vii10s+Raf1as", "Vii10s+Raf1ms", "Vii10s+V0f000+Raa3fs", "Vii10s+V0f000+Rad3fs", + "Vii20p", "Vii20s", "Vii20s+Raa3mp", "Vii20s+Raa3ms", "Vii30p", "Vii30p+Raa1ap", + "Vii30p+Raa1as", "Vii30p+Raa1ms", "Vii30p+Raa1ms+P+Edfs", "Vii30p+Raa3fp", + "Vii30p+Raa3fs", "Vii30p+Raa3mp", "Vii30p+Raa3mp+P+Idfp", "Vii30p+Raa3ms", + "Vii30p+Rad1ap", "Vii30p+Rad1ap+Raa3ms", "Vii30p+Rad1as", "Vii30p+Rad1as+Raa3ms", + "Vii30p+Rad1ms", "Vii30p+Rad2as", "Vii30p+Rad3as", "Vii30p+Rad3fp", "Vii30p+Rad3fs", + "Vii30p+Rad3fs+Raa3ms", "Vii30p+Rad3fs+V0f000", "Vii30p+Rad3mp", "Vii30p+Rad3ms", + "Vii30p+Rad3ms+Raa3ms", "Vii30p+Rao3aa", "Vii30p+Rao3aa+Rad1as", + "Vii30p+Rao3aa+Rad1ms", "Vii30p+Rao3aa+Rad3as", "Vii30p+V0f000+Raa1as", "Vii30s", + "Vii30s+La0", "Vii30s+P+Enfp", "Vii30s+Raa1ap", "Vii30s+Raa1as", + "Vii30s+Raa1as+La0", "Vii30s+Raa1as+Lp0+Ddmp", "Vii30s+Raa1ms", "Vii30s+Raa3fp", + "Vii30s+Raa3fs", "Vii30s+Raa3mp", "Vii30s+Raa3ms", "Vii30s+Rad1ap", "Vii30s+Rad1as", + "Vii30s+Rad1as+Raa3ms", "Vii30s+Rad1as+V0f000+Raa3fs", + "Vii30s+Rad1as+V0f000+Raa3ms", "Vii30s+Rad1fs", "Vii30s+Rad1ms", "Vii30s+Rad2ap", + "Vii30s+Rad2ap+Rad1as", "Vii30s+Rad2as", "Vii30s+Rad2ms", "Vii30s+Rad3ap", + "Vii30s+Rad3as", "Vii30s+Rad3as+La0+P+Ddfs+Scfs", "Vii30s+Rad3as+Raa3ms", + "Vii30s+Rad3as+V0f000+Raa3fs", "Vii30s+Rad3fp", "Vii30s+Rad3fs", "Vii30s+Rad3mp", + "Vii30s+Rad3mp+Raa3ms", "Vii30s+Rad3ms", "Vii30s+Rad3ms+V0f000+Raa3fs", + "Vii30s+Rad3ms+V0f000+Raa3ms", "Vii30s+Raf1ms", "Vii30s+Rao3aa", + "Vii30s+Rao3aa+Rad1as", "Vii30s+Rao3aa+Rad3as", "Vii30s+Rao3aa+Rad3as+V0f000", + "Vii30s+Rao3aa+Rad3fs", "Vii30s+Rao3aa+Rad3mp", "Vii30s+Rao3aa+Rad3ms", + "Vii30s+V0f000+Raa3ms", "Vii30s+V0f000+Rad3ms", "Viia0s", "Viia0s+Idfp+Ddfp", + "Viia0s+Idmp+Ddmp", "Viia0s+La0", "Viia0s+Raa3fp", "Viia0s+Raa3fs", + "Viia0s+Raa3fs+La0", "Viia0s+Raa3mp", "Viia0s+Raa3mp+La0", "Viia0s+Raa3ms", + "Viia0s+Rad2as", "Viia0s+Rad3ap", "Viia0s+Rad3as", "Viia0s+Rad3fs+Raa3ms", + "Viia0s+Rad3mp", "Viia0s+Rad3ms", "Viia0s+V0f000", "Viia0s+V0f000+Raa3fs", + "Viia0s+V0f000+Rad3as", "Viia0s+V0f000+Rad3fs", "Vip30p", "Vip30s", "Vli10p", + "Vli10p+Raa1ap", "Vli10p+Rad3fp", "Vli10s", "Vli10s+Raa3ms", "Vli10s+Raf1as", + "Vli10s+Raf1ms", "Vli10s+V0f000+Rad2as", "Vli10s+V0f000+Rad3as", "Vli20p", + "Vli20p+Rad1ms", "Vli20s", "Vli20s+Rad1as+Raa3ms", "Vli20s+V0f000+Raa2as", "Vli30p", + "Vli30p+Raa1as", "Vli30p+Raa3fs", "Vli30p+Raa3ms", "Vli30p+Rad1as", + "Vli30p+Rad1as+Raa3ms", "Vli30p+Rad3as", "Vli30p+Rad3fs", "Vli30p+Rad3ms", + "Vli30p+Rao3aa", "Vli30p+Rao3aa+Rad1as", "Vli30s", "Vli30s+P+Enns", + "Vli30s+P+Rtp3ms", "Vli30s+Raa1ap", "Vli30s+Raa1as", "Vli30s+Raa3fp", + "Vli30s+Raa3fs", "Vli30s+Raa3mp", "Vli30s+Raa3ms", "Vli30s+Rad1as", "Vli30s+Rad3as", + "Vli30s+Rad3as+Raa3ms", "Vli30s+Rad3fs", "Vli30s+Rad3fs+Raa3ms", "Vli30s+Rad3ms", + "Vli30s+Rad3ms+Raa3ms", "Vli30s+Rao3aa", "Vli30s+Rao3aa+Rad3as", + "Vli30s+Rao3aa+Rad3fs", "Vli30s+V0f000+Raa1as", "Vlia0s", "Vlia0s+Raa3fs", + "Vlia0s+Raa3mp", "Vlia0s+Raa3ms", "Vlia0s+Rad3as", "Vlia0s+Rad3as+Raa3fs", "Vpi10p", + "Vpi10p+Ddfp", "Vpi10p+La0+Q.+V0x000", "Vpi10p+P+Enns", "Vpi10p+Raa1ap", + "Vpi10p+Raa2ap", "Vpi10p+Raa2as", "Vpi10p+Raa3fs", "Vpi10p+Raa3mp", "Vpi10p+Raa3ms", + "Vpi10p+Rad1ap", "Vpi10p+Rad2as+Raa3fs", "Vpi10p+Rad2as+Raa3ms", "Vpi10p+Rad2fs", + "Vpi10p+Rad3as", "Vpi10p+Rad3mp", "Vpi10p+Rad3ms", "Vpi10p+Raf1ap", + "Vpi10p+Raf1ap+Lp0+Ddfp", "Vpi10p+Raf1fp", "Vpi10p+Raf1mp", "Vpi10p+Raf1mp+P+Idmp", + "Vpi10p+V0f000+Raa3ms", "Vpi10p+V0f000+Raf1ap", "Vpi10s", "Vpi10s+Lp0+Ddfs", + "Vpi10s+P+V0f000+Raa1ms+P", "Vpi10s+P+V0f000+Raf1as", "Vpi10s+P+V0f000+Raf1ms+P", + "Vpi10s+Raa1as", "Vpi10s+Raa1as+La0", "Vpi10s+Raa1fs", "Vpi10s+Raa1ms", + "Vpi10s+Raa2as", "Vpi10s+Raa2fs", "Vpi10s+Raa2ms", "Vpi10s+Raa3fp", "Vpi10s+Raa3fs", + "Vpi10s+Raa3mp", "Vpi10s+Raa3ms", "Vpi10s+Rad1as", "Vpi10s+Rad2ap", + "Vpi10s+Rad2ap+Raa3ms", "Vpi10s+Rad2as", "Vpi10s+Rad2as+Raa3ms", + "Vpi10s+Rad2as+Rad3fs+Raa3ms", "Vpi10s+Rad2fp", "Vpi10s+Rad2fs", "Vpi10s+Rad2ms", + "Vpi10s+Rad3ap", "Vpi10s+Rad3as", "Vpi10s+Rad3as+Raa3ms", "Vpi10s+Rad3fp", + "Vpi10s+Rad3fs", "Vpi10s+Rad3mp", "Vpi10s+Rad3ms", "Vpi10s+Rad3ms+Raa3fs", + "Vpi10s+Rad3ms+Raa3ms", "Vpi10s+Raf1as", "Vpi10s+Raf1as+P+V0f000+Raa3ms+P", + "Vpi10s+Raf1fs", "Vpi10s+Raf1ms", "Vpi10s+V0f000", "Vpi10s+V0f000+Raa1as", + "Vpi10s+V0f000+Raa3ms", "Vpi10s+V0f000+Rad3as", "Vpi10s+V0f000+Raf1as", + "Vpi10s+V0f000+Rao3aa+Rad1as", "Vpi10s+Zamp+Scmp", "Vpi20p", "Vpi20p+Raa3ms", + "Vpi20p+Raf2ap", "Vpi20s", "Vpi20s+Raa1as", "Vpi20s+Raa2as", "Vpi20s+Raa3fs", + "Vpi20s+Raa3ms", "Vpi20s+Rad1as", "Vpi20s+Rad1as+V0f000", "Vpi20s+Rad1fs", + "Vpi20s+Rad1ms", "Vpi20s+Rad3as", "Vpi20s+Rad3fs", "Vpi20s+Raf2as", "Vpi20s+Raf2ms", + "Vpi20s+V0f000+Raa3ms", "Vpi20s+V0f000+Raf2as", "Vpi30p", "Vpi30p+Cs", + "Vpi30p+P+Sp00", "Vpi30p+Q\"+Cc", "Vpi30p+Raa1ap", "Vpi30p+Raa1as", "Vpi30p+Raa1ms", + "Vpi30p+Raa2as", "Vpi30p+Raa3fp", "Vpi30p+Raa3fs", "Vpi30p+Raa3mp", "Vpi30p+Raa3ms", + "Vpi30p+Rad1ap", "Vpi30p+Rad1as", "Vpi30p+Rad1as+Raa3fp", "Vpi30p+Rad1as+Raa3ms", + "Vpi30p+Rad1fs", "Vpi30p+Rad1mp", "Vpi30p+Rad1ms", "Vpi30p+Rad2as", "Vpi30p+Rad3ap", + "Vpi30p+Rad3as", "Vpi30p+Rad3as+La0", "Vpi30p+Rad3as+Raa3ms", "Vpi30p+Rad3fp", + "Vpi30p+Rad3fs", "Vpi30p+Rad3mp", "Vpi30p+Rad3mp+Raa3ms", "Vpi30p+Rad3ms", + "Vpi30p+Rao3aa", "Vpi30p+Rao3aa+La0+P+Ddfp+Scfp", "Vpi30p+Rao3aa+Rad1ap", + "Vpi30p+Rao3aa+Rad1as", "Vpi30p+Rao3aa+Rad3as", "Vpi30p+Rao3aa+Rad3fs", + "Vpi30p+V0f000+Raa3fs", "Vpi30p+V0f000+Raa3ms", "Vpi30p+V0f000+Rad1ap", "Vpi30s", + "Vpi30s+A0ms", "Vpi30s+A0ms+La0", "Vpi30s+Idfp+Ddfp", "Vpi30s+La0", + "Vpi30s+Lp0+Ddms", "Vpi30s+P+Infs", "Vpi30s+Raa1ap", "Vpi30s+Raa1ap+Lcs+La0", + "Vpi30s+Raa1as", "Vpi30s+Raa1as+V0x000+Raa1as", "Vpi30s+Raa1fs", "Vpi30s+Raa1mp", + "Vpi30s+Raa1ms", "Vpi30s+Raa1ms+P+Idmp", "Vpi30s+Raa2as", "Vpi30s+Raa3fp", + "Vpi30s+Raa3fs", "Vpi30s+Raa3mp", "Vpi30s+Raa3ms", "Vpi30s+Rad1ap", + "Vpi30s+Rad1ap+Raa3fs", "Vpi30s+Rad1as", "Vpi30s+Rad1as+Idfp+Ddfp", + "Vpi30s+Rad1as+V0f000+Raa3fs", "Vpi30s+Rad1fp+V0f000+Rad1fp", "Vpi30s+Rad1fs", + "Vpi30s+Rad1ms", "Vpi30s+Rad1ms+V0f000+Raa3fs", "Vpi30s+Rad2as", "Vpi30s+Rad2fp", + "Vpi30s+Rad2fs", "Vpi30s+Rad2ms", "Vpi30s+Rad3ap", "Vpi30s+Rad3as", + "Vpi30s+Rad3as+Raa3ms", "Vpi30s+Rad3as+V0f000", "Vpi30s+Rad3fp", "Vpi30s+Rad3fs", + "Vpi30s+Rad3fs+Raa3ms", "Vpi30s+Rad3mp", "Vpi30s+Rad3ms", + "Vpi30s+Rad3ms+Ddms+Scms+P", "Vpi30s+Rad3ms+Raa3fs", "Vpi30s+Rad3ms+Raa3ms", + "Vpi30s+Rao3aa", "Vpi30s+Rao3aa+La0+P+Ddms+Scms", "Vpi30s+Rao3aa+Q_", + "Vpi30s+Rao3aa+Rad1ap", "Vpi30s+Rao3aa+Rad1as", "Vpi30s+Rao3aa+Rad2as", + "Vpi30s+Rao3aa+Rad3ap", "Vpi30s+Rao3aa+Rad3as", "Vpi30s+Rao3aa+Rad3fs", + "Vpi30s+Rao3aa+Rad3mp", "Vpi30s+Rao3aa+Rad3ms", "Vpi30s+Sp00", "Vpi30s+V0f000", + "Vpi30s+V0f000+Raa1ap", "Vpi30s+V0f000+Raa1as", "Vpi30s+V0f000+Raa3fp", + "Vpi30s+V0f000+Raa3fs", "Vpi30s+V0f000+Raa3ms", "Vpi30s+V0f000+Rad1ap", + "Vpi30s+V0f000+Rad3ap", "Vps10p", "Vps10p+La0", "Vps10p+Q_+Vpi30s+Rad3mp", + "Vps10p+Raa1ap", "Vps10p+Raa3ms", "Vps10p+Raf1ap", "Vps10p+V0f000+Raa3fs", "Vps10s", + "Vps20p", "Vps20s", "Vps30p", "Vps30p+P+Ddxp", "Vps30p+P+Ddxs", "Vps30p+Raa3ms", + "Vps30p+Rad1as", "Vps30p+Rad3fs", "Vps30p+Rao3aa", "Vps30s", "Vps30s+P+Ddxp", + "Vps30s+P+Ddxs", "Vps30s+Raa1ap", "Vps30s+Raa1as", "Vps30s+Raa3fp", "Vps30s+Raa3fs", + "Vps30s+Raa3ms", "Vps30s+Rad1ap", "Vps30s+Rad1as", "Vps30s+Rad1as+Ddms+P", + "Vps30s+Rad1ms", "Vps30s+Rad3as", "Vps30s+Rad3fs", "Vps30s+Rad3mp+Raa3ms", + "Vps30s+Rao3aa", "Vps30s+V0f000", "Vpsa0s", "Wg", "Wm", "Wm+Sp00", "Wn", "Wn+Lp0", + "Wn+Lp0+Ddmp+Scmp+P", "Wn+P", "Wn+P+Sp00", "Wn+P+V0f000+Rao3aa", "Wn+Q_+Vei30s", + "Wn+Rad1ap+Raa3ms", "Wn+Rad1as+Raa3ms", "Wn+Rad2as+Raa3ms", "Wn+Rad3as+Raa3ms", + "Wn+Rad3fs+Raa3ms", "Wn+Rad3ms+Raa3ms", "Wn+Scms", "Wn+V0f000+Raa3ms", + "Wn+Vei30s+Raa3fs", "Wn+Vii30p+Rad3mp", "Wn+Vpi20s", "Wn+Vpi30p", "Wr", "Y", "Za00", + "Za00+Enmp", "Za00+Sp00", "Zaap", "Zaas", "Zafp", "Zafp+Scfp", "Zafp+Scfs", + "Zafp+Spfs+Vpi30s", "Zafs", "Zamp", "Zams", "Zams+Ncnms", "Zams+Spms", "Zaxp", + "Zaxs", "Zf00", "Zg00", "Zgaa", "Zgap", "Zgas", "Zgfa", "Zgfp", "Zgfs", "Zgma", + "Zgmp", "Zgms", "Zo00", "Zo00+Dims", "Zo00+Scfp", "Zo00+Scmp", "Zo00+Scms", + "Zo00+Wn", "Zs00", "Zs00+A0fs+Zs00+A0as+Zs00+A0as+Zs00+A0as+Zs00+A0as+Scms", + "Zs00+Ncdmp", "Zs00+Ncnmp", "Zs00+Ncnms" }; + + runTest("gl", "xiada", tagset, "Este é un exame .", + new String[] { "este", "ser", "un", "exame", "." }, + new String[] { "Enms", "Vpi30s", "Dims", "Scms", "Q." }, + new String[] { "POS_PRON", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); + } + + @Test + public void testPolish() + throws Exception + { + String[] tagset = { "SENT", "adj:pl:acc:f:com", "adj:pl:acc:f:pos", "adj:pl:acc:f:sup", + "adj:pl:acc:m1:com", "adj:pl:acc:m1:pos", "adj:pl:acc:m1:sup", "adj:pl:acc:m2:com", + "adj:pl:acc:m2:pos", "adj:pl:acc:m2:sup", "adj:pl:acc:m3:com", "adj:pl:acc:m3:pos", + "adj:pl:acc:m3:sup", "adj:pl:acc:n:com", "adj:pl:acc:n:pos", "adj:pl:acc:n:sup", + "adj:pl:dat:f:com", "adj:pl:dat:f:pos", "adj:pl:dat:f:sup", "adj:pl:dat:m1:com", + "adj:pl:dat:m1:pos", "adj:pl:dat:m1:sup", "adj:pl:dat:m2:pos", "adj:pl:dat:m3:com", + "adj:pl:dat:m3:pos", "adj:pl:dat:n:pos", "adj:pl:dat:n:sup", "adj:pl:gen:f:com", + "adj:pl:gen:f:pos", "adj:pl:gen:f:sup", "adj:pl:gen:m1:com", "adj:pl:gen:m1:pos", + "adj:pl:gen:m1:sup", "adj:pl:gen:m2:com", "adj:pl:gen:m2:pos", "adj:pl:gen:m2:sup", + "adj:pl:gen:m3:com", "adj:pl:gen:m3:pos", "adj:pl:gen:m3:sup", "adj:pl:gen:n:com", + "adj:pl:gen:n:pos", "adj:pl:gen:n:sup", "adj:pl:inst:f:com", "adj:pl:inst:f:pos", + "adj:pl:inst:f:sup", "adj:pl:inst:m1:com", "adj:pl:inst:m1:pos", + "adj:pl:inst:m1:sup", "adj:pl:inst:m2:pos", "adj:pl:inst:m3:com", + "adj:pl:inst:m3:pos", "adj:pl:inst:m3:sup", "adj:pl:inst:n:com", + "adj:pl:inst:n:pos", "adj:pl:inst:n:sup", "adj:pl:loc:f:com", "adj:pl:loc:f:pos", + "adj:pl:loc:f:sup", "adj:pl:loc:m1:com", "adj:pl:loc:m1:pos", "adj:pl:loc:m1:sup", + "adj:pl:loc:m2:pos", "adj:pl:loc:m3:com", "adj:pl:loc:m3:pos", "adj:pl:loc:m3:sup", + "adj:pl:loc:n:com", "adj:pl:loc:n:pos", "adj:pl:loc:n:sup", "adj:pl:nom:f:com", + "adj:pl:nom:f:pos", "adj:pl:nom:f:sup", "adj:pl:nom:m1:com", "adj:pl:nom:m1:pos", + "adj:pl:nom:m1:sup", "adj:pl:nom:m2:com", "adj:pl:nom:m2:pos", "adj:pl:nom:m2:sup", + "adj:pl:nom:m3:com", "adj:pl:nom:m3:pos", "adj:pl:nom:m3:sup", "adj:pl:nom:n:com", + "adj:pl:nom:n:pos", "adj:pl:nom:n:sup", "adj:sg:acc:f:com", "adj:sg:acc:f:pos", + "adj:sg:acc:f:sup", "adj:sg:acc:m1:com", "adj:sg:acc:m1:pos", "adj:sg:acc:m1:sup", + "adj:sg:acc:m2:com", "adj:sg:acc:m2:pos", "adj:sg:acc:m2:sup", "adj:sg:acc:m3:com", + "adj:sg:acc:m3:pos", "adj:sg:acc:m3:sup", "adj:sg:acc:n:com", "adj:sg:acc:n:pos", + "adj:sg:acc:n:sup", "adj:sg:dat:f:com", "adj:sg:dat:f:pos", "adj:sg:dat:f:sup", + "adj:sg:dat:m1:com", "adj:sg:dat:m1:pos", "adj:sg:dat:m1:sup", "adj:sg:dat:m2:pos", + "adj:sg:dat:m3:com", "adj:sg:dat:m3:pos", "adj:sg:dat:m3:sup", "adj:sg:dat:n:com", + "adj:sg:dat:n:pos", "adj:sg:dat:n:sup", "adj:sg:gen:f:com", "adj:sg:gen:f:pos", + "adj:sg:gen:f:sup", "adj:sg:gen:m1:com", "adj:sg:gen:m1:pos", "adj:sg:gen:m1:sup", + "adj:sg:gen:m2:pos", "adj:sg:gen:m2:sup", "adj:sg:gen:m3:com", "adj:sg:gen:m3:pos", + "adj:sg:gen:m3:sup", "adj:sg:gen:n:com", "adj:sg:gen:n:pos", "adj:sg:gen:n:sup", + "adj:sg:inst:f:com", "adj:sg:inst:f:pos", "adj:sg:inst:f:sup", + "adj:sg:inst:m1:com", "adj:sg:inst:m1:pos", "adj:sg:inst:m1:sup", + "adj:sg:inst:m2:com", "adj:sg:inst:m2:pos", "adj:sg:inst:m2:sup", + "adj:sg:inst:m3:com", "adj:sg:inst:m3:pos", "adj:sg:inst:m3:sup", + "adj:sg:inst:n:com", "adj:sg:inst:n:pos", "adj:sg:inst:n:sup", "adj:sg:loc:f:com", + "adj:sg:loc:f:pos", "adj:sg:loc:f:sup", "adj:sg:loc:m1:com", "adj:sg:loc:m1:pos", + "adj:sg:loc:m1:sup", "adj:sg:loc:m2:com", "adj:sg:loc:m2:pos", "adj:sg:loc:m3:com", + "adj:sg:loc:m3:pos", "adj:sg:loc:m3:sup", "adj:sg:loc:n:com", "adj:sg:loc:n:pos", + "adj:sg:loc:n:sup", "adj:sg:nom:f:com", "adj:sg:nom:f:pos", "adj:sg:nom:f:sup", + "adj:sg:nom:m1:com", "adj:sg:nom:m1:pos", "adj:sg:nom:m1:sup", "adj:sg:nom:m2:com", + "adj:sg:nom:m2:pos", "adj:sg:nom:m2:sup", "adj:sg:nom:m3:com", "adj:sg:nom:m3:pos", + "adj:sg:nom:m3:sup", "adj:sg:nom:n:com", "adj:sg:nom:n:pos", "adj:sg:nom:n:sup", + "adj:sg:voc:f:pos", "adj:sg:voc:f:sup", "adj:sg:voc:m1:pos", "adj:sg:voc:m1:sup", + "adj:sg:voc:m2:pos", "adj:sg:voc:m3:pos", "adj:sg:voc:n:pos", "adja", "adjc", + "adjp", "adv", "adv:com", "adv:pos", "adv:sup", "aglt:pl:pri:imperf:nwok", + "aglt:pl:pri:imperf:wok", "aglt:pl:sec:imperf:nwok", "aglt:sg:pri:imperf:nwok", + "aglt:sg:pri:imperf:wok", "aglt:sg:sec:imperf:nwok", "aglt:sg:sec:imperf:wok", + "aglt:sg:ter:imperf:nwok", "bedzie:pl:pri:imperf", "bedzie:pl:sec:imperf", + "bedzie:pl:ter:imperf", "bedzie:sg:pri:imperf", "bedzie:sg:sec:imperf", + "bedzie:sg:ter:imperf", "brev:npun", "brev:pun", "burk", "comp", "conj", + "depr:pl:acc:m2", "depr:pl:nom:m2", "fin:pl:pri:imperf", "fin:pl:pri:perf", + "fin:pl:sec:imperf", "fin:pl:sec:perf", "fin:pl:ter:imperf", "fin:pl:ter:perf", + "fin:sg:pri:imperf", "fin:sg:pri:perf", "fin:sg:sec:imperf", "fin:sg:sec:perf", + "fin:sg:ter:imperf", "fin:sg:ter:perf", "ger:pl:dat:n:perf:aff", + "ger:pl:gen:n:imperf:aff", "ger:pl:gen:n:perf:aff", "ger:pl:inst:n:imperf:aff", + "ger:pl:inst:n:perf:aff", "ger:pl:loc:n:imperf:aff", "ger:pl:nom:n:imperf:aff", + "ger:pl:nom:n:perf:aff", "ger:sg:acc:n:imperf:aff", "ger:sg:acc:n:imperf:neg", + "ger:sg:acc:n:perf:aff", "ger:sg:acc:n:perf:neg", "ger:sg:dat:n:imperf:aff", + "ger:sg:dat:n:perf:aff", "ger:sg:gen:n:imperf:aff", "ger:sg:gen:n:imperf:neg", + "ger:sg:gen:n:perf:aff", "ger:sg:gen:n:perf:neg", "ger:sg:inst:n:imperf:aff", + "ger:sg:inst:n:imperf:neg", "ger:sg:inst:n:perf:aff", "ger:sg:inst:n:perf:neg", + "ger:sg:loc:n:imperf:aff", "ger:sg:loc:n:imperf:neg", "ger:sg:loc:n:perf:aff", + "ger:sg:loc:n:perf:neg", "ger:sg:nom:n:imperf:aff", "ger:sg:nom:n:imperf:neg", + "ger:sg:nom:n:perf:aff", "ger:sg:nom:n:perf:neg", "imps:imperf", "imps:perf", + "impt:pl:pri:imperf", "impt:pl:pri:perf", "impt:pl:sec:imperf", "impt:pl:sec:perf", + "impt:sg:sec:imperf", "impt:sg:sec:perf", "inf:imperf", "inf:perf", "interj", + "interp", "num:pl:acc:f:congr", "num:pl:acc:f:rec", "num:pl:acc:m1:congr", + "num:pl:acc:m1:rec", "num:pl:acc:m2:congr", "num:pl:acc:m2:rec", + "num:pl:acc:m3:congr", "num:pl:acc:m3:rec", "num:pl:acc:n:congr", + "num:pl:acc:n:rec", "num:pl:dat:f:congr", "num:pl:dat:m1:congr", + "num:pl:dat:m2:congr", "num:pl:dat:m3:congr", "num:pl:dat:m3:rec", + "num:pl:dat:n:congr", "num:pl:gen:f:congr", "num:pl:gen:f:rec", + "num:pl:gen:m1:congr", "num:pl:gen:m1:rec", "num:pl:gen:m2:congr", + "num:pl:gen:m2:rec", "num:pl:gen:m3:congr", "num:pl:gen:m3:rec", + "num:pl:gen:n:congr", "num:pl:gen:n:rec", "num:pl:inst:f:congr", + "num:pl:inst:m1:congr", "num:pl:inst:m2:congr", "num:pl:inst:m3:congr", + "num:pl:inst:m3:rec", "num:pl:inst:n:congr", "num:pl:loc:f:congr", + "num:pl:loc:f:rec", "num:pl:loc:m1:congr", "num:pl:loc:m2:congr", + "num:pl:loc:m2:rec", "num:pl:loc:m3:congr", "num:pl:loc:m3:rec", + "num:pl:loc:n:congr", "num:pl:nom:f:congr", "num:pl:nom:f:rec", + "num:pl:nom:m1:congr", "num:pl:nom:m1:rec", "num:pl:nom:m2:congr", + "num:pl:nom:m2:rec", "num:pl:nom:m3:congr", "num:pl:nom:m3:rec", + "num:pl:nom:n:congr", "num:pl:nom:n:rec", "num:sg:acc:m3:rec", + "num:sg:gen:m1:congr", "num:sg:gen:m3:congr", "num:sg:gen:m3:rec", + "num:sg:nom:f:rec", "num:sg:nom:m3:congr", "num:sg:nom:m3:rec", "num:sg:nom:n:rec", + "numcol:pl:acc:m1:rec", "numcol:pl:acc:n:rec", "numcol:pl:dat:m1:congr", + "numcol:pl:gen:m1:congr", "numcol:pl:gen:m1:rec", "numcol:pl:gen:n:congr", + "numcol:pl:gen:n:rec", "numcol:pl:inst:m1:rec", "numcol:pl:inst:n:rec", + "numcol:pl:nom:m1:rec", "numcol:pl:nom:n:rec", "pact:pl:acc:f:imperf:aff", + "pact:pl:acc:f:imperf:neg", "pact:pl:acc:m1:imperf:aff", + "pact:pl:acc:m2:imperf:aff", "pact:pl:acc:m3:imperf:aff", + "pact:pl:acc:m3:imperf:neg", "pact:pl:acc:n:imperf:aff", + "pact:pl:acc:n:imperf:neg", "pact:pl:dat:f:imperf:aff", + "pact:pl:dat:m1:imperf:aff", "pact:pl:dat:m2:imperf:aff", + "pact:pl:dat:m3:imperf:aff", "pact:pl:dat:n:imperf:aff", + "pact:pl:gen:f:imperf:aff", "pact:pl:gen:f:imperf:neg", + "pact:pl:gen:m1:imperf:aff", "pact:pl:gen:m1:imperf:neg", + "pact:pl:gen:m2:imperf:aff", "pact:pl:gen:m3:imperf:aff", + "pact:pl:gen:m3:imperf:neg", "pact:pl:gen:n:imperf:aff", + "pact:pl:inst:f:imperf:aff", "pact:pl:inst:m1:imperf:aff", + "pact:pl:inst:m2:imperf:aff", "pact:pl:inst:m3:imperf:aff", + "pact:pl:inst:m3:imperf:neg", "pact:pl:inst:n:imperf:aff", + "pact:pl:inst:n:imperf:neg", "pact:pl:loc:f:imperf:aff", + "pact:pl:loc:m1:imperf:aff", "pact:pl:loc:m3:imperf:aff", + "pact:pl:loc:m3:imperf:neg", "pact:pl:loc:n:imperf:aff", + "pact:pl:loc:n:imperf:neg", "pact:pl:nom:f:imperf:aff", "pact:pl:nom:f:imperf:neg", + "pact:pl:nom:m1:imperf:aff", "pact:pl:nom:m2:imperf:aff", + "pact:pl:nom:m3:imperf:aff", "pact:pl:nom:n:imperf:aff", + "pact:pl:nom:n:imperf:neg", "pact:sg:acc:f:imperf:aff", "pact:sg:acc:f:imperf:neg", + "pact:sg:acc:m1:imperf:aff", "pact:sg:acc:m2:imperf:aff", + "pact:sg:acc:m3:imperf:aff", "pact:sg:acc:n:imperf:aff", + "pact:sg:acc:n:imperf:neg", "pact:sg:dat:f:imperf:aff", + "pact:sg:dat:m1:imperf:aff", "pact:sg:dat:m2:imperf:aff", + "pact:sg:dat:m3:imperf:aff", "pact:sg:dat:n:imperf:aff", + "pact:sg:gen:f:imperf:aff", "pact:sg:gen:f:imperf:neg", + "pact:sg:gen:m1:imperf:aff", "pact:sg:gen:m1:imperf:neg", + "pact:sg:gen:m2:imperf:aff", "pact:sg:gen:m3:imperf:aff", + "pact:sg:gen:m3:imperf:neg", "pact:sg:gen:n:imperf:aff", + "pact:sg:gen:n:imperf:neg", "pact:sg:inst:f:imperf:aff", + "pact:sg:inst:f:imperf:neg", "pact:sg:inst:m1:imperf:aff", + "pact:sg:inst:m1:imperf:neg", "pact:sg:inst:m2:imperf:aff", + "pact:sg:inst:m2:imperf:neg", "pact:sg:inst:m3:imperf:aff", + "pact:sg:inst:m3:imperf:neg", "pact:sg:inst:n:imperf:aff", + "pact:sg:loc:f:imperf:aff", "pact:sg:loc:f:imperf:neg", + "pact:sg:loc:m1:imperf:aff", "pact:sg:loc:m2:imperf:aff", + "pact:sg:loc:m3:imperf:aff", "pact:sg:loc:m3:imperf:neg", + "pact:sg:loc:n:imperf:aff", "pact:sg:loc:n:imperf:neg", "pact:sg:nom:f:imperf:aff", + "pact:sg:nom:f:imperf:neg", "pact:sg:nom:m1:imperf:aff", + "pact:sg:nom:m1:imperf:neg", "pact:sg:nom:m2:imperf:aff", + "pact:sg:nom:m3:imperf:aff", "pact:sg:nom:m3:imperf:neg", + "pact:sg:nom:n:imperf:aff", "pact:sg:nom:n:imperf:neg", + "pact:sg:voc:m1:imperf:aff", "pant:perf", "pcon:imperf", + "ppas:pl:acc:f:imperf:aff", "ppas:pl:acc:f:perf:aff", "ppas:pl:acc:f:perf:neg", + "ppas:pl:acc:m1:imperf:aff", "ppas:pl:acc:m1:imperf:neg", + "ppas:pl:acc:m1:perf:aff", "ppas:pl:acc:m2:imperf:aff", "ppas:pl:acc:m2:perf:aff", + "ppas:pl:acc:m3:imperf:aff", "ppas:pl:acc:m3:perf:aff", "ppas:pl:acc:m3:perf:neg", + "ppas:pl:acc:n:imperf:aff", "ppas:pl:acc:n:imperf:neg", "ppas:pl:acc:n:perf:aff", + "ppas:pl:acc:n:perf:neg", "ppas:pl:dat:f:imperf:aff", "ppas:pl:dat:f:perf:aff", + "ppas:pl:dat:f:perf:neg", "ppas:pl:dat:m1:imperf:aff", "ppas:pl:dat:m1:perf:aff", + "ppas:pl:dat:m1:perf:neg", "ppas:pl:dat:m2:imperf:aff", + "ppas:pl:dat:m3:imperf:aff", "ppas:pl:dat:m3:perf:aff", "ppas:pl:dat:n:imperf:aff", + "ppas:pl:dat:n:perf:aff", "ppas:pl:gen:f:imperf:aff", "ppas:pl:gen:f:imperf:neg", + "ppas:pl:gen:f:perf:aff", "ppas:pl:gen:f:perf:neg", "ppas:pl:gen:m1:imperf:aff", + "ppas:pl:gen:m1:imperf:neg", "ppas:pl:gen:m1:perf:aff", "ppas:pl:gen:m1:perf:neg", + "ppas:pl:gen:m2:imperf:aff", "ppas:pl:gen:m2:perf:aff", + "ppas:pl:gen:m3:imperf:aff", "ppas:pl:gen:m3:imperf:neg", + "ppas:pl:gen:m3:perf:aff", "ppas:pl:gen:m3:perf:neg", "ppas:pl:gen:n:imperf:aff", + "ppas:pl:gen:n:perf:aff", "ppas:pl:gen:n:perf:neg", "ppas:pl:inst:f:imperf:aff", + "ppas:pl:inst:f:perf:aff", "ppas:pl:inst:m1:imperf:aff", + "ppas:pl:inst:m1:perf:aff", "ppas:pl:inst:m2:perf:aff", + "ppas:pl:inst:m3:imperf:aff", "ppas:pl:inst:m3:perf:aff", + "ppas:pl:inst:n:imperf:aff", "ppas:pl:inst:n:perf:aff", "ppas:pl:loc:f:imperf:aff", + "ppas:pl:loc:f:imperf:neg", "ppas:pl:loc:f:perf:aff", "ppas:pl:loc:f:perf:neg", + "ppas:pl:loc:m1:imperf:aff", "ppas:pl:loc:m1:perf:aff", + "ppas:pl:loc:m2:imperf:aff", "ppas:pl:loc:m3:imperf:aff", + "ppas:pl:loc:m3:perf:aff", "ppas:pl:loc:m3:perf:neg", "ppas:pl:loc:n:imperf:aff", + "ppas:pl:loc:n:perf:aff", "ppas:pl:loc:n:perf:neg", "ppas:pl:nom:f:imperf:aff", + "ppas:pl:nom:f:imperf:neg", "ppas:pl:nom:f:perf:aff", "ppas:pl:nom:f:perf:neg", + "ppas:pl:nom:m1:imperf:aff", "ppas:pl:nom:m1:imperf:neg", + "ppas:pl:nom:m1:perf:aff", "ppas:pl:nom:m1:perf:neg", "ppas:pl:nom:m2:imperf:aff", + "ppas:pl:nom:m2:perf:aff", "ppas:pl:nom:m3:imperf:aff", + "ppas:pl:nom:m3:imperf:neg", "ppas:pl:nom:m3:perf:aff", "ppas:pl:nom:m3:perf:neg", + "ppas:pl:nom:n:imperf:aff", "ppas:pl:nom:n:perf:aff", "ppas:pl:nom:n:perf:neg", + "ppas:sg:acc:f:imperf:aff", "ppas:sg:acc:f:imperf:neg", "ppas:sg:acc:f:perf:aff", + "ppas:sg:acc:f:perf:neg", "ppas:sg:acc:m1:imperf:aff", "ppas:sg:acc:m1:perf:aff", + "ppas:sg:acc:m2:imperf:aff", "ppas:sg:acc:m2:perf:aff", + "ppas:sg:acc:m3:imperf:aff", "ppas:sg:acc:m3:imperf:neg", + "ppas:sg:acc:m3:perf:aff", "ppas:sg:acc:m3:perf:neg", "ppas:sg:acc:n:imperf:aff", + "ppas:sg:acc:n:perf:aff", "ppas:sg:acc:n:perf:neg", "ppas:sg:dat:f:imperf:aff", + "ppas:sg:dat:f:imperf:neg", "ppas:sg:dat:f:perf:aff", "ppas:sg:dat:f:perf:neg", + "ppas:sg:dat:m1:imperf:aff", "ppas:sg:dat:m1:perf:aff", + "ppas:sg:dat:m3:imperf:aff", "ppas:sg:dat:m3:perf:aff", "ppas:sg:dat:n:perf:aff", + "ppas:sg:gen:f:imperf:aff", "ppas:sg:gen:f:imperf:neg", "ppas:sg:gen:f:perf:aff", + "ppas:sg:gen:f:perf:neg", "ppas:sg:gen:m1:imperf:aff", "ppas:sg:gen:m1:perf:aff", + "ppas:sg:gen:m1:perf:neg", "ppas:sg:gen:m2:imperf:aff", "ppas:sg:gen:m2:perf:aff", + "ppas:sg:gen:m3:imperf:aff", "ppas:sg:gen:m3:imperf:neg", + "ppas:sg:gen:m3:perf:aff", "ppas:sg:gen:m3:perf:neg", "ppas:sg:gen:n:imperf:aff", + "ppas:sg:gen:n:imperf:neg", "ppas:sg:gen:n:perf:aff", "ppas:sg:gen:n:perf:neg", + "ppas:sg:inst:f:imperf:aff", "ppas:sg:inst:f:imperf:neg", + "ppas:sg:inst:f:perf:aff", "ppas:sg:inst:f:perf:neg", "ppas:sg:inst:m1:imperf:aff", + "ppas:sg:inst:m1:imperf:neg", "ppas:sg:inst:m1:perf:aff", + "ppas:sg:inst:m1:perf:neg", "ppas:sg:inst:m2:imperf:aff", + "ppas:sg:inst:m2:perf:aff", "ppas:sg:inst:m3:imperf:aff", + "ppas:sg:inst:m3:imperf:neg", "ppas:sg:inst:m3:perf:aff", + "ppas:sg:inst:m3:perf:neg", "ppas:sg:inst:n:imperf:aff", + "ppas:sg:inst:n:imperf:neg", "ppas:sg:inst:n:perf:aff", "ppas:sg:inst:n:perf:neg", + "ppas:sg:loc:f:imperf:aff", "ppas:sg:loc:f:perf:aff", "ppas:sg:loc:f:perf:neg", + "ppas:sg:loc:m1:imperf:aff", "ppas:sg:loc:m1:perf:aff", + "ppas:sg:loc:m2:imperf:aff", "ppas:sg:loc:m3:imperf:aff", + "ppas:sg:loc:m3:imperf:neg", "ppas:sg:loc:m3:perf:aff", "ppas:sg:loc:m3:perf:neg", + "ppas:sg:loc:n:imperf:aff", "ppas:sg:loc:n:perf:aff", "ppas:sg:loc:n:perf:neg", + "ppas:sg:nom:f:imperf:aff", "ppas:sg:nom:f:imperf:neg", "ppas:sg:nom:f:perf:aff", + "ppas:sg:nom:f:perf:neg", "ppas:sg:nom:m1:imperf:aff", "ppas:sg:nom:m1:imperf:neg", + "ppas:sg:nom:m1:perf:aff", "ppas:sg:nom:m1:perf:neg", "ppas:sg:nom:m2:imperf:aff", + "ppas:sg:nom:m2:perf:aff", "ppas:sg:nom:m3:imperf:aff", + "ppas:sg:nom:m3:imperf:neg", "ppas:sg:nom:m3:perf:aff", "ppas:sg:nom:m3:perf:neg", + "ppas:sg:nom:n:imperf:aff", "ppas:sg:nom:n:imperf:neg", "ppas:sg:nom:n:perf:aff", + "ppas:sg:nom:n:perf:neg", "ppas:sg:voc:m2:imperf:aff", "ppron12:pl:acc:f:pri", + "ppron12:pl:acc:f:sec", "ppron12:pl:acc:m1:pri", "ppron12:pl:acc:m1:sec", + "ppron12:pl:acc:m2:sec", "ppron12:pl:acc:n:sec", "ppron12:pl:dat:f:pri", + "ppron12:pl:dat:f:sec", "ppron12:pl:dat:m1:pri", "ppron12:pl:dat:m1:sec", + "ppron12:pl:dat:m3:sec", "ppron12:pl:gen:f:pri", "ppron12:pl:gen:f:sec", + "ppron12:pl:gen:m1:pri", "ppron12:pl:gen:m1:sec", "ppron12:pl:gen:m2:pri", + "ppron12:pl:inst:f:pri", "ppron12:pl:inst:m1:pri", "ppron12:pl:inst:m1:sec", + "ppron12:pl:inst:n:pri", "ppron12:pl:loc:f:sec", "ppron12:pl:loc:m1:pri", + "ppron12:pl:loc:m1:sec", "ppron12:pl:loc:m3:sec", "ppron12:pl:nom:f:pri", + "ppron12:pl:nom:f:sec", "ppron12:pl:nom:m1:pri", "ppron12:pl:nom:m1:pri:akc", + "ppron12:pl:nom:m1:sec", "ppron12:pl:nom:m1:sec:akc", "ppron12:pl:nom:m2:pri", + "ppron12:pl:nom:m2:sec", "ppron12:pl:nom:n:sec", "ppron12:sg:acc:f:pri:akc", + "ppron12:sg:acc:f:sec:akc", "ppron12:sg:acc:f:sec:nakc", + "ppron12:sg:acc:m1:pri:akc", "ppron12:sg:acc:m1:pri:nakc", + "ppron12:sg:acc:m1:sec:akc", "ppron12:sg:acc:m1:sec:nakc", + "ppron12:sg:acc:m2:pri:akc", "ppron12:sg:acc:m2:sec:nakc", + "ppron12:sg:acc:m3:pri:akc", "ppron12:sg:acc:m3:sec:nakc", + "ppron12:sg:acc:n:pri:akc", "ppron12:sg:acc:n:sec:nakc", + "ppron12:sg:dat:f:pri:akc", "ppron12:sg:dat:f:pri:nakc", + "ppron12:sg:dat:f:sec:akc", "ppron12:sg:dat:f:sec:nakc", + "ppron12:sg:dat:m1:pri:akc", "ppron12:sg:dat:m1:pri:nakc", + "ppron12:sg:dat:m1:sec:akc", "ppron12:sg:dat:m1:sec:nakc", + "ppron12:sg:dat:m2:pri:nakc", "ppron12:sg:dat:m2:sec:akc", + "ppron12:sg:dat:m2:sec:nakc", "ppron12:sg:gen:f:pri:akc", + "ppron12:sg:gen:f:sec:akc", "ppron12:sg:gen:f:sec:nakc", + "ppron12:sg:gen:m1:pri:akc", "ppron12:sg:gen:m1:sec:akc", + "ppron12:sg:gen:m1:sec:nakc", "ppron12:sg:gen:m2:sec:akc", + "ppron12:sg:gen:m2:sec:nakc", "ppron12:sg:gen:n:pri:akc", "ppron12:sg:inst:f:pri", + "ppron12:sg:inst:f:sec", "ppron12:sg:inst:m1:pri", "ppron12:sg:inst:m1:pri:nakc", + "ppron12:sg:inst:m1:sec", "ppron12:sg:inst:n:sec", "ppron12:sg:loc:f:pri", + "ppron12:sg:loc:f:sec", "ppron12:sg:loc:m1:pri", "ppron12:sg:loc:m1:sec", + "ppron12:sg:loc:m3:pri", "ppron12:sg:nom:f:pri", + "ppron12:sg:nom:f:sec", "ppron12:sg:nom:m1:pri", "ppron12:sg:nom:m1:pri:akc", + "ppron12:sg:nom:m1:pri:nakc", "ppron12:sg:nom:m1:sec", "ppron12:sg:nom:m1:sec:akc", + "ppron12:sg:nom:m2:pri", "ppron12:sg:nom:m2:sec", "ppron12:sg:nom:m3:pri", + "ppron12:sg:nom:m3:sec", "ppron12:sg:nom:n:sec", "ppron12:sg:voc:n:sec", + "ppron3:pl:acc:f:ter:akc:npraep", "ppron3:pl:acc:f:ter:akc:praep", + "ppron3:pl:acc:m1:ter:akc:npraep", "ppron3:pl:acc:m1:ter:akc:praep", + "ppron3:pl:acc:m2:ter:akc:npraep", "ppron3:pl:acc:m2:ter:akc:praep", + "ppron3:pl:acc:m3:ter:akc:npraep", "ppron3:pl:acc:m3:ter:akc:praep", + "ppron3:pl:acc:n:ter:akc:npraep", "ppron3:pl:acc:n:ter:akc:praep", + "ppron3:pl:dat:f:ter:akc:npraep", "ppron3:pl:dat:f:ter:akc:praep", + "ppron3:pl:dat:m1:ter:akc:npraep", "ppron3:pl:dat:m1:ter:akc:praep", + "ppron3:pl:dat:m2:ter:akc:npraep", "ppron3:pl:dat:m3:ter:akc:npraep", + "ppron3:pl:dat:m3:ter:akc:praep", "ppron3:pl:dat:n:ter:akc:npraep", + "ppron3:pl:gen:f:ter:akc:npraep", "ppron3:pl:gen:f:ter:akc:praep", + "ppron3:pl:gen:m1:ter:akc:npraep", "ppron3:pl:gen:m1:ter:akc:praep", + "ppron3:pl:gen:m2:ter:akc:npraep", "ppron3:pl:gen:m2:ter:akc:praep", + "ppron3:pl:gen:m3:ter:akc:npraep", "ppron3:pl:gen:m3:ter:akc:praep", + "ppron3:pl:gen:n:ter:akc:npraep", "ppron3:pl:gen:n:ter:akc:praep", + "ppron3:pl:inst:f:ter:akc:npraep", "ppron3:pl:inst:f:ter:akc:praep", + "ppron3:pl:inst:m1:ter:akc:npraep", "ppron3:pl:inst:m1:ter:akc:praep", + "ppron3:pl:inst:m2:ter:akc:npraep", "ppron3:pl:inst:m2:ter:akc:praep", + "ppron3:pl:inst:m3:ter:akc:npraep", "ppron3:pl:inst:m3:ter:akc:praep", + "ppron3:pl:inst:n:ter:akc:npraep", "ppron3:pl:inst:n:ter:akc:praep", + "ppron3:pl:loc:f:ter:akc:praep", "ppron3:pl:loc:m1:ter:akc:praep", + "ppron3:pl:loc:m2:ter:akc:praep", "ppron3:pl:loc:m3:ter:akc:praep", + "ppron3:pl:loc:n:ter:akc:praep", "ppron3:pl:nom:f:ter:akc:npraep", + "ppron3:pl:nom:m1:ter:akc:npraep", "ppron3:pl:nom:m2:ter:akc:npraep", + "ppron3:pl:nom:m3:ter:akc:npraep", "ppron3:pl:nom:n:ter:akc:npraep", + "ppron3:sg:acc:f:ter:akc:npraep", "ppron3:sg:acc:f:ter:akc:praep", + "ppron3:sg:acc:m1:ter:akc:npraep", "ppron3:sg:acc:m1:ter:akc:praep", + "ppron3:sg:acc:m1:ter:nakc:npraep", "ppron3:sg:acc:m1:ter:nakc:praep", + "ppron3:sg:acc:m2:ter:akc:praep", "ppron3:sg:acc:m2:ter:nakc:npraep", + "ppron3:sg:acc:m2:ter:nakc:praep", "ppron3:sg:acc:m3:ter:akc:npraep", + "ppron3:sg:acc:m3:ter:akc:praep", "ppron3:sg:acc:m3:ter:nakc:npraep", + "ppron3:sg:acc:m3:ter:nakc:praep", "ppron3:sg:acc:n:ter:akc:npraep", + "ppron3:sg:acc:n:ter:akc:praep", "ppron3:sg:dat:f:ter:akc:npraep", + "ppron3:sg:dat:f:ter:akc:praep", "ppron3:sg:dat:m1:ter:akc:npraep", + "ppron3:sg:dat:m1:ter:akc:praep", "ppron3:sg:dat:m1:ter:nakc:npraep", + "ppron3:sg:dat:m2:ter:akc:npraep", "ppron3:sg:dat:m2:ter:nakc:npraep", + "ppron3:sg:dat:m3:ter:akc:npraep", "ppron3:sg:dat:m3:ter:akc:praep", + "ppron3:sg:dat:m3:ter:nakc:npraep", "ppron3:sg:dat:n:ter:akc:npraep", + "ppron3:sg:dat:n:ter:akc:praep", "ppron3:sg:dat:n:ter:nakc:npraep", + "ppron3:sg:gen:f:ter:akc:npraep", "ppron3:sg:gen:f:ter:akc:praep", + "ppron3:sg:gen:m1:ter:akc:npraep", "ppron3:sg:gen:m1:ter:akc:praep", + "ppron3:sg:gen:m1:ter:nakc:npraep", "ppron3:sg:gen:m1:ter:nakc:praep", + "ppron3:sg:gen:m2:ter:akc:npraep", "ppron3:sg:gen:m2:ter:akc:praep", + "ppron3:sg:gen:m2:ter:nakc:npraep", "ppron3:sg:gen:m3:ter:akc:npraep", + "ppron3:sg:gen:m3:ter:akc:praep", "ppron3:sg:gen:m3:ter:nakc:npraep", + "ppron3:sg:gen:m3:ter:nakc:praep", "ppron3:sg:gen:n:ter:akc:npraep", + "ppron3:sg:gen:n:ter:akc:praep", "ppron3:sg:gen:n:ter:nakc:npraep", + "ppron3:sg:inst:f:ter:akc:praep", "ppron3:sg:inst:m1:ter:akc:npraep", + "ppron3:sg:inst:m1:ter:akc:praep", "ppron3:sg:inst:m2:ter:akc:npraep", + "ppron3:sg:inst:m2:ter:akc:praep", "ppron3:sg:inst:m3:ter:akc:npraep", + "ppron3:sg:inst:m3:ter:akc:praep", "ppron3:sg:inst:n:ter:akc:npraep", + "ppron3:sg:inst:n:ter:akc:praep", "ppron3:sg:loc:f:ter:akc:praep", + "ppron3:sg:loc:m1:ter:akc:praep", "ppron3:sg:loc:m2:ter:akc:praep", + "ppron3:sg:loc:m3:ter:akc:praep", "ppron3:sg:loc:n:ter:akc:praep", + "ppron3:sg:nom:f:ter:akc:npraep", "ppron3:sg:nom:f:ter:akc:praep", + "ppron3:sg:nom:m1:ter:akc:npraep", "ppron3:sg:nom:m2:ter:akc:npraep", + "ppron3:sg:nom:m2:ter:akc:praep", "ppron3:sg:nom:m3:ter:akc:npraep", + "ppron3:sg:nom:n:ter:akc:npraep", "praet:pl:f:imperf", "praet:pl:f:perf", + "praet:pl:m1:imperf", "praet:pl:m1:imperf:agl", "praet:pl:m1:perf", + "praet:pl:m1:perf:nagl", "praet:pl:m2:imperf", "praet:pl:m2:perf", + "praet:pl:m3:imperf", "praet:pl:m3:perf", "praet:pl:n:imperf", "praet:pl:n:perf", + "praet:sg:f:imperf", "praet:sg:f:imperf:agl", "praet:sg:f:imperf:nagl", + "praet:sg:f:perf", "praet:sg:m1:imperf", "praet:sg:m1:imperf:agl", + "praet:sg:m1:imperf:nagl", "praet:sg:m1:perf", "praet:sg:m1:perf:agl", + "praet:sg:m1:perf:nagl", "praet:sg:m2:imperf", "praet:sg:m2:imperf:nagl", + "praet:sg:m2:perf", "praet:sg:m2:perf:nagl", "praet:sg:m3:imperf", + "praet:sg:m3:imperf:nagl", "praet:sg:m3:perf", "praet:sg:m3:perf:nagl", + "praet:sg:n:imperf", "praet:sg:n:perf", "pred", "prep:acc", "prep:acc:nwok", + "prep:acc:wok", "prep:dat", "prep:gen", "prep:gen:nwok", "prep:gen:wok", + "prep:inst", "prep:inst:nwok", "prep:inst:wok", "prep:loc", "prep:loc:nwok", + "prep:loc:wok", "prep:nom", "qub", "qub:nwok", "qub:wok", "siebie:acc", + "siebie:dat", "siebie:gen", "siebie:inst", "siebie:loc", "subst:pl:acc:f", + "subst:pl:acc:m1", "subst:pl:acc:m2", "subst:pl:acc:m3", "subst:pl:acc:n", + "subst:pl:dat:f", "subst:pl:dat:m1", "subst:pl:dat:m2", "subst:pl:dat:m3", + "subst:pl:dat:n", "subst:pl:gen:f", "subst:pl:gen:m1", "subst:pl:gen:m2", + "subst:pl:gen:m3", "subst:pl:gen:n", "subst:pl:inst:f", "subst:pl:inst:m1", + "subst:pl:inst:m2", "subst:pl:inst:m3", "subst:pl:inst:n", "subst:pl:loc:f", + "subst:pl:loc:m1", "subst:pl:loc:m2", "subst:pl:loc:m3", "subst:pl:loc:n", + "subst:pl:nom:f", "subst:pl:nom:m1", "subst:pl:nom:m2", "subst:pl:nom:m3", + "subst:pl:nom:n", "subst:sg:acc:f", "subst:sg:acc:m1", "subst:sg:acc:m2", + "subst:sg:acc:m3", "subst:sg:acc:n", "subst:sg:dat:f", "subst:sg:dat:m1", + "subst:sg:dat:m2", "subst:sg:dat:m3", "subst:sg:dat:n", "subst:sg:gen:f", + "subst:sg:gen:m1", "subst:sg:gen:m2", "subst:sg:gen:m3", "subst:sg:gen:n", + "subst:sg:inst:f", "subst:sg:inst:m1", "subst:sg:inst:m2", "subst:sg:inst:m3", + "subst:sg:inst:n", "subst:sg:loc:f", "subst:sg:loc:m1", "subst:sg:loc:m2", + "subst:sg:loc:m3", "subst:sg:loc:n", "subst:sg:nom:f", "subst:sg:nom:m1", + "subst:sg:nom:m2", "subst:sg:nom:m3", "subst:sg:nom:n", "subst:sg:voc:f", + "subst:sg:voc:m1", "subst:sg:voc:m2", "subst:sg:voc:m3", "subst:sg:voc:n", + "winien:pl:f:imperf", "winien:pl:m1:imperf", "winien:pl:m2:imperf", + "winien:pl:m3:imperf", "winien:pl:n:imperf", "winien:sg:f:imperf", + "winien:sg:m1:imperf", "winien:sg:m2:imperf", "winien:sg:m3:imperf", + "winien:sg:n:imperf", "xxx" }; + + runTest("pl", "ncp", tagset, "To badanie .", + new String[] { "ten", "badanie", "." }, + new String[] { "adj:sg:acc:n:pos", "subst:sg:acc:n", "SENT" }, + new String[] { "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); + } + + @Test + public void testRussian() + throws Exception + { + String[] tagset = { ",", "-", "Afcmsnf", "Afpfpgf", "Afpfsaf", "Afpfsas", "Afpfsdf", + "Afpfsgf", "Afpfsif", "Afpfslf", "Afpfsnf", "Afpfsns", "Afpmpaf", "Afpmpdf", + "Afpmpgf", "Afpmpif", "Afpmplf", "Afpmpnf", "Afpmpns", "Afpmsaf", "Afpmsdf", + "Afpmsds", "Afpmsgf", "Afpmsgs", "Afpmsif", "Afpmslf", "Afpmsnf", "Afpmsns", + "Afpnpaf", "Afpnpnf", "Afpnsaf", "Afpnsdf", "Afpnsgf", "Afpnsif", "Afpnslf", + "Afpnsnf", "Afpnsns", "C", "I", "Mc", "Mc---d", "Mc--a", "Mc--ad", "Mc--d", + "Mc--dd", "Mc--g", "Mc--gd", "Mc--i", "Mc--id", "Mc--l", "Mc--n", "Mcf-a", "Mcf-d", + "Mcf-g", "Mcf-i", "Mcf-l", "Mcf-n", "Mcm-a", "Mcm-d", "Mcm-g", "Mcm-i", "Mcm-l", + "Mcm-n", "Mcn-a", "Mcn-d", "Mcn-g", "Mcn-i", "Mcn-l", "Mcn-n", "Mo---d", "Mo--g", + "Mo--i", "Mo-pa", "Mo-pad", "Mo-pd", "Mo-pdd", "Mo-pg", "Mo-pgd", "Mo-pi", + "Mo-pid", "Mo-pl", "Mo-pld", "Mo-pn", "Mo-pnd", "Mo-sad", "Mof", "Mof-a", "Mof-d", + "Mof-g", "Mof-i", "Mof-l", "Mof-n", "Mofsa", "Mofsad", "Mofsd", "Mofsdd", "Mofsg", + "Mofsgd", "Mofsi", "Mofsid", "Mofsl", "Mofsld", "Mofsn", "Mofsnd", "Mom-a", + "Mom-d", "Mom-g", "Mom-i", "Mom-l", "Mom-n", "Momsa", "Momsad", "Momsd", "Momsg", + "Momsgd", "Momsi", "Momsid", "Momsl", "Momsld", "Momsn", "Momsnd", "Mon-a", + "Mon-d", "Mon-g", "Mon-i", "Mon-l", "Mon-n", "Monsa", "Monsad", "Monsd", "Monsg", + "Monsgd", "Monsi", "Monsid", "Monsl", "Monsn", "Monsnd", "Nccpay", "Nccpdy", + "Nccpgy", "Nccpiy", "Nccply", "Nccpny", "Nccsay", "Nccsdy", "Nccsgn", "Nccsgy", + "Nccsiy", "Nccsly", "Nccsnn", "Nccsny", "Ncfpan", "Ncfpay", "Ncfpdn", "Ncfpdy", + "Ncfpgn", "Ncfpgy", "Ncfpin", "Ncfpiy", "Ncfpln", "Ncfply", "Ncfpnn", "Ncfpny", + "Ncfsan", "Ncfsay", "Ncfsdn", "Ncfsdy", "Ncfsgn", "Ncfsgy", "Ncfsin", "Ncfsiy", + "Ncfsln", "Ncfsly", "Ncfsnn", "Ncfsnnl", "Ncfsnnp", "Ncfsny", "Ncfsvy", "Ncmpan", + "Ncmpay", "Ncmpdn", "Ncmpdy", "Ncmpgn", "Ncmpgy", "Ncmpin", "Ncmpiy", "Ncmpln", + "Ncmply", "Ncmpnn", "Ncmpnnl", "Ncmpny", "Ncmsan", "Ncmsay", "Ncmsdn", "Ncmsdy", + "Ncmsgn", "Ncmsgy", "Ncmsin", "Ncmsiy", "Ncmsln", "Ncmsly", "Ncmsnn", "Ncmsnnl", + "Ncmsnnp", "Ncmsny", "Ncmsvn", "Ncmsvy", "Ncnpan", "Ncnpay", "Ncnpdn", "Ncnpdy", + "Ncnpgn", "Ncnpgy", "Ncnpin", "Ncnpiy", "Ncnpln", "Ncnply", "Ncnpnn", "Ncnpny", + "Ncnsan", "Ncnsay", "Ncnsdn", "Ncnsdy", "Ncnsgn", "Ncnsgy", "Ncnsin", "Ncnsiy", + "Ncnsln", "Ncnsly", "Ncnsnn", "Ncnsny", "Npcpay", "Npcsay", "Npcsdy", "Npcsgy", + "Npcsiy", "Npcsly", "Npcsnn", "Npcsny", "Npcsvy", "Npfpay", "Npfpdy", "Npfpgy", + "Npfpiy", "Npfpny", "Npfsay", "Npfsdy", "Npfsgn", "Npfsgy", "Npfsiy", "Npfsly", + "Npfsnn", "Npfsny", "Npfsvy", "Npmpay", "Npmpdy", "Npmpgy", "Npmpiy", "Npmpny", + "Npmpvy", "Npmsay", "Npmsdn", "Npmsdy", "Npmsgn", "Npmsgy", "Npmsiy", "Npmsly", + "Npmsnn", "Npmsny", "Npmsvy", "Npnsan", "Npnsnn", "P-----a", "P-----r", "P----an", + "P----ar", "P----dn", "P----dr", "P----gn", "P----gr", "P----in", "P----ir", + "P----ln", "P----nn", "P---p-a", "P---paa", "P---pan", "P---pda", "P---pdn", + "P---pga", "P---pgn", "P---pia", "P---pin", "P---pla", "P---pln", "P---pna", + "P---pnn", "P---san", "P---sar", "P---sdn", "P---sdr", "P---sga", "P---sgn", + "P---sgr", "P---sia", "P---sin", "P---sir", "P---sln", "P---snn", "P--f-aa", + "P--f-la", "P--fpaa", "P--fs-a", "P--fsaa", "P--fsan", "P--fsda", "P--fsdn", + "P--fsga", "P--fsgn", "P--fsia", "P--fsin", "P--fsla", "P--fsln", "P--fsna", + "P--fsnn", "P--m-aa", "P--m-ga", "P--m-ia", "P--m-la", "P--mpga", "P--ms-a", + "P--msaa", "P--msan", "P--msda", "P--msdn", "P--msga", "P--msgn", "P--msia", + "P--msin", "P--msla", "P--msln", "P--msna", "P--msnn", "P--n-an", "P--n-ga", + "P--n-la", "P--n-na", "P--npan", "P--npgn", "P--npnn", "P--ns-a", "P--nsaa", + "P--nsan", "P--nsda", "P--nsdn", "P--nsga", "P--nsgn", "P--nsia", "P--nsin", + "P--nsla", "P--nsln", "P--nsna", "P--nsnn", "P-1-pan", "P-1-pdn", "P-1-pgn", + "P-1-pin", "P-1-pln", "P-1-pnn", "P-1-san", "P-1-sdn", "P-1-sgn", "P-1-sin", + "P-1-sln", "P-1-snn", "P-1nsnn", "P-2-pan", "P-2-pdn", "P-2-pgn", "P-2-pin", + "P-2-pln", "P-2-pnn", "P-2-san", "P-2-sdn", "P-2-sgn", "P-2-sin", "P-2-sln", + "P-2-snn", "P-2msdn", "P-2nsan", "P-3-pan", "P-3-pdn", "P-3-pgn", "P-3-pin", + "P-3-pln", "P-3-pnn", "P-3-san", "P-3fsan", "P-3fsdn", "P-3fsgn", "P-3fsin", + "P-3fsln", "P-3fsnn", "P-3msan", "P-3msdn", "P-3msgn", "P-3msin", "P-3msln", + "P-3msnn", "P-3nsan", "P-3nsdn", "P-3nsgn", "P-3nsin", "P-3nsln", "P-3nsnn", "Q", + "R", "Rc", "SENT", "Sp-a", "Sp-d", "Sp-g", "Sp-i", "Sp-l", "Sp-n", "Vmg----a-p", + "Vmg----m-p", "Vmgp---a-e", "Vmgp---a-p", "Vmgp---m-e", "Vmgp---m-p", "Vmgs---a-e", + "Vmgs---a-p", "Vmgs---m-e", "Vmgs---m-p", "Vmi-1--a-e", "Vmif1p-a-e", "Vmif1p-a-p", + "Vmif1p-m-p", "Vmif1s-a-e", "Vmif1s-a-p", "Vmif1s-m-p", "Vmif2p-a-e", "Vmif2p-a-p", + "Vmif2p-m-p", "Vmif2s-a-e", "Vmif2s-a-p", "Vmif2s-m-p", "Vmif3p-a-e", "Vmif3p-a-p", + "Vmif3p-m-p", "Vmif3s-a-e", "Vmif3s-a-p", "Vmif3s-m-p", "Vmip---m-e", "Vmip1p-a-e", + "Vmip1p-a-p", "Vmip1p-m-e", "Vmip1s-a-e", "Vmip1s-a-p", "Vmip1s-m-e", "Vmip2p-a-e", + "Vmip2p-m-e", "Vmip2s-a-e", "Vmip2s-m-e", "Vmip3p-a-e", "Vmip3p-a-p", "Vmip3p-m-e", + "Vmip3p-p-e", "Vmip3s-a-e", "Vmip3s-m-e", "Vmip3s-p-e", "Vmis---a-e", "Vmis---a-p", + "Vmis---m-e", "Vmis--nm-e", "Vmis-p-a-e", "Vmis-p-a-p", "Vmis-p-m-e", "Vmis-p-m-p", + "Vmis-p-p-e", "Vmis-s-a-e", "Vmis-s-a-p", "Vmis-sfa-e", "Vmis-sfa-p", "Vmis-sfm-e", + "Vmis-sfm-p", "Vmis-sfp-e", "Vmis-sma-e", "Vmis-sma-p", "Vmis-smm-e", "Vmis-smm-p", + "Vmis-smp-e", "Vmis-smp-p", "Vmis-sna-e", "Vmis-sna-p", "Vmis-snm-e", "Vmis-snm-p", + "Vmis-snp-e", "Vmm--s-a-e", "Vmm-1p-a-e", "Vmm-1p-a-p", "Vmm-1p-m-p", "Vmm-1s-a-e", + "Vmm-1s-a-p", "Vmm-1s-m-p", "Vmm-2--a-e", "Vmm-2--a-p", "Vmm-2p-a-e", "Vmm-2p-a-p", + "Vmm-2p-m-e", "Vmm-2p-m-p", "Vmm-2s-a-e", "Vmm-2s-a-p", "Vmm-2s-m-e", "Vmm-2s-m-p", + "Vmn----a-e", "Vmn----a-p", "Vmn----m-e", "Vmn----m-p", "Vmn----p-e", + "Vmpp-p-a-ea", "Vmpp-p-a-ed", "Vmpp-p-a-eg", "Vmpp-p-a-ei", "Vmpp-p-a-el", + "Vmpp-p-a-en", "Vmpp-p-afea", "Vmpp-p-afed", "Vmpp-p-afeg", "Vmpp-p-afei", + "Vmpp-p-afel", "Vmpp-p-afen", "Vmpp-p-m-ea", "Vmpp-p-m-ed", "Vmpp-p-m-eg", + "Vmpp-p-m-ei", "Vmpp-p-m-el", "Vmpp-p-m-en", "Vmpp-p-mfea", "Vmpp-p-mfed", + "Vmpp-p-mfeg", "Vmpp-p-mfei", "Vmpp-p-mfel", "Vmpp-p-mfen", "Vmpp-p-p-ea", + "Vmpp-p-p-ed", "Vmpp-p-p-eg", "Vmpp-p-p-en", "Vmpp-p-pfea", "Vmpp-p-pfed", + "Vmpp-p-pfeg", "Vmpp-p-pfei", "Vmpp-p-pfel", "Vmpp-p-pfen", "Vmpp-p-pse", + "Vmpp-pma-eg", "Vmpp-s-a-ei", "Vmpp-s-afei", "Vmpp-sfa-ea", "Vmpp-sfa-ed", + "Vmpp-sfa-eg", "Vmpp-sfa-ei", "Vmpp-sfa-el", "Vmpp-sfa-en", "Vmpp-sfafea", + "Vmpp-sfafed", "Vmpp-sfafeg", "Vmpp-sfafei", "Vmpp-sfafel", "Vmpp-sfafen", + "Vmpp-sfm-ea", "Vmpp-sfm-ed", "Vmpp-sfm-eg", "Vmpp-sfm-ei", "Vmpp-sfm-el", + "Vmpp-sfm-en", "Vmpp-sfmfea", "Vmpp-sfmfed", "Vmpp-sfmfeg", "Vmpp-sfmfei", + "Vmpp-sfmfel", "Vmpp-sfmfen", "Vmpp-sfp-ea", "Vmpp-sfp-eg", "Vmpp-sfp-ei", + "Vmpp-sfp-el", "Vmpp-sfp-en", "Vmpp-sfpfea", "Vmpp-sfpfed", "Vmpp-sfpfeg", + "Vmpp-sfpfei", "Vmpp-sfpfel", "Vmpp-sfpfen", "Vmpp-sfpse", "Vmpp-sma-ea", + "Vmpp-sma-ed", "Vmpp-sma-eg", "Vmpp-sma-ei", "Vmpp-sma-el", "Vmpp-sma-en", + "Vmpp-smafea", "Vmpp-smafed", "Vmpp-smafeg", "Vmpp-smafei", "Vmpp-smafel", + "Vmpp-smafen", "Vmpp-smase", "Vmpp-smm-ea", "Vmpp-smm-ed", "Vmpp-smm-eg", + "Vmpp-smm-ei", "Vmpp-smm-el", "Vmpp-smm-en", "Vmpp-smmfea", "Vmpp-smmfed", + "Vmpp-smmfeg", "Vmpp-smmfei", "Vmpp-smmfel", "Vmpp-smmfen", "Vmpp-smp-ea", + "Vmpp-smp-eg", "Vmpp-smp-ei", "Vmpp-smp-el", "Vmpp-smp-en", "Vmpp-smpfea", + "Vmpp-smpfed", "Vmpp-smpfeg", "Vmpp-smpfei", "Vmpp-smpfel", "Vmpp-smpfen", + "Vmpp-smpse", "Vmpp-sna-ea", "Vmpp-sna-ed", "Vmpp-sna-eg", "Vmpp-sna-ei", + "Vmpp-sna-el", "Vmpp-sna-en", "Vmpp-snafea", "Vmpp-snafed", "Vmpp-snafeg", + "Vmpp-snafei", "Vmpp-snafel", "Vmpp-snafen", "Vmpp-snm-ea", "Vmpp-snm-ed", + "Vmpp-snm-eg", "Vmpp-snm-ei", "Vmpp-snm-en", "Vmpp-snmfea", "Vmpp-snmfed", + "Vmpp-snmfeg", "Vmpp-snmfei", "Vmpp-snmfel", "Vmpp-snmfen", "Vmpp-snp-ea", + "Vmpp-snp-ed", "Vmpp-snp-eg", "Vmpp-snp-ei", "Vmpp-snp-en", "Vmpp-snpfea", + "Vmpp-snpfed", "Vmpp-snpfeg", "Vmpp-snpfei", "Vmpp-snpfel", "Vmpp-snpfen", + "Vmpp-snpse", "Vmps-p-a-ea", "Vmps-p-a-ed", "Vmps-p-a-eg", "Vmps-p-a-ei", + "Vmps-p-a-el", "Vmps-p-a-en", "Vmps-p-a-pa", "Vmps-p-a-pd", "Vmps-p-a-pg", + "Vmps-p-a-pi", "Vmps-p-a-pl", "Vmps-p-a-pn", "Vmps-p-afea", "Vmps-p-afed", + "Vmps-p-afeg", "Vmps-p-afei", "Vmps-p-afel", "Vmps-p-afen", "Vmps-p-afpa", + "Vmps-p-afpd", "Vmps-p-afpg", "Vmps-p-afpi", "Vmps-p-afpl", "Vmps-p-afpn", + "Vmps-p-m-ea", "Vmps-p-m-eg", "Vmps-p-m-ei", "Vmps-p-m-el", "Vmps-p-m-en", + "Vmps-p-m-pa", "Vmps-p-m-pd", "Vmps-p-m-pg", "Vmps-p-m-pi", "Vmps-p-m-pl", + "Vmps-p-m-pn", "Vmps-p-mfea", "Vmps-p-mfed", "Vmps-p-mfeg", "Vmps-p-mfei", + "Vmps-p-mfel", "Vmps-p-mfen", "Vmps-p-mfpa", "Vmps-p-mfpd", "Vmps-p-mfpg", + "Vmps-p-mfpi", "Vmps-p-mfpl", "Vmps-p-mfpn", "Vmps-p-p-ed", "Vmps-p-p-eg", + "Vmps-p-p-ei", "Vmps-p-p-en", "Vmps-p-p-pa", "Vmps-p-p-pd", "Vmps-p-p-pg", + "Vmps-p-p-pi", "Vmps-p-p-pl", "Vmps-p-p-pn", "Vmps-p-pfea", "Vmps-p-pfed", + "Vmps-p-pfeg", "Vmps-p-pfei", "Vmps-p-pfel", "Vmps-p-pfen", "Vmps-p-pfpa", + "Vmps-p-pfpd", "Vmps-p-pfpg", "Vmps-p-pfpi", "Vmps-p-pfpl", "Vmps-p-pfpn", + "Vmps-p-pse", "Vmps-p-psp", "Vmps-s-pfpa", "Vmps-s-pfpn", "Vmps-sfa-ea", + "Vmps-sfa-ed", "Vmps-sfa-eg", "Vmps-sfa-ei", "Vmps-sfa-el", "Vmps-sfa-en", + "Vmps-sfa-pa", "Vmps-sfa-pd", "Vmps-sfa-pg", "Vmps-sfa-pi", "Vmps-sfa-pl", + "Vmps-sfa-pn", "Vmps-sfafea", "Vmps-sfafed", "Vmps-sfafeg", "Vmps-sfafei", + "Vmps-sfafel", "Vmps-sfafen", "Vmps-sfafpa", "Vmps-sfafpd", "Vmps-sfafpg", + "Vmps-sfafpi", "Vmps-sfafpl", "Vmps-sfafpn", "Vmps-sfm-ea", "Vmps-sfm-eg", + "Vmps-sfm-el", "Vmps-sfm-en", "Vmps-sfm-pa", "Vmps-sfm-pd", "Vmps-sfm-pg", + "Vmps-sfm-pi", "Vmps-sfm-pl", "Vmps-sfm-pn", "Vmps-sfmfea", "Vmps-sfmfed", + "Vmps-sfmfeg", "Vmps-sfmfei", "Vmps-sfmfel", "Vmps-sfmfen", "Vmps-sfmfpa", + "Vmps-sfmfpd", "Vmps-sfmfpg", "Vmps-sfmfpi", "Vmps-sfmfpl", "Vmps-sfmfpn", + "Vmps-sfp-ea", "Vmps-sfp-ed", "Vmps-sfp-eg", "Vmps-sfp-ei", "Vmps-sfp-en", + "Vmps-sfp-pa", "Vmps-sfp-pd", "Vmps-sfp-pg", "Vmps-sfp-pi", "Vmps-sfp-pl", + "Vmps-sfp-pn", "Vmps-sfpfea", "Vmps-sfpfed", "Vmps-sfpfeg", "Vmps-sfpfei", + "Vmps-sfpfel", "Vmps-sfpfen", "Vmps-sfpfpa", "Vmps-sfpfpd", "Vmps-sfpfpg", + "Vmps-sfpfpi", "Vmps-sfpfpl", "Vmps-sfpfpn", "Vmps-sfpse", "Vmps-sfpsp", + "Vmps-sma-ea", "Vmps-sma-ed", "Vmps-sma-eg", "Vmps-sma-ei", "Vmps-sma-el", + "Vmps-sma-en", "Vmps-sma-pa", "Vmps-sma-pd", "Vmps-sma-pg", "Vmps-sma-pi", + "Vmps-sma-pl", "Vmps-sma-pn", "Vmps-smafea", "Vmps-smafed", "Vmps-smafeg", + "Vmps-smafei", "Vmps-smafel", "Vmps-smafen", "Vmps-smafpa", "Vmps-smafpd", + "Vmps-smafpg", "Vmps-smafpi", "Vmps-smafpl", "Vmps-smafpn", "Vmps-smm-ea", + "Vmps-smm-ed", "Vmps-smm-eg", "Vmps-smm-ei", "Vmps-smm-en", "Vmps-smm-pa", + "Vmps-smm-pd", "Vmps-smm-pg", "Vmps-smm-pi", "Vmps-smm-pl", "Vmps-smm-pn", + "Vmps-smmfea", "Vmps-smmfeg", "Vmps-smmfei", "Vmps-smmfel", "Vmps-smmfen", + "Vmps-smmfpa", "Vmps-smmfpd", "Vmps-smmfpg", "Vmps-smmfpi", "Vmps-smmfpl", + "Vmps-smmfpn", "Vmps-smp-ea", "Vmps-smp-eg", "Vmps-smp-ei", "Vmps-smp-en", + "Vmps-smp-pa", "Vmps-smp-pd", "Vmps-smp-pg", "Vmps-smp-pi", "Vmps-smp-pl", + "Vmps-smp-pn", "Vmps-smpfea", "Vmps-smpfed", "Vmps-smpfeg", "Vmps-smpfei", + "Vmps-smpfel", "Vmps-smpfen", "Vmps-smpfpa", "Vmps-smpfpd", "Vmps-smpfpg", + "Vmps-smpfpi", "Vmps-smpfpl", "Vmps-smpfpn", "Vmps-smpse", "Vmps-smpsp", + "Vmps-sna-ea", "Vmps-sna-eg", "Vmps-sna-ei", "Vmps-sna-el", "Vmps-sna-en", + "Vmps-sna-p", "Vmps-sna-pa", "Vmps-sna-pd", "Vmps-sna-pg", "Vmps-sna-pi", + "Vmps-sna-pl", "Vmps-sna-pn", "Vmps-snafea", "Vmps-snafed", "Vmps-snafeg", + "Vmps-snafei", "Vmps-snafel", "Vmps-snafen", "Vmps-snafpa", "Vmps-snafpd", + "Vmps-snafpg", "Vmps-snafpi", "Vmps-snafpl", "Vmps-snafpn", "Vmps-snm-ea", + "Vmps-snm-eg", "Vmps-snm-en", "Vmps-snm-pa", "Vmps-snm-pg", "Vmps-snm-pi", + "Vmps-snm-pl", "Vmps-snm-pn", "Vmps-snmfea", "Vmps-snmfed", "Vmps-snmfeg", + "Vmps-snmfei", "Vmps-snmfel", "Vmps-snmfen", "Vmps-snmfpa", "Vmps-snmfpd", + "Vmps-snmfpg", "Vmps-snmfpi", "Vmps-snmfpl", "Vmps-snmfpn", "Vmps-snp-el", + "Vmps-snp-p", "Vmps-snp-pa", "Vmps-snp-pd", "Vmps-snp-pg", "Vmps-snp-pi", + "Vmps-snp-pl", "Vmps-snp-pn", "Vmps-snpfea", "Vmps-snpfeg", "Vmps-snpfen", + "Vmps-snpfpa", "Vmps-snpfpd", "Vmps-snpfpg", "Vmps-snpfpi", "Vmps-snpfpl", + "Vmps-snpfpn", "Vmps-snpse", "Vmps-snpsp" }; + + runTest("ru", "msd", tagset, "Это тест .", + new String[] { "это", "тест", "." }, + new String[] { "P--nsnn", "Ncmsnn", "SENT" }, + new String[] { "POS_PRON", "POS_NOUN", "POS_PUNCT" }); + } + + @Test + @Ignore("Slovene model currently not in Artifactory because we do not know tagset yet") + public void testSlovene() + throws Exception + { + String[] tagset = { }; + + runTest("sl", null, tagset, "To je test .", + new String[] { "ta", "biti", "test", "." }, + new String[] { "zk-sei----s", "gvpste--n", "somei", "SENT" }, + new String[] { "POS", "POS", "POS", "POS" }); + + runTest("sl", null, tagset, "Gremo na Češko za kosilo .", + new String[] { "iti", "na", "Češko", "za", "kosilo", "." }, + new String[] { "gppspm--n-----d", "dpet", "slmei", "dpet", "soset", "SENT" }, + new String[] { "POS", "POS", "POS", "POS", "POS", "POS" }); + } + + @Test + public void testSlovak() + throws Exception + { + String[] tagset = { "!", "\"", "#", "%", "(", ")", ",", ".", "0", ":", ";", "?", "Apx", + "Apy", "Apz", "Asx", "Asy", "Asz", "Dx", "Dy", "Dz", "E", "Gpx", "Gpy", "Gpz", + "Gsx", "Gsy", "Gsz", "J", "ND", "Np", "Ns", "O", "OY", "PD", "Pp", "Ps", "Q", "R", + "Sp", "Ss", "T", "TY", "VBpa", "VBpb", "VBpc", "VBsa", "VBsb", "VBsc", "VH", "VI", + "VKpa", "VKpb", "VKpc", "VKsa", "VKsb", "VKsc", "VLpa", "VLpb", "VLpc", "VLsa", + "VLsb", "VLsc", "VMpa", "VMpb", "VMsb", "W", "Y", "Z", "par" }; + + runTest("sk", "smt-reduced", tagset, "To je test .", + new String[] { "to", "byť", "test", "." }, + new String[] { "Ps", "VKsc", "Ss", "." }, + new String[] { "POS_PRON", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); + } + + @Test + public void testChinese() throws Exception + { + String[] tagset = { "a", "ad", "ag", "an", "b", "bg", "c", "d", "dg", "e", "ew", "f", "g", + "h", "i", "j", "k", "l", "m", "mg", "n", "nd", "ng", "nh", "ni", "nl", "nr", "ns", + "nt", "nx", "nz", "o", "p", "q", "r", "rg", "s", "t", "tg", "u", "v", "vd", "vg", + "vn", "w", "wp", "ws", "x", "y", "z" }; + + // The rudder often in the wake of the wind round the back of the area. + runTest("zh", "lcmc", tagset, "尾 舵 常 处于 风轮 后面 的 尾流 区里 。", + new String[] { "_", "_", "_", "_", "风轮", "_", "_", "_", "_", "_" }, + new String[] { "ng", "n", "d", "v", "n", "f", "u", "n", "nl", "ew" }, + new String[] { "POS_NOUN", "POS_NOUN", "POS_ADV", "POS_VERB", "POS_NOUN", "POS", + "POS_AUX", "POS_NOUN", "POS", "POS_PUNCT" }); + + // The service sector has become an important engine of Guangdong's economic transformation + // and upgrading. + runTest("zh", "lcmc", tagset, "服务业 成为 广东 经济 转型 升级 的 重要 引擎 。", + new String[] { "_", "_", "_", "_", "_", "_", "_", "_", "_", "_" }, + new String[] { "n", "v", "ns", "n", "v", "v", "u", "a", "n", "ew" }, + new String[] { "POS_NOUN", "POS_VERB", "POS_PROPN", "POS_NOUN", "POS_VERB", "POS_VERB", + "POS_AUX", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); + + // How far is China from the world brand? + runTest("zh", "lcmc", tagset, "中国 离 世界 技术 品牌 有 多远 ?", + new String[] { "_", "_", "_", "_", "_", "_", "多远", "_" }, + new String[] { "ns", "v", "n", "n", "n", "v", "n", "ew" }, + new String[] { "POS_PROPN", "POS_VERB", "POS_NOUN", "POS_NOUN", "POS_NOUN", "POS_VERB", + "POS_NOUN", "POS_PUNCT" }); + } + + @Test +// @Ignore("Platform specific") + public void testOddCharacters() + throws Exception + { + runTest("en", null, null, "² § ¶ § °", + new String[] { "²", "§", "¶", "§", "°" }, + new String[] { "NN", "SYM", "NN", "SYM", "SYM" }, + new String[] { "POS_NOUN", "POS_SYM", "POS_NOUN", "POS_SYM", "POS_SYM" }); + } + + /** + * Generate a very large document and test it. + */ + @Test + @Ignore("Ignoring test to avoid memory errors (see issue #850 in GitHub") + public void hugeDocumentTest() + throws Exception + { + // Start Java with -Xmx512m + boolean run = Runtime.getRuntime().maxMemory() > (500000000); + if (!run) { + System.out.println("Test requires more heap than available, skipping"); + } + Assume.assumeTrue(run); + + // Disable trace as this significantly slows down the test + TreeTaggerWrapper.TRACE = false; + + String text = "This is a test ."; + int reps = 4000000 / text.length(); + String testString = repeat(text, " ", reps); + + JCas jcas = runTest("en", null, null, testString, null, null, null); + List<POS> actualTags = new ArrayList<POS>(select(jcas, POS.class)); + assertEquals(reps * 5, actualTags.size()); + + // test POS annotations + String[] expectedTags = { "DT", "VBZ", "DT", "NN", "SENT" }; + String[] expectedTagClasses = { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }; + + for (int i = 0; i < actualTags.size(); i++) { + POS posAnnotation = actualTags.get(i); + assertEquals("In position " + i, expectedTagClasses[i % 5], + posAnnotation.getType().getShortName()); + assertEquals("In position " + i, expectedTags[i % 5], posAnnotation.getPosValue()); + } + + System.out.println("Successfully tagged document with " + testString.length() + + " characters and " + actualTags.size() + " tokens"); + } + + /** + * Test using the same AnalysisEngine multiple times. + */ + @Test + public void multiDocumentTest() throws Exception + { + checkModelsAndBinary("en"); + + String testDocument = "This is a test ."; + String[] lemmas = { "this", "be", "a", "test", "." }; + String[] tags = { "DT", "VBZ", "DT", "NN", "SENT" }; + String[] tagClasses = { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }; + + AnalysisEngine engine = createEngine(TreeTaggerPosTagger.class); + + HideOutput hideOut = new HideOutput(); + try { + + for (int n = 0; n < 100; n++) { + JCas aJCas = TestRunner.runTest(engine, "en", testDocument); + + AssertAnnotations.assertPOS(tagClasses, tags, select(aJCas, POS.class)); + AssertAnnotations.assertLemma(lemmas, select(aJCas, Lemma.class)); + } + } + finally { + engine.destroy(); + hideOut.restoreOutput(); + } + } + + /** + * Run the {@link #hugeDocumentTest()} 100 times. + */ + @Test + @Ignore("This test takes a very long time. Only include it if you need to " + + "test the stability of the annotator") + public void loadTest() + throws Exception + { + for (int i = 0; i < 100; i++) { + System.out.println("Load test iteration " + i); + hugeDocumentTest(); + } + } + + private void checkModelsAndBinary(String lang) + { + Assume.assumeTrue( + getClass().getResource("/de/tudarmstadt/ukp/dkpro/core/treetagger/lib/tagger-" + + lang + "-le.bin") != null); + + Assume.assumeTrue(getClass().getResource( + "/de/tudarmstadt/ukp/dkpro/core/treetagger/bin/LICENSE.txt") != null || + System.getProperty("treetagger.home") != null); + } + + private JCas runTest(String language, String tagsetName, String[] tagset, String testDocument, + String[] lemmas, String[] tags, String[] tagClasses) + throws Exception + { + checkModelsAndBinary(language); + + AnalysisEngine engine = createEngine(TreeTaggerPosTagger.class, + TreeTaggerPosTagger.PARAM_PRINT_TAGSET, true); + + JCas aJCas = TestRunner.runTest(engine, language, testDocument); + + AssertAnnotations.assertLemma(lemmas, select(aJCas, Lemma.class)); + AssertAnnotations.assertPOS(tagClasses, tags, select(aJCas, POS.class)); + if (tagset != null) { + AssertAnnotations.assertTagset(POS.class, tagsetName, tagset, aJCas); + } + + return aJCas; + } + + /** + * Test using the same AnalysisEngine multiple times. + */ + @Test + public void longTokenTest() + throws Exception + { + checkModelsAndBinary("en"); + + AnalysisEngine engine = createEngine(TreeTaggerPosTagger.class); + JCas jcas = engine.newJCas(); + + try { + for (int n = 99990; n < 100000; n ++) { + System.out.println(n); + jcas.setDocumentLanguage("en"); + JCasBuilder builder = new JCasBuilder(jcas); + builder.add("Start", Token.class); + builder.add("with", Token.class); + builder.add("good", Token.class); + builder.add("tokens", Token.class); + builder.add(".", Token.class); + builder.add(StringUtils.repeat("b", n), Token.class); + builder.add("End", Token.class); + builder.add("with", Token.class); + builder.add("some", Token.class); + builder.add("good", Token.class); + builder.add("tokens", Token.class); + builder.add(".", Token.class); + builder.close(); + engine.process(jcas); + jcas.reset(); + } + } + finally { + engine.destroy(); + } + } + + /** + * Runs a small pipeline on a text containing quite odd characters such as + * Unicode LEFT-TO-RIGHT-MARKs. The BreakIteratorSegmenter creates tokens from these + * which are send to TreeTagger as tokens containing line breaks or only + * whitespace. TreeTaggerPosLemmaTT4J has to filter these tokens before + * they reach the TreeTaggerWrapper. + */ +// @Test +// public +// void testStrangeDocument() +// throws Exception +// { +// CollectionReader reader = createReader( +// FileSystemReader.class, +// createTypeSystemDescription(), +// FileSystemReader.PARAM_INPUTDIR, getTestResource( +// "test_files/annotator/TreeTaggerPosLemmaTT4J/strange")); +// +// AnalysisEngine sentenceSplitter = createEngine( +// BreakIteratorSegmenter.class, +// tsd); +// +// AnalysisEngine tt = createEngine(TreeTaggerPosLemmaTT4J.class, tsd, +// TreeTaggerTT4JBase.PARAM_LANGUAGE_CODE, "en"); +// +// runPipeline(reader, sentenceSplitter, tt); +// } + +// @Test +// @Ignore("This test should fail, however - due to fixes in the Tokenizer, " + +// "we can currently not provokate a failure with the given 'strange' " + +// "document.") +// public +// void testStrangeDocumentFail() +// throws Exception +// { +// CollectionReader reader = createReader( +// FileSystemReader.class, +// createTypeSystemDescription(), +// FileSystemReader.PARAM_INPUTDIR, getTestResource( +// "test_files/annotator/TreeTaggerPosLemmaTT4J/strange")); +// +// AnalysisEngine sentenceSplitter = createEngine( +// BreakIteratorSegmenter.class, +// tsd); +// +// AnalysisEngine tt = createEngine(TreeTaggerPosLemmaTT4J.class, tsd, +// TreeTaggerTT4JBase.PARAM_LANGUAGE_CODE, "en", +// TreeTaggerTT4JBase.PARAM_PERFORMANCE_MODE, true); +// +// runPipeline( +// reader, +// sentenceSplitter, +// tt); +// } + + /** + * When running this test, check manually if TreeTagger is restarted + * between the documents. If you jank up the log levels, that should be + * visible on the console. Unfortunately we cannot easily access the + * restartCount of the TreeTaggerWrapper. + */ +// @Test +// public +// void testRealMultiDocument() +// throws Exception +// { +// CollectionReader reader = createReader( +// FileSystemReader.class, +// createTypeSystemDescription(), +// FileSystemReader.PARAM_INPUTDIR, getTestResource( +// "test_files/annotator/TreeTaggerPosLemmaTT4J/multiDoc")); +// +// AnalysisEngine sentenceSplitter = createEngine( +// BreakIteratorSegmenter.class, +// tsd); +// +// AnalysisEngine tt = createEngine(TreeTaggerPosLemmaTT4J.class, tsd, +// TreeTaggerTT4JBase.PARAM_LANGUAGE_CODE, "en"); +// +// runPipeline( +// reader, +// sentenceSplitter, +// tt); +// } + + /* + * Uncomment to test explicitly setting model/binary locations + */ +// @Test +// public void testExplicitBinaryModel() throws Exception +// { +// AnalysisEngine tt = createEngine(TreeTaggerPosTagger.class, +// TreeTaggerPosTagger.PARAM_EXECUTABLE_PATH, +// "/Applications/tree-tagger-MacOSX-3.2-intel/bin/tree-tagger", +// TreeTaggerPosTagger.PARAM_MODEL_LOCATION, +// "/Applications/tree-tagger-MacOSX-3.2-intel/models/german-par-linux-3.2-utf8.bin", +// TreeTaggerPosTagger.PARAM_MODEL_ENCODING, "UTF-8"); +// +// JCas jcas = JCasFactory.createJCas(); +// jcas.setDocumentLanguage("de"); +// +// TokenBuilder<Token, Sentence> tb = new TokenBuilder<Token, Sentence>(Token.class, +// Sentence.class); +// tb.buildTokens(jcas, "Dies ist ein test ."); +// +// tt.process(jcas); +// } + + @Rule + public DkproTestContext testContext = new DkproTestContext(); +} diff --git a/dkpro-core-treetagger-asl/src/test/resources/log4j.properties b/dkpro-core-treetagger-asl/src/test/resources/log4j.properties deleted file mode 100644 index 9ef9876f5c..0000000000 --- a/dkpro-core-treetagger-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,7 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG diff --git a/dkpro-core-treetagger-asl/src/test/resources/log4j2.xml b/dkpro-core-treetagger-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..19bf03b585 --- /dev/null +++ b/dkpro-core-treetagger-asl/src/test/resources/log4j2.xml @@ -0,0 +1,15 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/dkpro-core-udpipe-asl/pom.xml b/dkpro-core-udpipe-asl/pom.xml index adbb22b8d4..919c6c3b68 100644 --- a/dkpro-core-udpipe-asl/pom.xml +++ b/dkpro-core-udpipe-asl/pom.xml @@ -15,19 +15,18 @@ See the License for the specific language governing permissions and limitations under the License. --> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core-asl</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-asl</artifactId> + <version>2.3.0-SNAPSHOT</version> <relativePath>../dkpro-core-asl</relativePath> </parent> - <groupId>org.dkpro.core</groupId> <artifactId>dkpro-core-udpipe-asl</artifactId> <packaging>jar</packaging> <name>DKPro Core ASL - UDPipe (v ${udpipe.version})</name> + <url>https://dkpro.github.io/dkpro-core/</url> <properties> <udpipe.version>1.1.0</udpipe.version> </properties> @@ -50,38 +49,42 @@ <artifactId>commons-lang3</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.syntax-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-syntax-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-lexmorph-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.parameter-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-parameter-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.resources-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-resources-asl</artifactId> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-api-segmentation-asl</artifactId> </dependency> <dependency> <groupId>org.dkpro.core</groupId> <artifactId>dkpro-core-udpipe-bin</artifactId> <version>20170425.0</version> </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> + </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core.testing-asl</artifactId> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-testing-asl</artifactId> <scope>test</scope> </dependency> <dependency> diff --git a/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/UDPipeParser.java b/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/UDPipeParser.java index 151062e3c6..1f77db3fe4 100644 --- a/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/UDPipeParser.java +++ b/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/UDPipeParser.java @@ -19,12 +19,12 @@ import static org.apache.uima.fit.util.JCasUtil.indexCovered; import static org.apache.uima.fit.util.JCasUtil.select; +import static org.dkpro.core.api.resources.MappingProviderFactory.createDependencyMappingProvider; import java.io.File; import java.io.IOException; import java.net.URL; import java.util.ArrayList; -import java.util.Collection; import java.util.List; import java.util.Map; @@ -37,19 +37,21 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.resources.ResourceUtils; import org.dkpro.core.udpipe.internal.DKPro2UDPipe; import org.dkpro.core.udpipe.internal.UDPipe2DKPro; import org.dkpro.core.udpipe.internal.UDPipeUtils; import cz.cuni.mff.ufal.udpipe.Model; import cz.cuni.mff.ufal.udpipe.ProcessingError; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Dependency parser using UDPipe. UDPipe uses Parsito, a greedy transition-based parser @@ -57,7 +59,9 @@ * * @see <a href="https://github.com/ufal/udpipe/tree/master/src/parsito">Parsito in UDPipe</a> */ -@ResourceMetaData(name="UDPipe Parsito Dependency Parser") +@Component(OperationType.DEPENDENCY_PARSER) +@ResourceMetaData(name = "UDPipe Parsito Dependency Parser") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", @@ -84,6 +88,20 @@ public class UDPipeParser @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + /** * Load the model from this location instead of locating the model automatically. */ @@ -91,24 +109,23 @@ public class UDPipeParser @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) protected String modelLocation; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Load the dependency to UIMA type mapping from this location instead of locating * the mapping automatically. */ - public static final String PARAM_DEPENDENCY_MAPPING_LOCATION = ComponentParameters.PARAM_DEPENDENCY_MAPPING_LOCATION; + public static final String PARAM_DEPENDENCY_MAPPING_LOCATION = + ComponentParameters.PARAM_DEPENDENCY_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_DEPENDENCY_MAPPING_LOCATION, mandatory = false) protected String dependencyMappingLocation; - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code true} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - private ModelProviderBase<Model> modelProvider; private MappingProvider mappingProvider; @@ -142,8 +159,8 @@ protected Model produceResource(URL aUrl) } }; - mappingProvider = MappingProviderFactory.createDependencyMappingProvider( - dependencyMappingLocation, language, modelProvider); + mappingProvider = createDependencyMappingProvider(this, dependencyMappingLocation, language, + modelProvider); } @Override @@ -159,7 +176,7 @@ public void process(JCas aJCas) // model metadata mappingProvider.configure(cas); - Map<Sentence, Collection<Token>> index = indexCovered(aJCas, Sentence.class, Token.class); + Map<Sentence, List<Token>> index = indexCovered(aJCas, Sentence.class, Token.class); for (Sentence sentence : select(aJCas, Sentence.class)) { List<Token> tokens = new ArrayList<>(index.get(sentence)); @@ -173,7 +190,7 @@ public void process(JCas aJCas) new IllegalStateException(error.getMessage())); } - UDPipe2DKPro.convertParse(udSent, tokens, aJCas, mappingProvider, internTags); + UDPipe2DKPro.convertParse(udSent, tokens, aJCas, mappingProvider); } } } diff --git a/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/UDPipePosTagger.java b/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/UDPipePosTagger.java index bc0e6ca271..cfa346fcba 100644 --- a/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/UDPipePosTagger.java +++ b/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/UDPipePosTagger.java @@ -24,6 +24,7 @@ import java.io.IOException; import java.net.URL; import java.util.Collection; +import java.util.List; import java.util.Map; import org.apache.uima.UimaContext; @@ -35,19 +36,22 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.MappingProvider; +import org.dkpro.core.api.resources.MappingProviderFactory; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.resources.ResourceUtils; import org.dkpro.core.udpipe.internal.DKPro2UDPipe; import org.dkpro.core.udpipe.internal.UDPipe2DKPro; import org.dkpro.core.udpipe.internal.UDPipeUtils; import cz.cuni.mff.ufal.udpipe.Model; import cz.cuni.mff.ufal.udpipe.ProcessingError; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import eu.openminted.share.annotations.api.Component; +import eu.openminted.share.annotations.api.DocumentationResource; +import eu.openminted.share.annotations.api.constants.OperationType; /** * Part-of-Speech, lemmatizer, and morphological analyzer using UDPipe. UDPipe uses MorphoDiTa for @@ -55,7 +59,9 @@ * * @see <a href="https://github.com/ufal/udpipe/tree/master/src/morphodita">MorphoDiTa in UDPipe</a> */ -@ResourceMetaData(name="UDPipe MorphoDiTa Morphological Analyzer") +@Component(OperationType.PART_OF_SPEECH_TAGGER) +@ResourceMetaData(name = "UDPipe MorphoDiTa Morphological Analyzer") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", @@ -81,6 +87,20 @@ public class UDPipePosTagger @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + /** * Load the model from this location instead of locating the model automatically. */ @@ -88,24 +108,23 @@ public class UDPipePosTagger @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) protected String modelLocation; + /** + * Enable/disable type mapping. + */ + public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; + @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = + ComponentParameters.DEFAULT_MAPPING_ENABLED) + protected boolean mappingEnabled; + /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ - public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; + public static final String PARAM_POS_MAPPING_LOCATION = + ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; - /** - * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid - * spaming the heap with thousands of strings representing only a few different tags. - * - * Default: {@code true} - */ - public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; - @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") - private boolean internTags; - private ModelProviderBase<Model> modelProvider; private MappingProvider mappingProvider; @@ -139,7 +158,7 @@ protected Model produceResource(URL aUrl) } }; - mappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, + mappingProvider = MappingProviderFactory.createPosMappingProvider(this, posMappingLocation, language, modelProvider); } @@ -156,7 +175,7 @@ public void process(JCas aJCas) // model metadata mappingProvider.configure(cas); - Map<Sentence, Collection<Token>> index = indexCovered(aJCas, Sentence.class, Token.class); + Map<Sentence, List<Token>> index = indexCovered(aJCas, Sentence.class, Token.class); for (Sentence sentence : select(aJCas, Sentence.class)) { Collection<Token> tokens = index.get(sentence); @@ -170,7 +189,7 @@ public void process(JCas aJCas) new IllegalStateException(error.getMessage())); } - UDPipe2DKPro.convertPosLemmaMorph(udSent, tokens, aJCas, mappingProvider, internTags); + UDPipe2DKPro.convertPosLemmaMorph(udSent, tokens, aJCas, mappingProvider); } } } diff --git a/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/UDPipeSegmenter.java b/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/UDPipeSegmenter.java index 72cc5df4ee..3c4f0b2a2a 100644 --- a/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/UDPipeSegmenter.java +++ b/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/UDPipeSegmenter.java @@ -28,22 +28,23 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.api.parameter.ComponentParameters; +import org.dkpro.core.api.resources.ModelProviderBase; +import org.dkpro.core.api.resources.ResourceUtils; +import org.dkpro.core.api.segmentation.SegmenterBase; import org.dkpro.core.udpipe.internal.UDPipeUtils; import cz.cuni.mff.ufal.udpipe.InputFormat; import cz.cuni.mff.ufal.udpipe.Model; import cz.cuni.mff.ufal.udpipe.Word; import cz.cuni.mff.ufal.udpipe.Words; -import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; -import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; +import eu.openminted.share.annotations.api.DocumentationResource; /** - * Tokenizer and sentence splitter using OpenNLP. - * + * Tokenizer and sentence splitter using UDPipe. */ -@ResourceMetaData(name="UDPipe Segmenter") +@ResourceMetaData(name = "UDPipe Segmenter") +@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") @TypeCapability( outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", @@ -65,6 +66,20 @@ public class UDPipeSegmenter @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; + /** + * URI of the model artifact. This can be used to override the default model resolving + * mechanism and directly address a particular model. + * + * <p>The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set + * the variant parameter to match the artifact. If the artifact contains the model in + * a non-default location, you also have to specify the model location parameter, e.g. + * {@code classpath:/model/path/in/artifact/model.bin}.</p> + */ + public static final String PARAM_MODEL_ARTIFACT_URI = + ComponentParameters.PARAM_MODEL_ARTIFACT_URI; + @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) + protected String modelArtifactUri; + /** * Load the model from this location instead of locating the model automatically. */ diff --git a/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/internal/DKPro2UDPipe.java b/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/internal/DKPro2UDPipe.java index 51dc7695e6..11faf9dcc9 100644 --- a/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/internal/DKPro2UDPipe.java +++ b/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/internal/DKPro2UDPipe.java @@ -31,18 +31,19 @@ public static void convert(Collection<Token> tokens, Sentence sentence) Word w = sentence.addWord(t.getText()); if (t.getPos() != null) { w.setXpostag(t.getPosValue()); - if (t.getPos().getCoarseValue()==null || t.getPos().getCoarseValue().trim().length()==0) { + if (t.getPos().getCoarseValue() == null + || t.getPos().getCoarseValue().trim().length() == 0) { w.setUpostag(t.getPos().getPosValue()); } else { w.setUpostag(t.getPos().getCoarseValue()); } } - + if (t.getLemma() != null) { w.setLemma(t.getLemmaValue()); } - + if (t.getMorph() != null) { w.setFeats(t.getMorph().getValue()); } diff --git a/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/internal/UDPipe2DKPro.java b/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/internal/UDPipe2DKPro.java index 5ab7024b71..12f874c217 100644 --- a/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/internal/UDPipe2DKPro.java +++ b/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/internal/UDPipe2DKPro.java @@ -24,13 +24,13 @@ import org.apache.uima.cas.CAS; import org.apache.uima.cas.Type; import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.lexmorph.pos.POSUtils; +import org.dkpro.core.api.resources.MappingProvider; import cz.cuni.mff.ufal.udpipe.Sentence; import cz.cuni.mff.ufal.udpipe.Word; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.pos.POSUtils; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; @@ -40,7 +40,7 @@ public class UDPipe2DKPro { public static void convertPosLemmaMorph(Sentence sentence, Collection<Token> tokens, JCas aJCas, - MappingProvider mappingProvider, boolean internTags) + MappingProvider mappingProvider) { CAS cas = aJCas.getCas(); @@ -52,20 +52,21 @@ public static void convertPosLemmaMorph(Sentence sentence, Collection<Token> tok // For Norwegian xtag is not provided. It is a blank string. // So the value of Utag is used as an replacement. - if (xtag.length() == 0 && utag.length() > 0) + if (xtag.length() == 0 && utag.length() > 0) { xtag = utag; + } // Convert the tag produced by the tagger to an UIMA type, create an annotation // of this type, and add it to the document. Type posTag = mappingProvider.getTagType(xtag); POS posAnno = (POS) cas.createAnnotation(posTag, t.getBegin(), t.getEnd()); // To save memory, we typically intern() tag strings - posAnno.setPosValue(internTags ? xtag.intern() : xtag); + posAnno.setPosValue(xtag != null ? xtag.intern() : null); if (utag == null) { POSUtils.assignCoarseValue(posAnno); } else { - posAnno.setCoarseValue(internTags ? utag.intern() : utag); + posAnno.setCoarseValue(utag != null ? utag.intern() : null); } posAnno.addToIndexes(); @@ -80,7 +81,8 @@ public static void convertPosLemmaMorph(Sentence sentence, Collection<Token> tok } if (StringUtils.isNotBlank(w.getForm())) { - MorphologicalFeatures morph = new MorphologicalFeatures(aJCas, t.getBegin(), t.getEnd()); + MorphologicalFeatures morph = new MorphologicalFeatures(aJCas, t.getBegin(), + t.getEnd()); morph.setValue(w.getFeats()); morph.addToIndexes(); t.setMorph(morph); @@ -91,7 +93,7 @@ public static void convertPosLemmaMorph(Sentence sentence, Collection<Token> tok } public static void convertParse(Sentence sentence, List<Token> tokens, JCas aJCas, - MappingProvider mappingProvider, boolean internTags) + MappingProvider mappingProvider) { for (int i = 1; i < sentence.getWords().size(); i++) { Word w = sentence.getWords().get(i); diff --git a/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/internal/UDPipeUtils.java b/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/internal/UDPipeUtils.java index e058822990..968507d3f5 100644 --- a/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/internal/UDPipeUtils.java +++ b/dkpro-core-udpipe-asl/src/main/java/org/dkpro/core/udpipe/internal/UDPipeUtils.java @@ -18,13 +18,15 @@ package org.dkpro.core.udpipe.internal; import java.io.IOException; + +import org.dkpro.core.api.resources.RuntimeProvider; + import cz.cuni.mff.ufal.udpipe.udpipe_java; -import de.tudarmstadt.ukp.dkpro.core.api.resources.RuntimeProvider; public class UDPipeUtils { private static boolean initialized = false; - + public static void init() throws IOException { if (initialized) { diff --git a/dkpro-core-udpipe-asl/src/test/java/org/dkpro/core/udpipe/UDPipeParserTest.java b/dkpro-core-udpipe-asl/src/test/java/org/dkpro/core/udpipe/UDPipeParserTest.java index 8852bb8355..65f3524409 100644 --- a/dkpro-core-udpipe-asl/src/test/java/org/dkpro/core/udpipe/UDPipeParserTest.java +++ b/dkpro-core-udpipe-asl/src/test/java/org/dkpro/core/udpipe/UDPipeParserTest.java @@ -23,14 +23,14 @@ import org.apache.commons.lang3.ArrayUtils; import org.apache.uima.fit.factory.AggregateBuilder; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class UDPipeParserTest { diff --git a/dkpro-core-udpipe-asl/src/test/java/org/dkpro/core/udpipe/UDPipePosTaggerTest.java b/dkpro-core-udpipe-asl/src/test/java/org/dkpro/core/udpipe/UDPipePosTaggerTest.java index d019581f1a..9bdd90227c 100644 --- a/dkpro-core-udpipe-asl/src/test/java/org/dkpro/core/udpipe/UDPipePosTaggerTest.java +++ b/dkpro-core-udpipe-asl/src/test/java/org/dkpro/core/udpipe/UDPipePosTaggerTest.java @@ -18,19 +18,18 @@ package org.dkpro.core.udpipe; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; - import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.AssumeResource; +import org.dkpro.core.testing.DkproTestContext; +import org.dkpro.core.testing.TestRunner; import org.junit.Rule; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.AssumeResource; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; -import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; public class UDPipePosTaggerTest { diff --git a/dkpro-core-udpipe-asl/src/test/java/org/dkpro/core/udpipe/UDPipeSegmenterTest.java b/dkpro-core-udpipe-asl/src/test/java/org/dkpro/core/udpipe/UDPipeSegmenterTest.java index ce1649227d..093999e1f9 100644 --- a/dkpro-core-udpipe-asl/src/test/java/org/dkpro/core/udpipe/UDPipeSegmenterTest.java +++ b/dkpro-core-udpipe-asl/src/test/java/org/dkpro/core/udpipe/UDPipeSegmenterTest.java @@ -18,19 +18,16 @@ package org.dkpro.core.udpipe; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; - - import static org.apache.uima.fit.util.JCasUtil.select; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; +import org.dkpro.core.testing.AssertAnnotations; +import org.dkpro.core.testing.DkproTestContext; import org.junit.Rule; import org.junit.Test; -import de.tudarmstadt.ukp.dkpro.core.testing.AssertAnnotations; -import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; - import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; @@ -42,29 +39,14 @@ public void testNorwegian() throws Exception { runTest("no", null, "Storbritannia drøyer ikke. Storbritannia starter den formelle prosessen for utmelding av EU 29. mars, opplyser statsminister Theresa Mays kontor.", - new String[] { "Storbritannia drøyer ikke.", "Storbritannia starter den formelle prosessen for utmelding av EU 29. mars, opplyser statsminister Theresa Mays kontor."}, - new String[] { "Storbritannia", - "drøyer", - "ikke", - ".", - "Storbritannia", - "starter", - "den", - "formelle", - "prosessen", - "for", - "utmelding", - "av", - "EU", - "29.", - "mars", - ",", - "opplyser", - "statsminister", - "Theresa", - "Mays", - "kontor", - "."}); + new String[] { + "Storbritannia drøyer ikke.", + "Storbritannia starter den formelle prosessen for utmelding av EU 29. " + + "mars, opplyser statsminister Theresa Mays kontor." }, + new String[] { "Storbritannia", "drøyer", "ikke", ".", "Storbritannia", "starter", + "den", "formelle", "prosessen", "for", "utmelding", "av", "EU", "29.", + "mars", ",", "opplyser", "statsminister", "Theresa", "Mays", "kontor", + "." }); } @@ -72,14 +54,17 @@ public void testNorwegian() public void testEnglish() throws Exception { - runTest("en", null, "Good morning Mr. President. I would love to welcome you to S.H.I.E.L.D. 2.0.", - new String[] { "Good morning Mr. President.", "I would love to welcome you to S.H.I.E.L.D. 2.0."}, - new String[] { "Good", "morning", "Mr.", "President", ".", "I", "would", "love", "to", "welcome", "you", "to", - "S.H.I.E.L.D.", "2.0","."}); + runTest("en", null, + "Good morning Mr. President. I would love to welcome you to S.H.I.E.L.D. 2.0.", + new String[] { "Good morning Mr. President.", + "I would love to welcome you to S.H.I.E.L.D. 2.0." }, + new String[] { "Good", "morning", "Mr.", "President", ".", "I", "would", "love", + "to", "welcome", "you", "to", "S.H.I.E.L.D.", "2.0", "." }); } - private void runTest(String language, String aVariant,String testDocument, String[] sExpected, String[] tExpected) + private void runTest(String language, String aVariant, String testDocument, String[] sExpected, + String[] tExpected) throws Exception { String variant = aVariant != null ? aVariant : "ud"; diff --git a/dkpro-core-udpipe-asl/src/test/resources/log4j.properties b/dkpro-core-udpipe-asl/src/test/resources/log4j.properties deleted file mode 100644 index 9ef9876f5c..0000000000 --- a/dkpro-core-udpipe-asl/src/test/resources/log4j.properties +++ /dev/null @@ -1,7 +0,0 @@ -log4j.rootLogger=WARN,development - -log4j.appender.development=org.apache.log4j.ConsoleAppender -log4j.appender.development.layout=org.apache.log4j.PatternLayout -log4j.appender.development.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %5p [%t] (%C{1}) - %m%n - -log4j.logger.de.tudarmstadt.ukp = DEBUG diff --git a/dkpro-core-udpipe-asl/src/test/resources/log4j2.xml b/dkpro-core-udpipe-asl/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..19bf03b585 --- /dev/null +++ b/dkpro-core-udpipe-asl/src/test/resources/log4j2.xml @@ -0,0 +1,15 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Configuration status="WARN"> + <Appenders> + <Console name="ConsoleAppender" target="SYSTEM_OUT"> + <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} %level{length=5} %logger{1} - %msg%n" /> + </Console> + </Appenders> + + <Loggers> + <Logger name="org.dkpro.core" level="DEBUG"/> + <Root level="WARN"> + <AppenderRef ref="ConsoleAppender" /> + </Root> + </Loggers> +</Configuration> diff --git a/pom.xml b/pom.xml index 60f96d3334..09c8f0c131 100644 --- a/pom.xml +++ b/pom.xml @@ -15,20 +15,20 @@ See the License for the specific language governing permissions and limitations under the License. --> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> <groupId>org.dkpro</groupId> <artifactId>dkpro-parent-pom</artifactId> - <version>17</version> + <version>25</version> </parent> - <groupId>de.tudarmstadt.ukp.dkpro.core</groupId> - <artifactId>de.tudarmstadt.ukp.dkpro.core</artifactId> - <version>1.10.0-SNAPSHOT</version> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core</artifactId> + <version>2.3.0-SNAPSHOT</version> <packaging>pom</packaging> <name>DKPro Core</name> - <description>DKPro Core is a collection of software components for natural language processing (NLP) based on the Apache UIMA framework.</description> <!-- The description tag must be present for antrun to work!! --> + <!-- The description tag must be present for antrun to work!! --> + <description>DKPro Core is a collection of software components for natural language processing (NLP) based on the Apache UIMA framework.</description> <url>https://dkpro.github.io/dkpro-core/</url> <organization> <name>Ubiquitous Knowledge Processing (UKP) Lab, Technische Universität Darmstadt</name> @@ -36,117 +36,130 @@ </organization> <inceptionYear>2007</inceptionYear> <properties> - <currentYear>2017</currentYear> + <currentYear>2019</currentYear> <maven.build.timestamp.format>yyyy-MM-dd HH:mm</maven.build.timestamp.format> - <uima.version>2.10.2</uima.version> - <uimafit.version>2.4.0</uimafit.version> - <uimafit.plugin.version>2.4.0</uimafit.plugin.version> + <uima.version>3.2.0</uima.version> + <uimafit.version>3.2.0</uimafit.version> + <uimafit.plugin.version>${uimafit.version}</uimafit.plugin.version> + <omtd.version>3.0.2.7</omtd.version> <lucene.version>4.4.0</lucene.version> <!-- The Spring version should be at least whatever uimaFIT requires --> - <spring.version>3.2.16.RELEASE</spring.version> - <slf4j.version>1.7.25</slf4j.version> + <spring.version>4.3.30.RELEASE</spring.version> + <slf4j.version>1.7.30</slf4j.version> + <log4j.version>2.16.0</log4j.version> + <maven.surefire.heap>6g</maven.surefire.heap> + <activation.version>1.2.0</activation.version> + <icu4j.version>68.2</icu4j.version> + <snakeyaml.version>1.27</snakeyaml.version> + <groovy.version>3.0.7</groovy.version> + <jackson.version>2.12.1</jackson.version> + <jena.version>3.17.0</jena.version> </properties> <repositories> <repository> <id>ukp-oss-model-releases</id> - <url>http://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-model-releases-local</url> + <url>https://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-model-releases-local</url> <snapshots> <enabled>false</enabled> </snapshots> </repository> - <repository> - <id>jcenter</id> - <url>http://jcenter.bintray.com</url> - <snapshots> - <enabled>false</enabled> - </snapshots> - </repository> - <!-- For UIMA/uimaFIT RCs --> - <!-- - <repository> - <id>ext-staging</id> - <name>Staging repo</name> - <url>https://oss.sonatype.org/content/repositories/orgdkpro-1027</url> - <releases> - <enabled>true</enabled> - </releases> - </repository> - --> - <!-- For UIMA/uimaFIT SNAPSHOT --> - <!-- - <repository> - <id>apache.snapshots</id> - <name>Apache Snapshot Repository</name> - <url>http://repository.apache.org/snapshots</url> - <releases> - <enabled>false</enabled> - </releases> - <snapshots> - <enabled>true</enabled> - </snapshots> - </repository> - --> - <!-- For SNAPSHOTs from the DKPro family --> - <!-- - <repository> - <id>ukp-oss-snapshots</id> - <url>http://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-snapshots</url> - <releases> - <enabled>false</enabled> - </releases> - <snapshots> + <!-- For UIMA/uimaFIT RCs --> + <!-- + <repository> + <id>ext-staging</id> + <name>Staging repo</name> + <url>https://repository.apache.org/content/repositories/orgapacheuima-1205/</url> + <releases> <enabled>true</enabled> - </snapshots> - </repository> - --> + </releases> + </repository> + --> + <!-- For UIMA/uimaFIT SNAPSHOT + <repository> + <id>apache.snapshots</id> + <name>Apache Snapshot Repository</name> + <url>https://repository.apache.org/snapshots</url> + <releases> + <enabled>false</enabled> + </releases> + <snapshots> + <enabled>true</enabled> + </snapshots> + </repository> + --> + <!-- For SNAPSHOTs from the DKPro family --> + <!-- + <repository> + <id>ukp-oss-snapshots</id> + <url>https://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-snapshots</url> + <releases> + <enabled>false</enabled> + </releases> + <snapshots> + <enabled>true</enabled> + </snapshots> + </repository> + --> </repositories> <pluginRepositories> <!-- For UIMA/uimaFIT RCs --> - <!-- + <!-- <pluginRepository> - <id>ext-staging</id> - <name>Staging repo</name> - <url>https://repository.apache.org/content/repositories/orgapacheuima-1134</url> - <releases> - <enabled>true</enabled> - </releases> + <id>ext-staging</id> + <name>Staging repo</name> + <url>https://repository.apache.org/content/repositories/orgapacheuima-1205/</url> + <releases> + <enabled>true</enabled> + </releases> </pluginRepository> --> <!-- For UIMA/uimaFIT SNAPSHOT maven plugin --> - <!-- <pluginRepository> <id>apache.snapshots</id> <name>Apache Snapshot Repository</name> - <url>http://repository.apache.org/snapshots</url> + <url>https://repository.apache.org/snapshots</url> + <releases> + <enabled>false</enabled> + </releases> + <snapshots> + <enabled>true</enabled> + </snapshots> + </pluginRepository> + <!-- For SNAPSHOTs from OpenMinTeD --> + <!-- + <pluginRepository> + <id>omtd-snapshots</id> + <layout>default</layout> + <url>https://repo.openminted.eu/content/repositories/snapshots</url> <releases> <enabled>false</enabled> </releases> - <snapshots> - <enabled>true</enabled> - </snapshots> + <snapshots> + <enabled>true</enabled> + </snapshots> </pluginRepository> --> </pluginRepositories> - <mailingLists> - <mailingList> - <name>DKPro Core user mailing list</name> - <archive>https://groups.google.com/forum/#!forum/dkpro-core-user</archive> - <post>dkpro-core-user@googlegroups.com</post> - <subscribe>dkpro-core-user+subscribe@googlegroups.com</subscribe> - <unsubscribe>dkpro-core-user+unsubscribe@googlegroups.com</unsubscribe> - </mailingList> - <mailingList> - <name>DKPro Core developer mailing list</name> - <archive>https://groups.google.com/forum/#!forum/dkpro-core-developers</archive> - <post>dkpro-core-developers@googlegroups.com</post> - <subscribe>dkpro-core-developers+subscribe@googlegroups.com</subscribe> - <unsubscribe>dkpro-core-developers+unsubscribe@googlegroups.com</unsubscribe> - </mailingList> - </mailingLists> - <ciManagement> - <system>Jenkins</system> - <url>https://zoidberg.ukp.informatik.tu-darmstadt.de/jenkins/</url> - </ciManagement> + <mailingLists> + <mailingList> + <name>DKPro Core user mailing list</name> + <archive>https://groups.google.com/forum/#!forum/dkpro-core-user</archive> + <post>dkpro-core-user@googlegroups.com</post> + <subscribe>dkpro-core-user+subscribe@googlegroups.com</subscribe> + <unsubscribe>dkpro-core-user+unsubscribe@googlegroups.com</unsubscribe> + </mailingList> + <mailingList> + <name>DKPro Core developer mailing list</name> + <archive>https://groups.google.com/forum/#!forum/dkpro-core-developers</archive> + <post>dkpro-core-developers@googlegroups.com</post> + <subscribe>dkpro-core-developers+subscribe@googlegroups.com</subscribe> + <unsubscribe>dkpro-core-developers+unsubscribe@googlegroups.com</unsubscribe> + </mailingList> + </mailingLists> + <ciManagement> + <system>Jenkins</system> + <url>https://zoidberg.ukp.informatik.tu-darmstadt.de/jenkins/</url> + </ciManagement> <issueManagement> <system>GitHub Issues</system> <url>https://github.com/dkpro/dkpro-core/issues</url> @@ -158,6 +171,7 @@ <tag>HEAD</tag> </scm> <modules> + <module>dkpro-core-build</module> <module>dkpro-core-asl</module> <module>dkpro-core-gpl</module> <!-- Documentation and examples modules --> @@ -168,7 +182,12 @@ <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> - <version>4.12</version> + <version>4.13.1</version> + </dependency> + <dependency> + <groupId>org.assertj</groupId> + <artifactId>assertj-core</artifactId> + <version>3.18.1</version> </dependency> <dependency> <groupId>org.apache.uima</groupId> @@ -208,12 +227,12 @@ <dependency> <groupId>org.yaml</groupId> <artifactId>snakeyaml</artifactId> - <version>1.19</version> + <version>${snakeyaml.version}</version> </dependency> <dependency> <groupId>xerces</groupId> <artifactId>xercesImpl</artifactId> - <version>2.9.1</version> + <version>2.12.0</version> </dependency> <dependency> <groupId>xalan</groupId> @@ -225,6 +244,11 @@ <artifactId>serializer</artifactId> <version>2.7.2</version> </dependency> + <dependency> + <groupId>xml-apis</groupId> + <artifactId>xml-apis</artifactId> + <version>1.4.01</version> + </dependency> <dependency> <groupId>stax</groupId> <artifactId>stax-api</artifactId> @@ -248,7 +272,7 @@ <dependency> <groupId>com.ibm.icu</groupId> <artifactId>icu4j</artifactId> - <version>60.1</version> + <version>${icu4j.version}</version> </dependency> <dependency> <groupId>commons-logging</groupId> @@ -278,17 +302,17 @@ <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-collections4</artifactId> - <version>4.1</version> + <version>4.4</version> </dependency> <dependency> <groupId>commons-codec</groupId> <artifactId>commons-codec</artifactId> - <version>1.11</version> + <version>1.15</version> </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> - <version>2.6</version> + <version>2.11.0</version> </dependency> <dependency> <groupId>commons-lang</groupId> @@ -298,32 +322,27 @@ <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> - <version>3.7</version> + <version>3.12.0</version> </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-compress</artifactId> - <version>1.15</version> + <version>1.21</version> </dependency> <dependency> <groupId>org.tukaani</groupId> <artifactId>xz</artifactId> - <version>1.6</version> - </dependency> - <dependency> - <groupId>com.github.haifengl</groupId> - <artifactId>smile-nlp</artifactId> - <version>1.3.1</version> + <version>1.8</version> </dependency> <dependency> <groupId>org.apache.ant</groupId> <artifactId>ant</artifactId> - <version>1.10.1</version> + <version>1.10.12</version> </dependency> <dependency> <groupId>jaxen</groupId> <artifactId>jaxen</artifactId> - <version>1.1.1</version> + <version>1.2.0</version> <exclusions> <exclusion> <artifactId>dom4j</artifactId> @@ -340,9 +359,14 @@ </exclusions> </dependency> <dependency> - <groupId>dom4j</groupId> + <groupId>org.dom4j</groupId> <artifactId>dom4j</artifactId> - <version>1.6.1</version> + <version>2.1.3</version> + </dependency> + <dependency> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + <version>30.1-jre</version> </dependency> <dependency> <groupId>org.springframework</groupId> @@ -380,9 +404,19 @@ <version>1.4.0</version> </dependency> <dependency> - <groupId>log4j</groupId> - <artifactId>log4j</artifactId> - <version>1.2.17</version> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-api</artifactId> + <version>${log4j.version}</version> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-core</artifactId> + <version>${log4j.version}</version> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-slf4j-impl</artifactId> + <version>${log4j.version}</version> </dependency> <dependency> <groupId>org.slf4j</groupId> @@ -394,30 +428,117 @@ <artifactId>slf4j-api</artifactId> <version>${slf4j.version}</version> </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>log4j-over-slf4j</artifactId> + <version>${slf4j.version}</version> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>jcl-over-slf4j</artifactId> + <version>${slf4j.version}</version> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>jul-to-slf4j</artifactId> + <version>${slf4j.version}</version> + </dependency> + <dependency> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + <version>1.2</version> + </dependency> <dependency> <groupId>xmlunit</groupId> <artifactId>xmlunit</artifactId> <version>1.6</version> </dependency> - <dependency> - <groupId>org.hamcrest</groupId> - <artifactId>hamcrest-core</artifactId> - <version>1.3</version> - </dependency> <dependency> <groupId>it.unimi.dsi</groupId> <artifactId>fastutil</artifactId> - <version>7.0.9</version> + <version>8.4.4</version> </dependency> <dependency> <groupId>org.apache.ivy</groupId> <artifactId>ivy</artifactId> - <version>2.4.0</version> + <version>2.5.0</version> + </dependency> + <dependency> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-api</artifactId> + <version>${omtd.version}</version> + </dependency> + <dependency> + <groupId>javax.xml.bind</groupId> + <artifactId>jaxb-api</artifactId> + <version>2.3.1</version> + </dependency> + <dependency> + <groupId>com.sun.xml.bind</groupId> + <artifactId>jaxb-core</artifactId> + <version>2.3.0.1</version> + </dependency> + <dependency> + <groupId>com.sun.xml.bind</groupId> + <artifactId>jaxb-impl</artifactId> + <version>2.3.2</version> + </dependency> + <dependency> + <groupId>javax.activation</groupId> + <artifactId>javax.activation-api</artifactId> + <version>1.2.0</version> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson</groupId> + <artifactId>jackson-bom</artifactId> + <version>${jackson.version}</version> + <type>pom</type> + <scope>import</scope> + </dependency> + <dependency> + <groupId>org.codehaus.groovy</groupId> + <artifactId>groovy-bom</artifactId> + <version>${groovy.version}</version> + <type>pom</type> + <scope>import</scope> </dependency> </dependencies> </dependencyManagement> <build> <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + <executions> + <execution> + <id>default</id> + <phase>verify</phase> + <goals> + <goal>analyze-only</goal> + </goals> + </execution> + </executions> + <configuration> + <failOnWarning>true</failOnWarning> + <usedDependencies combine.children="append"> + <!-- + - Logging is used via reflection and cannot be detected by Maven + <usedDependency>org.apache.logging.log4j:log4j-slf4j-impl</usedDependency> + <usedDependency>org.apache.logging.log4j:log4j-core</usedDependency> + <usedDependency>org.slf4j:log4j-over-slf4j</usedDependency> + <usedDependency>org.slf4j:jcl-over-slf4j</usedDependency> + <usedDependency>commons-logging:commons-logging</usedDependency> + --> + <!-- + - JAXB is used via reflection and cannot be detected by Maven + <usedDependency>javax.xml.bind:jaxb-api</usedDependency> + <usedDependency>com.sun.xml.bind:jaxb-core</usedDependency> + <usedDependency>com.sun.xml.bind:jaxb-impl</usedDependency> + <usedDependency>javax.activation:javax.activation-api</usedDependency> + --> + </usedDependencies> + </configuration> + </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-javadoc-plugin</artifactId> @@ -473,38 +594,139 @@ <plugin> <groupId>org.apache.uima</groupId> <artifactId>uimafit-maven-plugin</artifactId> + <configuration> + <componentVendor>DKPro Core Project</componentVendor> + <componentCopyright> + Copyright ${project.inceptionYear}-${currentYear} + Ubiquitous Knowledge Processing (UKP) Lab + Technische Universität Darmstadt + </componentCopyright> + <failOnMissingMetaData>true</failOnMissingMetaData> + </configuration> + <executions> + <execution> + <id>default</id> + <phase>process-classes</phase> + <goals> + <goal>enhance</goal> + <goal>generate</goal> + </goals> + </execution> + </executions> + </plugin> + <plugin> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-maven-plugin</artifactId> + <executions> + <execution> + <id>default</id> + <phase>process-classes</phase> + <goals> + <goal>generate</goal> + </goals> + </execution> + </executions> + <configuration> + <validateXml>false</validateXml> + <descriptorLocation>withClasses</descriptorLocation> + <uimaTypeMappings> + <uimaTypeMapping>META-INF/eu.openminted.share/uimaTypeMapping.map</uimaTypeMapping> + </uimaTypeMappings> + <mimeTypeMappings> + <mimeTypeMapping>META-INF/eu.openminted.share/mimeTypeMapping.map</mimeTypeMapping> + </mimeTypeMappings> + <properties> + <docbase>https://dkpro.github.io/dkpro-core/releases/${project.version}/docs</docbase> + <resourceNameAddon>(DKPro Core)</resourceNameAddon> + <resourceCopyright> + Copyright ${project.inceptionYear}-${currentYear} + Ubiquitous Knowledge Processing (UKP) Lab + Technische Universität Darmstadt + (Applies only to the DKPro Core UIMA wrapper. The copyright for wrapped tools and underlying + libraries remains with the respective copyright owners.) + </resourceCopyright> + </properties> + </configuration> + <dependencies> + <dependency> + <groupId>javax.xml.bind</groupId> + <artifactId>jaxb-api</artifactId> + <version>2.3.1</version> + </dependency> + <dependency> + <groupId>com.sun.xml.bind</groupId> + <artifactId>jaxb-core</artifactId> + <version>2.3.0.1</version> + </dependency> + <dependency> + <groupId>com.sun.xml.bind</groupId> + <artifactId>jaxb-impl</artifactId> + <version>2.3.2</version> + </dependency> + <dependency> + <groupId>javax.activation</groupId> + <artifactId>javax.activation-api</artifactId> + <version>1.2.0</version> + </dependency> + </dependencies> </plugin> </plugins> <pluginManagement> <plugins> + <plugin> + <groupId>org.apache.uima</groupId> + <artifactId>uimafit-maven-plugin</artifactId> + <version>${uimafit.plugin.version}</version> + </plugin> + <plugin> + <groupId>eu.openminted.share.annotations</groupId> + <artifactId>omtd-share-annotations-maven-plugin</artifactId> + <version>${omtd.version}</version> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-release-plugin</artifactId> + <configuration> + <autoVersionSubmodules>true</autoVersionSubmodules> + <useReleaseProfile>false</useReleaseProfile> + <arguments>${arguments} -Dmaven.surefire.heap=${maven.surefire.heap} -Pdkpro-release</arguments> + </configuration> + </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-surefire-plugin</artifactId> - <version>2.20</version> + <configuration> + <systemPropertyVariables> + <dkpro.core.testCachePath>${dkpro.core.testCachePath}</dkpro.core.testCachePath> + <!-- + UIMAv3 changes the default string serialization of feature structures which we rely + on in the CasDumpWriter. To minimize the differences between the DKPro Core UIMAv2 + and UIMAv3 branches, we force UIMAv3 to use the old serialization. We also do this + in the DkproTestContext, in particular since your IDE usually won't pick up the + property we set here. But we also set it here because whether or not setting the + value in DkproTestContext has an effect depends on the order in which the tests + are executed. I.e. if a loading the FeatureStructureImplC class is executed while + the property has not been set yet, then it is impossible for a later test to turn + it on. + --> + <uima.v2_pretty_print_format>true</uima.v2_pretty_print_format> + </systemPropertyVariables> + </configuration> </plugin> <plugin> - <groupId>org.apache.uima</groupId> - <artifactId>uimafit-maven-plugin</artifactId> - <version>${uimafit.plugin.version}</version> + <groupId>org.codehaus.mojo</groupId> + <artifactId>versions-maven-plugin</artifactId> + <version>2.8.1</version> <configuration> - <componentVendor>DKPro Core Project</componentVendor> - <componentCopyright> - Copyright ${project.inceptionYear}-${currentYear} - Ubiquitous Knowledge Processing (UKP) Lab - Technische Universität Darmstadt - </componentCopyright> - <failOnMissingMetaData>false</failOnMissingMetaData> + <rulesUri>file:${session.executionRootDirectory}/dkpro-core-build/src/main/resources/dkpro-core/version-rules.xml</rulesUri> </configuration> - <executions> - <execution> - <id>default</id> - <phase>process-classes</phase> - <goals> - <goal>enhance</goal> - <goal>generate</goal> - </goals> - </execution> - </executions> + <dependencies> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-build</artifactId> + <version>${project.version}</version> + </dependency> + </dependencies> </plugin> </plugins> </pluginManagement> @@ -595,37 +817,65 @@ </build> </profile> <profile> - <id>run-omtd-plugin</id> - <pluginRepositories> - <pluginRepository> - <id>omtd.snapshots</id> - <name>OpenMinTeD SNAPSHOTs</name> - <url>https://repo.openminted.eu/content/repositories/snapshots</url> - <releases> - <enabled>false</enabled> - </releases> - <snapshots> - <enabled>true</enabled> - </snapshots> - </pluginRepository> - </pluginRepositories> + <id>checkstyle</id> + <activation> + <file> + <exists>src</exists> + </file> + </activation> <build> <plugins> <plugin> - <groupId>eu.openminted.share.annotations</groupId> - <artifactId>omtd-share-annotations-maven-plugin</artifactId> - <version>0.0.1-SNAPSHOT</version> - <executions> - <execution> - <id>default</id> - <phase>process-classes</phase> - <goals> - <goal>generate</goal> - </goals> - </execution> - </executions> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-checkstyle-plugin</artifactId> </plugin> </plugins> + <pluginManagement> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-checkstyle-plugin</artifactId> + <version>3.1.1</version> + <inherited>true</inherited> + <dependencies> + <dependency> + <groupId>org.dkpro.core</groupId> + <artifactId>dkpro-core-build</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>com.puppycrawl.tools</groupId> + <artifactId>checkstyle</artifactId> + <version>8.31</version> + </dependency> + </dependencies> + <configuration> + <!-- See: https://github.com/m2e-code-quality/m2e-code-quality/issues/117#issuecomment-380022879 --> + <sourceDirectories>${project.compileSourceRoots}</sourceDirectories> + <testSourceDirectories>${project.testCompileSourceRoots}</testSourceDirectories> + <configLocation>dkpro-core/checkstyle.xml</configLocation> + <propertyExpansion>basedir=${project.basedir}</propertyExpansion> + <consoleOutput>true</consoleOutput> + <failOnViolation>true</failOnViolation> + <includeTestSourceDirectory>true</includeTestSourceDirectory> + <includeResources>false</includeResources> + <includeTestResources>false</includeTestResources> + <logViolationsToConsole>true</logViolationsToConsole> + <maxAllowedViolations>0</maxAllowedViolations> + <violationSeverity>error</violationSeverity> + </configuration> + <executions> + <execution> + <id>checkstyle-check</id> + <phase>verify</phase> + <goals> + <goal>check</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </pluginManagement> </build> </profile> </profiles>